From e6f3f7e4dc76eb8d8a546dc66621a02c5c84f4ac Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Fri, 12 Apr 2019 11:41:30 +0200
Subject: spi: Add spi_is_bpw_supported()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This let SPI clients check if the controller supports a particular word
width. drivers/gpu/drm/tinydrm/mipi-dbi.c will use this to determine if
the controller supports 16-bit for RGB565 pixels. If it doesn't it will
swap the bytes before transfer on little endian machines.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 662b336aa2e4..b30e3d13a5ac 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -983,6 +983,26 @@ spi_max_transfer_size(struct spi_device *spi)
 	return min(tr_max, msg_max);
 }
 
+/**
+ * spi_is_bpw_supported - Check if bits per word is supported
+ * @spi: SPI device
+ * @bpw: Bits per word
+ *
+ * This function checks to see if the SPI controller supports @bpw.
+ *
+ * Returns:
+ * True if @bpw is supported, false otherwise.
+ */
+static inline bool spi_is_bpw_supported(struct spi_device *spi, u32 bpw)
+{
+	u32 bpw_mask = spi->master->bits_per_word_mask;
+
+	if (bpw == 8 || (bpw <= 32 && bpw_mask & SPI_BPW_MASK(bpw)))
+		return true;
+
+	return false;
+}
+
 /*---------------------------------------------------------------------------*/
 
 /* SPI transfer replacement methods which make use of spi_res */
-- 
cgit v1.2.3


From 67b886d290052dbf2bcfc876a5ae41a5fe461edf Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:56 -0500
Subject: dma-buf: Remove leftover [un]map_atomic comments

The map_atomic/unmap_atomic callbacks have been removed, remove
the related comments.

Fixes: f664a5269542 ("dma-buf: remove kmap_atomic interface")
Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-1-afd@ti.com
---
 include/linux/dma-buf.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 58725f890b5b..e4a8dab2bc54 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,11 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map_atomic: [optional] maps a page from the buffer into kernel address
- *		space, users may not block until the subsequent unmap call.
- *		This callback must not sleep.
- * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
- *		  This Callback must not sleep.
  * @map: [optional] maps a page from the buffer into kernel address space.
  * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
-- 
cgit v1.2.3


From d5ae7712b7ffbb435e8f3d98f2123eff4734c77f Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:57 -0500
Subject: dma-buf: Update [un]map documentation to match the other functions

Other function have inline documentation, a couple still have
theirs at the top of the structure, update the docs and move
them inline.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-2-afd@ti.com
---
 include/linux/dma-buf.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index e4a8dab2bc54..a0bd071466fc 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,8 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map: [optional] maps a page from the buffer into kernel address space.
- * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
  *	  address space. Same restrictions as for vmap and friends apply.
  * @vunmap: [optional] unmaps a vmap from the buffer
@@ -200,8 +198,6 @@ struct dma_buf_ops {
 	 * to be restarted.
 	 */
 	int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction);
-	void *(*map)(struct dma_buf *, unsigned long);
-	void (*unmap)(struct dma_buf *, unsigned long, void *);
 
 	/**
 	 * @mmap:
@@ -240,6 +236,31 @@ struct dma_buf_ops {
 	 */
 	int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
 
+	/**
+	 * @map:
+	 *
+	 * Maps a page from the buffer into kernel address space. The page is
+	 * specified by offset into the buffer in PAGE_SIZE units.
+	 *
+	 * This callback is optional.
+	 *
+	 * Returns:
+	 *
+	 * Virtual address pointer where requested page can be accessed. NULL
+	 * on error or when this function is unimplemented by the exporter.
+	 */
+	void *(*map)(struct dma_buf *, unsigned long);
+
+	/**
+	 * @unmap:
+	 *
+	 * Unmaps a page from the buffer. Page offset and address pointer should
+	 * be the same as the one passed to and returned by matching call to map.
+	 *
+	 * This callback is optional.
+	 */
+	void (*unmap)(struct dma_buf *, unsigned long, void *);
+
 	void *(*vmap)(struct dma_buf *);
 	void (*vunmap)(struct dma_buf *, void *vaddr);
 };
-- 
cgit v1.2.3


From 0ff2de8bb163551ec4230a5a6f3c40c1f6adec4f Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sat, 23 Feb 2019 08:49:48 +0000
Subject: spi: core: allow defining time that cs is deasserted

For some SPI devices that support speed_hz > 1MHz the default 10 us delay
when cs_change = 1 is typically way to long and may result in poor spi bus
utilization.

This patch makes it possible to control the delay at micro or nano second
resolution on a per spi_transfer basis. It even allows an "as fast as
possible" mode with:
    xfer.cs_change_delay_unit = SPI_DELAY_UNIT_NSECS;
    xfer.cs_change_delay = 0;

The delay code is shared between delay_usecs and cs_change_delay for
consistency and reuse, so in the future this change_delay_unit could also
apply to delay_usec as well.

Note that on slower SOCs/CPU actually reaching ns deasserts on cs is not
realistic as the gpio overhead alone (without any delays added ) may
already leave cs deasserted for more than 1us - at least on a raspberry pi.
But at the very least this way we can keep it as short as possible.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 59 ++++++++++++++++++++++++++++++++++++++++---------
 include/linux/spi/spi.h |  7 ++++++
 2 files changed, 56 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 5e75944ad5d1..7e8ffe3fdc00 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1090,6 +1090,52 @@ static int spi_transfer_wait(struct spi_controller *ctlr,
 	return 0;
 }
 
+static void _spi_transfer_delay_ns(u32 ns)
+{
+	if (!ns)
+		return;
+	if (ns <= 1000) {
+		ndelay(ns);
+	} else {
+		u32 us = DIV_ROUND_UP(ns, 1000);
+
+		if (us <= 10)
+			udelay(us);
+		else
+			usleep_range(us, us + DIV_ROUND_UP(us, 10));
+	}
+}
+
+static void _spi_transfer_cs_change_delay(struct spi_message *msg,
+					  struct spi_transfer *xfer)
+{
+	u32 delay = xfer->cs_change_delay;
+	u32 unit = xfer->cs_change_delay_unit;
+
+	/* return early on "fast" mode - for everything but USECS */
+	if (!delay && unit != SPI_DELAY_UNIT_USECS)
+		return;
+
+	switch (unit) {
+	case SPI_DELAY_UNIT_USECS:
+		/* for compatibility use default of 10us */
+		if (!delay)
+			delay = 10000;
+		else
+			delay *= 1000;
+		break;
+	case SPI_DELAY_UNIT_NSECS: /* nothing to do here */
+		break;
+	default:
+		dev_err_once(&msg->spi->dev,
+			     "Use of unsupported delay unit %i, using default of 10us\n",
+			     xfer->cs_change_delay_unit);
+		delay = 10000;
+	}
+	/* now sleep for the requested amount of time */
+	_spi_transfer_delay_ns(delay);
+}
+
 /*
  * spi_transfer_one_message - Default implementation of transfer_one_message()
  *
@@ -1148,14 +1194,8 @@ static int spi_transfer_one_message(struct spi_controller *ctlr,
 		if (msg->status != -EINPROGRESS)
 			goto out;
 
-		if (xfer->delay_usecs) {
-			u16 us = xfer->delay_usecs;
-
-			if (us <= 10)
-				udelay(us);
-			else
-				usleep_range(us, us + DIV_ROUND_UP(us, 10));
-		}
+		if (xfer->delay_usecs)
+			_spi_transfer_delay_ns(xfer->delay_usecs * 1000);
 
 		if (xfer->cs_change) {
 			if (list_is_last(&xfer->transfer_list,
@@ -1163,7 +1203,7 @@ static int spi_transfer_one_message(struct spi_controller *ctlr,
 				keep_cs = true;
 			} else {
 				spi_set_cs(msg->spi, false);
-				udelay(10);
+				_spi_transfer_cs_change_delay(msg, xfer);
 				spi_set_cs(msg->spi, true);
 			}
 		}
@@ -3757,4 +3797,3 @@ err0:
  * include needing to have boardinfo data structures be much more public.
  */
 postcore_initcall(spi_init);
-
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 053abd22ad31..023beb9e9e4b 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -735,6 +735,9 @@ extern void spi_res_release(struct spi_controller *ctlr,
  * @bits_per_word: select a bits_per_word other than the device default
  *      for this transfer. If 0 the default (from @spi_device) is used.
  * @cs_change: affects chipselect after this transfer completes
+ * @cs_change_delay: delay between cs deassert and assert when
+ *      @cs_change is set and @spi_transfer is not the last in @spi_message
+ * @cs_change_delay_unit: unit of cs_change_delay
  * @delay_usecs: microseconds to delay after this transfer before
  *	(optionally) changing the chipselect status, then starting
  *	the next transfer or completing this @spi_message.
@@ -824,6 +827,10 @@ struct spi_transfer {
 	u8		bits_per_word;
 	u8		word_delay_usecs;
 	u16		delay_usecs;
+	u16		cs_change_delay;
+	u8		cs_change_delay_unit;
+#define SPI_DELAY_UNIT_USECS	0
+#define SPI_DELAY_UNIT_NSECS	1
 	u32		speed_hz;
 	u16		word_delay;
 
-- 
cgit v1.2.3


From d5864e5bed96db7230da45463d6ae7af5b3b4399 Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sat, 23 Feb 2019 08:49:50 +0000
Subject: spi: core: allow defining time that cs is deasserted as a multiple of
 SCK

Support setting a delay between cs assert and deassert as
a multiple of spi clock length.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 8 ++++++++
 include/linux/spi/spi.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7e8ffe3fdc00..cfa3c3decb8a 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1111,6 +1111,7 @@ static void _spi_transfer_cs_change_delay(struct spi_message *msg,
 {
 	u32 delay = xfer->cs_change_delay;
 	u32 unit = xfer->cs_change_delay_unit;
+	u32 hz;
 
 	/* return early on "fast" mode - for everything but USECS */
 	if (!delay && unit != SPI_DELAY_UNIT_USECS)
@@ -1126,6 +1127,13 @@ static void _spi_transfer_cs_change_delay(struct spi_message *msg,
 		break;
 	case SPI_DELAY_UNIT_NSECS: /* nothing to do here */
 		break;
+	case SPI_DELAY_UNIT_SCK:
+		/* if there is no effective speed know, then approximate
+		 * by underestimating with half the requested hz
+		 */
+		hz = xfer->effective_speed_hz ?: xfer->speed_hz / 2;
+		delay *= DIV_ROUND_UP(1000000000, hz);
+		break;
 	default:
 		dev_err_once(&msg->spi->dev,
 			     "Use of unsupported delay unit %i, using default of 10us\n",
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 023beb9e9e4b..e552a036cb4d 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -831,6 +831,7 @@ struct spi_transfer {
 	u8		cs_change_delay_unit;
 #define SPI_DELAY_UNIT_USECS	0
 #define SPI_DELAY_UNIT_NSECS	1
+#define SPI_DELAY_UNIT_SCK	2
 	u32		speed_hz;
 	u16		word_delay;
 
-- 
cgit v1.2.3


From aec71d794731c441a9b7ee9705efedd2f6054173 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Tue, 7 May 2019 15:08:14 +0200
Subject: livepatch: Remove stale kobj_added entries from kernel-doc
 descriptions

Commit 4d141ab3416d ("livepatch: Remove custom kobject state handling")
removed kobj_added members of klp_func, klp_object and klp_patch
structures. kernel-doc descriptions were omitted by accident. Remove
them.

Reported-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index a14bab1a0a3e..955d46f37b72 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -47,7 +47,6 @@
  * @stack_node:	list node for klp_ops func_stack list
  * @old_size:	size of the old function
  * @new_size:	size of the new function
- * @kobj_added: @kobj has been added and needs freeing
  * @nop:        temporary patch to use the original code again; dyn. allocated
  * @patched:	the func has been added to the klp_ops list
  * @transition:	the func is currently being applied or reverted
@@ -125,7 +124,6 @@ struct klp_callbacks {
  * @node:	list node for klp_patch obj_list
  * @mod:	kernel module associated with the patched object
  *		(NULL for vmlinux)
- * @kobj_added: @kobj has been added and needs freeing
  * @dynamic:    temporary object for nop functions; dynamically allocated
  * @patched:	the object's funcs have been added to the klp_ops list
  */
@@ -152,7 +150,6 @@ struct klp_object {
  * @list:	list node for global list of actively used patches
  * @kobj:	kobject for sysfs resources
  * @obj_list:	dynamic list of the object entries
- * @kobj_added: @kobj has been added and needs freeing
  * @enabled:	the patch is enabled (but operation may be incomplete)
  * @forced:	was involved in a forced transition
  * @free_work:	patch cleanup from workqueue-context
-- 
cgit v1.2.3


From 09ed79d6d75f06cc963a78f25463251b0a758dc7 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:47 -0700
Subject: percpu_ref: introduce PERCPU_REF_ALLOW_REINIT flag

In most cases percpu reference counters are not switched to the
percpu mode after they reach the atomic mode. Some obvious exceptions
are reference counters which are initialized into the atomic
mode (using PERCPU_REF_INIT_ATOMIC and PERCPU_REF_INIT_DEAD flags),
and there are few other exceptions.

But in most cases there is no way back, and once the reference counter
is switched to the atomic mode, there is no reason to wait for
percpu_ref_exit() to release the percpu memory. Of course, the size
of a single counter is not so big, but because it can pin the whole
percpu block in memory, the memory footprint can be noticeable
(e.g. on my 32 CPUs machine a percpu block is 8Mb large).

To make releasing of the percpu memory as early as possible, let's
introduce the PERCPU_REF_ALLOW_REINIT flag with the following semantics:
it has to be set in order to switch a percpu reference counter to the
percpu mode after the initialization. PERCPU_REF_INIT_ATOMIC and
PERCPU_REF_INIT_DEAD flags will implicitly assume PERCPU_REF_ALLOW_REINIT.

This patch doesn't introduce any functional change to avoid any
regressions. It will be done later in the patchset after adjusting
all call sites, which are reviving percpu counters.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b297cd1cd4f1..0f0240af8520 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -75,14 +75,21 @@ enum {
 	 * operation using percpu_ref_switch_to_percpu().  If initialized
 	 * with this flag, the ref will stay in atomic mode until
 	 * percpu_ref_switch_to_percpu() is invoked on it.
+	 * Implies ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_ATOMIC	= 1 << 0,
 
 	/*
 	 * Start dead w/ ref == 0 in atomic mode.  Must be revived with
-	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC.
+	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
+	 * ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_DEAD	= 1 << 1,
+
+	/*
+	 * Allow switching from atomic mode to percpu mode.
+	 */
+	PERCPU_REF_ALLOW_REINIT	= 1 << 2,
 };
 
 struct percpu_ref {
-- 
cgit v1.2.3


From 7d9ab9b6adffd9c474c1274acb5f6208f9a09cf3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:50 -0700
Subject: percpu_ref: release percpu memory early without
 PERCPU_REF_ALLOW_REINIT

Release percpu memory after finishing the switch to the atomic mode
if only PERCPU_REF_ALLOW_REINIT isn't set.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 0f0240af8520..7aef0abc194a 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -102,6 +102,7 @@ struct percpu_ref {
 	percpu_ref_func_t	*release;
 	percpu_ref_func_t	*confirm_switch;
 	bool			force_atomic:1;
+	bool			allow_reinit:1;
 	struct rcu_head		rcu;
 };
 
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9877682e49c7..501b517bd3db 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -69,11 +69,14 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 		return -ENOMEM;
 
 	ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
+	ref->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;
 
-	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD))
+	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
 		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-	else
+		ref->allow_reinit = true;
+	} else {
 		start_count += PERCPU_COUNT_BIAS;
+	}
 
 	if (flags & PERCPU_REF_INIT_DEAD)
 		ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
@@ -119,6 +122,9 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
 	ref->confirm_switch = NULL;
 	wake_up_all(&percpu_ref_switch_waitq);
 
+	if (!ref->allow_reinit)
+		percpu_ref_exit(ref);
+
 	/* drop ref from percpu_ref_switch_to_atomic() */
 	percpu_ref_put(ref);
 }
@@ -194,6 +200,9 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
+	if (WARN_ON_ONCE(!ref->allow_reinit))
+		return;
+
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
-- 
cgit v1.2.3


From 5d7e2b5ed5858fe739d4cb8ad22dcce7bd9dbe7b Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sat, 23 Feb 2019 08:49:49 +0000
Subject: spi: core: allow reporting the effectivly used speed_hz for a
 transfer

Provide a means for the spi bus driver to report the effectively used
spi clock frequency used for each spi_transfer.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 1 +
 include/linux/spi/spi.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index cfa3c3decb8a..e9bf0c23da50 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3126,6 +3126,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 	 */
 	message->frame_length = 0;
 	list_for_each_entry(xfer, &message->transfers, transfer_list) {
+		xfer->effective_speed_hz = 0;
 		message->frame_length += xfer->len;
 		if (!xfer->bits_per_word)
 			xfer->bits_per_word = spi->bits_per_word;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index f55b20254612..d0c5ba746e01 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -745,6 +745,9 @@ extern void spi_res_release(struct spi_controller *ctlr,
  *	(set by bits_per_word) transmission.
  * @word_delay: clock cycles to inter word delay after each word size
  *	(set by bits_per_word) transmission.
+ * @effective_speed_hz: the effective SCK-speed that was used to
+ *      transfer this transfer. Set to 0 if the spi bus driver does
+ *      not support it.
  * @transfer_list: transfers are sequenced through @spi_message.transfers
  * @tx_sg: Scatterlist for transmit, currently not for client use
  * @rx_sg: Scatterlist for receive, currently not for client use
@@ -835,6 +838,8 @@ struct spi_transfer {
 	u32		speed_hz;
 	u16		word_delay;
 
+	u32		effective_speed_hz;
+
 	struct list_head transfer_list;
 };
 
-- 
cgit v1.2.3


From 6319aee10e530315689db7609a7d4c444124ff22 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 8 May 2019 15:19:13 +0530
Subject: opp: Attach genpds to devices from within OPP core

The OPP core requires the virtual device pointers to set performance
state on behalf of the device, for the multiple power domain case. The
genpd API (dev_pm_domain_attach_by_name()) has evolved now to support
even the single power domain case and that lets us add common code for
handling both the cases more efficiently.

The virtual device structure returned by dev_pm_domain_attach_by_name()
isn't normally used by the cpufreq drivers as they don't manage power
on/off of the domains and so is only useful for the OPP core.

This patch moves all the complexity into the OPP core to make the end
drivers simple. The earlier APIs dev_pm_opp_{set|put}_genpd_virt_dev()
are reworked into dev_pm_opp_{attach|detach}_genpd(). The new helper
dev_pm_opp_attach_genpd() accepts a NULL terminated array of strings
which contains names of all the genpd's to attach. It then attaches all
the domains and saves the pointers to the virtual devices. The other
helper undo the work done by this helper.

Tested-by: Niklas Cassel <niklas.cassel@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 128 +++++++++++++++++++++++++++++++------------------
 include/linux/pm_opp.h |   8 ++--
 2 files changed, 86 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 0e7703fe733f..67d6b0caeab1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1744,91 +1744,127 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
 
+static void _opp_detach_genpd(struct opp_table *opp_table)
+{
+	int index;
+
+	for (index = 0; index < opp_table->required_opp_count; index++) {
+		if (!opp_table->genpd_virt_devs[index])
+			continue;
+
+		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
+		opp_table->genpd_virt_devs[index] = NULL;
+	}
+}
+
 /**
- * dev_pm_opp_set_genpd_virt_dev - Set virtual genpd device for an index
- * @dev: Consumer device for which the genpd device is getting set.
- * @virt_dev: virtual genpd device.
- * @index: index.
+ * dev_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer
+ * @dev: Consumer device for which the genpd is getting attached.
+ * @names: Null terminated array of pointers containing names of genpd to attach.
  *
  * Multiple generic power domains for a device are supported with the help of
  * virtual genpd devices, which are created for each consumer device - genpd
  * pair. These are the device structures which are attached to the power domain
  * and are required by the OPP core to set the performance state of the genpd.
+ * The same API also works for the case where single genpd is available and so
+ * we don't need to support that separately.
  *
  * This helper will normally be called by the consumer driver of the device
- * "dev", as only that has details of the genpd devices.
+ * "dev", as only that has details of the genpd names.
  *
- * This helper needs to be called once for each of those virtual devices, but
- * only if multiple domains are available for a device. Otherwise the original
- * device structure will be used instead by the OPP core.
+ * This helper needs to be called once with a list of all genpd to attach.
+ * Otherwise the original device structure will be used instead by the OPP core.
  */
-struct opp_table *dev_pm_opp_set_genpd_virt_dev(struct device *dev,
-						struct device *virt_dev,
-						int index)
+struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names)
 {
 	struct opp_table *opp_table;
+	struct device *virt_dev;
+	int index, ret = -EINVAL;
+	const char **name = names;
 
 	opp_table = dev_pm_opp_get_opp_table(dev);
 	if (!opp_table)
 		return ERR_PTR(-ENOMEM);
 
+	/*
+	 * If the genpd's OPP table isn't already initialized, parsing of the
+	 * required-opps fail for dev. We should retry this after genpd's OPP
+	 * table is added.
+	 */
+	if (!opp_table->required_opp_count) {
+		ret = -EPROBE_DEFER;
+		goto put_table;
+	}
+
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
 
-	if (unlikely(!opp_table->genpd_virt_devs ||
-		     index >= opp_table->required_opp_count ||
-		     opp_table->genpd_virt_devs[index])) {
+	while (*name) {
+		index = of_property_match_string(dev->of_node,
+						 "power-domain-names", *name);
+		if (index < 0) {
+			dev_err(dev, "Failed to find power domain: %s (%d)\n",
+				*name, index);
+			goto err;
+		}
 
-		dev_err(dev, "Invalid request to set required device\n");
-		dev_pm_opp_put_opp_table(opp_table);
-		mutex_unlock(&opp_table->genpd_virt_dev_lock);
+		if (index >= opp_table->required_opp_count) {
+			dev_err(dev, "Index can't be greater than required-opp-count - 1, %s (%d : %d)\n",
+				*name, opp_table->required_opp_count, index);
+			goto err;
+		}
 
-		return ERR_PTR(-EINVAL);
+		if (opp_table->genpd_virt_devs[index]) {
+			dev_err(dev, "Genpd virtual device already set %s\n",
+				*name);
+			goto err;
+		}
+
+		virt_dev = dev_pm_domain_attach_by_name(dev, *name);
+		if (IS_ERR(virt_dev)) {
+			ret = PTR_ERR(virt_dev);
+			dev_err(dev, "Couldn't attach to pm_domain: %d\n", ret);
+			goto err;
+		}
+
+		opp_table->genpd_virt_devs[index] = virt_dev;
+		name++;
 	}
 
-	opp_table->genpd_virt_devs[index] = virt_dev;
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
 
 	return opp_table;
+
+err:
+	_opp_detach_genpd(opp_table);
+	mutex_unlock(&opp_table->genpd_virt_dev_lock);
+
+put_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
 }
+EXPORT_SYMBOL_GPL(dev_pm_opp_attach_genpd);
 
 /**
- * dev_pm_opp_put_genpd_virt_dev() - Releases resources blocked for genpd device.
- * @opp_table: OPP table returned by dev_pm_opp_set_genpd_virt_dev().
- * @virt_dev: virtual genpd device.
- *
- * This releases the resource previously acquired with a call to
- * dev_pm_opp_set_genpd_virt_dev(). The consumer driver shall call this helper
- * if it doesn't want OPP core to update performance state of a power domain
- * anymore.
+ * dev_pm_opp_detach_genpd() - Detach genpd(s) from the device.
+ * @opp_table: OPP table returned by dev_pm_opp_attach_genpd().
+ *
+ * This detaches the genpd(s), resets the virtual device pointers, and puts the
+ * OPP table.
  */
-void dev_pm_opp_put_genpd_virt_dev(struct opp_table *opp_table,
-				   struct device *virt_dev)
+void dev_pm_opp_detach_genpd(struct opp_table *opp_table)
 {
-	int i;
-
 	/*
 	 * Acquire genpd_virt_dev_lock to make sure virt_dev isn't getting
 	 * used in parallel.
 	 */
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
-
-	for (i = 0; i < opp_table->required_opp_count; i++) {
-		if (opp_table->genpd_virt_devs[i] != virt_dev)
-			continue;
-
-		opp_table->genpd_virt_devs[i] = NULL;
-		dev_pm_opp_put_opp_table(opp_table);
-
-		/* Drop the vote */
-		dev_pm_genpd_set_performance_state(virt_dev, 0);
-		break;
-	}
-
+	_opp_detach_genpd(opp_table);
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
 
-	if (unlikely(i == opp_table->required_opp_count))
-		dev_err(virt_dev, "Failed to find required device entry\n");
+	dev_pm_opp_put_opp_table(opp_table);
 }
+EXPORT_SYMBOL_GPL(dev_pm_opp_detach_genpd);
 
 /**
  * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index b150fe97ce5a..be570761b77a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -131,8 +131,8 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_genpd_virt_dev(struct device *dev, struct device *virt_dev, int index);
-void dev_pm_opp_put_genpd_virt_dev(struct opp_table *opp_table, struct device *virt_dev);
+struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names);
+void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
@@ -295,12 +295,12 @@ static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const
 
 static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_genpd_virt_dev(struct device *dev, struct device *virt_dev, int index)
+static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_put_genpd_virt_dev(struct opp_table *opp_table, struct device *virt_dev) {}
+static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
 
 static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate)
 {
-- 
cgit v1.2.3


From 4c1ca625c622b7a9f04c2949fd1ffdc6effa86de Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Tue, 16 Apr 2019 19:20:47 -0600
Subject: platform/chrome: wilco_ec: Add Boot on AC support

Boot on AC is a policy which makes the device boot from S5 when AC
power is connected. This is useful for users who want to run their
device headless or with a dock.

Signed-off-by: Nick Crews <ncrews@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 Documentation/ABI/testing/sysfs-platform-wilco-ec |  9 +++
 drivers/platform/chrome/wilco_ec/Makefile         |  2 +-
 drivers/platform/chrome/wilco_ec/core.c           |  9 +++
 drivers/platform/chrome/wilco_ec/sysfs.c          | 77 +++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h            | 12 ++++
 5 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/testing/sysfs-platform-wilco-ec
 create mode 100644 drivers/platform/chrome/wilco_ec/sysfs.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-wilco-ec b/Documentation/ABI/testing/sysfs-platform-wilco-ec
new file mode 100644
index 000000000000..8e5d6eee44db
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-wilco-ec
@@ -0,0 +1,9 @@
+What:		/sys/bus/platform/devices/GOOG000C\:00/boot_on_ac
+Date:		April 2019
+KernelVersion:	5.3
+Description:
+		Boot on AC is a policy which makes the device boot from S5
+		when AC power is connected. This is useful for users who
+		want to run their device headless or with a dock.
+
+		Input should be parseable by kstrtou8() to 0 or 1.
diff --git a/drivers/platform/chrome/wilco_ec/Makefile b/drivers/platform/chrome/wilco_ec/Makefile
index 29b734137786..72df9b5e1983 100644
--- a/drivers/platform/chrome/wilco_ec/Makefile
+++ b/drivers/platform/chrome/wilco_ec/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-wilco_ec-objs				:= core.o mailbox.o properties.o
+wilco_ec-objs				:= core.o mailbox.o properties.o sysfs.o
 obj-$(CONFIG_WILCO_EC)			+= wilco_ec.o
 wilco_ec_debugfs-objs			:= debugfs.o
 obj-$(CONFIG_WILCO_EC_DEBUGFS)		+= wilco_ec_debugfs.o
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index 05e1e2be1c91..abd15d04e57b 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -89,8 +89,16 @@ static int wilco_ec_probe(struct platform_device *pdev)
 		goto unregister_debugfs;
 	}
 
+	ret = wilco_ec_add_sysfs(ec);
+	if (ret < 0) {
+		dev_err(dev, "Failed to create sysfs entries: %d", ret);
+		goto unregister_rtc;
+	}
+
 	return 0;
 
+unregister_rtc:
+	platform_device_unregister(ec->rtc_pdev);
 unregister_debugfs:
 	if (ec->debugfs_pdev)
 		platform_device_unregister(ec->debugfs_pdev);
@@ -102,6 +110,7 @@ static int wilco_ec_remove(struct platform_device *pdev)
 {
 	struct wilco_ec_device *ec = platform_get_drvdata(pdev);
 
+	wilco_ec_remove_sysfs(ec);
 	platform_device_unregister(ec->rtc_pdev);
 	if (ec->debugfs_pdev)
 		platform_device_unregister(ec->debugfs_pdev);
diff --git a/drivers/platform/chrome/wilco_ec/sysfs.c b/drivers/platform/chrome/wilco_ec/sysfs.c
new file mode 100644
index 000000000000..f84f0480460a
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/sysfs.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ *
+ * Sysfs properties to view and modify EC-controlled features on Wilco devices.
+ * The entries will appear under /sys/bus/platform/devices/GOOG000C:00/
+ *
+ * See Documentation/ABI/testing/sysfs-platform-wilco-ec for more information.
+ */
+
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/sysfs.h>
+
+#define CMD_KB_CMOS			0x7C
+#define SUB_CMD_KB_CMOS_AUTO_ON		0x03
+
+struct boot_on_ac_request {
+	u8 cmd;			/* Always CMD_KB_CMOS */
+	u8 reserved1;
+	u8 sub_cmd;		/* Always SUB_CMD_KB_CMOS_AUTO_ON */
+	u8 reserved3to5[3];
+	u8 val;			/* Either 0 or 1 */
+	u8 reserved7;
+} __packed;
+
+static ssize_t boot_on_ac_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct wilco_ec_device *ec = dev_get_drvdata(dev);
+	struct boot_on_ac_request rq;
+	struct wilco_ec_message msg;
+	int ret;
+	u8 val;
+
+	ret = kstrtou8(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+	if (val > 1)
+		return -EINVAL;
+
+	memset(&rq, 0, sizeof(rq));
+	rq.cmd = CMD_KB_CMOS;
+	rq.sub_cmd = SUB_CMD_KB_CMOS_AUTO_ON;
+	rq.val = val;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.type = WILCO_EC_MSG_LEGACY;
+	msg.request_data = &rq;
+	msg.request_size = sizeof(rq);
+	ret = wilco_ec_mailbox(ec, &msg);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR_WO(boot_on_ac);
+
+static struct attribute *wilco_dev_attrs[] = {
+	&dev_attr_boot_on_ac.attr,
+	NULL,
+};
+
+static struct attribute_group wilco_dev_attr_group = {
+	.attrs = wilco_dev_attrs,
+};
+
+int wilco_ec_add_sysfs(struct wilco_ec_device *ec)
+{
+	return sysfs_create_group(&ec->dev->kobj, &wilco_dev_attr_group);
+}
+
+void wilco_ec_remove_sysfs(struct wilco_ec_device *ec)
+{
+	sysfs_remove_group(&ec->dev->kobj, &wilco_dev_attr_group);
+}
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index 50a21bd5fd44..af68fc0563cc 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -194,4 +194,16 @@ int wilco_ec_get_byte_property(struct wilco_ec_device *ec, u32 property_id,
 int wilco_ec_set_byte_property(struct wilco_ec_device *ec, u32 property_id,
 			       u8 val);
 
+/**
+ * wilco_ec_add_sysfs() - Create sysfs entries
+ * @ec: Wilco EC device
+ *
+ * wilco_ec_remove_sysfs() needs to be called afterwards
+ * to perform the necessary cleanup.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int wilco_ec_add_sysfs(struct wilco_ec_device *ec);
+void wilco_ec_remove_sysfs(struct wilco_ec_device *ec);
+
 #endif /* WILCO_EC_H */
-- 
cgit v1.2.3


From 0c0b7ea23aed0b55ef2f9803f13ddaae1943713d Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Wed, 24 Apr 2019 10:56:50 -0600
Subject: platform/chrome: wilco_ec: Add property helper library

A Property is typically a data item that is stored to NVRAM
by the EC. Each of these data items has an index associated
with it, known as the Property ID (PID). Properties may have
variable lengths, up to a max of WILCO_EC_PROPERTY_MAX_SIZE
bytes. Properties can be simple integers, or they may be more
complex binary data.

This patch adds support for getting and setting properties.
This will be useful for setting the charge algorithm and charge
schedules, which all use properties.

Signed-off-by: Nick Crews <ncrews@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 drivers/platform/chrome/wilco_ec/Makefile     |   2 +-
 drivers/platform/chrome/wilco_ec/properties.c | 132 ++++++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h        |  71 ++++++++++++++
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 drivers/platform/chrome/wilco_ec/properties.c

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/wilco_ec/Makefile b/drivers/platform/chrome/wilco_ec/Makefile
index 063e7fb4ea17..29b734137786 100644
--- a/drivers/platform/chrome/wilco_ec/Makefile
+++ b/drivers/platform/chrome/wilco_ec/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-wilco_ec-objs				:= core.o mailbox.o
+wilco_ec-objs				:= core.o mailbox.o properties.o
 obj-$(CONFIG_WILCO_EC)			+= wilco_ec.o
 wilco_ec_debugfs-objs			:= debugfs.o
 obj-$(CONFIG_WILCO_EC_DEBUGFS)		+= wilco_ec_debugfs.o
diff --git a/drivers/platform/chrome/wilco_ec/properties.c b/drivers/platform/chrome/wilco_ec/properties.c
new file mode 100644
index 000000000000..e69682c95ea2
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/properties.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/string.h>
+#include <linux/unaligned/le_memmove.h>
+
+/* Operation code; what the EC should do with the property */
+enum ec_property_op {
+	EC_OP_GET = 0,
+	EC_OP_SET = 1,
+};
+
+struct ec_property_request {
+	u8 op; /* One of enum ec_property_op */
+	u8 property_id[4]; /* The 32 bit PID is stored Little Endian */
+	u8 length;
+	u8 data[WILCO_EC_PROPERTY_MAX_SIZE];
+} __packed;
+
+struct ec_property_response {
+	u8 reserved[2];
+	u8 op; /* One of enum ec_property_op */
+	u8 property_id[4]; /* The 32 bit PID is stored Little Endian */
+	u8 length;
+	u8 data[WILCO_EC_PROPERTY_MAX_SIZE];
+} __packed;
+
+static int send_property_msg(struct wilco_ec_device *ec,
+			     struct ec_property_request *rq,
+			     struct ec_property_response *rs)
+{
+	struct wilco_ec_message ec_msg;
+	int ret;
+
+	memset(&ec_msg, 0, sizeof(ec_msg));
+	ec_msg.type = WILCO_EC_MSG_PROPERTY;
+	ec_msg.request_data = rq;
+	ec_msg.request_size = sizeof(*rq);
+	ec_msg.response_data = rs;
+	ec_msg.response_size = sizeof(*rs);
+
+	ret = wilco_ec_mailbox(ec, &ec_msg);
+	if (ret < 0)
+		return ret;
+	if (rs->op != rq->op)
+		return -EBADMSG;
+	if (memcmp(rq->property_id, rs->property_id, sizeof(rs->property_id)))
+		return -EBADMSG;
+
+	return 0;
+}
+
+int wilco_ec_get_property(struct wilco_ec_device *ec,
+			  struct wilco_ec_property_msg *prop_msg)
+{
+	struct ec_property_request rq;
+	struct ec_property_response rs;
+	int ret;
+
+	memset(&rq, 0, sizeof(rq));
+	rq.op = EC_OP_GET;
+	put_unaligned_le32(prop_msg->property_id, rq.property_id);
+
+	ret = send_property_msg(ec, &rq, &rs);
+	if (ret < 0)
+		return ret;
+
+	prop_msg->length = rs.length;
+	memcpy(prop_msg->data, rs.data, rs.length);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wilco_ec_get_property);
+
+int wilco_ec_set_property(struct wilco_ec_device *ec,
+			  struct wilco_ec_property_msg *prop_msg)
+{
+	struct ec_property_request rq;
+	struct ec_property_response rs;
+	int ret;
+
+	memset(&rq, 0, sizeof(rq));
+	rq.op = EC_OP_SET;
+	put_unaligned_le32(prop_msg->property_id, rq.property_id);
+	rq.length = prop_msg->length;
+	memcpy(rq.data, prop_msg->data, prop_msg->length);
+
+	ret = send_property_msg(ec, &rq, &rs);
+	if (ret < 0)
+		return ret;
+	if (rs.length != prop_msg->length)
+		return -EBADMSG;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wilco_ec_set_property);
+
+int wilco_ec_get_byte_property(struct wilco_ec_device *ec, u32 property_id,
+			       u8 *val)
+{
+	struct wilco_ec_property_msg msg;
+	int ret;
+
+	msg.property_id = property_id;
+
+	ret = wilco_ec_get_property(ec, &msg);
+	if (ret < 0)
+		return ret;
+	if (msg.length != 1)
+		return -EBADMSG;
+
+	*val = msg.data[0];
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wilco_ec_get_byte_property);
+
+int wilco_ec_set_byte_property(struct wilco_ec_device *ec, u32 property_id,
+			       u8 val)
+{
+	struct wilco_ec_property_msg msg;
+
+	msg.property_id = property_id;
+	msg.data[0] = val;
+	msg.length = 1;
+
+	return wilco_ec_set_property(ec, &msg);
+}
+EXPORT_SYMBOL_GPL(wilco_ec_set_byte_property);
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index 1ff224793c99..50a21bd5fd44 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -123,4 +123,75 @@ struct wilco_ec_message {
  */
 int wilco_ec_mailbox(struct wilco_ec_device *ec, struct wilco_ec_message *msg);
 
+/*
+ * A Property is typically a data item that is stored to NVRAM
+ * by the EC. Each of these data items has an index associated
+ * with it, known as the Property ID (PID). Properties may have
+ * variable lengths, up to a max of WILCO_EC_PROPERTY_MAX_SIZE
+ * bytes. Properties can be simple integers, or they may be more
+ * complex binary data.
+ */
+
+#define WILCO_EC_PROPERTY_MAX_SIZE	4
+
+/**
+ * struct ec_property_set_msg - Message to get or set a property.
+ * @property_id: Which property to get or set.
+ * @length: Number of bytes of |data| that are used.
+ * @data: Actual property data.
+ */
+struct wilco_ec_property_msg {
+	u32 property_id;
+	int length;
+	u8 data[WILCO_EC_PROPERTY_MAX_SIZE];
+};
+
+/**
+ * wilco_ec_get_property() - Retrieve a property from the EC.
+ * @ec: Embedded Controller device.
+ * @prop_msg: Message for request and response.
+ *
+ * The property_id field of |prop_msg| should be filled before calling this
+ * function. The result will be stored in the data and length fields.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int wilco_ec_get_property(struct wilco_ec_device *ec,
+			  struct wilco_ec_property_msg *prop_msg);
+
+/**
+ * wilco_ec_set_property() - Store a property on the EC.
+ * @ec: Embedded Controller device.
+ * @prop_msg: Message for request and response.
+ *
+ * The property_id, length, and data fields of |prop_msg| should be
+ * filled before calling this function.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int wilco_ec_set_property(struct wilco_ec_device *ec,
+			  struct wilco_ec_property_msg *prop_msg);
+
+/**
+ * wilco_ec_get_byte_property() - Retrieve a byte-size property from the EC.
+ * @ec: Embedded Controller device.
+ * @property_id: Which property to retrieve.
+ * @val: The result value, will be filled by this function.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int wilco_ec_get_byte_property(struct wilco_ec_device *ec, u32 property_id,
+			       u8 *val);
+
+/**
+ * wilco_ec_get_byte_property() - Store a byte-size property on the EC.
+ * @ec: Embedded Controller device.
+ * @property_id: Which property to store.
+ * @val: Value to store.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int wilco_ec_set_byte_property(struct wilco_ec_device *ec, u32 property_id,
+			       u8 val);
+
 #endif /* WILCO_EC_H */
-- 
cgit v1.2.3


From 2ad1f7a91449de48d4bd5d1ec361ba7bb9026505 Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Wed, 8 May 2019 15:38:09 -0600
Subject: platform/chrome: wilco_ec: Remove 256 byte transfers

The 0xF6 command, intended to send and receive 256 byte payloads to
and from the EC, is not needed. The 0xF5 command for 32 byte
payloads is sufficient. This patch removes support for the 0xF6
command and 256 byte payloads.

Signed-off-by: Nick Crews <ncrews@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 Documentation/ABI/testing/debugfs-wilco-ec | 16 +++++++---------
 drivers/platform/chrome/wilco_ec/core.c    |  4 +---
 drivers/platform/chrome/wilco_ec/debugfs.c | 10 ++--------
 drivers/platform/chrome/wilco_ec/mailbox.c | 21 +++++----------------
 include/linux/platform_data/wilco-ec.h     |  9 ++-------
 5 files changed, 17 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-wilco-ec b/Documentation/ABI/testing/debugfs-wilco-ec
index 73a5a66ddca6..9d8d9d2def5b 100644
--- a/Documentation/ABI/testing/debugfs-wilco-ec
+++ b/Documentation/ABI/testing/debugfs-wilco-ec
@@ -23,11 +23,9 @@ Description:
 
 		For writing, bytes 0-1 indicate the message type, one of enum
 		wilco_ec_msg_type. Byte 2+ consist of the data passed in the
-		request, starting at MBOX[0]
-
-		At least three bytes are required for writing, two for the type
-		and at least a single byte of data. Only the first
-		EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
+		request, starting at MBOX[0]. At least three bytes are required
+		for writing, two for the type and at least a single byte of
+		data.
 
 		Example:
 		// Request EC info type 3 (EC firmware build date)
@@ -40,7 +38,7 @@ Description:
 		$ cat /sys/kernel/debug/wilco_ec/raw
 		00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00  ..12/21/18.8...
 
-		Note that the first 32 bytes of the received MBOX[] will be
-		printed, even if some of the data is junk. It is up to you to
-		know how many of the first bytes of data are the actual
-		response.
+		Note that the first 16 bytes of the received MBOX[] will be
+		printed, even if some of the data is junk, and skipping bytes
+		17 to 32. It is up to you to know how many of the first bytes of
+		data are the actual response.
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index abd15d04e57b..45cf3a5ed062 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -52,9 +52,7 @@ static int wilco_ec_probe(struct platform_device *pdev)
 	ec->dev = dev;
 	mutex_init(&ec->mailbox_lock);
 
-	/* Largest data buffer size requirement is extended data response */
-	ec->data_size = sizeof(struct wilco_ec_response) +
-		EC_MAILBOX_DATA_SIZE_EXTENDED;
+	ec->data_size = sizeof(struct wilco_ec_response) + EC_MAILBOX_DATA_SIZE;
 	ec->data_buffer = devm_kzalloc(dev, ec->data_size, GFP_KERNEL);
 	if (!ec->data_buffer)
 		return -ENOMEM;
diff --git a/drivers/platform/chrome/wilco_ec/debugfs.c b/drivers/platform/chrome/wilco_ec/debugfs.c
index f163476d080d..281ec595e8e0 100644
--- a/drivers/platform/chrome/wilco_ec/debugfs.c
+++ b/drivers/platform/chrome/wilco_ec/debugfs.c
@@ -17,13 +17,13 @@
 #define DRV_NAME "wilco-ec-debugfs"
 
 /* The 256 raw bytes will take up more space when represented as a hex string */
-#define FORMATTED_BUFFER_SIZE (EC_MAILBOX_DATA_SIZE_EXTENDED * 4)
+#define FORMATTED_BUFFER_SIZE (EC_MAILBOX_DATA_SIZE * 4)
 
 struct wilco_ec_debugfs {
 	struct wilco_ec_device *ec;
 	struct dentry *dir;
 	size_t response_size;
-	u8 raw_data[EC_MAILBOX_DATA_SIZE_EXTENDED];
+	u8 raw_data[EC_MAILBOX_DATA_SIZE];
 	u8 formatted_data[FORMATTED_BUFFER_SIZE];
 };
 static struct wilco_ec_debugfs *debug_info;
@@ -124,12 +124,6 @@ static ssize_t raw_write(struct file *file, const char __user *user_buf,
 	msg.response_data = debug_info->raw_data;
 	msg.response_size = EC_MAILBOX_DATA_SIZE;
 
-	/* Telemetry commands use extended response data */
-	if (msg.type == WILCO_EC_MSG_TELEMETRY_LONG) {
-		msg.flags |= WILCO_EC_FLAG_EXTENDED_DATA;
-		msg.response_size = EC_MAILBOX_DATA_SIZE_EXTENDED;
-	}
-
 	ret = wilco_ec_mailbox(debug_info->ec, &msg);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/platform/chrome/wilco_ec/mailbox.c b/drivers/platform/chrome/wilco_ec/mailbox.c
index 7fb58b487963..ced1f9f3dcee 100644
--- a/drivers/platform/chrome/wilco_ec/mailbox.c
+++ b/drivers/platform/chrome/wilco_ec/mailbox.c
@@ -119,7 +119,6 @@ static int wilco_ec_transfer(struct wilco_ec_device *ec,
 	struct wilco_ec_response *rs;
 	u8 checksum;
 	u8 flag;
-	size_t size;
 
 	/* Write request header, then data */
 	cros_ec_lpc_io_bytes_mec(MEC_IO_WRITE, 0, sizeof(*rq), (u8 *)rq);
@@ -148,21 +147,11 @@ static int wilco_ec_transfer(struct wilco_ec_device *ec,
 		return -EIO;
 	}
 
-	/*
-	 * The EC always returns either EC_MAILBOX_DATA_SIZE or
-	 * EC_MAILBOX_DATA_SIZE_EXTENDED bytes of data, so we need to
-	 * calculate the checksum on **all** of this data, even if we
-	 * won't use all of it.
-	 */
-	if (msg->flags & WILCO_EC_FLAG_EXTENDED_DATA)
-		size = EC_MAILBOX_DATA_SIZE_EXTENDED;
-	else
-		size = EC_MAILBOX_DATA_SIZE;
-
 	/* Read back response */
 	rs = ec->data_buffer;
 	checksum = cros_ec_lpc_io_bytes_mec(MEC_IO_READ, 0,
-					    sizeof(*rs) + size, (u8 *)rs);
+					    sizeof(*rs) + EC_MAILBOX_DATA_SIZE,
+					    (u8 *)rs);
 	if (checksum) {
 		dev_dbg(ec->dev, "bad packet checksum 0x%02x\n", rs->checksum);
 		return -EBADMSG;
@@ -173,9 +162,9 @@ static int wilco_ec_transfer(struct wilco_ec_device *ec,
 		return -EBADMSG;
 	}
 
-	if (rs->data_size != size) {
-		dev_dbg(ec->dev, "unexpected packet size (%u != %zu)",
-			rs->data_size, size);
+	if (rs->data_size != EC_MAILBOX_DATA_SIZE) {
+		dev_dbg(ec->dev, "unexpected packet size (%u != %u)",
+			rs->data_size, EC_MAILBOX_DATA_SIZE);
 		return -EMSGSIZE;
 	}
 
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index af68fc0563cc..e3ce9ce49b11 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -13,12 +13,9 @@
 
 /* Message flags for using the mailbox() interface */
 #define WILCO_EC_FLAG_NO_RESPONSE	BIT(0) /* EC does not respond */
-#define WILCO_EC_FLAG_EXTENDED_DATA	BIT(1) /* EC returns 256 data bytes */
 
 /* Normal commands have a maximum 32 bytes of data */
 #define EC_MAILBOX_DATA_SIZE		32
-/* Extended commands have 256 bytes of response data */
-#define EC_MAILBOX_DATA_SIZE_EXTENDED	256
 
 /**
  * struct wilco_ec_device - Wilco Embedded Controller handle.
@@ -85,14 +82,12 @@ struct wilco_ec_response {
  * enum wilco_ec_msg_type - Message type to select a set of command codes.
  * @WILCO_EC_MSG_LEGACY: Legacy EC messages for standard EC behavior.
  * @WILCO_EC_MSG_PROPERTY: Get/Set/Sync EC controlled NVRAM property.
- * @WILCO_EC_MSG_TELEMETRY_SHORT: 32 bytes of telemetry data provided by the EC.
- * @WILCO_EC_MSG_TELEMETRY_LONG: 256 bytes of telemetry data provided by the EC.
+ * @WILCO_EC_MSG_TELEMETRY: Request telemetry data from the EC.
  */
 enum wilco_ec_msg_type {
 	WILCO_EC_MSG_LEGACY = 0x00f0,
 	WILCO_EC_MSG_PROPERTY = 0x00f2,
-	WILCO_EC_MSG_TELEMETRY_SHORT = 0x00f5,
-	WILCO_EC_MSG_TELEMETRY_LONG = 0x00f6,
+	WILCO_EC_MSG_TELEMETRY = 0x00f5,
 };
 
 /**
-- 
cgit v1.2.3


From 824bd1be3ed01d67197098650d0c62b176087b11 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 20 May 2019 11:34:45 +0200
Subject: regulator: da9063: move definitions out of a header into the driver

Those definitions are only used within the driver meanwhile, so put them
there.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Steve Twiss <stwiss.opensource@diasemi.com>
Tested-by: Steve Twiss <stwiss.opensource@diasemi.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/da9063-regulator.c | 44 +++++++++++++++++++++++++++++++-
 include/linux/mfd/da9063/pdata.h     | 49 ------------------------------------
 2 files changed, 43 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c
index 43aa0df30346..da95197fdb4f 100644
--- a/drivers/regulator/da9063-regulator.c
+++ b/drivers/regulator/da9063-regulator.c
@@ -19,7 +19,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/mfd/da9063/core.h>
-#include <linux/mfd/da9063/pdata.h>
 #include <linux/mfd/da9063/registers.h>
 
 
@@ -28,6 +27,49 @@
 	REG_FIELD(_reg, __builtin_ffs((int)_mask) - 1, \
 		sizeof(unsigned int) * 8 - __builtin_clz((_mask)) - 1)
 
+/* DA9063 and DA9063L regulator IDs */
+enum {
+	/* BUCKs */
+	DA9063_ID_BCORE1,
+	DA9063_ID_BCORE2,
+	DA9063_ID_BPRO,
+	DA9063_ID_BMEM,
+	DA9063_ID_BIO,
+	DA9063_ID_BPERI,
+
+	/* BCORE1 and BCORE2 in merged mode */
+	DA9063_ID_BCORES_MERGED,
+	/* BMEM and BIO in merged mode */
+	DA9063_ID_BMEM_BIO_MERGED,
+	/* When two BUCKs are merged, they cannot be reused separately */
+
+	/* LDOs on both DA9063 and DA9063L */
+	DA9063_ID_LDO3,
+	DA9063_ID_LDO7,
+	DA9063_ID_LDO8,
+	DA9063_ID_LDO9,
+	DA9063_ID_LDO11,
+
+	/* DA9063-only LDOs */
+	DA9063_ID_LDO1,
+	DA9063_ID_LDO2,
+	DA9063_ID_LDO4,
+	DA9063_ID_LDO5,
+	DA9063_ID_LDO6,
+	DA9063_ID_LDO10,
+};
+
+/* Old regulator platform data */
+struct da9063_regulator_data {
+	int				id;
+	struct regulator_init_data	*initdata;
+};
+
+struct da9063_regulators_pdata {
+	unsigned			n_regulators;
+	struct da9063_regulator_data	*regulator_data;
+};
+
 /* Regulator capabilities and registers description */
 struct da9063_regulator_info {
 	struct regulator_desc desc;
diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h
index 50bed4f89c1a..21a2d107f0cf 100644
--- a/include/linux/mfd/da9063/pdata.h
+++ b/include/linux/mfd/da9063/pdata.h
@@ -16,55 +16,6 @@
 #ifndef __MFD_DA9063_PDATA_H__
 #define __MFD_DA9063_PDATA_H__
 
-#include <linux/regulator/machine.h>
-
-/*
- * Regulator configuration
- */
-/* DA9063 and DA9063L regulator IDs */
-enum {
-	/* BUCKs */
-	DA9063_ID_BCORE1,
-	DA9063_ID_BCORE2,
-	DA9063_ID_BPRO,
-	DA9063_ID_BMEM,
-	DA9063_ID_BIO,
-	DA9063_ID_BPERI,
-
-	/* BCORE1 and BCORE2 in merged mode */
-	DA9063_ID_BCORES_MERGED,
-	/* BMEM and BIO in merged mode */
-	DA9063_ID_BMEM_BIO_MERGED,
-	/* When two BUCKs are merged, they cannot be reused separately */
-
-	/* LDOs on both DA9063 and DA9063L */
-	DA9063_ID_LDO3,
-	DA9063_ID_LDO7,
-	DA9063_ID_LDO8,
-	DA9063_ID_LDO9,
-	DA9063_ID_LDO11,
-
-	/* DA9063-only LDOs */
-	DA9063_ID_LDO1,
-	DA9063_ID_LDO2,
-	DA9063_ID_LDO4,
-	DA9063_ID_LDO5,
-	DA9063_ID_LDO6,
-	DA9063_ID_LDO10,
-};
-
-/* Regulators platform data */
-struct da9063_regulator_data {
-	int				id;
-	struct regulator_init_data	*initdata;
-};
-
-struct da9063_regulators_pdata {
-	unsigned			n_regulators;
-	struct da9063_regulator_data	*regulator_data;
-};
-
-
 /*
  * RGB LED configuration
  */
-- 
cgit v1.2.3


From ec9964b4803300fb86f8e8fd9b421e59f7a71dc5 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:34 +0200
Subject: Platform: OLPC: Move EC-specific functionality out from x86

Move the olpc-ec driver away from the X86 OLPC platform so that it could be
used by the ARM based laptops too. Notably, the driver for the OLPC battery,
which is also used on the ARM models, builds on this driver's interface.

It is actually plaform independent: the OLPC EC commands with their argument
and responses are mostly the same despite the delivery mechanism is
different.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/include/asm/olpc.h         |  31 ----------
 arch/x86/platform/olpc/olpc.c       | 119 ++++++------------------------------
 drivers/platform/olpc/olpc-ec.c     |  99 +++++++++++++++++++++++++++++-
 drivers/power/supply/olpc_battery.c |   1 -
 include/linux/olpc-ec.h             |  32 +++++++++-
 5 files changed, 145 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index c2bf1de5d901..6fe76282aceb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -9,12 +9,10 @@
 struct olpc_platform_t {
 	int flags;
 	uint32_t boardrev;
-	int ecver;
 };
 
 #define OLPC_F_PRESENT		0x01
 #define OLPC_F_DCON		0x02
-#define OLPC_F_EC_WIDE_SCI	0x04
 
 #ifdef CONFIG_OLPC
 
@@ -64,13 +62,6 @@ static inline int olpc_board_at_least(uint32_t rev)
 	return olpc_platform_info.boardrev >= rev;
 }
 
-extern void olpc_ec_wakeup_set(u16 value);
-extern void olpc_ec_wakeup_clear(u16 value);
-extern bool olpc_ec_wakeup_available(void);
-
-extern int olpc_ec_mask_write(u16 bits);
-extern int olpc_ec_sci_query(u16 *sci_value);
-
 #else
 
 static inline int machine_is_olpc(void)
@@ -83,14 +74,6 @@ static inline int olpc_has_dcon(void)
 	return 0;
 }
 
-static inline void olpc_ec_wakeup_set(u16 value) { }
-static inline void olpc_ec_wakeup_clear(u16 value) { }
-
-static inline bool olpc_ec_wakeup_available(void)
-{
-	return false;
-}
-
 #endif
 
 #ifdef CONFIG_OLPC_XO1_PM
@@ -101,20 +84,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value);
 
 extern int pci_olpc_init(void);
 
-/* SCI source values */
-
-#define EC_SCI_SRC_EMPTY	0x00
-#define EC_SCI_SRC_GAME		0x01
-#define EC_SCI_SRC_BATTERY	0x02
-#define EC_SCI_SRC_BATSOC	0x04
-#define EC_SCI_SRC_BATERR	0x08
-#define EC_SCI_SRC_EBOOK	0x10	/* XO-1 only */
-#define EC_SCI_SRC_WLAN		0x20	/* XO-1 only */
-#define EC_SCI_SRC_ACPWR	0x40
-#define EC_SCI_SRC_BATCRIT	0x80
-#define EC_SCI_SRC_GPWAKE	0x100	/* XO-1.5 only */
-#define EC_SCI_SRC_ALL		0x1FF
-
 /* GPIO assignments */
 
 #define OLPC_GPIO_MIC_AC	1
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index f0e920fb98ad..c6c62b4f251f 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -30,9 +30,6 @@
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
 
-/* EC event mask to be applied during suspend (defining wakeup sources). */
-static u16 ec_wakeup_mask;
-
 /* what the timeout *should* be (in ms) */
 #define EC_BASE_TIMEOUT 20
 
@@ -186,83 +183,6 @@ err:
 	return ret;
 }
 
-void olpc_ec_wakeup_set(u16 value)
-{
-	ec_wakeup_mask |= value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
-
-void olpc_ec_wakeup_clear(u16 value)
-{
-	ec_wakeup_mask &= ~value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
-
-/*
- * Returns true if the compile and runtime configurations allow for EC events
- * to wake the system.
- */
-bool olpc_ec_wakeup_available(void)
-{
-	if (!machine_is_olpc())
-		return false;
-
-	/*
-	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO1_SCI
-	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
-		return true;
-#endif
-
-	/*
-	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO15_SCI
-	if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
-		return true;
-#endif
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
-
-int olpc_ec_mask_write(u16 bits)
-{
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word = cpu_to_be16(bits);
-		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
-				   NULL, 0);
-	} else {
-		unsigned char ec_byte = bits & 0xff;
-		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
-	}
-}
-EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
-
-int olpc_ec_sci_query(u16 *sci_value)
-{
-	int ret;
-
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word;
-		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
-			NULL, 0, (void *) &ec_word, 2);
-		if (ret == 0)
-			*sci_value = be16_to_cpu(ec_word);
-	} else {
-		unsigned char ec_byte;
-		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
-		if (ret == 0)
-			*sci_value = ec_byte;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
-
 static bool __init check_ofw_architecture(struct device_node *root)
 {
 	const char *olpc_arch;
@@ -296,6 +216,10 @@ static bool __init platform_detect(void)
 	if (success) {
 		olpc_platform_info.boardrev = get_board_revision(root);
 		olpc_platform_info.flags |= OLPC_F_PRESENT;
+
+		pr_info("OLPC board revision %s%X\n",
+			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+			olpc_platform_info.boardrev >> 4);
 	}
 
 	of_node_put(root);
@@ -315,27 +239,8 @@ static int __init add_xo1_platform_devices(void)
 	return PTR_ERR_OR_ZERO(pdev);
 }
 
-static int olpc_xo1_ec_probe(struct platform_device *pdev)
-{
-	/* get the EC revision */
-	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-			(unsigned char *) &olpc_platform_info.ecver, 1);
-
-	/* EC version 0x5f adds support for wide SCI mask */
-	if (olpc_platform_info.ecver >= 0x5f)
-		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
-
-	pr_info("OLPC board revision %s%X (EC=%x)\n",
-			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-			olpc_platform_info.boardrev >> 4,
-			olpc_platform_info.ecver);
-
-	return 0;
-}
 static int olpc_xo1_ec_suspend(struct platform_device *pdev)
 {
-	olpc_ec_mask_write(ec_wakeup_mask);
-
 	/*
 	 * Squelch SCIs while suspended.  This is a fix for
 	 * <http://dev.laptop.org/ticket/1835>.
@@ -359,15 +264,27 @@ static int olpc_xo1_ec_resume(struct platform_device *pdev)
 }
 
 static struct olpc_ec_driver ec_xo1_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.suspend = olpc_xo1_ec_suspend,
 	.resume = olpc_xo1_ec_resume,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_SCI
+	/*
+	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static struct olpc_ec_driver ec_xo1_5_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_5_SCI
+	/*
+	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static int __init olpc_init(void)
diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c
index 981955dce926..2a647455a368 100644
--- a/drivers/platform/olpc/olpc-ec.c
+++ b/drivers/platform/olpc/olpc-ec.c
@@ -32,6 +32,7 @@ struct ec_cmd_desc {
 
 struct olpc_ec_priv {
 	struct olpc_ec_driver *drv;
+	u8 version;
 	struct work_struct worker;
 	struct mutex cmd_lock;
 
@@ -41,6 +42,12 @@ struct olpc_ec_priv {
 
 	struct dentry *dbgfs_dir;
 
+	/*
+	 * EC event mask to be applied during suspend (defining wakeup
+	 * sources).
+	 */
+	u16 ec_wakeup_mask;
+
 	/*
 	 * Running an EC command while suspending means we don't always finish
 	 * the command before the machine suspends.  This means that the EC
@@ -149,6 +156,88 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
+void olpc_ec_wakeup_set(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
+
+int olpc_ec_mask_write(u16 bits)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word = cpu_to_be16(bits);
+
+		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *)&ec_word, 2, NULL, 0);
+	} else {
+		u8 ec_byte = bits & 0xff;
+
+		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
+
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+	if (WARN_ON(!ec_driver))
+		return false;
+
+	return ec_driver->wakeup_available;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_sci_query(u16 *sci_value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+	int ret;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word;
+
+		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, NULL, 0, (void *)&ec_word, 2);
+		if (ret == 0)
+			*sci_value = be16_to_cpu(ec_word);
+	} else {
+		u8 ec_byte;
+
+		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+		if (ret == 0)
+			*sci_value = ec_byte;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
 #ifdef CONFIG_DEBUG_FS
 
 /*
@@ -276,14 +365,16 @@ static int olpc_ec_probe(struct platform_device *pdev)
 	ec_priv = ec;
 	platform_set_drvdata(pdev, ec);
 
-	err = ec_driver->probe ? ec_driver->probe(pdev) : 0;
+	/* get the EC revision */
+	err = olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, &ec->version, 1);
 	if (err) {
 		ec_priv = NULL;
 		kfree(ec);
-	} else {
-		ec->dbgfs_dir = olpc_ec_setup_debugfs();
+		return err;
 	}
 
+	ec->dbgfs_dir = olpc_ec_setup_debugfs();
+
 	return err;
 }
 
@@ -293,6 +384,8 @@ static int olpc_ec_suspend(struct device *dev)
 	struct olpc_ec_priv *ec = platform_get_drvdata(pdev);
 	int err = 0;
 
+	olpc_ec_mask_write(ec->ec_wakeup_mask);
+
 	if (ec_driver->suspend)
 		err = ec_driver->suspend(pdev);
 	if (!err)
diff --git a/drivers/power/supply/olpc_battery.c b/drivers/power/supply/olpc_battery.c
index 7720e4c2ac0b..066ec9a11153 100644
--- a/drivers/power/supply/olpc_battery.c
+++ b/drivers/power/supply/olpc_battery.c
@@ -20,7 +20,6 @@
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/olpc-ec.h>
-#include <asm/olpc.h>
 
 
 #define EC_BAT_VOLTAGE	0x10	/* uint16_t,	*9.76/32,    mV   */
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 79bdc6328c52..7fa3d27f7fee 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -16,14 +16,28 @@
 #define EC_SCI_QUERY			0x84
 #define EC_EXT_SCI_QUERY		0x85
 
+/* SCI source values */
+#define EC_SCI_SRC_EMPTY        0x00
+#define EC_SCI_SRC_GAME         0x01
+#define EC_SCI_SRC_BATTERY      0x02
+#define EC_SCI_SRC_BATSOC       0x04
+#define EC_SCI_SRC_BATERR       0x08
+#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        0x40
+#define EC_SCI_SRC_BATCRIT      0x80
+#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          0x1FF
+
 struct platform_device;
 
 struct olpc_ec_driver {
-	int (*probe)(struct platform_device *);
 	int (*suspend)(struct platform_device *);
 	int (*resume)(struct platform_device *);
 
 	int (*ec_cmd)(u8, u8 *, size_t, u8 *, size_t, void *);
+
+	bool wakeup_available;
 };
 
 #ifdef CONFIG_OLPC
@@ -33,11 +47,27 @@ extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 extern int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen);
 
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
+extern bool olpc_ec_wakeup_available(void);
+
 #else
 
 static inline int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen) { return -ENODEV; }
 
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_OLPC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3


From 8097548f3af9ec990169574ad9d874052b78bff8 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:36 +0200
Subject: Platform: OLPC: Use BIT() and GENMASK() for event masks

Just a cosmetic tidy-up.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/olpc-ec.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 7fa3d27f7fee..f7b6a7eda232 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_OLPC_EC_H
 #define _LINUX_OLPC_EC_H
 
+#include <linux/bits.h>
+
 /* XO-1 EC commands */
 #define EC_FIRMWARE_REV			0x08
 #define EC_WRITE_SCI_MASK		0x1b
@@ -17,17 +19,16 @@
 #define EC_EXT_SCI_QUERY		0x85
 
 /* SCI source values */
-#define EC_SCI_SRC_EMPTY        0x00
-#define EC_SCI_SRC_GAME         0x01
-#define EC_SCI_SRC_BATTERY      0x02
-#define EC_SCI_SRC_BATSOC       0x04
-#define EC_SCI_SRC_BATERR       0x08
-#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
-#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
-#define EC_SCI_SRC_ACPWR        0x40
-#define EC_SCI_SRC_BATCRIT      0x80
-#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
-#define EC_SCI_SRC_ALL          0x1FF
+#define EC_SCI_SRC_GAME         BIT(0)
+#define EC_SCI_SRC_BATTERY      BIT(1)
+#define EC_SCI_SRC_BATSOC       BIT(2)
+#define EC_SCI_SRC_BATERR       BIT(3)
+#define EC_SCI_SRC_EBOOK        BIT(4)    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         BIT(5)    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        BIT(6)
+#define EC_SCI_SRC_BATCRIT      BIT(7)
+#define EC_SCI_SRC_GPWAKE       BIT(8)   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          GENMASK(8, 0)
 
 struct platform_device;
 
-- 
cgit v1.2.3


From 0c3d931b3ab9efeea4948b5373c62095449d0101 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:37 +0200
Subject: Platform: OLPC: Add XO-1.75 EC driver

It's based off the driver from the OLPC kernel sources. Somewhat
modernized and cleaned up, for better or worse.

Modified to plug into the olpc-ec driver infrastructure (so that battery
interface and debugfs could be reused) and the SPI slave framework.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/Kconfig                      |   1 +
 drivers/platform/Kconfig              |   2 +
 drivers/platform/Makefile             |   2 +-
 drivers/platform/olpc/Kconfig         |  14 +
 drivers/platform/olpc/Makefile        |   3 +-
 drivers/platform/olpc/olpc-xo175-ec.c | 752 ++++++++++++++++++++++++++++++++++
 include/linux/olpc-ec.h               |   4 +-
 7 files changed, 774 insertions(+), 4 deletions(-)
 create mode 100644 drivers/platform/olpc/Kconfig
 create mode 100644 drivers/platform/olpc/olpc-xo175-ec.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..cb1c073b3c7e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2698,6 +2698,7 @@ config OLPC
 	select OF
 	select OF_PROMTREE
 	select IRQ_DOMAIN
+	select OLPC_EC
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
index d4c2e424a700..4313d73d3618 100644
--- a/drivers/platform/Kconfig
+++ b/drivers/platform/Kconfig
@@ -10,3 +10,5 @@ source "drivers/platform/goldfish/Kconfig"
 source "drivers/platform/chrome/Kconfig"
 
 source "drivers/platform/mellanox/Kconfig"
+
+source "drivers/platform/olpc/Kconfig"
diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile
index 4b2ce58bcd9c..6fda58c021ca 100644
--- a/drivers/platform/Makefile
+++ b/drivers/platform/Makefile
@@ -6,6 +6,6 @@
 obj-$(CONFIG_X86)		+= x86/
 obj-$(CONFIG_MELLANOX_PLATFORM)	+= mellanox/
 obj-$(CONFIG_MIPS)		+= mips/
-obj-$(CONFIG_OLPC)		+= olpc/
+obj-$(CONFIG_OLPC_EC)		+= olpc/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
 obj-$(CONFIG_CHROME_PLATFORMS)	+= chrome/
diff --git a/drivers/platform/olpc/Kconfig b/drivers/platform/olpc/Kconfig
new file mode 100644
index 000000000000..559f843199d7
--- /dev/null
+++ b/drivers/platform/olpc/Kconfig
@@ -0,0 +1,14 @@
+config OLPC_EC
+	bool
+
+config OLPC_XO175_EC
+	tristate "OLPC XO 1.75 Embedded Controller"
+	depends on ARCH_MMP || COMPILE_TEST
+	select SPI_SLAVE
+	select OLPC_EC
+	help
+	  Include support for the OLPC XO Embedded Controller (EC). The EC
+	  provides various platform services, including support for the power,
+	  button, restart, shutdown and battery charging status.
+
+	  Unless you have an OLPC XO laptop, you will want to say N.
diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile
index dc8b26bc7209..01fe6ba01665 100644
--- a/drivers/platform/olpc/Makefile
+++ b/drivers/platform/olpc/Makefile
@@ -1,4 +1,5 @@
 #
 # OLPC XO platform-specific drivers
 #
-obj-$(CONFIG_OLPC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_EC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_XO175_EC)	+= olpc-xo175-ec.o
diff --git a/drivers/platform/olpc/olpc-xo175-ec.c b/drivers/platform/olpc/olpc-xo175-ec.c
new file mode 100644
index 000000000000..344d14f3da54
--- /dev/null
+++ b/drivers/platform/olpc/olpc-xo175-ec.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Driver for the OLPC XO-1.75 Embedded Controller.
+ *
+ * The EC protocol is documented at:
+ * http://wiki.laptop.org/go/XO_1.75_HOST_to_EC_Protocol
+ *
+ * Copyright (C) 2010 One Laptop per Child Foundation.
+ * Copyright (C) 2018 Lubomir Rintel <lkundrak@v3.sk>
+ */
+
+#include <linux/completion.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/input.h>
+#include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/olpc-ec.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/spi/spi.h>
+
+struct ec_cmd_t {
+	u8 cmd;
+	u8 bytes_returned;
+};
+
+enum ec_chan_t {
+	CHAN_NONE = 0,
+	CHAN_SWITCH,
+	CHAN_CMD_RESP,
+	CHAN_KEYBOARD,
+	CHAN_TOUCHPAD,
+	CHAN_EVENT,
+	CHAN_DEBUG,
+	CHAN_CMD_ERROR,
+};
+
+/*
+ * EC events
+ */
+#define EVENT_AC_CHANGE			1  /* AC plugged/unplugged */
+#define EVENT_BATTERY_STATUS		2  /* Battery low/full/error/gone */
+#define EVENT_BATTERY_CRITICAL		3  /* Battery critical voltage */
+#define EVENT_BATTERY_SOC_CHANGE	4  /* 1% SOC Change */
+#define EVENT_BATTERY_ERROR		5  /* Abnormal error, query for cause */
+#define EVENT_POWER_PRESSED		6  /* Power button was pressed */
+#define EVENT_POWER_PRESS_WAKE		7  /* Woken up with a power button */
+#define EVENT_TIMED_HOST_WAKE		8  /* Host wake timer */
+#define EVENT_OLS_HIGH_LIMIT		9  /* OLS crossed dark threshold */
+#define EVENT_OLS_LOW_LIMIT		10 /* OLS crossed light threshold */
+
+/*
+ * EC commands
+ * (from http://dev.laptop.org/git/users/rsmith/ec-1.75/tree/ec_cmd.h)
+ */
+#define CMD_GET_API_VERSION		0x08 /* out: u8 */
+#define CMD_READ_VOLTAGE		0x10 /* out: u16, *9.76/32, mV */
+#define CMD_READ_CURRENT		0x11 /* out: s16, *15.625/120, mA */
+#define CMD_READ_ACR			0x12 /* out: s16, *6250/15, uAh */
+#define CMD_READ_BATT_TEMPERATURE	0x13 /* out: u16, *100/256, deg C */
+#define CMD_READ_AMBIENT_TEMPERATURE	0x14 /* unimplemented, no hardware */
+#define CMD_READ_BATTERY_STATUS		0x15 /* out: u8, bitmask */
+#define CMD_READ_SOC			0x16 /* out: u8, percentage */
+#define CMD_READ_GAUGE_ID		0x17 /* out: u8 * 8 */
+#define CMD_READ_GAUGE_DATA		0x18 /* in: u8 addr, out: u8 data */
+#define CMD_READ_BOARD_ID		0x19 /* out: u16 (platform id) */
+#define CMD_READ_BATT_ERR_CODE		0x1f /* out: u8, error bitmask */
+#define CMD_SET_DCON_POWER		0x26 /* in: u8 */
+#define CMD_RESET_EC			0x28 /* none */
+#define CMD_READ_BATTERY_TYPE		0x2c /* out: u8 */
+#define CMD_SET_AUTOWAK			0x33 /* out: u8 */
+#define CMD_SET_EC_WAKEUP_TIMER		0x36 /* in: u32, out: ? */
+#define CMD_READ_EXT_SCI_MASK		0x37 /* ? */
+#define CMD_WRITE_EXT_SCI_MASK		0x38 /* ? */
+#define CMD_CLEAR_EC_WAKEUP_TIMER	0x39 /* none */
+#define CMD_ENABLE_RUNIN_DISCHARGE	0x3B /* none */
+#define CMD_DISABLE_RUNIN_DISCHARGE	0x3C /* none */
+#define CMD_READ_MPPT_ACTIVE		0x3d /* out: u8 */
+#define CMD_READ_MPPT_LIMIT		0x3e /* out: u8 */
+#define CMD_SET_MPPT_LIMIT		0x3f /* in: u8 */
+#define CMD_DISABLE_MPPT		0x40 /* none */
+#define CMD_ENABLE_MPPT			0x41 /* none */
+#define CMD_READ_VIN			0x42 /* out: u16 */
+#define CMD_EXT_SCI_QUERY		0x43 /* ? */
+#define RSP_KEYBOARD_DATA		0x48 /* ? */
+#define RSP_TOUCHPAD_DATA		0x49 /* ? */
+#define CMD_GET_FW_VERSION		0x4a /* out: u8 * 16 */
+#define CMD_POWER_CYCLE			0x4b /* none */
+#define CMD_POWER_OFF			0x4c /* none */
+#define CMD_RESET_EC_SOFT		0x4d /* none */
+#define CMD_READ_GAUGE_U16		0x4e /* ? */
+#define CMD_ENABLE_MOUSE		0x4f /* ? */
+#define CMD_ECHO			0x52 /* in: u8 * 5, out: u8 * 5 */
+#define CMD_GET_FW_DATE			0x53 /* out: u8 * 16 */
+#define CMD_GET_FW_USER			0x54 /* out: u8 * 16 */
+#define CMD_TURN_OFF_POWER		0x55 /* none (same as 0x4c) */
+#define CMD_READ_OLS			0x56 /* out: u16 */
+#define CMD_OLS_SMT_LEDON		0x57 /* none */
+#define CMD_OLS_SMT_LEDOFF		0x58 /* none */
+#define CMD_START_OLS_ASSY		0x59 /* none */
+#define CMD_STOP_OLS_ASSY		0x5a /* none */
+#define CMD_OLS_SMTTEST_STOP		0x5b /* none */
+#define CMD_READ_VIN_SCALED		0x5c /* out: u16 */
+#define CMD_READ_BAT_MIN_W		0x5d /* out: u16 */
+#define CMD_READ_BAR_MAX_W		0x5e /* out: u16 */
+#define CMD_RESET_BAT_MINMAX_W		0x5f /* none */
+#define CMD_READ_LOCATION		0x60 /* in: u16 addr, out: u8 data */
+#define CMD_WRITE_LOCATION		0x61 /* in: u16 addr, u8 data */
+#define CMD_KEYBOARD_CMD		0x62 /* in: u8, out: ? */
+#define CMD_TOUCHPAD_CMD		0x63 /* in: u8, out: ? */
+#define CMD_GET_FW_HASH			0x64 /* out: u8 * 16 */
+#define CMD_SUSPEND_HINT		0x65 /* in: u8 */
+#define CMD_ENABLE_WAKE_TIMER		0x66 /* in: u8 */
+#define CMD_SET_WAKE_TIMER		0x67 /* in: 32 */
+#define CMD_ENABLE_WAKE_AUTORESET	0x68 /* in: u8 */
+#define CMD_OLS_SET_LIMITS		0x69 /* in: u16, u16 */
+#define CMD_OLS_GET_LIMITS		0x6a /* out: u16, u16 */
+#define CMD_OLS_SET_CEILING		0x6b /* in: u16 */
+#define CMD_OLS_GET_CEILING		0x6c /* out: u16 */
+
+/*
+ * Accepted EC commands, and how many bytes they return. There are plenty
+ * of EC commands that are no longer implemented, or are implemented only on
+ * certain older boards.
+ */
+static const struct ec_cmd_t olpc_xo175_ec_cmds[] = {
+	{ CMD_GET_API_VERSION, 1 },
+	{ CMD_READ_VOLTAGE, 2 },
+	{ CMD_READ_CURRENT, 2 },
+	{ CMD_READ_ACR, 2 },
+	{ CMD_READ_BATT_TEMPERATURE, 2 },
+	{ CMD_READ_BATTERY_STATUS, 1 },
+	{ CMD_READ_SOC, 1 },
+	{ CMD_READ_GAUGE_ID, 8 },
+	{ CMD_READ_GAUGE_DATA, 1 },
+	{ CMD_READ_BOARD_ID, 2 },
+	{ CMD_READ_BATT_ERR_CODE, 1 },
+	{ CMD_SET_DCON_POWER, 0 },
+	{ CMD_RESET_EC, 0 },
+	{ CMD_READ_BATTERY_TYPE, 1 },
+	{ CMD_ENABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_DISABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_READ_MPPT_ACTIVE, 1 },
+	{ CMD_READ_MPPT_LIMIT, 1 },
+	{ CMD_SET_MPPT_LIMIT, 0 },
+	{ CMD_DISABLE_MPPT, 0 },
+	{ CMD_ENABLE_MPPT, 0 },
+	{ CMD_READ_VIN, 2 },
+	{ CMD_GET_FW_VERSION, 16 },
+	{ CMD_POWER_CYCLE, 0 },
+	{ CMD_POWER_OFF, 0 },
+	{ CMD_RESET_EC_SOFT, 0 },
+	{ CMD_ECHO, 5 },
+	{ CMD_GET_FW_DATE, 16 },
+	{ CMD_GET_FW_USER, 16 },
+	{ CMD_TURN_OFF_POWER, 0 },
+	{ CMD_READ_OLS, 2 },
+	{ CMD_OLS_SMT_LEDON, 0 },
+	{ CMD_OLS_SMT_LEDOFF, 0 },
+	{ CMD_START_OLS_ASSY, 0 },
+	{ CMD_STOP_OLS_ASSY, 0 },
+	{ CMD_OLS_SMTTEST_STOP, 0 },
+	{ CMD_READ_VIN_SCALED, 2 },
+	{ CMD_READ_BAT_MIN_W, 2 },
+	{ CMD_READ_BAR_MAX_W, 2 },
+	{ CMD_RESET_BAT_MINMAX_W, 0 },
+	{ CMD_READ_LOCATION, 1 },
+	{ CMD_WRITE_LOCATION, 0 },
+	{ CMD_GET_FW_HASH, 16 },
+	{ CMD_SUSPEND_HINT, 0 },
+	{ CMD_ENABLE_WAKE_TIMER, 0 },
+	{ CMD_SET_WAKE_TIMER, 0 },
+	{ CMD_ENABLE_WAKE_AUTORESET, 0 },
+	{ CMD_OLS_SET_LIMITS, 0 },
+	{ CMD_OLS_GET_LIMITS, 4 },
+	{ CMD_OLS_SET_CEILING, 0 },
+	{ CMD_OLS_GET_CEILING, 2 },
+	{ CMD_READ_EXT_SCI_MASK, 2 },
+	{ CMD_WRITE_EXT_SCI_MASK, 0 },
+
+	{ }
+};
+
+#define EC_MAX_CMD_DATA_LEN	5
+#define EC_MAX_RESP_LEN		16
+
+#define LOG_BUF_SIZE		128
+
+#define PM_WAKEUP_TIME		1000
+
+#define EC_ALL_EVENTS		GENMASK(15, 0)
+
+enum ec_state_t {
+	CMD_STATE_IDLE = 0,
+	CMD_STATE_WAITING_FOR_SWITCH,
+	CMD_STATE_CMD_IN_TX_FIFO,
+	CMD_STATE_CMD_SENT,
+	CMD_STATE_RESP_RECEIVED,
+	CMD_STATE_ERROR_RECEIVED,
+};
+
+struct olpc_xo175_ec_cmd {
+	u8 command;
+	u8 nr_args;
+	u8 data_len;
+	u8 args[EC_MAX_CMD_DATA_LEN];
+};
+
+struct olpc_xo175_ec_resp {
+	u8 channel;
+	u8 byte;
+};
+
+struct olpc_xo175_ec {
+	bool suspended;
+
+	/* SPI related stuff. */
+	struct spi_device *spi;
+	struct spi_transfer xfer;
+	struct spi_message msg;
+	union {
+		struct olpc_xo175_ec_cmd cmd;
+		struct olpc_xo175_ec_resp resp;
+	} tx_buf, rx_buf;
+
+	/* GPIO for the CMD signals. */
+	struct gpio_desc *gpio_cmd;
+
+	/* Command handling related state. */
+	spinlock_t cmd_state_lock;
+	int cmd_state;
+	bool cmd_running;
+	struct completion cmd_done;
+	struct olpc_xo175_ec_cmd cmd;
+	u8 resp_data[EC_MAX_RESP_LEN];
+	int expected_resp_len;
+	int resp_len;
+
+	/* Power button. */
+	struct input_dev *pwrbtn;
+
+	/* Debug handling. */
+	char logbuf[LOG_BUF_SIZE];
+	int logbuf_len;
+};
+
+static struct platform_device *olpc_ec;
+
+static int olpc_xo175_ec_resp_len(u8 cmd)
+{
+	const struct ec_cmd_t *p;
+
+	for (p = olpc_xo175_ec_cmds; p->cmd; p++) {
+		if (p->cmd == cmd)
+			return p->bytes_returned;
+	}
+
+	return -EINVAL;
+}
+
+static void olpc_xo175_ec_flush_logbuf(struct olpc_xo175_ec *priv)
+{
+	dev_dbg(&priv->spi->dev, "got debug string [%*pE]\n",
+				priv->logbuf_len, priv->logbuf);
+	priv->logbuf_len = 0;
+}
+
+static void olpc_xo175_ec_complete(void *arg);
+
+static void olpc_xo175_ec_send_command(struct olpc_xo175_ec *priv, void *cmd,
+								size_t cmdlen)
+{
+	int ret;
+
+	memcpy(&priv->tx_buf, cmd, cmdlen);
+	priv->xfer.len = cmdlen;
+
+	spi_message_init_with_transfers(&priv->msg, &priv->xfer, 1);
+
+	priv->msg.complete = olpc_xo175_ec_complete;
+	priv->msg.context = priv;
+
+	ret = spi_async(priv->spi, &priv->msg);
+	if (ret)
+		dev_err(&priv->spi->dev, "spi_async() failed %d\n", ret);
+}
+
+static void olpc_xo175_ec_read_packet(struct olpc_xo175_ec *priv)
+{
+	u8 nonce[] = {0xA5, 0x5A};
+
+	olpc_xo175_ec_send_command(priv, nonce, sizeof(nonce));
+}
+
+static void olpc_xo175_ec_complete(void *arg)
+{
+	struct olpc_xo175_ec *priv = arg;
+	struct device *dev = &priv->spi->dev;
+	struct power_supply *psy;
+	unsigned long flags;
+	u8 channel;
+	u8 byte;
+	int ret;
+
+	ret = priv->msg.status;
+	if (ret) {
+		dev_err(dev, "SPI transfer failed: %d\n", ret);
+
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+		if (priv->cmd_running) {
+			priv->resp_len = 0;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+		if (ret != -EINTR)
+			olpc_xo175_ec_read_packet(priv);
+
+		return;
+	}
+
+	channel = priv->rx_buf.resp.channel;
+	byte = priv->rx_buf.resp.byte;
+
+	switch (channel) {
+	case CHAN_NONE:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* We can safely ignore these */
+			dev_err(dev, "spurious FIFO read packet\n");
+			spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+			return;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_SENT;
+		if (!priv->expected_resp_len)
+			complete(&priv->cmd_done);
+		olpc_xo175_ec_read_packet(priv);
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_SWITCH:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* Just go with the flow */
+			dev_err(dev, "spurious SWITCH packet\n");
+			memset(&priv->cmd, 0, sizeof(priv->cmd));
+			priv->cmd.command = CMD_ECHO;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_IN_TX_FIFO;
+
+		/* Throw command into TxFIFO */
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		olpc_xo175_ec_send_command(priv, &priv->cmd, sizeof(priv->cmd));
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_CMD_RESP:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious response packet\n");
+		} else if (priv->resp_len >= priv->expected_resp_len) {
+			dev_err(dev, "too many response packets\n");
+		} else {
+			priv->resp_data[priv->resp_len++] = byte;
+			if (priv->resp_len == priv->expected_resp_len) {
+				priv->cmd_state = CMD_STATE_RESP_RECEIVED;
+				complete(&priv->cmd_done);
+			}
+		}
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_CMD_ERROR:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious cmd error packet\n");
+		} else {
+			priv->resp_data[0] = byte;
+			priv->resp_len = 1;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_KEYBOARD:
+		dev_warn(dev, "keyboard is not supported\n");
+		break;
+
+	case CHAN_TOUCHPAD:
+		dev_warn(dev, "touchpad is not supported\n");
+		break;
+
+	case CHAN_EVENT:
+		dev_dbg(dev, "got event %.2x\n", byte);
+		switch (byte) {
+		case EVENT_AC_CHANGE:
+			psy = power_supply_get_by_name("olpc-ac");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_BATTERY_STATUS:
+		case EVENT_BATTERY_CRITICAL:
+		case EVENT_BATTERY_SOC_CHANGE:
+		case EVENT_BATTERY_ERROR:
+			psy = power_supply_get_by_name("olpc-battery");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_POWER_PRESSED:
+			input_report_key(priv->pwrbtn, KEY_POWER, 1);
+			input_sync(priv->pwrbtn);
+			input_report_key(priv->pwrbtn, KEY_POWER, 0);
+			input_sync(priv->pwrbtn);
+			/* fall through */
+		case EVENT_POWER_PRESS_WAKE:
+		case EVENT_TIMED_HOST_WAKE:
+			pm_wakeup_event(priv->pwrbtn->dev.parent,
+						PM_WAKEUP_TIME);
+			break;
+		default:
+			dev_dbg(dev, "ignored unknown event %.2x\n", byte);
+			break;
+		}
+		break;
+
+	case CHAN_DEBUG:
+		if (byte == '\n') {
+			olpc_xo175_ec_flush_logbuf(priv);
+		} else if (isprint(byte)) {
+			priv->logbuf[priv->logbuf_len++] = byte;
+			if (priv->logbuf_len == LOG_BUF_SIZE)
+				olpc_xo175_ec_flush_logbuf(priv);
+		}
+		break;
+
+	default:
+		dev_warn(dev, "unknown channel: %d, %.2x\n", channel, byte);
+		break;
+	}
+
+	/* Most non-command packets get the TxFIFO refilled and an ACK. */
+	olpc_xo175_ec_read_packet(priv);
+}
+
+/*
+ * This function is protected with a mutex. We can safely assume that
+ * there will be only one instance of this function running at a time.
+ * One of the ways in which we enforce this is by waiting until we get
+ * all response bytes back from the EC, rather than just the number that
+ * the caller requests (otherwise, we might start a new command while an
+ * old command's response bytes are still incoming).
+ */
+static int olpc_xo175_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *resp,
+					size_t resp_len, void *ec_cb_arg)
+{
+	struct olpc_xo175_ec *priv = ec_cb_arg;
+	struct device *dev = &priv->spi->dev;
+	unsigned long flags;
+	size_t nr_bytes;
+	int ret = 0;
+
+	dev_dbg(dev, "CMD %x, %zd bytes expected\n", cmd, resp_len);
+
+	if (inlen > 5) {
+		dev_err(dev, "command len %zd too big!\n", resp_len);
+		return -EOVERFLOW;
+	}
+
+	/* Suspending in the middle of an EC command hoses things badly! */
+	if (WARN_ON(priv->suspended))
+		return -EBUSY;
+
+	/* Ensure a valid command and return bytes */
+	ret = olpc_xo175_ec_resp_len(cmd);
+	if (ret < 0) {
+		dev_err_ratelimited(dev, "unknown command 0x%x\n", cmd);
+
+		/*
+		 * Assume the best in our callers, and allow unknown commands
+		 * through. I'm not the charitable type, but it was beaten
+		 * into me. Just maintain a minimum standard of sanity.
+		 */
+		if (resp_len > sizeof(priv->resp_data)) {
+			dev_err(dev, "response too big: %zd!\n", resp_len);
+			return -EOVERFLOW;
+		}
+		nr_bytes = resp_len;
+	} else {
+		nr_bytes = (size_t)ret;
+	}
+	resp_len = min(resp_len, nr_bytes);
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Initialize the state machine */
+	init_completion(&priv->cmd_done);
+	priv->cmd_running = true;
+	priv->cmd_state = CMD_STATE_WAITING_FOR_SWITCH;
+	memset(&priv->cmd, 0, sizeof(priv->cmd));
+	priv->cmd.command = cmd;
+	priv->cmd.nr_args = inlen;
+	priv->cmd.data_len = 0;
+	memcpy(priv->cmd.args, inbuf, inlen);
+	priv->expected_resp_len = nr_bytes;
+	priv->resp_len = 0;
+
+	/* Tickle the cmd gpio to get things started */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 1);
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	/* The irq handler should do the rest */
+	if (!wait_for_completion_timeout(&priv->cmd_done,
+			msecs_to_jiffies(4000))) {
+		dev_err(dev, "EC cmd error: timeout in STATE %d\n",
+				priv->cmd_state);
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		spi_slave_abort(priv->spi);
+		olpc_xo175_ec_read_packet(priv);
+		return -ETIMEDOUT;
+	}
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Deal with the results. */
+	if (priv->cmd_state == CMD_STATE_ERROR_RECEIVED) {
+		/* EC-provided error is in the single response byte */
+		dev_err(dev, "command 0x%x returned error 0x%x\n",
+						cmd, priv->resp_data[0]);
+		ret = -EREMOTEIO;
+	} else if (priv->resp_len != nr_bytes) {
+		dev_err(dev, "command 0x%x returned %d bytes, expected %zd bytes\n",
+						cmd, priv->resp_len, nr_bytes);
+		ret = -EREMOTEIO;
+	} else {
+		/*
+		 * We may have 8 bytes in priv->resp, but we only care about
+		 * what we've been asked for. If the caller asked for only 2
+		 * bytes, give them that. We've guaranteed that
+		 * resp_len <= priv->resp_len and priv->resp_len == nr_bytes.
+		 */
+		memcpy(resp, priv->resp_data, resp_len);
+	}
+
+	/* This should already be low, but just in case. */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+	priv->cmd_running = false;
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	return ret;
+}
+
+static int olpc_xo175_ec_set_event_mask(unsigned int mask)
+{
+	u8 args[2];
+
+	args[0] = mask >> 0;
+	args[1] = mask >> 8;
+	return olpc_ec_cmd(CMD_WRITE_EXT_SCI_MASK, args, 2, NULL, 0);
+}
+
+static void olpc_xo175_ec_power_off(void)
+{
+	while (1) {
+		olpc_ec_cmd(CMD_POWER_OFF, NULL, 0, NULL, 0);
+		mdelay(1000);
+	}
+}
+
+static int __maybe_unused olpc_xo175_ec_suspend(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+	static struct {
+		u8 suspend;
+		u32 suspend_count;
+	} __packed hintargs;
+	static unsigned int suspend_count;
+
+	/*
+	 * SOC_SLEEP is not wired to the EC on B3 and earlier boards.
+	 * This command lets the EC know instead. The suspend count doesn't seem
+	 * to be used anywhere but in the EC debug output.
+	 */
+	hintargs.suspend = 1;
+	hintargs.suspend_count = suspend_count++;
+	olpc_ec_cmd(CMD_SUSPEND_HINT, (void *)&hintargs, sizeof(hintargs),
+								NULL, 0);
+
+	/*
+	 * After we've sent the suspend hint, don't allow further EC commands
+	 * to be run until we've resumed. Userspace tasks should be frozen,
+	 * but kernel threads and interrupts could still schedule EC commands.
+	 */
+	priv->suspended = true;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume_noirq(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+
+	priv->suspended = false;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume(struct device *dev)
+{
+	u8 x = 0;
+
+	/*
+	 * The resume hint is only needed if no other commands are
+	 * being sent during resume. all it does is tell the EC
+	 * the SoC is definitely awake.
+	 */
+	olpc_ec_cmd(CMD_SUSPEND_HINT, &x, 1, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	return 0;
+}
+
+static struct olpc_ec_driver olpc_xo175_ec_driver = {
+	.ec_cmd = olpc_xo175_ec_cmd,
+};
+
+static int olpc_xo175_ec_remove(struct spi_device *spi)
+{
+	if (pm_power_off == olpc_xo175_ec_power_off)
+		pm_power_off = NULL;
+
+	spi_slave_abort(spi);
+
+	platform_device_unregister(olpc_ec);
+	olpc_ec = NULL;
+
+	return 0;
+}
+
+static int olpc_xo175_ec_probe(struct spi_device *spi)
+{
+	struct olpc_xo175_ec *priv;
+	int ret;
+
+	if (olpc_ec) {
+		dev_err(&spi->dev, "OLPC EC already registered.\n");
+		return -EBUSY;
+	}
+
+	priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->gpio_cmd = devm_gpiod_get(&spi->dev, "cmd", GPIOD_OUT_LOW);
+	if (IS_ERR(priv->gpio_cmd)) {
+		dev_err(&spi->dev, "failed to get cmd gpio: %ld\n",
+					PTR_ERR(priv->gpio_cmd));
+		return PTR_ERR(priv->gpio_cmd);
+	}
+
+	priv->spi = spi;
+
+	spin_lock_init(&priv->cmd_state_lock);
+	priv->cmd_state = CMD_STATE_IDLE;
+	init_completion(&priv->cmd_done);
+
+	priv->logbuf_len = 0;
+
+	/* Set up power button input device */
+	priv->pwrbtn = devm_input_allocate_device(&spi->dev);
+	if (!priv->pwrbtn)
+		return -ENOMEM;
+	priv->pwrbtn->name = "Power Button";
+	priv->pwrbtn->dev.parent = &spi->dev;
+	input_set_capability(priv->pwrbtn, EV_KEY, KEY_POWER);
+	ret = input_register_device(priv->pwrbtn);
+	if (ret) {
+		dev_err(&spi->dev, "error registering input device: %d\n", ret);
+		return ret;
+	}
+
+	spi_set_drvdata(spi, priv);
+
+	priv->xfer.rx_buf = &priv->rx_buf;
+	priv->xfer.tx_buf = &priv->tx_buf;
+
+	olpc_xo175_ec_read_packet(priv);
+
+	olpc_ec_driver_register(&olpc_xo175_ec_driver, priv);
+	olpc_ec = platform_device_register_resndata(&spi->dev, "olpc-ec", -1,
+							NULL, 0, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	if (pm_power_off == NULL)
+		pm_power_off = olpc_xo175_ec_power_off;
+
+	dev_info(&spi->dev, "OLPC XO-1.75 Embedded Controller driver\n");
+
+	return 0;
+}
+
+static const struct dev_pm_ops olpc_xo175_ec_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, olpc_xo175_ec_resume_noirq)
+	SET_RUNTIME_PM_OPS(olpc_xo175_ec_suspend, olpc_xo175_ec_resume, NULL)
+};
+
+static const struct of_device_id olpc_xo175_ec_of_match[] = {
+	{ .compatible = "olpc,xo1.75-ec" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, olpc_xo175_ec_of_match);
+
+static struct spi_driver olpc_xo175_ec_spi_driver = {
+	.driver = {
+		.name	= "olpc-xo175-ec",
+		.of_match_table = olpc_xo175_ec_of_match,
+		.pm = &olpc_xo175_ec_pm_ops,
+	},
+	.probe		= olpc_xo175_ec_probe,
+	.remove		= olpc_xo175_ec_remove,
+};
+module_spi_driver(olpc_xo175_ec_spi_driver);
+
+MODULE_DESCRIPTION("OLPC XO-1.75 Embedded Controller driver");
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>"); /* Functionality */
+MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>"); /* Bugs */
+MODULE_LICENSE("GPL");
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index f7b6a7eda232..c4602364e909 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -41,7 +41,7 @@ struct olpc_ec_driver {
 	bool wakeup_available;
 };
 
-#ifdef CONFIG_OLPC
+#ifdef CONFIG_OLPC_EC
 
 extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 
@@ -69,6 +69,6 @@ static inline bool olpc_ec_wakeup_available(void)
 	return false;
 }
 
-#endif /* CONFIG_OLPC */
+#endif /* CONFIG_OLPC_EC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3


From 9a0f780958bbcb85604636fa340e2a1efaa4f432 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms+renesas@verge.net.au>
Date: Mon, 13 May 2019 13:39:51 +0200
Subject: dmaengine: sudmac: remove unused driver

SUDMAC driver was introduced in v3.10 but was never integrated for use
by any platform. As it is unused remove it.

Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sh/Kconfig  |   6 -
 drivers/dma/sh/Makefile |   1 -
 drivers/dma/sh/sudmac.c | 414 ------------------------------------------------
 include/linux/sudmac.h  |  52 ------
 4 files changed, 473 deletions(-)
 delete mode 100644 drivers/dma/sh/sudmac.c
 delete mode 100644 include/linux/sudmac.h

(limited to 'include/linux')

diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig
index 4d6b02b3b1f1..54d5d0369d3c 100644
--- a/drivers/dma/sh/Kconfig
+++ b/drivers/dma/sh/Kconfig
@@ -47,9 +47,3 @@ config RENESAS_USB_DMAC
 	help
 	  This driver supports the USB-DMA controller found in the Renesas
 	  SoCs.
-
-config SUDMAC
-	tristate "Renesas SUDMAC support"
-	depends on SH_DMAE_BASE
-	help
-	  Enable support for the Renesas SUDMAC controllers.
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile
index 42110dd57a56..112fbd22bb3f 100644
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -15,4 +15,3 @@ obj-$(CONFIG_SH_DMAE) += shdma.o
 
 obj-$(CONFIG_RCAR_DMAC) += rcar-dmac.o
 obj-$(CONFIG_RENESAS_USB_DMAC) += usb-dmac.o
-obj-$(CONFIG_SUDMAC) += sudmac.o
diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c
deleted file mode 100644
index 30cc3553cb8b..000000000000
--- a/drivers/dma/sh/sudmac.c
+++ /dev/null
@@ -1,414 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Renesas SUDMAC support
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * based on drivers/dma/sh/shdma.c:
- * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
- * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
- * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/sudmac.h>
-
-struct sudmac_chan {
-	struct shdma_chan shdma_chan;
-	void __iomem *base;
-	char dev_id[16];	/* unique name per DMAC of channel */
-
-	u32 offset;		/* for CFG, BA, BBC, CA, CBC, DEN */
-	u32 cfg;
-	u32 dint_end_bit;
-};
-
-struct sudmac_device {
-	struct shdma_dev shdma_dev;
-	struct sudmac_pdata *pdata;
-	void __iomem *chan_reg;
-};
-
-struct sudmac_regs {
-	u32 base_addr;
-	u32 base_byte_count;
-};
-
-struct sudmac_desc {
-	struct sudmac_regs hw;
-	struct shdma_desc shdma_desc;
-};
-
-#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan)
-#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc)
-#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \
-				 struct sudmac_device, shdma_dev.dma_dev)
-
-/* SUDMAC register */
-#define SUDMAC_CH0CFG		0x00
-#define SUDMAC_CH0BA		0x10
-#define SUDMAC_CH0BBC		0x18
-#define SUDMAC_CH0CA		0x20
-#define SUDMAC_CH0CBC		0x28
-#define SUDMAC_CH0DEN		0x30
-#define SUDMAC_DSTSCLR		0x38
-#define SUDMAC_DBUFCTRL		0x3C
-#define SUDMAC_DINTCTRL		0x40
-#define SUDMAC_DINTSTS		0x44
-#define SUDMAC_DINTSTSCLR	0x48
-#define SUDMAC_CH0SHCTRL	0x50
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_SENDBUFM	0x1000 /* b12: Transmit Buffer Mode */
-#define SUDMAC_RCVENDM	0x0100 /* b8: Receive Data Transfer End Mode */
-#define SUDMAC_LBA_WAIT	0x0030 /* b5-4: Local Bus Access Wait */
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_CH1ENDE	0x0002 /* b1: Ch1 DMA Transfer End Int Enable */
-#define SUDMAC_CH0ENDE	0x0001 /* b0: Ch0 DMA Transfer End Int Enable */
-
-#define SUDMAC_DRV_NAME "sudmac"
-
-static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg)
-{
-	iowrite32(data, sc->base + reg);
-}
-
-static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg)
-{
-	return ioread32(sc->base + reg);
-}
-
-static bool sudmac_is_busy(struct sudmac_chan *sc)
-{
-	u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset);
-
-	if (den)
-		return true; /* working */
-
-	return false; /* waiting */
-}
-
-static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw,
-			   struct shdma_desc *sdesc)
-{
-	sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset);
-	sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset);
-	sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset);
-}
-
-static void sudmac_start(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset);
-}
-
-static void sudmac_start_xfer(struct shdma_chan *schan,
-			      struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	sudmac_set_reg(sc, &sd->hw, sdesc);
-	sudmac_start(sc);
-}
-
-static bool sudmac_channel_busy(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	return sudmac_is_busy(sc);
-}
-
-static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id)
-{
-}
-
-static const struct sudmac_slave_config *sudmac_find_slave(
-	struct sudmac_chan *sc, int slave_id)
-{
-	struct sudmac_device *sdev = to_sdev(sc);
-	struct sudmac_pdata *pdata = sdev->pdata;
-	const struct sudmac_slave_config *cfg;
-	int i;
-
-	for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
-		if (cfg->slave_id == slave_id)
-			return cfg;
-
-	return NULL;
-}
-
-static int sudmac_set_slave(struct shdma_chan *schan, int slave_id,
-			    dma_addr_t slave_addr, bool try)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id);
-
-	if (!cfg)
-		return -ENODEV;
-
-	return 0;
-}
-
-static inline void sudmac_dma_halt(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset);
-	sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR);
-}
-
-static int sudmac_desc_setup(struct shdma_chan *schan,
-			     struct shdma_desc *sdesc,
-			     dma_addr_t src, dma_addr_t dst, size_t *len)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	dev_dbg(sc->shdma_chan.dev, "%s: src=%pad, dst=%pad, len=%zu\n",
-		__func__, &src, &dst, *len);
-
-	if (*len > schan->max_xfer_len)
-		*len = schan->max_xfer_len;
-
-	if (dst)
-		sd->hw.base_addr = dst;
-	else if (src)
-		sd->hw.base_addr = src;
-	sd->hw.base_byte_count = *len;
-
-	return 0;
-}
-
-static void sudmac_halt(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	sudmac_dma_halt(sc);
-}
-
-static bool sudmac_chan_irq(struct shdma_chan *schan, int irq)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS);
-
-	if (!(dintsts & sc->dint_end_bit))
-		return false;
-
-	/* DMA stop */
-	sudmac_dma_halt(sc);
-
-	return true;
-}
-
-static size_t sudmac_get_partial(struct shdma_chan *schan,
-				 struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset);
-
-	return sd->hw.base_byte_count - current_byte_count;
-}
-
-static bool sudmac_desc_completed(struct shdma_chan *schan,
-				  struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset);
-
-	return sd->hw.base_addr + sd->hw.base_byte_count == current_addr;
-}
-
-static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq,
-			     unsigned long flags)
-{
-	struct shdma_dev *sdev = &su_dev->shdma_dev;
-	struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev);
-	struct sudmac_chan *sc;
-	struct shdma_chan *schan;
-	int err;
-
-	sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL);
-	if (!sc)
-		return -ENOMEM;
-
-	schan = &sc->shdma_chan;
-	schan->max_xfer_len = 64 * 1024 * 1024 - 1;
-
-	shdma_chan_probe(sdev, schan, id);
-
-	sc->base = su_dev->chan_reg;
-
-	/* get platform_data */
-	sc->offset = su_dev->pdata->channel->offset;
-	if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE)
-		sc->cfg |= SUDMAC_SENDBUFM;
-	if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE)
-		sc->cfg |= SUDMAC_RCVENDM;
-	sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT;
-
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0)
-		sc->dint_end_bit |= SUDMAC_CH0ENDE;
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1)
-		sc->dint_end_bit |= SUDMAC_CH1ENDE;
-
-	/* set up channel irq */
-	if (pdev->id >= 0)
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d",
-			 pdev->id, id);
-	else
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id);
-
-	err = shdma_request_irq(schan, irq, flags, sc->dev_id);
-	if (err) {
-		dev_err(sdev->dma_dev.dev,
-			"DMA channel %d request_irq failed %d\n", id, err);
-		goto err_no_irq;
-	}
-
-	return 0;
-
-err_no_irq:
-	/* remove from dmaengine device node */
-	shdma_chan_remove(schan);
-	return err;
-}
-
-static void sudmac_chan_remove(struct sudmac_device *su_dev)
-{
-	struct shdma_chan *schan;
-	int i;
-
-	shdma_for_each_chan(schan, &su_dev->shdma_dev, i) {
-		BUG_ON(!schan);
-
-		shdma_chan_remove(schan);
-	}
-}
-
-static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan)
-{
-	/* SUDMAC doesn't need the address */
-	return 0;
-}
-
-static struct shdma_desc *sudmac_embedded_desc(void *buf, int i)
-{
-	return &((struct sudmac_desc *)buf)[i].shdma_desc;
-}
-
-static const struct shdma_ops sudmac_shdma_ops = {
-	.desc_completed = sudmac_desc_completed,
-	.halt_channel = sudmac_halt,
-	.channel_busy = sudmac_channel_busy,
-	.slave_addr = sudmac_slave_addr,
-	.desc_setup = sudmac_desc_setup,
-	.set_slave = sudmac_set_slave,
-	.setup_xfer = sudmac_setup_xfer,
-	.start_xfer = sudmac_start_xfer,
-	.embedded_desc = sudmac_embedded_desc,
-	.chan_irq = sudmac_chan_irq,
-	.get_partial = sudmac_get_partial,
-};
-
-static int sudmac_probe(struct platform_device *pdev)
-{
-	struct sudmac_pdata *pdata = dev_get_platdata(&pdev->dev);
-	int err, i;
-	struct sudmac_device *su_dev;
-	struct dma_device *dma_dev;
-	struct resource *chan, *irq_res;
-
-	/* get platform data */
-	if (!pdata)
-		return -ENODEV;
-
-	irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!irq_res)
-		return -ENODEV;
-
-	err = -ENOMEM;
-	su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device),
-			      GFP_KERNEL);
-	if (!su_dev)
-		return err;
-
-	dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	chan = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	su_dev->chan_reg = devm_ioremap_resource(&pdev->dev, chan);
-	if (IS_ERR(su_dev->chan_reg))
-		return PTR_ERR(su_dev->chan_reg);
-
-	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
-
-	su_dev->shdma_dev.ops = &sudmac_shdma_ops;
-	su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc);
-	err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num);
-	if (err < 0)
-		return err;
-
-	/* platform data */
-	su_dev->pdata = dev_get_platdata(&pdev->dev);
-
-	platform_set_drvdata(pdev, su_dev);
-
-	/* Create DMA Channel */
-	for (i = 0; i < pdata->channel_num; i++) {
-		err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED);
-		if (err)
-			goto chan_probe_err;
-	}
-
-	err = dma_async_device_register(&su_dev->shdma_dev.dma_dev);
-	if (err < 0)
-		goto chan_probe_err;
-
-	return err;
-
-chan_probe_err:
-	sudmac_chan_remove(su_dev);
-
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return err;
-}
-
-static int sudmac_remove(struct platform_device *pdev)
-{
-	struct sudmac_device *su_dev = platform_get_drvdata(pdev);
-	struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	dma_async_device_unregister(dma_dev);
-	sudmac_chan_remove(su_dev);
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return 0;
-}
-
-static struct platform_driver sudmac_driver = {
-	.driver		= {
-		.name	= SUDMAC_DRV_NAME,
-	},
-	.probe		= sudmac_probe,
-	.remove		= sudmac_remove,
-};
-module_platform_driver(sudmac_driver);
-
-MODULE_AUTHOR("Yoshihiro Shimoda");
-MODULE_DESCRIPTION("Renesas SUDMAC driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" SUDMAC_DRV_NAME);
diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h
deleted file mode 100644
index 377b8a5788fa..000000000000
--- a/include/linux/sudmac.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Header for the SUDMAC driver
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * This is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- */
-#ifndef SUDMAC_H
-#define SUDMAC_H
-
-#include <linux/dmaengine.h>
-#include <linux/shdma-base.h>
-#include <linux/types.h>
-
-/* Used by slave DMA clients to request DMA to/from a specific peripheral */
-struct sudmac_slave {
-	struct shdma_slave	shdma_slave;	/* Set by the platform */
-};
-
-/*
- * Supplied by platforms to specify, how a DMA channel has to be configured for
- * a certain peripheral
- */
-struct sudmac_slave_config {
-	int		slave_id;
-};
-
-struct sudmac_channel {
-	unsigned long	offset;
-	unsigned long	config;
-	unsigned long	wait;		/* The configuable range is 0 to 3 */
-	unsigned long	dint_end_bit;
-};
-
-struct sudmac_pdata {
-	const struct sudmac_slave_config *slave;
-	int slave_num;
-	const struct sudmac_channel *channel;
-	int channel_num;
-};
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_TX_BUFFER_MODE	BIT(0)
-#define SUDMAC_RX_END_MODE	BIT(1)
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_DMA_BIT_CH0	BIT(0)
-#define SUDMAC_DMA_BIT_CH1	BIT(1)
-
-#endif
-- 
cgit v1.2.3


From 7e5f7bb08b8cefd3a7e8961861f47fe1f0e830d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: unexport simple_dname()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/d_path.c            | 1 -
 fs/internal.h          | 1 +
 include/linux/dcache.h | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/d_path.c b/fs/d_path.c
index e8fce6b1174f..a7d0a96b35ce 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -316,7 +316,6 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 		end = ERR_PTR(-ENAMETOOLONG);
 	return end;
 }
-EXPORT_SYMBOL(simple_dname);
 
 /*
  * Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/internal.h b/fs/internal.h
index 0010889f2e85..1ac2b8f6c621 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -160,6 +160,7 @@ extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 extern struct dentry *d_alloc_cursor(struct dentry *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern char *simple_dname(struct dentry *, char *, int);
 
 /*
  * read_write.c
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f14e587c5d5d..361305ddd75e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -291,7 +291,6 @@ static inline unsigned d_count(const struct dentry *dentry)
  */
 extern __printf(4, 5)
 char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
-extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
-- 
cgit v1.2.3


From 97a7968448cb0ef5c15e3d395746b108b1a55556 Mon Sep 17 00:00:00 2001
From: Chris Brandt <chris.brandt@renesas.com>
Date: Wed, 15 May 2019 10:20:41 -0500
Subject: usb: renesas_usbhs: move flags to param

Move options from 'flags' field in private structure to param structure
where other options are already being kept.

Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 23 +++++++----------------
 drivers/usb/renesas_usbhs/common.h |  2 --
 include/linux/usb/renesas_usbhs.h  |  1 +
 3 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 0ca89de7f842..1de7a44f3415 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -43,15 +43,6 @@
  *			| ....  |	+-----------+
  */
 
-
-#define USBHSF_RUNTIME_PWCTRL	(1 << 0)
-
-/* status */
-#define usbhsc_flags_init(p)   do {(p)->flags = 0; } while (0)
-#define usbhsc_flags_set(p, b) ((p)->flags |=  (b))
-#define usbhsc_flags_clr(p, b) ((p)->flags &= ~(b))
-#define usbhsc_flags_has(p, b) ((p)->flags &   (b))
-
 /*
  * platform call back
  *
@@ -479,7 +470,7 @@ static void usbhsc_hotplug(struct usbhs_priv *priv)
 		dev_dbg(&pdev->dev, "%s enable\n", __func__);
 
 		/* power on */
-		if (usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL))
+		if (usbhs_get_dparam(priv, runtime_pwctrl))
 			usbhsc_power_ctrl(priv, enable);
 
 		/* bus init */
@@ -499,7 +490,7 @@ static void usbhsc_hotplug(struct usbhs_priv *priv)
 		usbhsc_bus_init(priv);
 
 		/* power off */
-		if (usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL))
+		if (usbhs_get_dparam(priv, runtime_pwctrl))
 			usbhsc_power_ctrl(priv, enable);
 
 		usbhs_mod_change(priv, -1);
@@ -733,7 +724,7 @@ static int usbhs_probe(struct platform_device *pdev)
 	/* FIXME */
 	/* runtime power control ? */
 	if (priv->pfunc.get_vbus)
-		usbhsc_flags_set(priv, USBHSF_RUNTIME_PWCTRL);
+		usbhs_get_dparam(priv, runtime_pwctrl) = 1;
 
 	/*
 	 * priv settings
@@ -807,7 +798,7 @@ static int usbhs_probe(struct platform_device *pdev)
 
 	/* power control */
 	pm_runtime_enable(&pdev->dev);
-	if (!usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL)) {
+	if (!usbhs_get_dparam(priv, runtime_pwctrl)) {
 		usbhsc_power_ctrl(priv, 1);
 		usbhs_mod_autonomy_mode(priv);
 	}
@@ -848,7 +839,7 @@ static int usbhs_remove(struct platform_device *pdev)
 	dfunc->notify_hotplug = NULL;
 
 	/* power off */
-	if (!usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL))
+	if (!usbhs_get_dparam(priv, runtime_pwctrl))
 		usbhsc_power_ctrl(priv, 0);
 
 	pm_runtime_disable(&pdev->dev);
@@ -873,7 +864,7 @@ static __maybe_unused int usbhsc_suspend(struct device *dev)
 		usbhs_mod_change(priv, -1);
 	}
 
-	if (mod || !usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL))
+	if (mod || !usbhs_get_dparam(priv, runtime_pwctrl))
 		usbhsc_power_ctrl(priv, 0);
 
 	return 0;
@@ -884,7 +875,7 @@ static __maybe_unused int usbhsc_resume(struct device *dev)
 	struct usbhs_priv *priv = dev_get_drvdata(dev);
 	struct platform_device *pdev = usbhs_priv_to_pdev(priv);
 
-	if (!usbhsc_flags_has(priv, USBHSF_RUNTIME_PWCTRL)) {
+	if (!usbhs_get_dparam(priv, runtime_pwctrl)) {
 		usbhsc_power_ctrl(priv, 1);
 		usbhs_mod_autonomy_mode(priv);
 	}
diff --git a/drivers/usb/renesas_usbhs/common.h b/drivers/usb/renesas_usbhs/common.h
index de1a6638bf68..1fbffb7bbc8f 100644
--- a/drivers/usb/renesas_usbhs/common.h
+++ b/drivers/usb/renesas_usbhs/common.h
@@ -260,8 +260,6 @@ struct usbhs_priv {
 
 	spinlock_t		lock;
 
-	u32 flags;
-
 	/*
 	 * module control
 	 */
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 53924f8e840c..17fae6e504cc 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -189,6 +189,7 @@ struct renesas_usbhs_driver_param {
 	u32 has_otg:1; /* for controlling PWEN/EXTLP */
 	u32 has_sudmac:1; /* for SUDMAC */
 	u32 has_usb_dmac:1; /* for USB-DMAC */
+	u32 runtime_pwctrl:1;
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 };
 
-- 
cgit v1.2.3


From 2195e3af9079ea067079e98446ea6a457c81a98c Mon Sep 17 00:00:00 2001
From: Chris Brandt <chris.brandt@renesas.com>
Date: Wed, 15 May 2019 10:20:42 -0500
Subject: usb: renesas_usbhs: add support for CNEN bit

For some SoC, CNEN must be set for USB Device mode operation.

Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 6 ++++++
 drivers/usb/renesas_usbhs/common.h | 1 +
 include/linux/usb/renesas_usbhs.h  | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 1de7a44f3415..734fb4e542c5 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -114,6 +114,12 @@ void usbhs_sys_function_ctrl(struct usbhs_priv *priv, int enable)
 	u16 mask = DCFM | DRPD | DPRPU | HSE | USBE;
 	u16 val  = HSE | USBE;
 
+	/* CNEN bit is required for function operation */
+	if (usbhs_get_dparam(priv, has_cnen)) {
+		mask |= CNEN;
+		val  |= CNEN;
+	}
+
 	/*
 	 * if enable
 	 *
diff --git a/drivers/usb/renesas_usbhs/common.h b/drivers/usb/renesas_usbhs/common.h
index 1fbffb7bbc8f..de74ebd1a347 100644
--- a/drivers/usb/renesas_usbhs/common.h
+++ b/drivers/usb/renesas_usbhs/common.h
@@ -104,6 +104,7 @@ struct usbhs_priv;
 
 /* SYSCFG */
 #define SCKE	(1 << 10)	/* USB Module Clock Enable */
+#define CNEN	(1 << 8)	/* Single-ended receiver operation Enable */
 #define HSE	(1 << 7)	/* High-Speed Operation Enable */
 #define DCFM	(1 << 6)	/* Controller Function Select */
 #define DRPD	(1 << 5)	/* D+ Line/D- Line Resistance Control */
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 17fae6e504cc..9097a38fcda8 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -190,6 +190,7 @@ struct renesas_usbhs_driver_param {
 	u32 has_sudmac:1; /* for SUDMAC */
 	u32 has_usb_dmac:1; /* for USB-DMAC */
 	u32 runtime_pwctrl:1;
+	u32 has_cnen:1;
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 };
 
-- 
cgit v1.2.3


From f756066990607dbe8ea5579c925b48e646891f3e Mon Sep 17 00:00:00 2001
From: Chris Brandt <chris.brandt@renesas.com>
Date: Wed, 15 May 2019 10:20:43 -0500
Subject: usb: renesas_usbhs: support byte addressable CFIFO

Some SoC have a CFIFO register that is byte addressable. This means
when the CFIFO access is set to 32-bit, you can write 8-bit values to
addresses CFIFO+0, CFIFO+1, CFIFO+2, CFIFO+3.

Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/fifo.c  | 9 +++++++--
 include/linux/usb/renesas_usbhs.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c
index 39fa2fc1b8b7..452b456ac24e 100644
--- a/drivers/usb/renesas_usbhs/fifo.c
+++ b/drivers/usb/renesas_usbhs/fifo.c
@@ -543,8 +543,13 @@ static int usbhsf_pio_try_push(struct usbhs_pkt *pkt, int *is_done)
 	}
 
 	/* the rest operation */
-	for (i = 0; i < len; i++)
-		iowrite8(buf[i], addr + (0x03 - (i & 0x03)));
+	if (usbhs_get_dparam(priv, cfifo_byte_addr)) {
+		for (i = 0; i < len; i++)
+			iowrite8(buf[i], addr + (i & 0x03));
+	} else {
+		for (i = 0; i < len; i++)
+			iowrite8(buf[i], addr + (0x03 - (i & 0x03)));
+	}
 
 	/*
 	 * variable update
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 9097a38fcda8..87043fd21d54 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -191,6 +191,7 @@ struct renesas_usbhs_driver_param {
 	u32 has_usb_dmac:1; /* for USB-DMAC */
 	u32 runtime_pwctrl:1;
 	u32 has_cnen:1;
+	u32 cfifo_byte_addr:1; /* CFIFO is byte addressable */
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 };
 
-- 
cgit v1.2.3


From b69dce6341053cd51f3692a2ab3825140fad6ab8 Mon Sep 17 00:00:00 2001
From: Chris Brandt <chris.brandt@renesas.com>
Date: Wed, 15 May 2019 10:20:44 -0500
Subject: usb: renesas_usbhs: Add support for RZ/A2

The RZ/A2 is similar to the R-Car Gen3 with some small differences.

Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/Makefile |  2 +-
 drivers/usb/renesas_usbhs/common.c | 15 ++++++++
 drivers/usb/renesas_usbhs/rza.h    |  1 +
 drivers/usb/renesas_usbhs/rza2.c   | 72 ++++++++++++++++++++++++++++++++++++++
 include/linux/usb/renesas_usbhs.h  |  1 +
 5 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 drivers/usb/renesas_usbhs/rza2.c

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/Makefile b/drivers/usb/renesas_usbhs/Makefile
index 5c5b51bb48ef..a1fed56b0957 100644
--- a/drivers/usb/renesas_usbhs/Makefile
+++ b/drivers/usb/renesas_usbhs/Makefile
@@ -5,7 +5,7 @@
 
 obj-$(CONFIG_USB_RENESAS_USBHS)	+= renesas_usbhs.o
 
-renesas_usbhs-y			:= common.o mod.o pipe.o fifo.o rcar2.o rcar3.o rza.o
+renesas_usbhs-y			:= common.o mod.o pipe.o fifo.o rcar2.o rcar3.o rza.o rza2.o
 
 ifneq ($(CONFIG_USB_RENESAS_USBHS_HCD),)
 	renesas_usbhs-y		+= mod_host.o
diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 734fb4e542c5..c7c9c5d75a56 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -571,6 +571,17 @@ static const struct usbhs_of_data rza1_data = {
 	}
 };
 
+static const struct usbhs_of_data rza2_data = {
+	.platform_callback = &usbhs_rza2_ops,
+	.param = {
+		.type = USBHS_TYPE_RZA2,
+		.has_cnen = 1,
+		.cfifo_byte_addr = 1,
+		.pipe_configs = usbhsc_new_pipe,
+		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+	}
+};
+
 /*
  *		platform functions
  */
@@ -619,6 +630,10 @@ static const struct of_device_id usbhs_of_match[] = {
 		.compatible = "renesas,rza1-usbhs",
 		.data = &rza1_data,
 	},
+	{
+		.compatible = "renesas,rza2-usbhs",
+		.data = &rza2_data,
+	},
 	{ },
 };
 MODULE_DEVICE_TABLE(of, usbhs_of_match);
diff --git a/drivers/usb/renesas_usbhs/rza.h b/drivers/usb/renesas_usbhs/rza.h
index ca917ca54f6d..073a53d1d442 100644
--- a/drivers/usb/renesas_usbhs/rza.h
+++ b/drivers/usb/renesas_usbhs/rza.h
@@ -2,3 +2,4 @@
 #include "common.h"
 
 extern const struct renesas_usbhs_platform_callback usbhs_rza1_ops;
+extern const struct renesas_usbhs_platform_callback usbhs_rza2_ops;
diff --git a/drivers/usb/renesas_usbhs/rza2.c b/drivers/usb/renesas_usbhs/rza2.c
new file mode 100644
index 000000000000..9d8551f93533
--- /dev/null
+++ b/drivers/usb/renesas_usbhs/rza2.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Renesas USB driver RZ/A2 initialization and power control
+ *
+ * Copyright (C) 2019 Chris Brandt
+ * Copyright (C) 2019 Renesas Electronics Corporation
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/of_device.h>
+#include <linux/phy/phy.h>
+#include "common.h"
+#include "rza.h"
+
+static int usbhs_rza2_hardware_init(struct platform_device *pdev)
+{
+	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
+	struct phy *phy = phy_get(&pdev->dev, "usb");
+
+	if (IS_ERR(phy))
+		return PTR_ERR(phy);
+
+	priv->phy = phy;
+	return 0;
+}
+
+static int usbhs_rza2_hardware_exit(struct platform_device *pdev)
+{
+	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
+
+	phy_put(priv->phy);
+	priv->phy = NULL;
+
+	return 0;
+}
+
+static int usbhs_rza2_power_ctrl(struct platform_device *pdev,
+				void __iomem *base, int enable)
+{
+	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
+	int retval = 0;
+
+	if (!priv->phy)
+		return -ENODEV;
+
+	if (enable) {
+		retval = phy_init(priv->phy);
+		usbhs_bset(priv, SUSPMODE, SUSPM, SUSPM);
+		udelay(100);	/* Wait for PLL to become stable */
+		if (!retval)
+			retval = phy_power_on(priv->phy);
+	} else {
+		usbhs_bset(priv, SUSPMODE, SUSPM, 0);
+		phy_power_off(priv->phy);
+		phy_exit(priv->phy);
+	}
+
+	return retval;
+}
+
+static int usbhs_rza2_get_id(struct platform_device *pdev)
+{
+	return USBHS_GADGET;
+}
+
+const struct renesas_usbhs_platform_callback usbhs_rza2_ops = {
+	.hardware_init = usbhs_rza2_hardware_init,
+	.hardware_exit = usbhs_rza2_hardware_exit,
+	.power_ctrl = usbhs_rza2_power_ctrl,
+	.get_id = usbhs_rza2_get_id,
+};
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 87043fd21d54..3f53043fb56b 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -199,6 +199,7 @@ struct renesas_usbhs_driver_param {
 #define USBHS_TYPE_RCAR_GEN3		2
 #define USBHS_TYPE_RCAR_GEN3_WITH_PLL	3
 #define USBHS_TYPE_RZA1			4
+#define USBHS_TYPE_RZA2			5
 
 /*
  * option:
-- 
cgit v1.2.3


From b48345aafb203803ccda4488cb5409b1ed435c0a Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 10 May 2019 12:21:49 -0400
Subject: audit: deliver signal_info regarless of syscall

When a process signals the audit daemon (shutdown, rotate, resume,
reconfig) but syscall auditing is not enabled, we still want to know the
identity of the process sending the signal to the audit daemon.

Move audit_signal_info() out of syscall auditing to general auditing but
create a new function audit_signal_info_syscall() to take care of the
syscall dependent parts for when syscall auditing is enabled.

Please see the github kernel audit issue
https://github.com/linux-audit/audit-kernel/issues/111

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  9 +++++++++
 kernel/audit.c        | 27 +++++++++++++++++++++++++++
 kernel/audit.h        |  8 ++++++--
 kernel/auditsc.c      | 19 +++----------------
 kernel/signal.c       |  2 +-
 5 files changed, 46 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 43a23e28ba23..b4078560cb73 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -196,6 +196,9 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 }
 
 extern u32 audit_enabled;
+
+extern int audit_signal_info(int sig, struct task_struct *t);
+
 #else /* CONFIG_AUDIT */
 static inline __printf(4, 5)
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
@@ -249,6 +252,12 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 }
 
 #define audit_enabled AUDIT_OFF
+
+static inline int audit_signal_info(int sig, struct task_struct *t)
+{
+	return 0;
+}
+
 #endif /* CONFIG_AUDIT */
 
 #ifdef CONFIG_AUDIT_COMPAT_GENERIC
diff --git a/kernel/audit.c b/kernel/audit.c
index b96bf69183f4..67399ff72d43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2273,6 +2273,33 @@ out:
 	return rc;
 }
 
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+	kuid_t uid = current_uid(), auid;
+
+	if (auditd_test_task(t) &&
+	    (sig == SIGTERM || sig == SIGHUP ||
+	     sig == SIGUSR1 || sig == SIGUSR2)) {
+		audit_sig_pid = task_tgid_nr(current);
+		auid = audit_get_loginuid(current);
+		if (uid_valid(auid))
+			audit_sig_uid = auid;
+		else
+			audit_sig_uid = uid;
+		security_task_getsecid(current, &audit_sig_sid);
+	}
+
+	return audit_signal_info_syscall(t);
+}
+
 /**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index 2071725a999f..996d94faad43 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -299,7 +299,7 @@ extern const char *audit_tree_path(struct audit_tree *tree);
 extern void audit_put_tree(struct audit_tree *tree);
 extern void audit_kill_trees(struct audit_context *context);
 
-extern int audit_signal_info(int sig, struct task_struct *t);
+extern int audit_signal_info_syscall(struct task_struct *t);
 extern void audit_filter_inodes(struct task_struct *tsk,
 				struct audit_context *ctx);
 extern struct list_head *audit_killed_trees(void);
@@ -330,7 +330,11 @@ extern struct list_head *audit_killed_trees(void);
 #define audit_tree_path(rule) ""	/* never called */
 #define audit_kill_trees(context) BUG()
 
-#define audit_signal_info(s, t) AUDIT_DISABLED
+static inline int audit_signal_info_syscall(struct task_struct *t)
+{
+	return 0;
+}
+
 #define audit_filter_inodes(t, c) AUDIT_DISABLED
 #endif /* CONFIG_AUDITSYSCALL */
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95ae27edd417..30aa07b0115f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2360,30 +2360,17 @@ void __audit_ptrace(struct task_struct *t)
 }
 
 /**
- * audit_signal_info - record signal info for shutting down audit subsystem
- * @sig: signal value
+ * audit_signal_info_syscall - record signal info for syscalls
  * @t: task being signaled
  *
  * If the audit subsystem is being terminated, record the task (pid)
  * and uid that is doing that.
  */
-int audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
 {
 	struct audit_aux_data_pids *axp;
 	struct audit_context *ctx = audit_context();
-	kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
-
-	if (auditd_test_task(t) &&
-	    (sig == SIGTERM || sig == SIGHUP ||
-	     sig == SIGUSR1 || sig == SIGUSR2)) {
-		audit_sig_pid = task_tgid_nr(current);
-		auid = audit_get_loginuid(current);
-		if (uid_valid(auid))
-			audit_sig_uid = auid;
-		else
-			audit_sig_uid = uid;
-		security_task_getsecid(current, &audit_sig_sid);
-	}
+	kuid_t t_uid = task_uid(t);
 
 	if (!audit_signals || audit_dummy_context())
 		return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index a1eb44dc9ff5..5cfc8611867b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -44,6 +44,7 @@
 #include <linux/posix-timers.h>
 #include <linux/livepatch.h>
 #include <linux/cgroup.h>
+#include <linux/audit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -53,7 +54,6 @@
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
-#include "audit.h"	/* audit_signal_info() */
 
 /*
  * SLAB caches for signal bits.
-- 
cgit v1.2.3


From 2e21865faf4fd7ca99eb2ace072c6d618059e342 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 May 2019 14:06:51 +0100
Subject: keys: sparse: Fix key_fs[ug]id_changed()

Sparse warnings are incurred by key_fs[ug]id_changed() due to unprotected
accesses of tsk->cred, which is marked __rcu.

Fix this by passing the new cred struct to these functions from
commit_creds() rather than the task pointer.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
---
 include/linux/key.h          |  8 ++++----
 kernel/cred.c                |  4 ++--
 security/keys/process_keys.c | 22 ++++++++++------------
 3 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 7099985e35a9..1f09aad1c98c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -402,8 +402,8 @@ extern struct ctl_table key_sysctls[];
  * the userspace interface
  */
 extern int install_thread_keyring_to_cred(struct cred *cred);
-extern void key_fsuid_changed(struct task_struct *tsk);
-extern void key_fsgid_changed(struct task_struct *tsk);
+extern void key_fsuid_changed(struct cred *new_cred);
+extern void key_fsgid_changed(struct cred *new_cred);
 extern void key_init(void);
 
 #else /* CONFIG_KEYS */
@@ -418,8 +418,8 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define key_fsuid_changed(t)		do { } while(0)
-#define key_fsgid_changed(t)		do { } while(0)
+#define key_fsuid_changed(c)		do { } while(0)
+#define key_fsgid_changed(c)		do { } while(0)
 #define key_init()			do { } while(0)
 
 #endif /* CONFIG_KEYS */
diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..3bd40de9e192 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -455,9 +455,9 @@ int commit_creds(struct cred *new)
 
 	/* alter the thread keyring */
 	if (!uid_eq(new->fsuid, old->fsuid))
-		key_fsuid_changed(task);
+		key_fsuid_changed(new);
 	if (!gid_eq(new->fsgid, old->fsgid))
-		key_fsgid_changed(task);
+		key_fsgid_changed(new);
 
 	/* do it
 	 * RLIMIT_NPROC limits on user->processes have already been checked
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index f05f7125a7d5..ba5d3172cafe 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -293,28 +293,26 @@ static int install_session_keyring(struct key *keyring)
 /*
  * Handle the fsuid changing.
  */
-void key_fsuid_changed(struct task_struct *tsk)
+void key_fsuid_changed(struct cred *new_cred)
 {
 	/* update the ownership of the thread keyring */
-	BUG_ON(!tsk->cred);
-	if (tsk->cred->thread_keyring) {
-		down_write(&tsk->cred->thread_keyring->sem);
-		tsk->cred->thread_keyring->uid = tsk->cred->fsuid;
-		up_write(&tsk->cred->thread_keyring->sem);
+	if (new_cred->thread_keyring) {
+		down_write(&new_cred->thread_keyring->sem);
+		new_cred->thread_keyring->uid = new_cred->fsuid;
+		up_write(&new_cred->thread_keyring->sem);
 	}
 }
 
 /*
  * Handle the fsgid changing.
  */
-void key_fsgid_changed(struct task_struct *tsk)
+void key_fsgid_changed(struct cred *new_cred)
 {
 	/* update the ownership of the thread keyring */
-	BUG_ON(!tsk->cred);
-	if (tsk->cred->thread_keyring) {
-		down_write(&tsk->cred->thread_keyring->sem);
-		tsk->cred->thread_keyring->gid = tsk->cred->fsgid;
-		up_write(&tsk->cred->thread_keyring->sem);
+	if (new_cred->thread_keyring) {
+		down_write(&new_cred->thread_keyring->sem);
+		new_cred->thread_keyring->gid = new_cred->fsgid;
+		up_write(&new_cred->thread_keyring->sem);
 	}
 }
 
-- 
cgit v1.2.3


From f13e143e7444bffc53f5c2904aeed76646da69d6 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 3 Jul 2018 16:42:26 +0200
Subject: dma-buf: start caching of sg_table objects v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To allow a smooth transition from pinning buffer objects to dynamic
invalidation we first start to cache the sg_table for an attachment.

v2: keep closer to the DRM implementation

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.kernel.org/patch/10943053/
---
 drivers/dma-buf/dma-buf.c | 27 +++++++++++++++++++++++++--
 include/linux/dma-buf.h   | 13 +++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3ae6c0c2cc02..f4104a21b069 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -576,6 +576,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	list_add(&attach->node, &dmabuf->attachments);
 
 	mutex_unlock(&dmabuf->lock);
+
 	return attach;
 
 err_attach:
@@ -598,6 +599,9 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 	if (WARN_ON(!dmabuf || !attach))
 		return;
 
+	if (attach->sgt)
+		dmabuf->ops->unmap_dma_buf(attach, attach->sgt, attach->dir);
+
 	mutex_lock(&dmabuf->lock);
 	list_del(&attach->node);
 	if (dmabuf->ops->detach)
@@ -633,10 +637,27 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf))
 		return ERR_PTR(-EINVAL);
 
+	if (attach->sgt) {
+		/*
+		 * Two mappings with different directions for the same
+		 * attachment are not allowed.
+		 */
+		if (attach->dir != direction &&
+		    attach->dir != DMA_BIDIRECTIONAL)
+			return ERR_PTR(-EBUSY);
+
+		return attach->sgt;
+	}
+
 	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
 	if (!sg_table)
 		sg_table = ERR_PTR(-ENOMEM);
 
+	if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) {
+		attach->sgt = sg_table;
+		attach->dir = direction;
+	}
+
 	return sg_table;
 }
 EXPORT_SYMBOL_GPL(dma_buf_map_attachment);
@@ -660,8 +681,10 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
 		return;
 
-	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table,
-						direction);
+	if (attach->sgt == sg_table)
+		return;
+
+	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction);
 }
 EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
 
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index a0bd071466fc..8a327566d7f4 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -44,6 +44,15 @@ struct dma_buf_attachment;
  * @vunmap: [optional] unmaps a vmap from the buffer
  */
 struct dma_buf_ops {
+	/**
+	  * @cache_sgt_mapping:
+	  *
+	  * If true the framework will cache the first mapping made for each
+	  * attachment. This avoids creating mappings for attachments multiple
+	  * times.
+	  */
+	bool cache_sgt_mapping;
+
 	/**
 	 * @attach:
 	 *
@@ -323,6 +332,8 @@ struct dma_buf {
  * @dmabuf: buffer for this attachment.
  * @dev: device attached to the buffer.
  * @node: list of dma_buf_attachment.
+ * @sgt: cached mapping.
+ * @dir: direction of cached mapping.
  * @priv: exporter specific attachment data.
  *
  * This structure holds the attachment information between the dma_buf buffer
@@ -338,6 +349,8 @@ struct dma_buf_attachment {
 	struct dma_buf *dmabuf;
 	struct device *dev;
 	struct list_head node;
+	struct sg_table *sgt;
+	enum dma_data_direction dir;
 	void *priv;
 };
 
-- 
cgit v1.2.3


From fbb5d0353c62d10c3699ec844d2d015a762952d7 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:06 +0530
Subject: drm: Add HDR source metadata property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a blob property to get HDR metadata
information from userspace. This will be send as part
of AVI Infoframe to panel.

It also implements get() and set() functions for HDR output
metadata property.The blob data is received from userspace and
saved in connector state, the same is returned as blob in get
property call to userspace.

v2: Rebase and modified the metadata structure elements
as per Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments

v5: Rebase.

v6: Addressed Brian Starkey's review comments, defined
new structure with header for dynamic metadata scalability.
Merge get/set property functions for metadata in this patch.

v7: Addressed Jonas Karlman review comments and defined separate
structure for infoframe to better align with CTA 861.G spec. Added
Shashank's RB.

v8: Addressed Ville's review comments. Moved sink metadata structure
out of uapi headers as suggested by Jonas Karlman.

v9: Rebase and addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments, dropped the metdata_changed
state variable as its not needed anymore.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-2-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_atomic_uapi.c | 12 ++++++++++++
 drivers/gpu/drm/drm_connector.c   |  6 ++++++
 include/drm/drm_connector.h       | 10 ++++++++++
 include/drm/drm_mode_config.h     |  7 +++++++
 include/linux/hdmi.h              | 26 ++++++++++++++++++++++++++
 include/uapi/drm/drm_mode.h       | 23 +++++++++++++++++++++++
 6 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 428d82662dc4..125605ff45af 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -676,6 +676,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 {
 	struct drm_device *dev = connector->dev;
 	struct drm_mode_config *config = &dev->mode_config;
+	bool replaced = false;
+	int ret;
 
 	if (property == config->prop_crtc_id) {
 		struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val);
@@ -726,6 +728,13 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 		 */
 		if (state->link_status != DRM_LINK_STATUS_GOOD)
 			state->link_status = val;
+	} else if (property == config->hdr_output_metadata_property) {
+		ret = drm_atomic_replace_property_blob_from_id(dev,
+				&state->hdr_output_metadata,
+				val,
+				sizeof(struct hdr_output_metadata), -1,
+				&replaced);
+		return ret;
 	} else if (property == config->aspect_ratio_property) {
 		state->picture_aspect_ratio = val;
 	} else if (property == config->content_type_property) {
@@ -814,6 +823,9 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
 		*val = state->colorspace;
 	} else if (property == connector->scaling_mode_property) {
 		*val = state->scaling_mode;
+	} else if (property == config->hdr_output_metadata_property) {
+		*val = state->hdr_output_metadata ?
+			state->hdr_output_metadata->base.id : 0;
 	} else if (property == connector->content_protection_property) {
 		*val = state->content_protection;
 	} else if (property == config->writeback_fb_id_property) {
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index b34c3d38bf15..365ace0c0c9e 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1058,6 +1058,12 @@ int drm_connector_create_standard_properties(struct drm_device *dev)
 		return -ENOMEM;
 	dev->mode_config.non_desktop_property = prop;
 
+	prop = drm_property_create(dev, DRM_MODE_PROP_BLOB,
+				   "HDR_OUTPUT_METADATA", 0);
+	if (!prop)
+		return -ENOMEM;
+	dev->mode_config.hdr_output_metadata_property = prop;
+
 	return 0;
 }
 
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index f43f40d5888a..f0e987df4c1e 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -603,6 +603,12 @@ struct drm_connector_state {
 	 * and the connector bpc limitations obtained from edid.
 	 */
 	u8 max_bpc;
+
+	/**
+	 * @hdr_output_metadata:
+	 * DRM blob property for HDR output metadata
+	 */
+	struct drm_property_blob *hdr_output_metadata;
 };
 
 /**
@@ -1243,6 +1249,10 @@ struct drm_connector {
 	 * &drm_mode_config.connector_free_work.
 	 */
 	struct llist_node free_node;
+
+	/* HDR metdata */
+	struct hdr_output_metadata hdr_output_metadata;
+	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
 #define obj_to_connector(x) container_of(x, struct drm_connector, base)
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7f60e8eb269a..c031b5a9d8d1 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -836,6 +836,13 @@ struct drm_mode_config {
 	 */
 	struct drm_property *writeback_out_fence_ptr_property;
 
+	/**
+	 * hdr_output_metadata_property: Connector property containing hdr
+	 * metatda. This will be provided by userspace compositors based
+	 * on HDR content
+	 */
+	struct drm_property *hdr_output_metadata_property;
+
 	/* dumb ioctl parameters */
 	uint32_t preferred_depth, prefer_shadow;
 
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 927ad6451105..6780476dcbff 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -152,6 +152,16 @@ enum hdmi_content_type {
 	HDMI_CONTENT_TYPE_GAME,
 };
 
+enum hdmi_metadata_type {
+	HDMI_STATIC_METADATA_TYPE1 = 1,
+};
+
+enum hdmi_eotf {
+	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
+	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
+	HDMI_EOTF_SMPTE_ST2084,
+};
+
 struct hdmi_avi_infoframe {
 	enum hdmi_infoframe_type type;
 	unsigned char version;
@@ -320,6 +330,22 @@ struct hdmi_vendor_infoframe {
 	unsigned int s3d_ext_data;
 };
 
+/* HDR Metadata as per 861.G spec */
+struct hdr_static_metadata {
+	__u8 eotf;
+	__u8 metadata_type;
+	__u16 max_cll;
+	__u16 max_fall;
+	__u16 min_cll;
+};
+
+struct hdr_sink_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_static_metadata hdmi_type1;
+	};
+};
+
 int hdmi_vendor_infoframe_init(struct hdmi_vendor_infoframe *frame);
 ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame,
 				   void *buffer, size_t size);
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 83cd1636b9be..997a7e05c0c6 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -630,6 +630,29 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
+/* HDR Metadata Infoframe as per 861.G spec */
+struct hdr_metadata_infoframe {
+	__u8 eotf;
+	__u8 metadata_type;
+	struct {
+		__u16 x, y;
+		} display_primaries[3];
+	struct {
+		__u16 x, y;
+		} white_point;
+	__u16 max_display_mastering_luminance;
+	__u16 min_display_mastering_luminance;
+	__u16 max_cll;
+	__u16 max_fall;
+};
+
+struct hdr_output_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_metadata_infoframe hdmi_metadata_type1;
+	};
+};
+
 #define DRM_MODE_PAGE_FLIP_EVENT 0x01
 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
-- 
cgit v1.2.3


From 2cdbfd66a82969770ce1a7032fb1e2155a08cee8 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:09 +0530
Subject: drm: Enable HDR infoframe support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable Dynamic Range and Mastering Infoframe for HDR
content, which is defined in CEA 861.3 spec.

The metadata will be computed based on blending
policy in userspace compositors and passed as a connector
property blob to driver. The same will be sent as infoframe
to panel which support HDR.

Added the const version of infoframe for DRM metadata
for HDR.

v2: Rebase and added Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments and merged the
patch making drm infoframe function arguments as constant.

v5: Rebase

v6: Fixed checkpatch warnings with --strict option. Addressed
Shashank's review comments and added his RB.

v7: Addressed Brian Starkey's review comments. Merged 2 patches
into one.

v8: Addressed Jonas Karlman review comments.

v9: Addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments.

v11: Added BUILD_BUG_ON and sizeof instead of magic numbers as
per Ville's comments.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-5-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c |  72 +++++++++++++++++
 drivers/video/hdmi.c       | 190 +++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_edid.h     |   5 ++
 include/linux/hdmi.h       |  28 +++++++
 4 files changed, 295 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index a5ef9f45fee0..73560c9437cd 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4904,6 +4904,78 @@ static bool is_hdmi2_sink(struct drm_connector *connector)
 		connector->display_info.color_formats & DRM_COLOR_FORMAT_YCRCB420;
 }
 
+static inline bool is_eotf_supported(u8 output_eotf, u8 sink_eotf)
+{
+	return sink_eotf & BIT(output_eotf);
+}
+
+/**
+ * drm_hdmi_infoframe_set_hdr_metadata() - fill an HDMI DRM infoframe with
+ *                                         HDR metadata from userspace
+ * @frame: HDMI DRM infoframe
+ * @hdr_metadata: hdr_source_metadata info from userspace
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state)
+{
+	struct drm_connector *connector;
+	struct hdr_output_metadata *hdr_metadata;
+	int err;
+
+	if (!frame || !conn_state)
+		return -EINVAL;
+
+	connector = conn_state->connector;
+
+	if (!conn_state->hdr_output_metadata)
+		return -EINVAL;
+
+	hdr_metadata = conn_state->hdr_output_metadata->data;
+
+	if (!hdr_metadata || !connector)
+		return -EINVAL;
+
+	/* Sink EOTF is Bit map while infoframe is absolute values */
+	if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf,
+	    connector->hdr_sink_metadata.hdmi_type1.eotf)) {
+		DRM_DEBUG_KMS("EOTF Not Supported\n");
+		return -EINVAL;
+	}
+
+	err = hdmi_drm_infoframe_init(frame);
+	if (err < 0)
+		return err;
+
+	frame->eotf = hdr_metadata->hdmi_metadata_type1.eotf;
+	frame->metadata_type = hdr_metadata->hdmi_metadata_type1.metadata_type;
+
+	BUILD_BUG_ON(sizeof(frame->display_primaries) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.display_primaries));
+	BUILD_BUG_ON(sizeof(frame->white_point) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.white_point));
+
+	memcpy(&frame->display_primaries,
+	       &hdr_metadata->hdmi_metadata_type1.display_primaries,
+	       sizeof(frame->display_primaries));
+
+	memcpy(&frame->white_point,
+	       &hdr_metadata->hdmi_metadata_type1.white_point,
+	       sizeof(frame->white_point));
+
+	frame->max_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.max_display_mastering_luminance;
+	frame->min_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.min_display_mastering_luminance;
+	frame->max_fall = hdr_metadata->hdmi_metadata_type1.max_fall;
+	frame->max_cll = hdr_metadata->hdmi_metadata_type1.max_cll;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata);
+
 /**
  * drm_hdmi_avi_infoframe_from_display_mode() - fill an HDMI AVI infoframe with
  *                                              data from a DRM display mode
diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index 799ae49774f5..481f0367dfd3 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -650,6 +650,150 @@ hdmi_vendor_any_infoframe_check_only(const union hdmi_vendor_any_infoframe *fram
 	return 0;
 }
 
+/**
+ * hdmi_drm_infoframe_init() - initialize an HDMI Dynaminc Range and
+ * mastering infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame)
+{
+	memset(frame, 0, sizeof(*frame));
+
+	frame->type = HDMI_INFOFRAME_TYPE_DRM;
+	frame->version = 1;
+	frame->length = HDMI_DRM_INFOFRAME_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_init);
+
+static int hdmi_drm_infoframe_check_only(const struct hdmi_drm_infoframe *frame)
+{
+	if (frame->type != HDMI_INFOFRAME_TYPE_DRM ||
+	    frame->version != 1)
+		return -EINVAL;
+
+	if (frame->length != HDMI_DRM_INFOFRAME_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * hdmi_drm_infoframe_check() - check a HDMI DRM infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Validates that the infoframe is consistent.
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame)
+{
+	return hdmi_drm_infoframe_check_only(frame);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_check);
+
+/**
+ * hdmi_drm_infoframe_pack_only() - write HDMI DRM infoframe to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Packs the information contained in the @frame structure into a binary
+ * representation that can be written into the corresponding controller
+ * registers. Also computes the checksum as required by section 5.3.5 of
+ * the HDMI 1.4 specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size)
+{
+	u8 *ptr = buffer;
+	size_t length;
+	int i;
+
+	length = HDMI_INFOFRAME_HEADER_SIZE + frame->length;
+
+	if (size < length)
+		return -ENOSPC;
+
+	memset(buffer, 0, size);
+
+	ptr[0] = frame->type;
+	ptr[1] = frame->version;
+	ptr[2] = frame->length;
+	ptr[3] = 0; /* checksum */
+
+	/* start infoframe payload */
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	*ptr++ = frame->eotf;
+	*ptr++ = frame->metadata_type;
+
+	for (i = 0; i < 3; i++) {
+		*ptr++ = frame->display_primaries[i].x;
+		*ptr++ = frame->display_primaries[i].x >> 8;
+		*ptr++ = frame->display_primaries[i].y;
+		*ptr++ = frame->display_primaries[i].y >> 8;
+	}
+
+	*ptr++ = frame->white_point.x;
+	*ptr++ = frame->white_point.x >> 8;
+
+	*ptr++ = frame->white_point.y;
+	*ptr++ = frame->white_point.y >> 8;
+
+	*ptr++ = frame->max_display_mastering_luminance;
+	*ptr++ = frame->max_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->min_display_mastering_luminance;
+	*ptr++ = frame->min_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->max_cll;
+	*ptr++ = frame->max_cll >> 8;
+
+	*ptr++ = frame->max_fall;
+	*ptr++ = frame->max_fall >> 8;
+
+	hdmi_infoframe_set_checksum(buffer, length);
+
+	return length;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack_only);
+
+/**
+ * hdmi_drm_infoframe_pack() - check a HDMI DRM infoframe,
+ *                             and write it to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Validates that the infoframe is consistent and updates derived fields
+ * (eg. length) based on other fields, after which it packs the information
+ * contained in the @frame structure into a binary representation that
+ * can be written into the corresponding controller registers. This function
+ * also computes the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame,
+				void *buffer, size_t size)
+{
+	int ret;
+
+	ret = hdmi_drm_infoframe_check(frame);
+	if (ret)
+		return ret;
+
+	return hdmi_drm_infoframe_pack_only(frame, buffer, size);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack);
+
 /*
  * hdmi_vendor_any_infoframe_check() - check a vendor infoframe
  */
@@ -758,6 +902,10 @@ hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t
 		length = hdmi_avi_infoframe_pack_only(&frame->avi,
 						      buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack_only(&frame->drm,
+						      buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack_only(&frame->spd,
 						      buffer, size);
@@ -806,6 +954,9 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame,
 	case HDMI_INFOFRAME_TYPE_AVI:
 		length = hdmi_avi_infoframe_pack(&frame->avi, buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack(&frame->drm, buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack(&frame->spd, buffer, size);
 		break;
@@ -838,6 +989,8 @@ static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type)
 		return "Source Product Description (SPD)";
 	case HDMI_INFOFRAME_TYPE_AUDIO:
 		return "Audio";
+	case HDMI_INFOFRAME_TYPE_DRM:
+		return "Dynamic Range and Mastering";
 	}
 	return "Reserved";
 }
@@ -1284,6 +1437,40 @@ static void hdmi_audio_infoframe_log(const char *level,
 			frame->downmix_inhibit ? "Yes" : "No");
 }
 
+/**
+ * hdmi_drm_infoframe_log() - log info of HDMI DRM infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI DRM infoframe
+ */
+static void hdmi_drm_infoframe_log(const char *level,
+				   struct device *dev,
+				   const struct hdmi_drm_infoframe *frame)
+{
+	int i;
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+	hdmi_log("length: %d\n", frame->length);
+	hdmi_log("metadata type: %d\n", frame->metadata_type);
+	hdmi_log("eotf: %d\n", frame->eotf);
+	for (i = 0; i < 3; i++) {
+		hdmi_log("x[%d]: %d\n", i, frame->display_primaries[i].x);
+		hdmi_log("y[%d]: %d\n", i, frame->display_primaries[i].y);
+	}
+
+	hdmi_log("white point x: %d\n", frame->white_point.x);
+	hdmi_log("white point y: %d\n", frame->white_point.y);
+
+	hdmi_log("max_display_mastering_luminance: %d\n",
+		 frame->max_display_mastering_luminance);
+	hdmi_log("min_display_mastering_luminance: %d\n",
+		 frame->min_display_mastering_luminance);
+
+	hdmi_log("max_cll: %d\n", frame->max_cll);
+	hdmi_log("max_fall: %d\n", frame->max_fall);
+}
+
 static const char *
 hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct)
 {
@@ -1372,6 +1559,9 @@ void hdmi_infoframe_log(const char *level,
 	case HDMI_INFOFRAME_TYPE_VENDOR:
 		hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		hdmi_drm_infoframe_log(level, dev, &frame->drm);
+		break;
 	}
 }
 EXPORT_SYMBOL(hdmi_infoframe_log);
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 9d3b5b93102c..0e21e91c4314 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -25,6 +25,7 @@
 
 #include <linux/types.h>
 #include <linux/hdmi.h>
+#include <drm/drm_mode.h>
 
 struct drm_device;
 struct i2c_adapter;
@@ -370,6 +371,10 @@ drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame,
 				   const struct drm_display_mode *mode,
 				   enum hdmi_quantization_range rgb_quant_range);
 
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state);
+
 /**
  * drm_eld_mnl - Get ELD monitor name length in bytes.
  * @eld: pointer to an eld memory structure with mnl set
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 6780476dcbff..bcf3c6c3499e 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -47,6 +47,7 @@ enum hdmi_infoframe_type {
 	HDMI_INFOFRAME_TYPE_AVI = 0x82,
 	HDMI_INFOFRAME_TYPE_SPD = 0x83,
 	HDMI_INFOFRAME_TYPE_AUDIO = 0x84,
+	HDMI_INFOFRAME_TYPE_DRM = 0x87,
 };
 
 #define HDMI_IEEE_OUI 0x000c03
@@ -55,6 +56,7 @@ enum hdmi_infoframe_type {
 #define HDMI_AVI_INFOFRAME_SIZE    13
 #define HDMI_SPD_INFOFRAME_SIZE    25
 #define HDMI_AUDIO_INFOFRAME_SIZE  10
+#define HDMI_DRM_INFOFRAME_SIZE    26
 
 #define HDMI_INFOFRAME_SIZE(type)	\
 	(HDMI_INFOFRAME_HEADER_SIZE + HDMI_ ## type ## _INFOFRAME_SIZE)
@@ -185,12 +187,37 @@ struct hdmi_avi_infoframe {
 	unsigned short right_bar;
 };
 
+/* DRM Infoframe as per CTA 861.G spec */
+struct hdmi_drm_infoframe {
+	enum hdmi_infoframe_type type;
+	unsigned char version;
+	unsigned char length;
+	enum hdmi_eotf eotf;
+	enum hdmi_metadata_type metadata_type;
+	struct {
+		u16 x, y;
+	} display_primaries[3];
+	struct {
+		u16 x, y;
+	} white_point;
+	u16 max_display_mastering_luminance;
+	u16 min_display_mastering_luminance;
+	u16 max_cll;
+	u16 max_fall;
+};
+
 int hdmi_avi_infoframe_init(struct hdmi_avi_infoframe *frame);
 ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer,
 				size_t size);
 ssize_t hdmi_avi_infoframe_pack_only(const struct hdmi_avi_infoframe *frame,
 				     void *buffer, size_t size);
 int hdmi_avi_infoframe_check(struct hdmi_avi_infoframe *frame);
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame);
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, void *buffer,
+				size_t size);
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size);
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame);
 
 enum hdmi_spd_sdi {
 	HDMI_SPD_SDI_UNKNOWN,
@@ -381,6 +408,7 @@ union hdmi_infoframe {
 	struct hdmi_spd_infoframe spd;
 	union hdmi_vendor_any_infoframe vendor;
 	struct hdmi_audio_infoframe audio;
+	struct hdmi_drm_infoframe drm;
 };
 
 ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer,
-- 
cgit v1.2.3


From b5e3eed1eeb363c148e2935d9d3c12c30a280de6 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Thu, 16 May 2019 19:40:12 +0530
Subject: drm: Add HLG EOTF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADD HLG EOTF to the list of EOTF transfer functions supported.
Hybrid Log-Gamma (HLG) is a high dynamic range (HDR) standard.
HLG defines a nonlinear transfer function in which the lower
half of the signal values use a gamma curve and the upper half
of the signal values use a logarithmic curve.

v2: Rebase

v3: Fixed a warning message

v4: Addressed Shashank's review comments

v5: Addressed Jonas Karlman's review comment and dropped the i915
tag from header.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-8-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c | 3 ++-
 include/linux/hdmi.h       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 73560c9437cd..262510c2a670 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3854,7 +3854,8 @@ static uint8_t eotf_supported(const u8 *edid_ext)
 	return edid_ext[2] &
 		(BIT(HDMI_EOTF_TRADITIONAL_GAMMA_SDR) |
 		 BIT(HDMI_EOTF_TRADITIONAL_GAMMA_HDR) |
-		 BIT(HDMI_EOTF_SMPTE_ST2084));
+		 BIT(HDMI_EOTF_SMPTE_ST2084) |
+		 BIT(HDMI_EOTF_BT_2100_HLG));
 }
 
 static uint8_t hdr_metadata_type(const u8 *edid_ext)
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index bcf3c6c3499e..ee55ba589cdc 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -162,6 +162,7 @@ enum hdmi_eotf {
 	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
 	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
 	HDMI_EOTF_SMPTE_ST2084,
+	HDMI_EOTF_BT_2100_HLG,
 };
 
 struct hdmi_avi_infoframe {
-- 
cgit v1.2.3


From 70f1b0d34bdf03065fe869e93cc17cad1ea20c4a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Feb 2019 19:44:12 -0600
Subject: signal/usb: Replace kill_pid_info_as_cred with kill_pid_usb_asyncio

The usb support for asyncio encoded one of it's values in the wrong
field.  It should have used si_value but instead used si_addr which is
not present in the _rt union member of struct siginfo.

The practical result of this is that on a 64bit big endian kernel
when delivering a signal to a 32bit process the si_addr field
is set to NULL, instead of the expected pointer value.

This issue can not be fixed in copy_siginfo_to_user32 as the usb
usage of the the _sigfault (aka si_addr) member of the siginfo
union when SI_ASYNCIO is set is incompatible with the POSIX and
glibc usage of the _rt member of the siginfo union.

Therefore replace kill_pid_info_as_cred with kill_pid_usb_asyncio a
dedicated function for this one specific case.  There are no other
users of kill_pid_info_as_cred so this specialization should have no
impact on the amount of code in the kernel.  Have kill_pid_usb_asyncio
take instead of a siginfo_t which is difficult and error prone, 3
arguments, a signal number, an errno value, and an address enconded as
a sigval_t.  The encoding of the address as a sigval_t allows the
code that reads the userspace request for a signal to handle this
compat issue along with all of the other compat issues.

Add BUILD_BUG_ONs in kernel/signal.c to ensure that we can now place
the pointer value at the in si_pid (instead of si_addr).  That is the
code now verifies that si_pid and si_addr always occur at the same
location.  Further the code veries that for native structures a value
placed in si_pid and spilling into si_uid will appear in userspace in
si_addr (on a byte by byte copy of siginfo or a field by field copy of
siginfo).  The code also verifies that for a 64bit kernel and a 32bit
userspace the 32bit pointer will fit in si_pid.

I have used the usbsig.c program below written by Alan Stern and
slightly tweaked by me to run on a big endian machine to verify the
issue exists (on sparc64) and to confirm the patch below fixes the issue.

 /* usbsig.c -- test USB async signal delivery */

 #define _GNU_SOURCE
 #include <stdio.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <unistd.h>
 #include <endian.h>
 #include <linux/usb/ch9.h>
 #include <linux/usbdevice_fs.h>

 static struct usbdevfs_urb urb;
 static struct usbdevfs_disconnectsignal ds;
 static volatile sig_atomic_t done = 0;

 void urb_handler(int sig, siginfo_t *info , void *ucontext)
 {
 	printf("Got signal %d, signo %d errno %d code %d addr: %p urb: %p\n",
 	       sig, info->si_signo, info->si_errno, info->si_code,
 	       info->si_addr, &urb);

 	printf("%s\n", (info->si_addr == &urb) ? "Good" : "Bad");
 }

 void ds_handler(int sig, siginfo_t *info , void *ucontext)
 {
 	printf("Got signal %d, signo %d errno %d code %d addr: %p ds: %p\n",
 	       sig, info->si_signo, info->si_errno, info->si_code,
 	       info->si_addr, &ds);

 	printf("%s\n", (info->si_addr == &ds) ? "Good" : "Bad");
 	done = 1;
 }

 int main(int argc, char **argv)
 {
 	char *devfilename;
 	int fd;
 	int rc;
 	struct sigaction act;
 	struct usb_ctrlrequest *req;
 	void *ptr;
 	char buf[80];

 	if (argc != 2) {
 		fprintf(stderr, "Usage: usbsig device-file-name\n");
 		return 1;
 	}

 	devfilename = argv[1];
 	fd = open(devfilename, O_RDWR);
 	if (fd == -1) {
 		perror("Error opening device file");
 		return 1;
 	}

 	act.sa_sigaction = urb_handler;
 	sigemptyset(&act.sa_mask);
 	act.sa_flags = SA_SIGINFO;

 	rc = sigaction(SIGUSR1, &act, NULL);
 	if (rc == -1) {
 		perror("Error in sigaction");
 		return 1;
 	}

 	act.sa_sigaction = ds_handler;
 	sigemptyset(&act.sa_mask);
 	act.sa_flags = SA_SIGINFO;

 	rc = sigaction(SIGUSR2, &act, NULL);
 	if (rc == -1) {
 		perror("Error in sigaction");
 		return 1;
 	}

 	memset(&urb, 0, sizeof(urb));
 	urb.type = USBDEVFS_URB_TYPE_CONTROL;
 	urb.endpoint = USB_DIR_IN | 0;
 	urb.buffer = buf;
 	urb.buffer_length = sizeof(buf);
 	urb.signr = SIGUSR1;

 	req = (struct usb_ctrlrequest *) buf;
 	req->bRequestType = USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE;
 	req->bRequest = USB_REQ_GET_DESCRIPTOR;
 	req->wValue = htole16(USB_DT_DEVICE << 8);
 	req->wIndex = htole16(0);
 	req->wLength = htole16(sizeof(buf) - sizeof(*req));

 	rc = ioctl(fd, USBDEVFS_SUBMITURB, &urb);
 	if (rc == -1) {
 		perror("Error in SUBMITURB ioctl");
 		return 1;
 	}

 	rc = ioctl(fd, USBDEVFS_REAPURB, &ptr);
 	if (rc == -1) {
 		perror("Error in REAPURB ioctl");
 		return 1;
 	}

 	memset(&ds, 0, sizeof(ds));
 	ds.signr = SIGUSR2;
 	ds.context = &ds;
 	rc = ioctl(fd, USBDEVFS_DISCSIGNAL, &ds);
 	if (rc == -1) {
 		perror("Error in DISCSIGNAL ioctl");
 		return 1;
 	}

 	printf("Waiting for usb disconnect\n");
 	while (!done) {
 		sleep(1);
 	}

 	close(fd);
 	return 0;
 }

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-usb@vger.kernel.org
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.com>
Fixes: v2.3.39
Cc: stable@vger.kernel.org
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 drivers/usb/core/devio.c     | 48 +++++++++++++++---------------
 include/linux/sched/signal.h |  2 +-
 kernel/signal.c              | 69 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 86 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index fa783531ee88..a02448105527 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -63,7 +63,7 @@ struct usb_dev_state {
 	unsigned int discsignr;
 	struct pid *disc_pid;
 	const struct cred *cred;
-	void __user *disccontext;
+	sigval_t disccontext;
 	unsigned long ifclaimed;
 	u32 disabled_bulk_eps;
 	bool privileges_dropped;
@@ -90,6 +90,7 @@ struct async {
 	unsigned int ifnum;
 	void __user *userbuffer;
 	void __user *userurb;
+	sigval_t userurb_sigval;
 	struct urb *urb;
 	struct usb_memory *usbm;
 	unsigned int mem_usage;
@@ -582,22 +583,19 @@ static void async_completed(struct urb *urb)
 {
 	struct async *as = urb->context;
 	struct usb_dev_state *ps = as->ps;
-	struct kernel_siginfo sinfo;
 	struct pid *pid = NULL;
 	const struct cred *cred = NULL;
 	unsigned long flags;
-	int signr;
+	sigval_t addr;
+	int signr, errno;
 
 	spin_lock_irqsave(&ps->lock, flags);
 	list_move_tail(&as->asynclist, &ps->async_completed);
 	as->status = urb->status;
 	signr = as->signr;
 	if (signr) {
-		clear_siginfo(&sinfo);
-		sinfo.si_signo = as->signr;
-		sinfo.si_errno = as->status;
-		sinfo.si_code = SI_ASYNCIO;
-		sinfo.si_addr = as->userurb;
+		errno = as->status;
+		addr = as->userurb_sigval;
 		pid = get_pid(as->pid);
 		cred = get_cred(as->cred);
 	}
@@ -615,7 +613,7 @@ static void async_completed(struct urb *urb)
 	spin_unlock_irqrestore(&ps->lock, flags);
 
 	if (signr) {
-		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred);
+		kill_pid_usb_asyncio(signr, errno, addr, pid, cred);
 		put_pid(pid);
 		put_cred(cred);
 	}
@@ -1427,7 +1425,7 @@ find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb)
 
 static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb,
 			struct usbdevfs_iso_packet_desc __user *iso_frame_desc,
-			void __user *arg)
+			void __user *arg, sigval_t userurb_sigval)
 {
 	struct usbdevfs_iso_packet_desc *isopkt = NULL;
 	struct usb_host_endpoint *ep;
@@ -1727,6 +1725,7 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 	isopkt = NULL;
 	as->ps = ps;
 	as->userurb = arg;
+	as->userurb_sigval = userurb_sigval;
 	if (as->usbm) {
 		unsigned long uurb_start = (unsigned long)uurb->buffer;
 
@@ -1801,13 +1800,17 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 static int proc_submiturb(struct usb_dev_state *ps, void __user *arg)
 {
 	struct usbdevfs_urb uurb;
+	sigval_t userurb_sigval;
 
 	if (copy_from_user(&uurb, arg, sizeof(uurb)))
 		return -EFAULT;
 
+	memset(&userurb_sigval, 0, sizeof(userurb_sigval));
+	userurb_sigval.sival_ptr = arg;
+
 	return proc_do_submiturb(ps, &uurb,
 			(((struct usbdevfs_urb __user *)arg)->iso_frame_desc),
-			arg);
+			arg, userurb_sigval);
 }
 
 static int proc_unlinkurb(struct usb_dev_state *ps, void __user *arg)
@@ -1977,7 +1980,7 @@ static int proc_disconnectsignal_compat(struct usb_dev_state *ps, void __user *a
 	if (copy_from_user(&ds, arg, sizeof(ds)))
 		return -EFAULT;
 	ps->discsignr = ds.signr;
-	ps->disccontext = compat_ptr(ds.context);
+	ps->disccontext.sival_int = ds.context;
 	return 0;
 }
 
@@ -2005,13 +2008,17 @@ static int get_urb32(struct usbdevfs_urb *kurb,
 static int proc_submiturb_compat(struct usb_dev_state *ps, void __user *arg)
 {
 	struct usbdevfs_urb uurb;
+	sigval_t userurb_sigval;
 
 	if (get_urb32(&uurb, (struct usbdevfs_urb32 __user *)arg))
 		return -EFAULT;
 
+	memset(&userurb_sigval, 0, sizeof(userurb_sigval));
+	userurb_sigval.sival_int = ptr_to_compat(arg);
+
 	return proc_do_submiturb(ps, &uurb,
 			((struct usbdevfs_urb32 __user *)arg)->iso_frame_desc,
-			arg);
+			arg, userurb_sigval);
 }
 
 static int processcompl_compat(struct async *as, void __user * __user *arg)
@@ -2092,7 +2099,7 @@ static int proc_disconnectsignal(struct usb_dev_state *ps, void __user *arg)
 	if (copy_from_user(&ds, arg, sizeof(ds)))
 		return -EFAULT;
 	ps->discsignr = ds.signr;
-	ps->disccontext = ds.context;
+	ps->disccontext.sival_ptr = ds.context;
 	return 0;
 }
 
@@ -2614,22 +2621,15 @@ const struct file_operations usbdev_file_operations = {
 static void usbdev_remove(struct usb_device *udev)
 {
 	struct usb_dev_state *ps;
-	struct kernel_siginfo sinfo;
 
 	while (!list_empty(&udev->filelist)) {
 		ps = list_entry(udev->filelist.next, struct usb_dev_state, list);
 		destroy_all_async(ps);
 		wake_up_all(&ps->wait);
 		list_del_init(&ps->list);
-		if (ps->discsignr) {
-			clear_siginfo(&sinfo);
-			sinfo.si_signo = ps->discsignr;
-			sinfo.si_errno = EPIPE;
-			sinfo.si_code = SI_ASYNCIO;
-			sinfo.si_addr = ps->disccontext;
-			kill_pid_info_as_cred(ps->discsignr, &sinfo,
-					ps->disc_pid, ps->cred);
-		}
+		if (ps->discsignr)
+			kill_pid_usb_asyncio(ps->discsignr, EPIPE, ps->disccontext,
+					     ps->disc_pid, ps->cred);
 	}
 }
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 38a0f0785323..c68ca81db0a1 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -329,7 +329,7 @@ extern void force_sigsegv(int sig, struct task_struct *p);
 extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
+extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
 				const struct cred *);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
diff --git a/kernel/signal.c b/kernel/signal.c
index a1eb44dc9ff5..18040d6bd63a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1439,13 +1439,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred,
 	       uid_eq(cred->uid, pcred->uid);
 }
 
-/* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
-			 const struct cred *cred)
+/*
+ * The usb asyncio usage of siginfo is wrong.  The glibc support
+ * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
+ * AKA after the generic fields:
+ *	kernel_pid_t	si_pid;
+ *	kernel_uid32_t	si_uid;
+ *	sigval_t	si_value;
+ *
+ * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
+ * after the generic fields is:
+ *	void __user 	*si_addr;
+ *
+ * This is a practical problem when there is a 64bit big endian kernel
+ * and a 32bit userspace.  As the 32bit address will encoded in the low
+ * 32bits of the pointer.  Those low 32bits will be stored at higher
+ * address than appear in a 32 bit pointer.  So userspace will not
+ * see the address it was expecting for it's completions.
+ *
+ * There is nothing in the encoding that can allow
+ * copy_siginfo_to_user32 to detect this confusion of formats, so
+ * handle this by requiring the caller of kill_pid_usb_asyncio to
+ * notice when this situration takes place and to store the 32bit
+ * pointer in sival_int, instead of sival_addr of the sigval_t addr
+ * parameter.
+ */
+int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
+			 struct pid *pid, const struct cred *cred)
 {
-	int ret = -EINVAL;
+	struct kernel_siginfo info;
 	struct task_struct *p;
 	unsigned long flags;
+	int ret = -EINVAL;
+
+	clear_siginfo(&info);
+	info.si_signo = sig;
+	info.si_errno = errno;
+	info.si_code = SI_ASYNCIO;
+	*((sigval_t *)&info.si_pid) = addr;
 
 	if (!valid_signal(sig))
 		return ret;
@@ -1456,17 +1487,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
+	if (!kill_as_cred_perm(cred, p)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
-	ret = security_task_kill(p, info, sig, cred);
+	ret = security_task_kill(p, &info, sig, cred);
 	if (ret)
 		goto out_unlock;
 
 	if (sig) {
 		if (lock_task_sighand(p, &flags)) {
-			ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
+			ret = __send_signal(sig, &info, p, PIDTYPE_TGID, 0);
 			unlock_task_sighand(p, &flags);
 		} else
 			ret = -ESRCH;
@@ -1475,7 +1506,7 @@ out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
+EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -4474,6 +4505,28 @@ static inline void siginfo_buildtime_checks(void)
 	CHECK_OFFSET(si_syscall);
 	CHECK_OFFSET(si_arch);
 #undef CHECK_OFFSET
+
+	/* usb asyncio */
+	BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
+		     offsetof(struct siginfo, si_addr));
+	if (sizeof(int) == sizeof(void __user *)) {
+		BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
+			     sizeof(void __user *));
+	} else {
+		BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
+			      sizeof_field(struct siginfo, si_uid)) !=
+			     sizeof(void __user *));
+		BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
+			     offsetof(struct siginfo, si_uid));
+	}
+#ifdef CONFIG_COMPAT
+	BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
+		     offsetof(struct compat_siginfo, si_addr));
+	BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+		     sizeof(compat_uptr_t));
+	BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+		     sizeof_field(struct siginfo, si_pid));
+#endif
 }
 
 void __init signals_init(void)
-- 
cgit v1.2.3


From 0db355d499f10a79b6a5161e77c7eba8f062bde4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 15:00:25 -0700
Subject: ipv4/igmp: shrink struct ip_sf_list

Removing two 4 bytes holes allows to use kmalloc-32
kmem cache instead of kmalloc-64 on 64bit kernels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 9c94b2ea789c..6649cb78de4a 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -65,8 +65,8 @@ struct ip_mc_socklist {
 
 struct ip_sf_list {
 	struct ip_sf_list	*sf_next;
-	__be32			sf_inaddr;
 	unsigned long		sf_count[2];	/* include/exclude counts */
+	__be32			sf_inaddr;
 	unsigned char		sf_gsresp;	/* include in g & s response? */
 	unsigned char		sf_oldin;	/* change state */
 	unsigned char		sf_crcount;	/* retrans. left to send */
-- 
cgit v1.2.3


From c08e7e4c8a6f04e01d16117eb4a0077059ec2cd4 Mon Sep 17 00:00:00 2001
From: Guillaume La Roque <glaroque@baylibre.com>
Date: Tue, 14 May 2019 10:26:48 +0200
Subject: pinctrl: generic: add new 'drive-strength-microamp' property support

Add drive-strength-microamp property support to allow drive strength in uA

Signed-off-by: Guillaume La Roque <glaroque@baylibre.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 2 ++
 include/linux/pinctrl/pinconf-generic.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index b4f7f8a458ea..d0cbdb1ad76a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -39,6 +39,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH, "output drive strength", "mA", true),
+	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH_UA, "output drive strength", "uA", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "usec", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_ENABLE, "input enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL, false),
@@ -167,6 +168,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "drive-open-source", PIN_CONFIG_DRIVE_OPEN_SOURCE, 0 },
 	{ "drive-push-pull", PIN_CONFIG_DRIVE_PUSH_PULL, 0 },
 	{ "drive-strength", PIN_CONFIG_DRIVE_STRENGTH, 0 },
+	{ "drive-strength-microamp", PIN_CONFIG_DRIVE_STRENGTH_UA, 0 },
 	{ "input-debounce", PIN_CONFIG_INPUT_DEBOUNCE, 0 },
 	{ "input-disable", PIN_CONFIG_INPUT_ENABLE, 0 },
 	{ "input-enable", PIN_CONFIG_INPUT_ENABLE, 1 },
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 6c0680641108..72d06d6a3099 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -55,6 +55,8 @@
  *	push-pull mode, the argument is ignored.
  * @PIN_CONFIG_DRIVE_STRENGTH: the pin will sink or source at most the current
  *	passed as argument. The argument is in mA.
+ * @PIN_CONFIG_DRIVE_STRENGTH_UA: the pin will sink or source at most the current
+ *	passed as argument. The argument is in uA.
  * @PIN_CONFIG_INPUT_DEBOUNCE: this will configure the pin to debounce mode,
  *	which means it will wait for signals to settle when reading inputs. The
  *	argument gives the debounce time in usecs. Setting the
@@ -112,6 +114,7 @@ enum pin_config_param {
 	PIN_CONFIG_DRIVE_OPEN_SOURCE,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
 	PIN_CONFIG_DRIVE_STRENGTH,
+	PIN_CONFIG_DRIVE_STRENGTH_UA,
 	PIN_CONFIG_INPUT_DEBOUNCE,
 	PIN_CONFIG_INPUT_ENABLE,
 	PIN_CONFIG_INPUT_SCHMITT,
-- 
cgit v1.2.3


From 036f394dd77f8117346874151793ec38967d843f Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Wed, 22 May 2019 17:29:24 +0200
Subject: pinctrl: Enable device link creation for pin control

A pin controller may want to create a link between itself
and its clients to be sure of suspend/resume call ordering.

Introduce link_consumers field in pinctrl_desc structure to let
pinctrl core knows that controller expect to create a link.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
[Renamed create_link to link_consumers]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 11 +++++++++++
 include/linux/pinctrl/pinctrl.h |  5 +++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c6ff4d5fa482..d757c51d7114 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1216,6 +1216,15 @@ struct pinctrl_state *pinctrl_lookup_state(struct pinctrl *p,
 }
 EXPORT_SYMBOL_GPL(pinctrl_lookup_state);
 
+static void pinctrl_link_add(struct pinctrl_dev *pctldev,
+			     struct device *consumer)
+{
+	if (pctldev->desc->link_consumers)
+		device_link_add(consumer, pctldev->dev,
+				DL_FLAG_PM_RUNTIME |
+				DL_FLAG_AUTOREMOVE_CONSUMER);
+}
+
 /**
  * pinctrl_commit_state() - select/activate/program a pinctrl state to HW
  * @p: the pinctrl handle for the device that requests configuration
@@ -1261,6 +1270,8 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
 		if (ret < 0) {
 			goto unapply_new_state;
 		}
+
+		pinctrl_link_add(setting->pctldev, p->dev);
 	}
 
 	p->state = state;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 8f5dbb84547a..2744113f1024 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -125,6 +125,10 @@ struct pinctrl_ops {
  *	the hardware description
  * @custom_conf_items: Information how to print @params in debugfs, must be
  *	the same size as the @custom_params, i.e. @num_custom_params
+ * @link_consumers: If true create a device link between pinctrl and its
+ *	consumers (i.e. the devices requesting pin control states). This is
+ *	sometimes necessary to ascertain the right suspend/resume order for
+ *	example.
  */
 struct pinctrl_desc {
 	const char *name;
@@ -139,6 +143,7 @@ struct pinctrl_desc {
 	const struct pinconf_generic_params *custom_params;
 	const struct pin_config_item *custom_conf_items;
 #endif
+	bool link_consumers;
 };
 
 /* External interface to pin controller */
-- 
cgit v1.2.3


From 0e344d8c709fe01d882fc0fb5452bedfe5eba67a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 13 May 2019 13:58:47 -0400
Subject: cpu/topology: Export die_id

Export die_id in cpu topology, for the benefit of hardware that has
multiple-die/package.

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-doc@vger.kernel.org
Link: https://lkml.kernel.org/r/e7d1caaf4fbd24ee40db6d557ab28d7d83298900.1557769318.git.len.brown@intel.com
---
 Documentation/cputopology.txt | 15 ++++++++++++---
 drivers/base/topology.c       |  4 ++++
 include/linux/topology.h      |  3 +++
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index cb61277e2308..2ff8a1e9a2db 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -12,6 +12,12 @@ physical_package_id:
 	socket number, but the actual value is architecture and platform
 	dependent.
 
+die_id:
+
+	the CPU die ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
 core_id:
 
 	the CPU core ID of cpuX. Typically it is the hardware platform's
@@ -81,6 +87,7 @@ For an architecture to support this feature, it must define some of
 these macros in include/asm-XXX/topology.h::
 
 	#define topology_physical_package_id(cpu)
+	#define topology_die_id(cpu)
 	#define topology_core_id(cpu)
 	#define topology_book_id(cpu)
 	#define topology_drawer_id(cpu)
@@ -99,9 +106,11 @@ provides default definitions for any of the above macros that are
 not defined by include/asm-XXX/topology.h:
 
 1) topology_physical_package_id: -1
-2) topology_core_id: 0
-3) topology_sibling_cpumask: just the given CPU
-4) topology_core_cpumask: just the given CPU
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
 
 For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
 default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 5fd9f167ecc1..50352cf96f85 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -43,6 +43,9 @@ static ssize_t name##_list_show(struct device *dev,			\
 define_id_show_func(physical_package_id);
 static DEVICE_ATTR_RO(physical_package_id);
 
+define_id_show_func(die_id);
+static DEVICE_ATTR_RO(die_id);
+
 define_id_show_func(core_id);
 static DEVICE_ATTR_RO(core_id);
 
@@ -72,6 +75,7 @@ static DEVICE_ATTR_RO(drawer_siblings_list);
 
 static struct attribute *default_attrs[] = {
 	&dev_attr_physical_package_id.attr,
+	&dev_attr_die_id.attr,
 	&dev_attr_core_id.attr,
 	&dev_attr_thread_siblings.attr,
 	&dev_attr_thread_siblings_list.attr,
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e1ee4b..5cc8595dd0e4 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -184,6 +184,9 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
 #endif
+#ifndef topology_die_id
+#define topology_die_id(cpu)			((void)(cpu), -1)
+#endif
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-- 
cgit v1.2.3


From 2e4c54dac7b360c3820399bdf06cde9134a4495b Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 13 May 2019 13:58:56 -0400
Subject: topology: Create core_cpus and die_cpus sysfs attributes

Create CPU topology sysfs attributes: "core_cpus" and "core_cpus_list"

These attributes represent all of the logical CPUs that share the
same core.

These attriutes is synonymous with the existing "thread_siblings" and
"thread_siblings_list" attribute, which will be deprecated.

Create CPU topology sysfs attributes: "die_cpus" and "die_cpus_list".
These attributes represent all of the logical CPUs that share the
same die.

Suggested-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/071c23a298cd27ede6ed0b6460cae190d193364f.1557769318.git.len.brown@intel.com
---
 Documentation/cputopology.txt   | 21 +++++++++++++++------
 arch/x86/include/asm/smp.h      |  1 +
 arch/x86/include/asm/topology.h |  1 +
 arch/x86/kernel/smpboot.c       | 22 ++++++++++++++++++++++
 arch/x86/xen/smp_pv.c           |  1 +
 drivers/base/topology.c         | 12 ++++++++++++
 include/linux/topology.h        |  3 +++
 7 files changed, 55 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 48af5c290e20..b90dafcc8237 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -36,15 +36,15 @@ drawer_id:
 	identifier (rather than the kernel's).	The actual value is
 	architecture and platform dependent.
 
-thread_siblings:
+core_cpus:
 
-	internal kernel map of cpuX's hardware threads within the same
-	core as cpuX.
+	internal kernel map of CPUs within the same core.
+	(deprecated name: "thread_siblings")
 
-thread_siblings_list:
+core_cpus_list:
 
-	human-readable list of cpuX's hardware threads within the same
-	core as cpuX.
+	human-readable list of CPUs within the same core.
+	(deprecated name: "thread_siblings_list");
 
 package_cpus:
 
@@ -56,6 +56,14 @@ package_cpus_list:
 	human-readable list of CPUs sharing the same physical_package_id.
 	(deprecated name: "core_siblings_list")
 
+die_cpus:
+
+	internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+	human-readable list of CPUs within the same die.
+
 book_siblings:
 
 	internal kernel map of cpuX's hardware threads within the same
@@ -93,6 +101,7 @@ these macros in include/asm-XXX/topology.h::
 	#define topology_drawer_id(cpu)
 	#define topology_sibling_cpumask(cpu)
 	#define topology_core_cpumask(cpu)
+	#define topology_die_cpumask(cpu)
 	#define topology_book_cpumask(cpu)
 	#define topology_drawer_cpumask(cpu)
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index da545df207b2..b673a226ad6c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -23,6 +23,7 @@ extern unsigned int num_processors;
 
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
 /* cpus sharing the last level cache: */
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9de16b4f6023..4b14d2318251 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -111,6 +111,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
 
 #ifdef CONFIG_SMP
+#define topology_die_cpumask(cpu)		(per_cpu(cpu_die_map, cpu))
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a6e01b6c2709..1a19a5171949 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -91,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
+/* representing HT, core, and die siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
+
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 
 /* Per CPU bogomips and other parameters */
@@ -509,6 +513,15 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if ((c->phys_proc_id == o->phys_proc_id) &&
+		(c->cpu_die_id == o->cpu_die_id))
+		return true;
+	return false;
+}
+
+
 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
 static inline int x86_sched_itmt_flags(void)
 {
@@ -571,6 +584,7 @@ void set_cpu_sibling_map(int cpu)
 		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
 		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
+		cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
 		c->booted_cores = 1;
 		return;
 	}
@@ -619,6 +633,9 @@ void set_cpu_sibling_map(int cpu)
 		}
 		if (match_pkg(c, o) && !topology_same_node(c, o))
 			x86_has_numa_in_package = true;
+
+		if ((i == cpu) || (has_mp && match_die(c, o)))
+			link_mask(topology_die_cpumask, cpu, i);
 	}
 
 	threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -1223,6 +1240,7 @@ static __init void disable_smp(void)
 		physid_set_mask_of_physid(0, &phys_cpu_present_map);
 	cpumask_set_cpu(0, topology_sibling_cpumask(0));
 	cpumask_set_cpu(0, topology_core_cpumask(0));
+	cpumask_set_cpu(0, topology_die_cpumask(0));
 }
 
 /*
@@ -1318,6 +1336,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
 
@@ -1538,6 +1557,8 @@ static void remove_siblinginfo(int cpu)
 			cpu_data(sibling).booted_cores--;
 	}
 
+	for_each_cpu(sibling, topology_die_cpumask(cpu))
+		cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
 	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
 		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
@@ -1545,6 +1566,7 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear(cpu_llc_shared_mask(cpu));
 	cpumask_clear(topology_sibling_cpumask(cpu));
 	cpumask_clear(topology_core_cpumask(cpu));
+	cpumask_clear(topology_die_cpumask(cpu));
 	c->cpu_core_id = 0;
 	c->booted_cores = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 590fcf863006..77d81c1a63e9 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -251,6 +251,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
 	set_cpu_sibling_map(0);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index dc3c19b482f3..4e033d4cc0dc 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -53,10 +53,18 @@ define_siblings_show_func(thread_siblings, sibling_cpumask);
 static DEVICE_ATTR_RO(thread_siblings);
 static DEVICE_ATTR_RO(thread_siblings_list);
 
+define_siblings_show_func(core_cpus, sibling_cpumask);
+static DEVICE_ATTR_RO(core_cpus);
+static DEVICE_ATTR_RO(core_cpus_list);
+
 define_siblings_show_func(core_siblings, core_cpumask);
 static DEVICE_ATTR_RO(core_siblings);
 static DEVICE_ATTR_RO(core_siblings_list);
 
+define_siblings_show_func(die_cpus, die_cpumask);
+static DEVICE_ATTR_RO(die_cpus);
+static DEVICE_ATTR_RO(die_cpus_list);
+
 define_siblings_show_func(package_cpus, core_cpumask);
 static DEVICE_ATTR_RO(package_cpus);
 static DEVICE_ATTR_RO(package_cpus_list);
@@ -83,8 +91,12 @@ static struct attribute *default_attrs[] = {
 	&dev_attr_core_id.attr,
 	&dev_attr_thread_siblings.attr,
 	&dev_attr_thread_siblings_list.attr,
+	&dev_attr_core_cpus.attr,
+	&dev_attr_core_cpus_list.attr,
 	&dev_attr_core_siblings.attr,
 	&dev_attr_core_siblings_list.attr,
+	&dev_attr_die_cpus.attr,
+	&dev_attr_die_cpus_list.attr,
 	&dev_attr_package_cpus.attr,
 	&dev_attr_package_cpus_list.attr,
 #ifdef CONFIG_SCHED_BOOK
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 5cc8595dd0e4..47a3e3c08036 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -196,6 +196,9 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
+#ifndef topology_die_cpumask
+#define topology_die_cpumask(cpu)		cpumask_of(cpu)
+#endif
 
 #ifdef CONFIG_SCHED_SMT
 static inline const struct cpumask *cpu_smt_mask(int cpu)
-- 
cgit v1.2.3


From 924b5867e7bd6a6a98014f0517b747465b108011 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 15 May 2019 09:48:12 -0700
Subject: spi: Allow SPI devices to request the pumping thread be realtime

Right now the only way to get the SPI pumping thread bumped up to
realtime priority is for the controller to request it.  However it may
be that the controller works fine with the normal priority but
communication to a particular SPI device on the bus needs realtime
priority.

Let's add a way for devices to request realtime priority when they set
themselves up.

NOTE: this will just affect the priority of transfers that end up on
the SPI core's pumping thread.  In many cases transfers happen in the
context of the caller so if you need realtime priority for all
transfers you should ensure the calling context is also realtime
priority.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Tested-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 36 ++++++++++++++++++++++++++++++------
 include/linux/spi/spi.h |  2 ++
 2 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 5e75944ad5d1..18f70e4bbb31 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1364,10 +1364,32 @@ static void spi_pump_messages(struct kthread_work *work)
 	__spi_pump_messages(ctlr, true);
 }
 
-static int spi_init_queue(struct spi_controller *ctlr)
+/**
+ * spi_set_thread_rt - set the controller to pump at realtime priority
+ * @ctlr: controller to boost priority of
+ *
+ * This can be called because the controller requested realtime priority
+ * (by setting the ->rt value before calling spi_register_controller()) or
+ * because a device on the bus said that its transfers needed realtime
+ * priority.
+ *
+ * NOTE: at the moment if any device on a bus says it needs realtime then
+ * the thread will be at realtime priority for all transfers on that
+ * controller.  If this eventually becomes a problem we may see if we can
+ * find a way to boost the priority only temporarily during relevant
+ * transfers.
+ */
+static void spi_set_thread_rt(struct spi_controller *ctlr)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 
+	dev_info(&ctlr->dev,
+		"will run message pump with realtime priority\n");
+	sched_setscheduler(ctlr->kworker_task, SCHED_FIFO, &param);
+}
+
+static int spi_init_queue(struct spi_controller *ctlr)
+{
 	ctlr->running = false;
 	ctlr->busy = false;
 
@@ -1387,11 +1409,8 @@ static int spi_init_queue(struct spi_controller *ctlr)
 	 * request and the scheduling of the message pump thread. Without this
 	 * setting the message pump thread will remain at default priority.
 	 */
-	if (ctlr->rt) {
-		dev_info(&ctlr->dev,
-			"will run message pump with realtime priority\n");
-		sched_setscheduler(ctlr->kworker_task, SCHED_FIFO, &param);
-	}
+	if (ctlr->rt)
+		spi_set_thread_rt(ctlr);
 
 	return 0;
 }
@@ -2982,6 +3001,11 @@ int spi_setup(struct spi_device *spi)
 
 	spi_set_cs(spi, false);
 
+	if (spi->rt && !spi->controller->rt) {
+		spi->controller->rt = true;
+		spi_set_thread_rt(spi->controller);
+	}
+
 	dev_dbg(&spi->dev, "setup mode %d, %s%s%s%s%u bits/w, %u Hz max --> %d\n",
 			(int) (spi->mode & (SPI_CPOL | SPI_CPHA)),
 			(spi->mode & SPI_CS_HIGH) ? "cs_high, " : "",
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 053abd22ad31..15505c2485d6 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -109,6 +109,7 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
  *	This may be changed by the device's driver, or left at the
  *	default (0) indicating protocol words are eight bit bytes.
  *	The spi_transfer.bits_per_word can override this for each transfer.
+ * @rt: Make the pump thread real time priority.
  * @irq: Negative, or the number passed to request_irq() to receive
  *	interrupts from this device.
  * @controller_state: Controller's runtime state
@@ -143,6 +144,7 @@ struct spi_device {
 	u32			max_speed_hz;
 	u8			chip_select;
 	u8			bits_per_word;
+	bool			rt;
 	u32			mode;
 #define	SPI_CPHA	0x01			/* clock phase */
 #define	SPI_CPOL	0x02			/* clock polarity */
-- 
cgit v1.2.3


From 1bd33bf0fe6d3012410db0302187199871b510a0 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Thu, 23 May 2019 14:02:20 +0200
Subject: net: ll_temac: Prepare indirect register access for multicast support

With .ndo_set_rx_mode/temac_set_multicast_list() being called in atomic
context (holding addr_list_lock), and temac_set_multicast_list() needing
to access temac indirect registers, the mutex used to synchronize indirect
register is a no-no.

Replace it with a spinlock, and avoid sleeping in
temac_indirect_busywait().

To avoid excessive holding of the lock, which is now a spinlock, the
temac_device_reset() function is changed to only hold the lock for short
periods.  With timeouts, it could be holding the spinlock for more than
2 seconds.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac.h        |   5 +-
 drivers/net/ethernet/xilinx/ll_temac_main.c   | 240 ++++++++++++++++++--------
 drivers/net/ethernet/xilinx/ll_temac_mdio.c   |  20 +--
 include/linux/platform_data/xilinx-ll-temac.h |   3 +-
 4 files changed, 179 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h
index 1aeda084b8f1..276292bca334 100644
--- a/drivers/net/ethernet/xilinx/ll_temac.h
+++ b/drivers/net/ethernet/xilinx/ll_temac.h
@@ -361,7 +361,7 @@ struct temac_local {
 	/* For synchronization of indirect register access.  Must be
 	 * shared mutex between interfaces in same TEMAC block.
 	 */
-	struct mutex *indirect_mutex;
+	spinlock_t *indirect_lock;
 	u32 options;			/* Current options word */
 	int last_link;
 	unsigned int temac_features;
@@ -388,8 +388,9 @@ struct temac_local {
 /* xilinx_temac.c */
 int temac_indirect_busywait(struct temac_local *lp);
 u32 temac_indirect_in32(struct temac_local *lp, int reg);
+u32 temac_indirect_in32_locked(struct temac_local *lp, int reg);
 void temac_indirect_out32(struct temac_local *lp, int reg, u32 value);
-
+void temac_indirect_out32_locked(struct temac_local *lp, int reg, u32 value);
 
 /* xilinx_temac_mdio.c */
 int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev);
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 65fb549241b2..cc58bd8c12f6 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -53,6 +53,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
+#include <linux/processor.h>
 #include <linux/platform_data/xilinx-ll-temac.h>
 
 #include "ll_temac.h"
@@ -84,51 +85,118 @@ static void _temac_iow_le(struct temac_local *lp, int offset, u32 value)
 	return iowrite32(value, lp->regs + offset);
 }
 
+static bool hard_acs_rdy(struct temac_local *lp)
+{
+	return temac_ior(lp, XTE_RDY0_OFFSET) & XTE_RDY0_HARD_ACS_RDY_MASK;
+}
+
+static bool hard_acs_rdy_or_timeout(struct temac_local *lp, ktime_t timeout)
+{
+	ktime_t cur = ktime_get();
+
+	return hard_acs_rdy(lp) || ktime_after(cur, timeout);
+}
+
+/* Poll for maximum 20 ms.  This is similar to the 2 jiffies @ 100 Hz
+ * that was used before, and should cover MDIO bus speed down to 3200
+ * Hz.
+ */
+#define HARD_ACS_RDY_POLL_NS (20 * NSEC_PER_MSEC)
+
+/**
+ * temac_indirect_busywait - Wait for current indirect register access
+ * to complete.
+ */
 int temac_indirect_busywait(struct temac_local *lp)
 {
-	unsigned long end = jiffies + 2;
+	ktime_t timeout = ktime_add_ns(ktime_get(), HARD_ACS_RDY_POLL_NS);
 
-	while (!(temac_ior(lp, XTE_RDY0_OFFSET) & XTE_RDY0_HARD_ACS_RDY_MASK)) {
-		if (time_before_eq(end, jiffies)) {
-			WARN_ON(1);
-			return -ETIMEDOUT;
-		}
-		usleep_range(500, 1000);
-	}
-	return 0;
+	spin_until_cond(hard_acs_rdy_or_timeout(lp, timeout));
+	if (WARN_ON(!hard_acs_rdy(lp)))
+		return -ETIMEDOUT;
+	else
+		return 0;
 }
 
 /**
- * temac_indirect_in32
- *
- * lp->indirect_mutex must be held when calling this function
+ * temac_indirect_in32 - Indirect register read access.  This function
+ * must be called without lp->indirect_lock being held.
  */
 u32 temac_indirect_in32(struct temac_local *lp, int reg)
 {
-	u32 val;
+	unsigned long flags;
+	int val;
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	val = temac_indirect_in32_locked(lp, reg);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+	return val;
+}
 
-	if (temac_indirect_busywait(lp))
+/**
+ * temac_indirect_in32_locked - Indirect register read access.  This
+ * function must be called with lp->indirect_lock being held.  Use
+ * this together with spin_lock_irqsave/spin_lock_irqrestore to avoid
+ * repeated lock/unlock and to ensure uninterrupted access to indirect
+ * registers.
+ */
+u32 temac_indirect_in32_locked(struct temac_local *lp, int reg)
+{
+	/* This initial wait should normally not spin, as we always
+	 * try to wait for indirect access to complete before
+	 * releasing the indirect_lock.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return -ETIMEDOUT;
+	/* Initiate read from indirect register */
 	temac_iow(lp, XTE_CTL0_OFFSET, reg);
-	if (temac_indirect_busywait(lp))
+	/* Wait for indirect register access to complete.  We really
+	 * should not see timeouts, and could even end up causing
+	 * problem for following indirect access, so let's make a bit
+	 * of WARN noise.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return -ETIMEDOUT;
-	val = temac_ior(lp, XTE_LSW0_OFFSET);
-
-	return val;
+	/* Value is ready now */
+	return temac_ior(lp, XTE_LSW0_OFFSET);
 }
 
 /**
- * temac_indirect_out32
- *
- * lp->indirect_mutex must be held when calling this function
+ * temac_indirect_out32 - Indirect register write access.  This function
+ * must be called without lp->indirect_lock being held.
  */
 void temac_indirect_out32(struct temac_local *lp, int reg, u32 value)
 {
-	if (temac_indirect_busywait(lp))
+	unsigned long flags;
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, reg, value);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+}
+
+/**
+ * temac_indirect_out32_locked - Indirect register write access.  This
+ * function must be called with lp->indirect_lock being held.  Use
+ * this together with spin_lock_irqsave/spin_lock_irqrestore to avoid
+ * repeated lock/unlock and to ensure uninterrupted access to indirect
+ * registers.
+ */
+void temac_indirect_out32_locked(struct temac_local *lp, int reg, u32 value)
+{
+	/* As in temac_indirect_in32_locked(), we should normally not
+	 * spin here.  And if it happens, we actually end up silently
+	 * ignoring the write request.  Ouch.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return;
+	/* Initiate write to indirect register */
 	temac_iow(lp, XTE_LSW0_OFFSET, value);
 	temac_iow(lp, XTE_CTL0_OFFSET, CNTLREG_WRITE_ENABLE_MASK | reg);
-	temac_indirect_busywait(lp);
+	/* As in temac_indirect_in32_locked(), we should not see timeouts
+	 * here.  And if it happens, we continue before the write has
+	 * completed.  Not good.
+	 */
+	WARN_ON(temac_indirect_busywait(lp));
 }
 
 /**
@@ -344,20 +412,21 @@ out:
 static void temac_do_set_mac_address(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
+	unsigned long flags;
 
 	/* set up unicast MAC address filter set its mac address */
-	mutex_lock(lp->indirect_mutex);
-	temac_indirect_out32(lp, XTE_UAW0_OFFSET,
-			     (ndev->dev_addr[0]) |
-			     (ndev->dev_addr[1] << 8) |
-			     (ndev->dev_addr[2] << 16) |
-			     (ndev->dev_addr[3] << 24));
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_UAW0_OFFSET,
+				    (ndev->dev_addr[0]) |
+				    (ndev->dev_addr[1] << 8) |
+				    (ndev->dev_addr[2] << 16) |
+				    (ndev->dev_addr[3] << 24));
 	/* There are reserved bits in EUAW1
 	 * so don't affect them Set MAC bits [47:32] in EUAW1 */
-	temac_indirect_out32(lp, XTE_UAW1_OFFSET,
-			     (ndev->dev_addr[4] & 0x000000ff) |
-			     (ndev->dev_addr[5] << 8));
-	mutex_unlock(lp->indirect_mutex);
+	temac_indirect_out32_locked(lp, XTE_UAW1_OFFSET,
+				    (ndev->dev_addr[4] & 0x000000ff) |
+				    (ndev->dev_addr[5] << 8));
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 }
 
 static int temac_init_mac_address(struct net_device *ndev, const void *address)
@@ -383,42 +452,56 @@ static int temac_set_mac_address(struct net_device *ndev, void *p)
 static void temac_set_multicast_list(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
-	u32 multi_addr_msw, multi_addr_lsw, val;
+	u32 multi_addr_msw, multi_addr_lsw;
 	int i;
+	unsigned long flags;
+	bool promisc_mode_disabled = false;
 
-	mutex_lock(lp->indirect_mutex);
-	if (ndev->flags & (IFF_ALLMULTI | IFF_PROMISC) ||
-	    netdev_mc_count(ndev) > MULTICAST_CAM_TABLE_NUM) {
+	if (ndev->flags & (IFF_PROMISC | IFF_ALLMULTI) ||
+	    (netdev_mc_count(ndev) > MULTICAST_CAM_TABLE_NUM)) {
 		temac_indirect_out32(lp, XTE_AFM_OFFSET, XTE_AFM_EPPRM_MASK);
 		dev_info(&ndev->dev, "Promiscuous mode enabled.\n");
-	} else if (!netdev_mc_empty(ndev)) {
+		return;
+	}
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+
+	if (!netdev_mc_empty(ndev)) {
 		struct netdev_hw_addr *ha;
 
 		i = 0;
 		netdev_for_each_mc_addr(ha, ndev) {
-			if (i >= MULTICAST_CAM_TABLE_NUM)
+			if (WARN_ON(i >= MULTICAST_CAM_TABLE_NUM))
 				break;
 			multi_addr_msw = ((ha->addr[3] << 24) |
 					  (ha->addr[2] << 16) |
 					  (ha->addr[1] << 8) |
 					  (ha->addr[0]));
-			temac_indirect_out32(lp, XTE_MAW0_OFFSET,
-					     multi_addr_msw);
+			temac_indirect_out32_locked(lp, XTE_MAW0_OFFSET,
+						    multi_addr_msw);
 			multi_addr_lsw = ((ha->addr[5] << 8) |
 					  (ha->addr[4]) | (i << 16));
-			temac_indirect_out32(lp, XTE_MAW1_OFFSET,
-					     multi_addr_lsw);
+			temac_indirect_out32_locked(lp, XTE_MAW1_OFFSET,
+						    multi_addr_lsw);
 			i++;
 		}
 	} else {
-		val = temac_indirect_in32(lp, XTE_AFM_OFFSET);
-		temac_indirect_out32(lp, XTE_AFM_OFFSET,
-				     val & ~XTE_AFM_EPPRM_MASK);
-		temac_indirect_out32(lp, XTE_MAW0_OFFSET, 0);
-		temac_indirect_out32(lp, XTE_MAW1_OFFSET, 0);
-		dev_info(&ndev->dev, "Promiscuous mode disabled.\n");
+		temac_indirect_out32_locked(lp, XTE_MAW0_OFFSET, 0);
+		temac_indirect_out32_locked(lp, XTE_MAW1_OFFSET, i << 16);
+		}
 	}
-	mutex_unlock(lp->indirect_mutex);
+
+	/* Enable address filter block if currently disabled */
+	if (temac_indirect_in32_locked(lp, XTE_AFM_OFFSET)
+	    & XTE_AFM_EPPRM_MASK) {
+		temac_indirect_out32_locked(lp, XTE_AFM_OFFSET, 0);
+		promisc_mode_disabled = true;
+	}
+
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+
+	if (promisc_mode_disabled)
+		dev_info(&ndev->dev, "Promiscuous mode disabled.\n");
 }
 
 static struct temac_option {
@@ -509,17 +592,19 @@ static u32 temac_setoptions(struct net_device *ndev, u32 options)
 	struct temac_local *lp = netdev_priv(ndev);
 	struct temac_option *tp = &temac_options[0];
 	int reg;
+	unsigned long flags;
 
-	mutex_lock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
 	while (tp->opt) {
-		reg = temac_indirect_in32(lp, tp->reg) & ~tp->m_or;
-		if (options & tp->opt)
+		reg = temac_indirect_in32_locked(lp, tp->reg) & ~tp->m_or;
+		if (options & tp->opt) {
 			reg |= tp->m_or;
-		temac_indirect_out32(lp, tp->reg, reg);
+			temac_indirect_out32_locked(lp, tp->reg, reg);
+		}
 		tp++;
 	}
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 	lp->options |= options;
-	mutex_unlock(lp->indirect_mutex);
 
 	return 0;
 }
@@ -530,6 +615,7 @@ static void temac_device_reset(struct net_device *ndev)
 	struct temac_local *lp = netdev_priv(ndev);
 	u32 timeout;
 	u32 val;
+	unsigned long flags;
 
 	/* Perform a software reset */
 
@@ -538,7 +624,6 @@ static void temac_device_reset(struct net_device *ndev)
 
 	dev_dbg(&ndev->dev, "%s()\n", __func__);
 
-	mutex_lock(lp->indirect_mutex);
 	/* Reset the receiver and wait for it to finish reset */
 	temac_indirect_out32(lp, XTE_RXC1_OFFSET, XTE_RXC1_RXRST_MASK);
 	timeout = 1000;
@@ -564,8 +649,11 @@ static void temac_device_reset(struct net_device *ndev)
 	}
 
 	/* Disable the receiver */
-	val = temac_indirect_in32(lp, XTE_RXC1_OFFSET);
-	temac_indirect_out32(lp, XTE_RXC1_OFFSET, val & ~XTE_RXC1_RXEN_MASK);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	val = temac_indirect_in32_locked(lp, XTE_RXC1_OFFSET);
+	temac_indirect_out32_locked(lp, XTE_RXC1_OFFSET,
+				    val & ~XTE_RXC1_RXEN_MASK);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	/* Reset Local Link (DMA) */
 	lp->dma_out(lp, DMA_CONTROL_REG, DMA_CONTROL_RST);
@@ -585,12 +673,12 @@ static void temac_device_reset(struct net_device *ndev)
 				"temac_device_reset descriptor allocation failed\n");
 	}
 
-	temac_indirect_out32(lp, XTE_RXC0_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_RXC1_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_TXC_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_FCC_OFFSET, XTE_FCC_RXFLO_MASK);
-
-	mutex_unlock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_RXC0_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_RXC1_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_TXC_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_FCC_OFFSET, XTE_FCC_RXFLO_MASK);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	/* Sync default options with HW
 	 * but leave receiver and transmitter disabled.  */
@@ -614,13 +702,14 @@ static void temac_adjust_link(struct net_device *ndev)
 	struct phy_device *phy = ndev->phydev;
 	u32 mii_speed;
 	int link_state;
+	unsigned long flags;
 
 	/* hash together the state values to decide if something has changed */
 	link_state = phy->speed | (phy->duplex << 1) | phy->link;
 
-	mutex_lock(lp->indirect_mutex);
 	if (lp->last_link != link_state) {
-		mii_speed = temac_indirect_in32(lp, XTE_EMCFG_OFFSET);
+		spin_lock_irqsave(lp->indirect_lock, flags);
+		mii_speed = temac_indirect_in32_locked(lp, XTE_EMCFG_OFFSET);
 		mii_speed &= ~XTE_EMCFG_LINKSPD_MASK;
 
 		switch (phy->speed) {
@@ -630,11 +719,12 @@ static void temac_adjust_link(struct net_device *ndev)
 		}
 
 		/* Write new speed setting out to TEMAC */
-		temac_indirect_out32(lp, XTE_EMCFG_OFFSET, mii_speed);
+		temac_indirect_out32_locked(lp, XTE_EMCFG_OFFSET, mii_speed);
+		spin_unlock_irqrestore(lp->indirect_lock, flags);
+
 		lp->last_link = link_state;
 		phy_print_status(phy);
 	}
-	mutex_unlock(lp->indirect_mutex);
 }
 
 #ifdef CONFIG_64BIT
@@ -1096,17 +1186,17 @@ static int temac_probe(struct platform_device *pdev)
 
 	/* Setup mutex for synchronization of indirect register access */
 	if (pdata) {
-		if (!pdata->indirect_mutex) {
+		if (!pdata->indirect_lock) {
 			dev_err(&pdev->dev,
-				"indirect_mutex missing in platform_data\n");
+				"indirect_lock missing in platform_data\n");
 			return -EINVAL;
 		}
-		lp->indirect_mutex = pdata->indirect_mutex;
+		lp->indirect_lock = pdata->indirect_lock;
 	} else {
-		lp->indirect_mutex = devm_kmalloc(&pdev->dev,
-						  sizeof(*lp->indirect_mutex),
-						  GFP_KERNEL);
-		mutex_init(lp->indirect_mutex);
+		lp->indirect_lock = devm_kmalloc(&pdev->dev,
+						 sizeof(*lp->indirect_lock),
+						 GFP_KERNEL);
+		spin_lock_init(lp->indirect_lock);
 	}
 
 	/* map device registers */
diff --git a/drivers/net/ethernet/xilinx/ll_temac_mdio.c b/drivers/net/ethernet/xilinx/ll_temac_mdio.c
index a4667326f745..6fd2dea4e60f 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_mdio.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_mdio.c
@@ -25,14 +25,15 @@ static int temac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 {
 	struct temac_local *lp = bus->priv;
 	u32 rc;
+	unsigned long flags;
 
 	/* Write the PHY address to the MIIM Access Initiator register.
 	 * When the transfer completes, the PHY register value will appear
 	 * in the LSW0 register */
-	mutex_lock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
 	temac_iow(lp, XTE_LSW0_OFFSET, (phy_id << 5) | reg);
-	rc = temac_indirect_in32(lp, XTE_MIIMAI_OFFSET);
-	mutex_unlock(lp->indirect_mutex);
+	rc = temac_indirect_in32_locked(lp, XTE_MIIMAI_OFFSET);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	dev_dbg(lp->dev, "temac_mdio_read(phy_id=%i, reg=%x) == %x\n",
 		phy_id, reg, rc);
@@ -43,6 +44,7 @@ static int temac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 static int temac_mdio_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
 {
 	struct temac_local *lp = bus->priv;
+	unsigned long flags;
 
 	dev_dbg(lp->dev, "temac_mdio_write(phy_id=%i, reg=%x, val=%x)\n",
 		phy_id, reg, val);
@@ -50,10 +52,10 @@ static int temac_mdio_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
 	/* First write the desired value into the write data register
 	 * and then write the address into the access initiator register
 	 */
-	mutex_lock(lp->indirect_mutex);
-	temac_indirect_out32(lp, XTE_MGTDR_OFFSET, val);
-	temac_indirect_out32(lp, XTE_MIIMAI_OFFSET, (phy_id << 5) | reg);
-	mutex_unlock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_MGTDR_OFFSET, val);
+	temac_indirect_out32_locked(lp, XTE_MIIMAI_OFFSET, (phy_id << 5) | reg);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	return 0;
 }
@@ -87,9 +89,7 @@ int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev)
 
 	/* Enable the MDIO bus by asserting the enable bit and writing
 	 * in the clock config */
-	mutex_lock(lp->indirect_mutex);
 	temac_indirect_out32(lp, XTE_MC_OFFSET, 1 << 6 | clk_div);
-	mutex_unlock(lp->indirect_mutex);
 
 	bus = devm_mdiobus_alloc(&pdev->dev);
 	if (!bus)
@@ -116,10 +116,8 @@ int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev)
 	if (rc)
 		return rc;
 
-	mutex_lock(lp->indirect_mutex);
 	dev_dbg(lp->dev, "MDIO bus registered;  MC:%x\n",
 		temac_indirect_in32(lp, XTE_MC_OFFSET));
-	mutex_unlock(lp->indirect_mutex);
 	return 0;
 }
 
diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
index 368530f98176..f4a68136afa6 100644
--- a/include/linux/platform_data/xilinx-ll-temac.h
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -4,6 +4,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/phy.h>
+#include <linux/spinlock.h>
 
 struct ll_temac_platform_data {
 	bool txcsum;		/* Enable/disable TX checksum */
@@ -21,7 +22,7 @@ struct ll_temac_platform_data {
 	 * TEMAC IP block, the same mutex should be passed here, as
 	 * they share the same DCR bus bridge.
 	 */
-	struct mutex *indirect_mutex;
+	spinlock_t *indirect_lock;
 	/* DMA channel control setup */
 	u8 tx_irq_timeout;	/* TX Interrupt Delay Time-out */
 	u8 tx_irq_count;	/* TX Interrupt Coalescing Threshold Count */
-- 
cgit v1.2.3


From 9395da4efbd46661f0049d24d54d1cea63241fc9 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 22 May 2019 14:21:07 -0600
Subject: net: qualcomm: rmnet: Move common struct definitions to include

Create if_rmnet.h and move the rmnet MAP packet structs to this
common include file. To account for portablity, add little and
big endian bitfield definitions similar to the ip & tcp headers.

The definitions in the headers can now be re-used by the
upcoming ipa driver series as well as qmi_wwan.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 25 +----------
 include/linux/if_rmnet.h                        | 55 +++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 24 deletions(-)
 create mode 100644 include/linux/if_rmnet.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 884f1f52dcc2..991d7e285736 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -12,6 +12,7 @@
 
 #ifndef _RMNET_MAP_H_
 #define _RMNET_MAP_H_
+#include <linux/if_rmnet.h>
 
 struct rmnet_map_control_command {
 	u8  command_name;
@@ -39,30 +40,6 @@ enum rmnet_map_commands {
 	RMNET_MAP_COMMAND_ENUM_LENGTH
 };
 
-struct rmnet_map_header {
-	u8  pad_len:6;
-	u8  reserved_bit:1;
-	u8  cd_bit:1;
-	u8  mux_id;
-	__be16 pkt_len;
-}  __aligned(1);
-
-struct rmnet_map_dl_csum_trailer {
-	u8  reserved1;
-	u8  valid:1;
-	u8  reserved2:7;
-	u16 csum_start_offset;
-	u16 csum_length;
-	__be16 csum_value;
-} __aligned(1);
-
-struct rmnet_map_ul_csum_header {
-	__be16 csum_start_offset;
-	u16 csum_insert_offset:14;
-	u16 udp_ip4_ind:1;
-	u16 csum_enabled:1;
-} __aligned(1);
-
 #define RMNET_MAP_GET_MUX_ID(Y) (((struct rmnet_map_header *) \
 				 (Y)->data)->mux_id)
 #define RMNET_MAP_GET_CD_BIT(Y) (((struct rmnet_map_header *) \
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
new file mode 100644
index 000000000000..b4f5403383fc
--- /dev/null
+++ b/include/linux/if_rmnet.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (c) 2013-2019, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _LINUX_IF_RMNET_H_
+#define _LINUX_IF_RMNET_H_
+
+struct rmnet_map_header {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8  pad_len:6;
+	u8  reserved_bit:1;
+	u8  cd_bit:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u8  cd_bit:1;
+	u8  reserved_bit:1;
+	u8  pad_len:6;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+	u8  mux_id;
+	__be16 pkt_len;
+}  __aligned(1);
+
+struct rmnet_map_dl_csum_trailer {
+	u8  reserved1;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8  valid:1;
+	u8  reserved2:7;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u8  reserved2:7;
+	u8  valid:1;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+	u16 csum_start_offset;
+	u16 csum_length;
+	__be16 csum_value;
+} __aligned(1);
+
+struct rmnet_map_ul_csum_header {
+	__be16 csum_start_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u16 csum_insert_offset:14;
+	u16 udp_ip4_ind:1;
+	u16 csum_enabled:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u16 csum_enabled:1;
+	u16 udp_ip4_ind:1;
+	u16 csum_insert_offset:14;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+} __aligned(1);
+
+#endif /* !(_LINUX_IF_RMNET_H_) */
-- 
cgit v1.2.3


From a8f500af0ccffc3d2aaf9018537981cb173865a1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 21 May 2019 20:17:06 -0700
Subject: bpf: split explored_states

split explored_states into prune_point boolean mark
and link list of explored states.
This removes STATE_LIST_MARK hack and allows marks to be separate from states.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 31 +++++++++++++------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1305ccbd8fe6..02bba09a0ea1 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -233,6 +233,7 @@ struct bpf_insn_aux_data {
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
 	u8 alu_state; /* used in combination with alu_limit */
+	bool prune_point;
 	unsigned int orig_idx; /* original instruction index */
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 736b5a0d4848..6a3e69ba891e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5436,7 +5436,6 @@ enum {
 	BRANCH = 2,
 };
 
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
 static struct bpf_verifier_state_list **explored_state(
 					struct bpf_verifier_env *env,
 					int idx)
@@ -5446,7 +5445,7 @@ static struct bpf_verifier_state_list **explored_state(
 
 static void init_explored_state(struct bpf_verifier_env *env, int idx)
 {
-	env->explored_states[idx] = STATE_LIST_MARK;
+	env->insn_aux_data[idx].prune_point = true;
 }
 
 /* t, w, e - match pseudo-code above:
@@ -6018,10 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 	int i;
 
 	sl = *explored_state(env, insn);
-	if (!sl)
-		return;
-
-	while (sl != STATE_LIST_MARK) {
+	while (sl) {
 		if (sl->state.curframe != cur->curframe)
 			goto next;
 		for (i = 0; i <= cur->curframe; i++)
@@ -6376,18 +6372,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
 
-	pprev = explored_state(env, insn_idx);
-	sl = *pprev;
-
-	if (!sl)
+	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
 		 */
 		return 0;
 
+	pprev = explored_state(env, insn_idx);
+	sl = *pprev;
+
 	clean_live_states(env, insn_idx, cur);
 
-	while (sl != STATE_LIST_MARK) {
+	while (sl) {
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -8145,13 +8141,12 @@ static void free_states(struct bpf_verifier_env *env)
 	for (i = 0; i < env->prog->len; i++) {
 		sl = env->explored_states[i];
 
-		if (sl)
-			while (sl != STATE_LIST_MARK) {
-				sln = sl->next;
-				free_verifier_state(&sl->state, false);
-				kfree(sl);
-				sl = sln;
-			}
+		while (sl) {
+			sln = sl->next;
+			free_verifier_state(&sl->state, false);
+			kfree(sl);
+			sl = sln;
+		}
 	}
 
 	kvfree(env->explored_states);
-- 
cgit v1.2.3


From dc2a4ebc0b44a212fcf72242210e56aa17e7317b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 21 May 2019 20:17:07 -0700
Subject: bpf: convert explored_states to hash table

All prune points inside a callee bpf function most likely will have
different callsites. For example, if function foo() is called from
two callsites the half of explored states in all prune points in foo()
will be useless for subsequent walking of one of those callsites.
Fortunately explored_states pruning heuristics keeps the number of states
per prune point small, but walking these states is still a waste of cpu
time when the callsite of the current state is different from the callsite
of the explored state.

To improve pruning logic convert explored_states into hash table and
use simple insn_idx ^ callsite hash to select hash bucket.
This optimization has no effect on programs without bpf2bpf calls
and drastically improves programs with calls.
In the later case it reduces total memory consumption in 1M scale tests
by almost 3 times (peak_states drops from 5752 to 2016).

Care should be taken when comparing the states for equivalency.
Since the same hash bucket can now contain states with different indices
the insn_idx has to be part of verifier_state and compared.

Different hash table sizes and different hash functions were explored,
but the results were not significantly better vs this patch.
They can be improved in the future.

Hit/miss heuristic is not counting index miscompare as a miss.
Otherwise verifier stats become unstable when experimenting
with different hash functions.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 23 ++++++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 02bba09a0ea1..405b502283c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -187,6 +187,7 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
+	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
 	bool speculative;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a3e69ba891e..550091c7a46a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5436,11 +5436,19 @@ enum {
 	BRANCH = 2,
 };
 
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+	return env->prog->len;
+}
+
 static struct bpf_verifier_state_list **explored_state(
 					struct bpf_verifier_env *env,
 					int idx)
 {
-	return &env->explored_states[idx];
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_func_state *state = cur->frame[cur->curframe];
+
+	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
 }
 
 static void init_explored_state(struct bpf_verifier_env *env, int idx)
@@ -6018,7 +6026,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 
 	sl = *explored_state(env, insn);
 	while (sl) {
-		if (sl->state.curframe != cur->curframe)
+		if (sl->state.insn_idx != insn ||
+		    sl->state.curframe != cur->curframe)
 			goto next;
 		for (i = 0; i <= cur->curframe; i++)
 			if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -6384,6 +6393,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	clean_live_states(env, insn_idx, cur);
 
 	while (sl) {
+		states_cnt++;
+		if (sl->state.insn_idx != insn_idx)
+			goto next;
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -6401,7 +6413,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		states_cnt++;
 		sl->miss_cnt++;
 		/* heuristic to determine whether this state is beneficial
 		 * to keep checking from state equivalence point of view.
@@ -6428,6 +6439,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			sl = *pprev;
 			continue;
 		}
+next:
 		pprev = &sl->next;
 		sl = *pprev;
 	}
@@ -6459,6 +6471,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		kfree(new_sl);
 		return err;
 	}
+	new->insn_idx = insn_idx;
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -8138,7 +8151,7 @@ static void free_states(struct bpf_verifier_env *env)
 	if (!env->explored_states)
 		return;
 
-	for (i = 0; i < env->prog->len; i++) {
+	for (i = 0; i < state_htab_size(env); i++) {
 		sl = env->explored_states[i];
 
 		while (sl) {
@@ -8246,7 +8259,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 			goto skip_full_check;
 	}
 
-	env->explored_states = kvcalloc(env->prog->len,
+	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
-- 
cgit v1.2.3


From 59fcdce425b7c947ccea03a16e393af9bb4d6262 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:05:59 -0700
Subject: clk: Remove ifdef for COMMON_CLK in clk-provider.h

This ifdef has been there since the beginning of this file, but it
doesn't really seem to serve any purpose besides obfuscating the struct
definitions and #defines here from compilation units that include it.
Let's always expose these function prototypes and struct definitions so
that code can inspect clk providers without needing to have
CONFIG_COMMON_CLK enabled.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..3bced2ec9f26 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/of_clk.h>
 
-#ifdef CONFIG_COMMON_CLK
-
 /*
  * flags used across common struct clk.  these flags should only affect the
  * top-level framework.  custom flags for dealing with hardware specifics
@@ -1019,5 +1017,4 @@ static inline int of_clk_detect_critical(struct device_node *np, int index,
 
 void clk_gate_restore_context(struct clk_hw *hw);
 
-#endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
cgit v1.2.3


From 30d5a945743cd05ec5c847f2e38c2fbda5e00944 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:11:57 -0700
Subject: clk: Unexport __clk_of_table

This symbol doesn't need to be exported to clk providers anymore.
Originally, it was hidden inside clk.c, but then OMAP needed to get
access to it in commit 819b4861c18d ("CLK: ti: add init support for
clock IP blocks"), but eventually that code also changed in commit
c08ee14cc663 ("clk: ti: change clock init to use generic of_clk_init")
and we were left with this exported. Move this back into clk.c so that
it isn't exposed anymore.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 1 +
 include/linux/clk-provider.h | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index aa51756fd4d6..b34e84bb8167 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4038,6 +4038,7 @@ struct of_clk_provider {
 	void *data;
 };
 
+extern struct of_device_id __clk_of_table;
 static const struct of_device_id __clk_of_table_sentinel
 	__used __section(__clk_of_table_end);
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 3bced2ec9f26..9ba000e3a50d 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -865,8 +865,6 @@ static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate,
  */
 unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
-struct of_device_id;
-
 struct clk_onecell_data {
 	struct clk **clks;
 	unsigned int clk_num;
@@ -877,8 +875,6 @@ struct clk_hw_onecell_data {
 	struct clk_hw *hws[];
 };
 
-extern struct of_device_id __clk_of_table;
-
 #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
 
 /*
-- 
cgit v1.2.3


From 418a3ab1e7785799193c0f8628cd0f01c00a03ae Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 25 Apr 2019 04:54:42 -0700
Subject: mm/balloon_compaction: List interfaces

Introduce interfaces for ballooning enqueueing and dequeueing of a list
of pages. These interfaces reduce the overhead of storing and restoring
IRQs by batching the operations. In addition they do not panic if the
list of pages is empty.

Cc: Jason Wang <jasowang@redhat.com>
Cc: linux-mm@kvack.org
Cc: virtualization@lists.linux-foundation.org
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Xavier Deguillard <xdeguillard@vmware.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/balloon_compaction.h |   4 ++
 mm/balloon_compaction.c            | 144 +++++++++++++++++++++++++++----------
 2 files changed, 110 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f31521dcb09a..338aa27e4773 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -64,6 +64,10 @@ extern struct page *balloon_page_alloc(void);
 extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 				 struct page *page);
 extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
+extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+				      struct list_head *pages);
+extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+				     struct list_head *pages, size_t n_req_pages);
 
 static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 {
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index ef858d547e2d..b7bd72612c5a 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -10,6 +10,105 @@
 #include <linux/export.h>
 #include <linux/balloon_compaction.h>
 
+static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
+				     struct page *page)
+{
+	/*
+	 * Block others from accessing the 'page' when we get around to
+	 * establishing additional references. We should be the only one
+	 * holding a reference to the 'page' at this point. If we are not, then
+	 * memory corruption is possible and we should stop execution.
+	 */
+	BUG_ON(!trylock_page(page));
+	list_del(&page->lru);
+	balloon_page_insert(b_dev_info, page);
+	unlock_page(page);
+	__count_vm_event(BALLOON_INFLATE);
+}
+
+/**
+ * balloon_page_list_enqueue() - inserts a list of pages into the balloon page
+ *				 list.
+ * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * @pages: pages to enqueue - allocated using balloon_page_alloc.
+ *
+ * Driver must call it to properly enqueue a balloon pages before definitively
+ * removing it from the guest system.
+ *
+ * Return: number of pages that were enqueued.
+ */
+size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+				 struct list_head *pages)
+{
+	struct page *page, *tmp;
+	unsigned long flags;
+	size_t n_pages = 0;
+
+	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+	list_for_each_entry_safe(page, tmp, pages, lru) {
+		balloon_page_enqueue_one(b_dev_info, page);
+		n_pages++;
+	}
+	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+	return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
+
+/**
+ * balloon_page_list_dequeue() - removes pages from balloon's page list and
+ *				 returns a list of the pages.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @pages: pointer to the list of pages that would be returned to the caller.
+ * @n_req_pages: number of requested pages.
+ *
+ * Driver must call this function to properly de-allocate a previous enlisted
+ * balloon pages before definetively releasing it back to the guest system.
+ * This function tries to remove @n_req_pages from the ballooned pages and
+ * return them to the caller in the @pages list.
+ *
+ * Note that this function may fail to dequeue some pages temporarily empty due
+ * to compaction isolated pages.
+ *
+ * Return: number of pages that were added to the @pages list.
+ */
+size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+				 struct list_head *pages, size_t n_req_pages)
+{
+	struct page *page, *tmp;
+	unsigned long flags;
+	size_t n_pages = 0;
+
+	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+	list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+		if (n_pages == n_req_pages)
+			break;
+
+		/*
+		 * Block others from accessing the 'page' while we get around to
+		 * establishing additional references and preparing the 'page'
+		 * to be released by the balloon driver.
+		 */
+		if (!trylock_page(page))
+			continue;
+
+		if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
+		    PageIsolated(page)) {
+			/* raced with isolation */
+			unlock_page(page);
+			continue;
+		}
+		balloon_page_delete(page);
+		__count_vm_event(BALLOON_DEFLATE);
+		list_add(&page->lru, pages);
+		unlock_page(page);
+		n_pages++;
+	}
+	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+	return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
+
 /*
  * balloon_page_alloc - allocates a new page for insertion into the balloon
  *			  page list.
@@ -43,17 +142,9 @@ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 {
 	unsigned long flags;
 
-	/*
-	 * Block others from accessing the 'page' when we get around to
-	 * establishing additional references. We should be the only one
-	 * holding a reference to the 'page' at this point.
-	 */
-	BUG_ON(!trylock_page(page));
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-	balloon_page_insert(b_dev_info, page);
-	__count_vm_event(BALLOON_INFLATE);
+	balloon_page_enqueue_one(b_dev_info, page);
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
-	unlock_page(page);
 }
 EXPORT_SYMBOL_GPL(balloon_page_enqueue);
 
@@ -70,36 +161,13 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
  */
 struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
 {
-	struct page *page, *tmp;
 	unsigned long flags;
-	bool dequeued_page;
+	LIST_HEAD(pages);
+	int n_pages;
 
-	dequeued_page = false;
-	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-	list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
-		/*
-		 * Block others from accessing the 'page' while we get around
-		 * establishing additional references and preparing the 'page'
-		 * to be released by the balloon driver.
-		 */
-		if (trylock_page(page)) {
-#ifdef CONFIG_BALLOON_COMPACTION
-			if (PageIsolated(page)) {
-				/* raced with isolation */
-				unlock_page(page);
-				continue;
-			}
-#endif
-			balloon_page_delete(page);
-			__count_vm_event(BALLOON_DEFLATE);
-			unlock_page(page);
-			dequeued_page = true;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+	n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1);
 
-	if (!dequeued_page) {
+	if (n_pages != 1) {
 		/*
 		 * If we are unable to dequeue a balloon page because the page
 		 * list is empty and there is no isolated pages, then something
@@ -112,9 +180,9 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
 			     !b_dev_info->isolated_pages))
 			BUG();
 		spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
-		page = NULL;
+		return NULL;
 	}
-	return page;
+	return list_first_entry(&pages, struct page, lru);
 }
 EXPORT_SYMBOL_GPL(balloon_page_dequeue);
 
-- 
cgit v1.2.3


From 4618d6719743b60f1da4b8112c4518ee46110b94 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 23 May 2019 20:06:49 +0200
Subject: net: phy: add interface mode PHY_INTERFACE_MODE_USXGMII

Add support for interface mode PHY_INTERFACE_MODE_USXGMII.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 073fb151b5a9..7180b1d1e5e3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -103,6 +103,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_XAUI,
 	/* 10GBASE-KR, XFI, SFI - single lane 10G Serdes */
 	PHY_INTERFACE_MODE_10GKR,
+	PHY_INTERFACE_MODE_USXGMII,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -178,6 +179,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "xaui";
 	case PHY_INTERFACE_MODE_10GKR:
 		return "10gbase-kr";
+	case PHY_INTERFACE_MODE_USXGMII:
+		return "usxgmii";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 3fce8e1eb9945c2771360542b71ff717460ba4d7 Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Mon, 6 May 2019 14:16:11 -0500
Subject: leds: TI LMU: Add common code for TI LMU devices

Create a TI LMU common framework for TI LMU devices that share
common features.

Currently the runtime ramp and brightness setting have
been identified as common features with common register settings.

This work is derived from Milo Kims TI LMU MFD code.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/Kconfig               |   9 +++
 drivers/leds/Makefile              |   1 +
 drivers/leds/leds-ti-lmu-common.c  | 156 +++++++++++++++++++++++++++++++++++++
 include/linux/leds-ti-lmu-common.h |  47 +++++++++++
 4 files changed, 213 insertions(+)
 create mode 100644 drivers/leds/leds-ti-lmu-common.c
 create mode 100644 include/linux/leds-ti-lmu-common.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 71be87bdb926..4e262696be19 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -783,6 +783,15 @@ config LEDS_NIC78BX
 	  To compile this driver as a module, choose M here: the module
 	  will be called leds-nic78bx.
 
+config LEDS_TI_LMU_COMMON
+	tristate "LED driver for TI LMU"
+	depends on LEDS_CLASS
+	depends on REGMAP
+	help
+	  Say Y to enable the LED driver for TI LMU devices.
+	  This supports common features between the TI LM3532, LM3631, LM3632,
+	  LM3633, LM3695 and LM3697.
+
 comment "LED Triggers"
 source "drivers/leds/trigger/Kconfig"
 
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 1e9702ebffee..84c49339a8e3 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_LEDS_MT6323)		+= leds-mt6323.o
 obj-$(CONFIG_LEDS_LM3692X)		+= leds-lm3692x.o
 obj-$(CONFIG_LEDS_SC27XX_BLTC)		+= leds-sc27xx-bltc.o
 obj-$(CONFIG_LEDS_LM3601X)		+= leds-lm3601x.o
+obj-$(CONFIG_LEDS_TI_LMU_COMMON)	+= leds-ti-lmu-common.o
 
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_CR0014114)		+= leds-cr0014114.o
diff --git a/drivers/leds/leds-ti-lmu-common.c b/drivers/leds/leds-ti-lmu-common.c
new file mode 100644
index 000000000000..adc7293004f1
--- /dev/null
+++ b/drivers/leds/leds-ti-lmu-common.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright 2015 Texas Instruments
+// Copyright 2018 Sebastian Reichel
+// Copyright 2018 Pavel Machek <pavel@ucw.cz>
+// TI LMU LED common framework, based on previous work from
+// Milo Kim <milo.kim@ti.com>
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/of_device.h>
+
+#include <linux/leds-ti-lmu-common.h>
+
+const static int ramp_table[16] = {2048, 262000, 524000, 1049000, 2090000,
+				4194000, 8389000, 16780000, 33550000, 41940000,
+				50330000, 58720000, 67110000, 83880000,
+				100660000, 117440000};
+
+static int ti_lmu_common_update_brightness(struct ti_lmu_bank *lmu_bank,
+					   int brightness)
+{
+	struct regmap *regmap = lmu_bank->regmap;
+	u8 reg, val;
+	int ret;
+
+	/*
+	 * Brightness register update
+	 *
+	 * 11 bit dimming: update LSB bits and write MSB byte.
+	 *		   MSB brightness should be shifted.
+	 *  8 bit dimming: write MSB byte.
+	 */
+	if (lmu_bank->max_brightness == MAX_BRIGHTNESS_11BIT) {
+		reg = lmu_bank->lsb_brightness_reg;
+		ret = regmap_update_bits(regmap, reg,
+					 LMU_11BIT_LSB_MASK,
+					 brightness);
+		if (ret)
+			return ret;
+
+		val = brightness >> LMU_11BIT_MSB_SHIFT;
+	} else {
+		val = brightness;
+	}
+
+	reg = lmu_bank->msb_brightness_reg;
+
+	return regmap_write(regmap, reg, val);
+}
+
+int ti_lmu_common_set_brightness(struct ti_lmu_bank *lmu_bank, int brightness)
+{
+	return ti_lmu_common_update_brightness(lmu_bank, brightness);
+}
+EXPORT_SYMBOL(ti_lmu_common_set_brightness);
+
+static int ti_lmu_common_convert_ramp_to_index(unsigned int usec)
+{
+	int size = ARRAY_SIZE(ramp_table);
+	int i;
+
+	if (usec <= ramp_table[0])
+		return 0;
+
+	if (usec > ramp_table[size - 1])
+		return size - 1;
+
+	for (i = 1; i < size; i++) {
+		if (usec == ramp_table[i])
+			return i;
+
+		/* Find an approximate index by looking up the table */
+		if (usec > ramp_table[i - 1] && usec < ramp_table[i]) {
+			if (usec - ramp_table[i - 1] < ramp_table[i] - usec)
+				return i - 1;
+			else
+				return i;
+		}
+	}
+
+	return -EINVAL;
+}
+
+int ti_lmu_common_set_ramp(struct ti_lmu_bank *lmu_bank)
+{
+	struct regmap *regmap = lmu_bank->regmap;
+	u8 ramp, ramp_up, ramp_down;
+
+	if (lmu_bank->ramp_up_usec == 0 && lmu_bank->ramp_down_usec == 0) {
+		ramp_up = 0;
+		ramp_down = 0;
+	} else {
+		ramp_up = ti_lmu_common_convert_ramp_to_index(lmu_bank->ramp_up_usec);
+		ramp_down = ti_lmu_common_convert_ramp_to_index(lmu_bank->ramp_down_usec);
+	}
+
+	if (ramp_up < 0 || ramp_down < 0)
+		return -EINVAL;
+
+	ramp = (ramp_up << 4) | ramp_down;
+
+	return regmap_write(regmap, lmu_bank->runtime_ramp_reg, ramp);
+
+}
+EXPORT_SYMBOL(ti_lmu_common_set_ramp);
+
+int ti_lmu_common_get_ramp_params(struct device *dev,
+				  struct fwnode_handle *child,
+				  struct ti_lmu_bank *lmu_data)
+{
+	int ret;
+
+	ret = fwnode_property_read_u32(child, "ramp-up-us",
+				 &lmu_data->ramp_up_usec);
+	if (ret)
+		dev_warn(dev, "ramp-up-us property missing\n");
+
+
+	ret = fwnode_property_read_u32(child, "ramp-down-us",
+				 &lmu_data->ramp_down_usec);
+	if (ret)
+		dev_warn(dev, "ramp-down-us property missing\n");
+
+	return 0;
+}
+EXPORT_SYMBOL(ti_lmu_common_get_ramp_params);
+
+int ti_lmu_common_get_brt_res(struct device *dev, struct fwnode_handle *child,
+				  struct ti_lmu_bank *lmu_data)
+{
+	int ret;
+
+	ret = device_property_read_u32(dev, "ti,brightness-resolution",
+				       &lmu_data->max_brightness);
+	if (ret)
+		ret = fwnode_property_read_u32(child,
+					       "ti,brightness-resolution",
+					       &lmu_data->max_brightness);
+	if (lmu_data->max_brightness <= 0) {
+		lmu_data->max_brightness = MAX_BRIGHTNESS_8BIT;
+		return ret;
+	}
+
+	if (lmu_data->max_brightness > MAX_BRIGHTNESS_11BIT)
+			lmu_data->max_brightness = MAX_BRIGHTNESS_11BIT;
+
+
+	return 0;
+}
+EXPORT_SYMBOL(ti_lmu_common_get_brt_res);
+
+MODULE_DESCRIPTION("TI LMU common LED framework");
+MODULE_AUTHOR("Sebastian Reichel");
+MODULE_AUTHOR("Dan Murphy <dmurphy@ti.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ti-lmu-led-common");
diff --git a/include/linux/leds-ti-lmu-common.h b/include/linux/leds-ti-lmu-common.h
new file mode 100644
index 000000000000..5eb111f38803
--- /dev/null
+++ b/include/linux/leds-ti-lmu-common.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// TI LMU Common Core
+// Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com/
+
+#ifndef _TI_LMU_COMMON_H_
+#define _TI_LMU_COMMON_H_
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <uapi/linux/uleds.h>
+
+#define LMU_11BIT_LSB_MASK	(BIT(0) | BIT(1) | BIT(2))
+#define LMU_11BIT_MSB_SHIFT	3
+
+#define MAX_BRIGHTNESS_8BIT	255
+#define MAX_BRIGHTNESS_11BIT	2047
+
+struct ti_lmu_bank {
+	struct regmap *regmap;
+
+	int max_brightness;
+
+	u8 lsb_brightness_reg;
+	u8 msb_brightness_reg;
+
+	u8 runtime_ramp_reg;
+	u32 ramp_up_usec;
+	u32 ramp_down_usec;
+};
+
+int ti_lmu_common_set_brightness(struct ti_lmu_bank *lmu_bank, int brightness);
+
+int ti_lmu_common_set_ramp(struct ti_lmu_bank *lmu_bank);
+
+int ti_lmu_common_get_ramp_params(struct device *dev,
+				  struct fwnode_handle *child,
+				  struct ti_lmu_bank *lmu_data);
+
+int ti_lmu_common_get_brt_res(struct device *dev, struct fwnode_handle *child,
+			      struct ti_lmu_bank *lmu_data);
+
+#endif /* _TI_LMU_COMMON_H_ */
-- 
cgit v1.2.3


From b86b9ba55a2e0d1013db26084385d83dd7d0b475 Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Mon, 6 May 2019 14:16:13 -0500
Subject: mfd: ti-lmu: Remove support for LM3697

Remove support for the LM3697 from the ti-lmu driver in favor
of a dedicated LED driver.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/mfd/Kconfig                 |  2 +-
 drivers/mfd/ti-lmu.c                | 17 --------------
 include/linux/mfd/ti-lmu-register.h | 44 -------------------------------------
 include/linux/mfd/ti-lmu.h          |  1 -
 4 files changed, 1 insertion(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..8933485b28e7 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1336,7 +1336,7 @@ config MFD_TI_LMU
 	help
 	  Say yes here to enable support for TI LMU chips.
 
-	  TI LMU MFD supports LM3532, LM3631, LM3632, LM3633, LM3695 and LM3697.
+	  TI LMU MFD supports LM3532, LM3631, LM3632, LM3633, and LM3695.
 	  It consists of backlight, LED and regulator driver.
 	  It provides consistent device controls for lighting functions.
 
diff --git a/drivers/mfd/ti-lmu.c b/drivers/mfd/ti-lmu.c
index b06cb908d1aa..89b1c5b584af 100644
--- a/drivers/mfd/ti-lmu.c
+++ b/drivers/mfd/ti-lmu.c
@@ -111,20 +111,6 @@ static const struct mfd_cell lm3695_devices[] = {
 	},
 };
 
-static const struct mfd_cell lm3697_devices[] = {
-	{
-		.name          = "ti-lmu-backlight",
-		.id            = LM3697,
-		.of_compatible = "ti,lm3697-backlight",
-	},
-	/* Monitoring driver for open/short circuit detection */
-	{
-		.name          = "ti-lmu-fault-monitor",
-		.id            = LM3697,
-		.of_compatible = "ti,lm3697-fault-monitor",
-	},
-};
-
 #define TI_LMU_DATA(chip, max_reg)		\
 static const struct ti_lmu_data chip##_data =	\
 {						\
@@ -137,7 +123,6 @@ TI_LMU_DATA(lm3631, LM3631_MAX_REG);
 TI_LMU_DATA(lm3632, LM3632_MAX_REG);
 TI_LMU_DATA(lm3633, LM3633_MAX_REG);
 TI_LMU_DATA(lm3695, LM3695_MAX_REG);
-TI_LMU_DATA(lm3697, LM3697_MAX_REG);
 
 static int ti_lmu_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 {
@@ -206,7 +191,6 @@ static const struct of_device_id ti_lmu_of_match[] = {
 	{ .compatible = "ti,lm3632", .data = &lm3632_data },
 	{ .compatible = "ti,lm3633", .data = &lm3633_data },
 	{ .compatible = "ti,lm3695", .data = &lm3695_data },
-	{ .compatible = "ti,lm3697", .data = &lm3697_data },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, ti_lmu_of_match);
@@ -216,7 +200,6 @@ static const struct i2c_device_id ti_lmu_ids[] = {
 	{ "lm3632", LM3632 },
 	{ "lm3633", LM3633 },
 	{ "lm3695", LM3695 },
-	{ "lm3697", LM3697 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, ti_lmu_ids);
diff --git a/include/linux/mfd/ti-lmu-register.h b/include/linux/mfd/ti-lmu-register.h
index f09510561a55..76998b01764b 100644
--- a/include/linux/mfd/ti-lmu-register.h
+++ b/include/linux/mfd/ti-lmu-register.h
@@ -189,48 +189,4 @@
 #define LM3695_REG_BRT_MSB			0x14
 
 #define LM3695_MAX_REG				0x14
-
-/* LM3697 */
-#define LM3697_REG_HVLED_OUTPUT_CFG		0x10
-#define LM3697_HVLED1_CFG_MASK			BIT(0)
-#define LM3697_HVLED2_CFG_MASK			BIT(1)
-#define LM3697_HVLED3_CFG_MASK			BIT(2)
-#define LM3697_HVLED1_CFG_SHIFT			0
-#define LM3697_HVLED2_CFG_SHIFT			1
-#define LM3697_HVLED3_CFG_SHIFT			2
-
-#define LM3697_REG_BL0_RAMP			0x11
-#define LM3697_REG_BL1_RAMP			0x12
-#define LM3697_RAMPUP_MASK			0xF0
-#define LM3697_RAMPUP_SHIFT			4
-#define LM3697_RAMPDN_MASK			0x0F
-#define LM3697_RAMPDN_SHIFT			0
-
-#define LM3697_REG_RAMP_CONF			0x14
-#define LM3697_RAMP_MASK			0x0F
-#define LM3697_RAMP_EACH			0x05
-
-#define LM3697_REG_PWM_CFG			0x1C
-#define LM3697_PWM_A_MASK			BIT(0)
-#define LM3697_PWM_B_MASK			BIT(1)
-
-#define LM3697_REG_IMAX_A			0x17
-#define LM3697_REG_IMAX_B			0x18
-
-#define LM3697_REG_FEEDBACK_ENABLE		0x19
-
-#define LM3697_REG_BRT_A_LSB			0x20
-#define LM3697_REG_BRT_A_MSB			0x21
-#define LM3697_REG_BRT_B_LSB			0x22
-#define LM3697_REG_BRT_B_MSB			0x23
-
-#define LM3697_REG_ENABLE			0x24
-
-#define LM3697_REG_OPEN_FAULT_STATUS		0xB0
-
-#define LM3697_REG_SHORT_FAULT_STATUS		0xB2
-
-#define LM3697_REG_MONITOR_ENABLE		0xB4
-
-#define LM3697_MAX_REG				0xB4
 #endif
diff --git a/include/linux/mfd/ti-lmu.h b/include/linux/mfd/ti-lmu.h
index 7762c1bce55d..54e9d272e81c 100644
--- a/include/linux/mfd/ti-lmu.h
+++ b/include/linux/mfd/ti-lmu.h
@@ -26,7 +26,6 @@ enum ti_lmu_id {
 	LM3632,
 	LM3633,
 	LM3695,
-	LM3697,
 	LMU_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 9b4d2b635bd0cf8dfc45223f66fd85792fd2dc7b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 13:40:52 -0700
Subject: of/fdt: Remove dead code and mark functions with __init

Some functions in here are never called, and others are only called
during __init. Remove the dead code and some dead exports for functions
that don't exist (I'm looking at you of_fdt_get_string!). Mark some
functions with __init so we can throw them away after we boot up and
poke at the FDT blob too.

Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/fdt.c       | 37 +++++--------------------------------
 include/linux/of_fdt.h | 11 -----------
 2 files changed, 5 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index de893c9616a1..918098c9f72a 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -38,7 +38,7 @@
  * memory entries in the /memory node. This function may be called
  * any time after initial_boot_param is set.
  */
-void of_fdt_limit_memory(int limit)
+void __init of_fdt_limit_memory(int limit)
 {
 	int memory;
 	int len;
@@ -110,25 +110,6 @@ static int of_fdt_is_compatible(const void *blob,
 	return 0;
 }
 
-/**
- * of_fdt_is_big_endian - Return true if given node needs BE MMIO accesses
- * @blob: A device tree blob
- * @node: node to test
- *
- * Returns true if the node has a "big-endian" property, or if the kernel
- * was compiled for BE *and* the node has a "native-endian" property.
- * Returns false otherwise.
- */
-bool of_fdt_is_big_endian(const void *blob, unsigned long node)
-{
-	if (fdt_getprop(blob, node, "big-endian", NULL))
-		return true;
-	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) &&
-	    fdt_getprop(blob, node, "native-endian", NULL))
-		return true;
-	return false;
-}
-
 static bool of_fdt_device_is_available(const void *blob, unsigned long node)
 {
 	const char *status = fdt_getprop(blob, node, "status", NULL);
@@ -145,8 +126,8 @@ static bool of_fdt_device_is_available(const void *blob, unsigned long node)
 /**
  * of_fdt_match - Return true if node matches a list of compatible values
  */
-int of_fdt_match(const void *blob, unsigned long node,
-                 const char *const *compat)
+static int __init of_fdt_match(const void *blob, unsigned long node,
+			       const char *const *compat)
 {
 	unsigned int tmp, score = 0;
 
@@ -758,7 +739,7 @@ int __init of_scan_flat_dt_subnodes(unsigned long parent,
  * @return offset of the subnode, or -FDT_ERR_NOTFOUND if there is none
  */
 
-int of_get_flat_dt_subnode_by_name(unsigned long node, const char *uname)
+int __init of_get_flat_dt_subnode_by_name(unsigned long node, const char *uname)
 {
 	return fdt_subnode_offset(initial_boot_params, node, uname);
 }
@@ -771,14 +752,6 @@ unsigned long __init of_get_flat_dt_root(void)
 	return 0;
 }
 
-/**
- * of_get_flat_dt_size - Return the total size of the FDT
- */
-int __init of_get_flat_dt_size(void)
-{
-	return fdt_totalsize(initial_boot_params);
-}
-
 /**
  * of_get_flat_dt_prop - Given a node in the flat blob, return the property ptr
  *
@@ -804,7 +777,7 @@ int __init of_flat_dt_is_compatible(unsigned long node, const char *compat)
 /**
  * of_flat_dt_match - Return true if node matches a list of compatible values
  */
-int __init of_flat_dt_match(unsigned long node, const char *const *compat)
+static int __init of_flat_dt_match(unsigned long node, const char *const *compat)
 {
 	return of_fdt_match(initial_boot_params, node, compat);
 }
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index a713e5d156d8..acf820e88952 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -23,15 +23,6 @@
 struct device_node;
 
 /* For scanning an arbitrary device-tree at any time */
-extern char *of_fdt_get_string(const void *blob, u32 offset);
-extern void *of_fdt_get_property(const void *blob,
-				 unsigned long node,
-				 const char *name,
-				 int *size);
-extern bool of_fdt_is_big_endian(const void *blob,
-				 unsigned long node);
-extern int of_fdt_match(const void *blob, unsigned long node,
-			const char *const *compat);
 extern void *of_fdt_unflatten_tree(const unsigned long *blob,
 				   struct device_node *dad,
 				   struct device_node **mynodes);
@@ -64,9 +55,7 @@ extern int of_get_flat_dt_subnode_by_name(unsigned long node,
 extern const void *of_get_flat_dt_prop(unsigned long node, const char *name,
 				       int *size);
 extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
-extern int of_flat_dt_match(unsigned long node, const char *const *matches);
 extern unsigned long of_get_flat_dt_root(void);
-extern int of_get_flat_dt_size(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
 extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
-- 
cgit v1.2.3


From 5327ed3d44b754f5cc51d5b3f18e442eaebacff5 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:12 +0100
Subject: bpf: verifier: mark verified-insn with sub-register zext flag

eBPF ISA specification requires high 32-bit cleared when low 32-bit
sub-register is written. This applies to destination register of ALU32 etc.
JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
AArch64 ISA has the same semantics, so the corresponding JIT back-end
doesn't need to do extra work.

However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
(PowerPC, SPARC etc) need to do explicit zero extension to meet this
requirement, otherwise code like the following will fail.

  u64_value = (u64) u32_value
  ... other uses of u64_value

This is because compiler could exploit the semantic described above and
save those zero extensions for extending u32_value to u64_value, these JIT
back-ends are expected to guarantee this through inserting extra zero
extensions which however could be a significant increase on the code size.
Some benchmarks show there could be ~40% sub-register writes out of total
insns, meaning at least ~40% extra code-gen.

One observation is these extra zero extensions are not always necessary.
Take above code snippet for example, it is possible u32_value will never be
casted into a u64, the value of high 32-bit of u32_value then could be
ignored and extra zero extension could be eliminated.

This patch implements this idea, insns defining sub-registers will be
marked when the high 32-bit of the defined sub-register matters. For
those unmarked insns, it is safe to eliminate high 32-bit clearnace for
them.

Algo:
 - Split read flags into READ32 and READ64.

 - Record index of insn that does sub-register write. Keep the index inside
   reg state and update it during verifier insn walking.

 - A full register read on a sub-register marks its definition insn as
   needing zero extension on dst register.

   A new sub-register write overrides the old one.

 - When propagating read64 during path pruning, also mark any insn defining
   a sub-register that is read in the pruned path as full-register.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  14 +++-
 kernel/bpf/verifier.c        | 173 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 171 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 405b502283c5..704ed7971472 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -36,9 +36,11 @@
  */
 enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
-	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
-	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
-	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
+	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
+	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
+	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
+	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
 struct bpf_reg_state {
@@ -131,6 +133,11 @@ struct bpf_reg_state {
 	 * pointing to bpf_func_state.
 	 */
 	u32 frameno;
+	/* Tracks subreg definition. The stored value is the insn_idx of the
+	 * writing insn. This is safe because subreg_def is used before any insn
+	 * patching which only happens after main verification finished.
+	 */
+	s32 subreg_def;
 	enum bpf_reg_liveness live;
 };
 
@@ -233,6 +240,7 @@ struct bpf_insn_aux_data {
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	bool zext_dst; /* this insn zero extends dst reg */
 	u8 alu_state; /* used in combination with alu_limit */
 	bool prune_point;
 	unsigned int orig_idx; /* original instruction index */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 550091c7a46a..f6b4c7148c3e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -982,6 +982,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 	__mark_reg_not_init(regs + regno);
 }
 
+#define DEF_NOT_SUBREG	(0)
 static void init_reg_state(struct bpf_verifier_env *env,
 			   struct bpf_func_state *state)
 {
@@ -992,6 +993,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
 		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 		regs[i].parent = NULL;
+		regs[i].subreg_def = DEF_NOT_SUBREG;
 	}
 
 	/* frame pointer */
@@ -1137,7 +1139,7 @@ next:
  */
 static int mark_reg_read(struct bpf_verifier_env *env,
 			 const struct bpf_reg_state *state,
-			 struct bpf_reg_state *parent)
+			 struct bpf_reg_state *parent, u8 flag)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 	int cnt = 0;
@@ -1152,17 +1154,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
-		if (parent->live & REG_LIVE_READ)
+		/* The first condition is more likely to be true than the
+		 * second, checked it first.
+		 */
+		if ((parent->live & REG_LIVE_READ) == flag ||
+		    parent->live & REG_LIVE_READ64)
 			/* The parentage chain never changes and
 			 * this parent was already marked as LIVE_READ.
 			 * There is no need to keep walking the chain again and
 			 * keep re-marking all parents as LIVE_READ.
 			 * This case happens when the same register is read
 			 * multiple times without writes into it in-between.
+			 * Also, if parent has the stronger REG_LIVE_READ64 set,
+			 * then no need to set the weak REG_LIVE_READ32.
 			 */
 			break;
 		/* ... then we depend on parent's value */
-		parent->live |= REG_LIVE_READ;
+		parent->live |= flag;
+		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+		if (flag == REG_LIVE_READ64)
+			parent->live &= ~REG_LIVE_READ32;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -1174,12 +1185,111 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+	u8 code, class, op;
+
+	code = insn->code;
+	class = BPF_CLASS(code);
+	op = BPF_OP(code);
+	if (class == BPF_JMP) {
+		/* BPF_EXIT for "main" will reach here. Return TRUE
+		 * conservatively.
+		 */
+		if (op == BPF_EXIT)
+			return true;
+		if (op == BPF_CALL) {
+			/* BPF to BPF call will reach here because of marking
+			 * caller saved clobber with DST_OP_NO_MARK for which we
+			 * don't care the register def because they are anyway
+			 * marked as NOT_INIT already.
+			 */
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return false;
+			/* Helper call will reach here because of arg type
+			 * check, conservatively return TRUE.
+			 */
+			if (t == SRC_OP)
+				return true;
+
+			return false;
+		}
+	}
+
+	if (class == BPF_ALU64 || class == BPF_JMP ||
+	    /* BPF_END always use BPF_ALU class. */
+	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+		return true;
+
+	if (class == BPF_ALU || class == BPF_JMP32)
+		return false;
+
+	if (class == BPF_LDX) {
+		if (t != SRC_OP)
+			return BPF_SIZE(code) == BPF_DW;
+		/* LDX source must be ptr. */
+		return true;
+	}
+
+	if (class == BPF_STX) {
+		if (reg->type != SCALAR_VALUE)
+			return true;
+		return BPF_SIZE(code) == BPF_DW;
+	}
+
+	if (class == BPF_LD) {
+		u8 mode = BPF_MODE(code);
+
+		/* LD_IMM64 */
+		if (mode == BPF_IMM)
+			return true;
+
+		/* Both LD_IND and LD_ABS return 32-bit data. */
+		if (t != SRC_OP)
+			return  false;
+
+		/* Implicit ctx ptr. */
+		if (regno == BPF_REG_6)
+			return true;
+
+		/* Explicit source could be any width. */
+		return true;
+	}
+
+	if (class == BPF_ST)
+		/* The only source register for BPF_ST is a ptr. */
+		return true;
+
+	/* Conservatively return true at default. */
+	return true;
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *reg)
+{
+	s32 def_idx = reg->subreg_def;
+
+	if (def_idx == DEF_NOT_SUBREG)
+		return;
+
+	env->insn_aux_data[def_idx - 1].zext_dst = true;
+	/* The dst will be zero extended, so won't be sub-register anymore. */
+	reg->subreg_def = DEF_NOT_SUBREG;
+}
+
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
 	struct bpf_reg_state *reg, *regs = state->regs;
+	bool rw64;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -1187,6 +1297,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	reg = &regs[regno];
+	rw64 = is_reg64(env, insn, regno, reg, t);
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (reg->type == NOT_INIT) {
@@ -1197,7 +1308,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 		if (regno == BPF_REG_FP)
 			return 0;
 
-		return mark_reg_read(env, reg, reg->parent);
+		if (rw64)
+			mark_insn_zext(env, reg);
+
+		return mark_reg_read(env, reg, reg->parent,
+				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1205,6 +1320,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
 		if (t == DST_OP)
 			mark_reg_unknown(env, regs, regno);
 	}
@@ -1384,7 +1500,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1401,7 +1518,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			return -EACCES;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -2110,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 						    value_regno);
 				if (reg_type_may_be_null(reg_type))
 					regs[value_regno].id = ++env->id_gen;
+				/* A load of ctx field could have different
+				 * actual load size with the one encoded in the
+				 * insn. When the dst is PTR, it is for sure not
+				 * a sub-register.
+				 */
+				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
 			}
 			regs[value_regno].type = reg_type;
 		}
@@ -2369,7 +2493,8 @@ mark:
 		 * the whole slot to be marked as 'read'
 		 */
 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
-			      state->stack[spi].spilled_ptr.parent);
+			      state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 	}
 	return update_stack_depth(env, state, min_off);
 }
@@ -3333,6 +3458,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
+	/* helper call returns 64-bit value. */
+	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
@@ -4264,6 +4392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				 */
 				*dst_reg = *src_reg;
 				dst_reg->live |= REG_LIVE_WRITTEN;
+				dst_reg->subreg_def = DEF_NOT_SUBREG;
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
@@ -4274,6 +4403,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				} else if (src_reg->type == SCALAR_VALUE) {
 					*dst_reg = *src_reg;
 					dst_reg->live |= REG_LIVE_WRITTEN;
+					dst_reg->subreg_def = env->insn_idx + 1;
 				} else {
 					mark_reg_unknown(env, regs,
 							 insn->dst_reg);
@@ -5353,6 +5483,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * Already marked as written above.
 	 */
 	mark_reg_unknown(env, regs, BPF_REG_0);
+	/* ld_abs load up to 32-bit skb data. */
+	regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
 	return 0;
 }
 
@@ -6309,20 +6441,33 @@ static bool states_equal(struct bpf_verifier_env *env,
 	return true;
 }
 
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
 static int propagate_liveness_reg(struct bpf_verifier_env *env,
 				  struct bpf_reg_state *reg,
 				  struct bpf_reg_state *parent_reg)
 {
+	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+	u8 flag = reg->live & REG_LIVE_READ;
 	int err;
 
-	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+	/* When comes here, read flags of PARENT_REG or REG could be any of
+	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+	 */
+	if (parent_flag == REG_LIVE_READ64 ||
+	    /* Or if there is no read flag from REG. */
+	    !flag ||
+	    /* Or if the read flag from REG is the same as PARENT_REG. */
+	    parent_flag == flag)
 		return 0;
 
-	err = mark_reg_read(env, reg, parent_reg);
+	err = mark_reg_read(env, reg, parent_reg, flag);
 	if (err)
 		return err;
 
-	return 0;
+	return flag;
 }
 
 /* A write screens off any subsequent reads; but write marks come from the
@@ -6356,8 +6501,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
 			err = propagate_liveness_reg(env, &state_reg[i],
 						     &parent_reg[i]);
-			if (err)
+			if (err < 0)
 				return err;
+			if (err == REG_LIVE_READ64)
+				mark_insn_zext(env, &parent_reg[i]);
 		}
 
 		/* Propagate stack slots. */
@@ -6367,11 +6514,11 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			state_reg = &state->stack[i].spilled_ptr;
 			err = propagate_liveness_reg(env, state_reg,
 						     parent_reg);
-			if (err)
+			if (err < 0)
 				return err;
 		}
 	}
-	return err;
+	return 0;
 }
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
-- 
cgit v1.2.3


From 7d134041a89610ae552501fc88652805addcdee4 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:14 +0100
Subject: bpf: introduce new mov32 variant for doing explicit zero extension

The encoding for this new variant is based on BPF_X format. "imm" field was
0 only, now it could be 1 which means doing zero extension unconditionally

  .code = BPF_ALU | BPF_MOV | BPF_X
  .dst_reg = DST
  .src_reg = SRC
  .imm  = 1

We use this new form for doing zero extension for which verifier will
guarantee SRC == DST.

Implications on JIT back-ends when doing code-gen for
BPF_ALU | BPF_MOV | BPF_X:
  1. No change if hardware already does zero extension unconditionally for
     sub-register write.
  2. Otherwise, when seeing imm == 1, just generate insns to clear high
     32-bit. No need to generate insns for the move because when imm == 1,
     dst_reg is the same as src_reg at the moment.

Interpreter doesn't need change as well. It is doing unconditionally zero
extension for mov32 already.

One helper macro BPF_ZEXT_REG is added to help creating zero extension
insn using this new mov32 variant.

One helper function insn_is_zext is added for checking one insn is an
zero extension on dst. This will be widely used by a few JIT back-ends in
later patches in this set.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7148bab96943..bb10ffb88452 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -160,6 +160,20 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Special form of mov32, used for doing explicit zero extension on dst. */
+#define BPF_ZEXT_REG(DST)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = DST,					\
+		.off   = 0,					\
+		.imm   = 1 })
+
+static inline bool insn_is_zext(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
+}
+
 /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
 #define BPF_LD_IMM64(DST, IMM)					\
 	BPF_LD_IMM64_RAW(DST, 0, IMM)
-- 
cgit v1.2.3


From a4b1d3c1ddf6cb441187b6c130a473c16a05a356 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:15 +0100
Subject: bpf: verifier: insert zero extension according to analysis result

After previous patches, verifier will mark a insn if it really needs zero
extension on dst_reg.

It is then for back-ends to decide how to use such information to eliminate
unnecessary zero extension code-gen during JIT compilation.

One approach is verifier insert explicit zero extension for those insns
that need zero extension in a generic way, JIT back-ends then do not
generate zero extension for sub-register write at default.

However, only those back-ends which do not have hardware zero extension
want this optimization. Back-ends like x86_64 and AArch64 have hardware
zero extension support that the insertion should be disabled.

This patch introduces new target hook "bpf_jit_needs_zext" which returns
false at default, meaning verifier zero extension insertion is disabled at
default. A back-end could override this hook to return true if it doesn't
have hardware support and want verifier insert zero extension explicitly.

Offload targets do not use this native target hook, instead, they could
get the optimization results using bpf_prog_offload_ops.finalize.

NOTE: arches could have diversified features, it is possible for one arch
to have hardware zero extension support for some sub-register write insns
but not for all. For example, PowerPC, SPARC have zero extended loads, but
not for alu32. So when verifier zero extension insertion enabled, these JIT
back-ends need to peephole insns to remove those zero extension inserted
for insn that actually has hardware zero extension support. The peephole
could be as simple as looking the next insn, if it is a special zero
extension insn then it is safe to eliminate it if the current insn has
hardware zero extension support.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |  1 +
 include/linux/filter.h |  1 +
 kernel/bpf/core.c      |  9 +++++++++
 kernel/bpf/verifier.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fb3aa2dc975..d98141edb74b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -370,6 +370,7 @@ struct bpf_prog_aux {
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
+	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bb10ffb88452..ba8b65270e0d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -825,6 +825,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
+bool bpf_jit_needs_zext(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..3675b19ecb90 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2090,6 +2090,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
 	return false;
 }
 
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a6af3166acae..d4394a84b9eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7640,6 +7640,38 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
 	return 0;
 }
 
+static int opt_subreg_zext_lo32(struct bpf_verifier_env *env)
+{
+	struct bpf_insn_aux_data *aux = env->insn_aux_data;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int i, delta = 0, len = env->prog->len;
+	struct bpf_insn zext_patch[2];
+	struct bpf_prog *new_prog;
+
+	zext_patch[1] = BPF_ZEXT_REG(0);
+	for (i = 0; i < len; i++) {
+		int adj_idx = i + delta;
+		struct bpf_insn insn;
+
+		if (!aux[adj_idx].zext_dst)
+			continue;
+
+		insn = insns[adj_idx];
+		zext_patch[0] = insn;
+		zext_patch[1].dst_reg = insn.dst_reg;
+		zext_patch[1].src_reg = insn.dst_reg;
+		new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2);
+		if (!new_prog)
+			return -ENOMEM;
+		env->prog = new_prog;
+		insns = new_prog->insnsi;
+		aux = env->insn_aux_data;
+		delta += 2;
+	}
+
+	return 0;
+}
+
 /* convert load instructions that access fields of a context type into a
  * sequence of instructions that access fields of the underlying structure:
  *     struct __sk_buff    -> struct sk_buff
@@ -8490,6 +8522,15 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
+	/* do 32-bit optimization after insn patching has done so those patched
+	 * insns could be handled correctly.
+	 */
+	if (ret == 0 && bpf_jit_needs_zext() &&
+	    !bpf_prog_is_dev_bound(env->prog->aux)) {
+		ret = opt_subreg_zext_lo32(env);
+		env->prog->aux->verifier_zext = !ret;
+	}
+
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
-- 
cgit v1.2.3


From 23634ebc1d946f19eb112d4455c1d84948875e31 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Sun, 24 Mar 2019 15:25:51 -0700
Subject: rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()

When RCU core processing is offloaded from RCU_SOFTIRQ to the rcuc
kthreads, a full and unconditional wakeup is required to initiate RCU
core processing.  In contrast, when RCU core processing is carried
out by RCU_SOFTIRQ, a raise_softirq() suffices.  Of course, there are
situations where raise_softirq() does a full wakeup, but these do not
occur with normal usage of rcu_read_unlock().

The reason that full wakeups can be problematic is that the scheduler
sometimes invokes rcu_read_unlock() with its pi or rq locks held,
which can of course result in deadlock in CONFIG_PREEMPT=y kernels when
rcu_read_unlock() invokes the scheduler.  Scheduler invocations can happen
in the following situations: (1) The just-ended reader has been subjected
to RCU priority boosting, in which case rcu_read_unlock() must deboost,
(2) Interrupts were disabled across the call to rcu_read_unlock(), so
the quiescent state must be deferred, requiring a wakeup of the rcuc
kthread corresponding to the current CPU.

Now, the scheduler may hold one of its locks across rcu_read_unlock()
only if preemption has been disabled across the entire RCU read-side
critical section, which in the days prior to RCU flavor consolidation
meant that rcu_read_unlock() never needed to do wakeups.  However, this
is no longer the case for any but the first rcu_read_unlock() following a
condition (e.g., preempted RCU reader) requiring special rcu_read_unlock()
attention.  For example, an RCU read-side critical section might be
preempted, but preemption might be disabled across the rcu_read_unlock().
The rcu_read_unlock() must defer the quiescent state, and therefore
leaves the task queued on its leaf rcu_node structure.  If a scheduler
interrupt occurs, the scheduler might well invoke rcu_read_unlock() with
one of its locks held.  However, the preempted task is still queued, so
rcu_read_unlock() will attempt to defer the quiescent state once more.
When RCU core processing is carried out by RCU_SOFTIRQ, this works just
fine: The raise_softirq() function simply sets a bit in a per-CPU mask
and the RCU core processing will be undertaken upon return from interrupt.

Not so when RCU core processing is carried out by the rcuc kthread: In this
case, the required wakeup can result in deadlock.

The initial solution to this problem was to use set_tsk_need_resched() and
set_preempt_need_resched() to force a future context switch, which allows
rcu_preempt_note_context_switch() to report the deferred quiescent state
to RCU's core processing.  Unfortunately for expedited grace periods,
there can be a significant delay between the call for a context switch
and the actual context switch.

This commit therefore introduces a ->deferred_qs flag to the task_struct
structure's rcu_special structure.  This flag is initially false, and
is set to true by the first call to rcu_read_unlock() requiring special
attention, then finally reset back to false when the quiescent state is
finally reported.  Then rcu_read_unlock() attempts full wakeups only when
->deferred_qs is false, that is, on the first rcu_read_unlock() requiring
special attention.  Note that a chain of RCU readers linked by some other
sort of reader may find that a later rcu_read_unlock() is once again able
to do a full wakeup, courtesy of an intervening preemption:

	rcu_read_lock();
	/* preempted */
	local_irq_disable();
	rcu_read_unlock(); /* Can do full wakeup, sets ->deferred_qs. */
	rcu_read_lock();
	local_irq_enable();
	preempt_disable()
	rcu_read_unlock(); /* Cannot do full wakeup, ->deferred_qs set. */
	rcu_read_lock();
	preempt_enable();
	/* preempted, >deferred_qs reset. */
	local_irq_disable();
	rcu_read_unlock(); /* Can again do full wakeup, sets ->deferred_qs. */

Such linked RCU readers do not yet seem to appear in the Linux kernel, and
it is probably best if they don't.  However, RCU needs to handle them, and
some variations on this theme could make even raise_softirq() unsafe due to
the possibility of its doing a full wakeup.  This commit therefore also
avoids invoking raise_softirq() when the ->deferred_qs set flag is set.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/sched.h    |  2 +-
 kernel/rcu/tree_plugin.h | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..942a44c1b8eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -565,7 +565,7 @@ union rcu_special {
 		u8			blocked;
 		u8			need_qs;
 		u8			exp_hint; /* Hint for performance. */
-		u8			pad; /* No garbage from compiler! */
+		u8			deferred_qs;
 	} b; /* Bits. */
 	u32 s; /* Set of bits. */
 };
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 21611862e083..75110ea75d01 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -455,6 +455,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		local_irq_restore(flags);
 		return;
 	}
+	t->rcu_read_unlock_special.b.deferred_qs = false;
 	if (special.b.need_qs) {
 		rcu_qs();
 		t->rcu_read_unlock_special.b.need_qs = false;
@@ -605,16 +606,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	local_irq_save(flags);
 	irqs_were_disabled = irqs_disabled_flags(flags);
 	if (preempt_bh_were_disabled || irqs_were_disabled) {
-		WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
-		/* Need to defer quiescent state until everything is enabled. */
-		if (irqs_were_disabled && use_softirq) {
-			/* Enabling irqs does not reschedule, so... */
+		t->rcu_read_unlock_special.b.exp_hint = false;
+		// Need to defer quiescent state until everything is enabled.
+		if (irqs_were_disabled && use_softirq &&
+		    (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+			// Using softirq, safe to awaken, and we get
+			// no help from enabling irqs, unlike bh/preempt.
 			raise_softirq_irqoff(RCU_SOFTIRQ);
+		} else if (irqs_were_disabled && !use_softirq &&
+			   !t->rcu_read_unlock_special.b.deferred_qs) {
+			// Safe to awaken and we get no help from enabling
+			// irqs, unlike bh/preempt.
+			invoke_rcu_core();
 		} else {
-			/* Enabling BH or preempt does reschedule, so... */
+			// Enabling BH or preempt does reschedule, so...
 			set_tsk_need_resched(current);
 			set_preempt_need_resched();
 		}
+		t->rcu_read_unlock_special.b.deferred_qs = true;
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v1.2.3


From 71d8d1531e0904e08adf1540e191bd707dfd73da Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 26 Mar 2019 15:24:08 -0400
Subject: lockdep: Add assertion to check if in an interrupt

In rcu_rrupt_from_idle, we want to check if it is called from within an
interrupt, but want to do such checking only for debug builds. lockdep
already tracks when we enter an interrupt. Let us expose it as an
assertion macro so it can be used to assert this.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: kernel-team@android.com
Cc: rcu@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/lockdep.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 6e2377e6c1d6..e8eef38b2213 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -632,11 +632,18 @@ do {									\
 			  "IRQs not disabled as expected\n");		\
 	} while (0)
 
+#define lockdep_assert_in_irq() do {					\
+		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
+			  !current->hardirq_context,			\
+			  "Not in hardirq as expected\n");		\
+	} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
+# define lockdep_assert_in_irq() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 1f58bb18f6f28d1df0b7144d90bc90ee5672416d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: mount_pseudo(): drop 'name' argument, switch to d_make_root()

Once upon a time we used to set ->d_name of e.g. pipefs root
so that d_path() on pipes would work.  These days it's
completely pointless - dentries of pipes are not even connected
to pipefs root.  However, mount_pseudo() had set the root
dentry name (passed as the second argument) and callers
kept inventing names to pass to it.  Including those that
didn't *have* any non-root dentries to start with...

All of that had been pointless for about 8 years now; it's
time to get rid of that cargo-culting...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c      |  2 +-
 drivers/dax/super.c             |  2 +-
 drivers/gpu/drm/drm_drv.c       |  6 +-----
 drivers/misc/cxl/api.c          |  3 +--
 drivers/scsi/cxlflash/ocxl_hw.c |  3 +--
 drivers/virtio/virtio_balloon.c |  3 +--
 fs/aio.c                        |  3 +--
 fs/anon_inodes.c                |  4 ++--
 fs/block_dev.c                  |  2 +-
 fs/btrfs/tests/btrfs-tests.c    |  2 +-
 fs/libfs.c                      | 12 +++---------
 fs/nsfs.c                       |  2 +-
 fs/pipe.c                       |  2 +-
 include/linux/fs.h              |  6 +++---
 mm/z3fold.c                     |  2 +-
 mm/zsmalloc.c                   |  2 +-
 net/socket.c                    |  2 +-
 17 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7a969f4c3534..a30da6f2c28e 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -602,7 +602,7 @@ static const struct dentry_operations pfmfs_dentry_operations;
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations,
+	return mount_pseudo(fs_type, NULL, &pfmfs_dentry_operations,
 			PFMFS_MAGIC);
 }
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 35f051efaf35..f83814eea5ad 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -440,7 +440,7 @@ static const struct super_operations dax_sops = {
 static struct dentry *dax_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+	return mount_pseudo(fs_type, &dax_sops, NULL, DAXFS_MAGIC);
 }
 
 static struct file_system_type dax_fs_type = {
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 8b44ac9a92ae..48365c62a190 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -535,11 +535,7 @@ static struct vfsmount *drm_fs_mnt;
 static struct dentry *drm_fs_mount(struct file_system_type *fs_type, int flags,
 				   const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type,
-			    "drm:",
-			    NULL,
-			    NULL,
-			    0x010203ff);
+	return mount_pseudo(fs_type, NULL, NULL, 0x010203ff);
 }
 
 static struct file_system_type drm_fs_type = {
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index a59c7af79873..1f2b0535a8cf 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -40,8 +40,7 @@ static struct vfsmount *cxl_vfs_mount;
 static struct dentry *cxl_fs_mount(struct file_system_type *fs_type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "cxl:", NULL, NULL,
-			CXL_PSEUDO_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, CXL_PSEUDO_FS_MAGIC);
 }
 
 static struct file_system_type cxl_fs_type = {
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 31cfdf2c8c30..38e1fbd2b406 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -48,8 +48,7 @@ static struct dentry *ocxlflash_fs_mount(struct file_system_type *fs_type,
 					 int flags, const char *dev_name,
 					 void *data)
 {
-	return mount_pseudo(fs_type, "ocxlflash:", NULL, NULL,
-			    OCXLFLASH_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, OCXLFLASH_FS_MAGIC);
 }
 
 static struct file_system_type ocxlflash_fs_type = {
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 554d1a98d193..62bafc4f2662 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -761,8 +761,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 static struct dentry *balloon_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "balloon-kvm:", NULL, NULL,
-				BALLOON_KVM_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, BALLOON_KVM_MAGIC);
 }
 
 static struct file_system_type balloon_fs = {
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..09bc35fa6810 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -252,8 +252,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 static struct dentry *aio_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
-					   AIO_RING_MAGIC);
+	struct dentry *root = mount_pseudo(fs_type, NULL, NULL, AIO_RING_MAGIC);
 
 	if (!IS_ERR(root))
 		root->d_sb->s_iflags |= SB_I_NOEXEC;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 91262c34b797..644d0837aafe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -41,8 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "anon_inode:", NULL,
-			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, &anon_inodefs_dentry_operations,
+			    ANON_INODE_FS_MAGIC);
 }
 
 static struct file_system_type anon_inode_fs_type = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0f7552a87d54..3143da7b0998 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -837,7 +837,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct dentry *dent;
-	dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+	dent = mount_pseudo(fs_type, &bdev_sops, NULL, BDEVFS_MAGIC);
 	if (!IS_ERR(dent))
 		dent->d_sb->s_iflags |= SB_I_CGROUPWB;
 	return dent;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9238fd4f1734..6da54323eaf8 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -36,7 +36,7 @@ static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
 				       int flags, const char *dev_name,
 				       void *data)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+	return mount_pseudo(fs_type, &btrfs_test_super_ops,
 			    NULL, BTRFS_TEST_MAGIC);
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 4b59b1816efb..030e545f586e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,14 +239,12 @@ static const struct super_operations simple_super_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
 	const struct super_operations *ops, const struct xattr_handler **xattr,
 	const struct dentry_operations *dops, unsigned long magic)
 {
 	struct super_block *s;
-	struct dentry *dentry;
 	struct inode *root;
-	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
 	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
@@ -271,13 +269,9 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	dentry = __d_alloc(s, &d_name);
-	if (!dentry) {
-		iput(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
 		goto Enomem;
-	}
-	d_instantiate(dentry, root);
-	s->s_root = dentry;
 	s->s_d_op = dops;
 	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e3bf08c5af41..b3c49ddc0f85 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -261,7 +261,7 @@ static const struct super_operations nsfs_ops = {
 static struct dentry *nsfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+	return mount_pseudo(fs_type, &nsfs_ops,
 			&ns_dentry_operations, NSFS_MAGIC);
 }
 static struct file_system_type nsfs = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 41065901106b..99a023730e6f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1185,7 +1185,7 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+	return mount_pseudo(fs_type, &pipefs_ops,
 			&pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..b06251dd429f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2257,18 +2257,18 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *, char *,
+extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
 					 const struct super_operations *ops,
 					 const struct xattr_handler **xattr,
 					 const struct dentry_operations *dops,
 					 unsigned long);
 
 static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type, char *name,
+mount_pseudo(struct file_system_type *fs_type,
 	     const struct super_operations *ops,
 	     const struct dentry_operations *dops, unsigned long magic)
 {
-	return mount_pseudo_xattr(fs_type, name, ops, NULL, dops, magic);
+	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
 }
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 0b14daf930a8..abeb5bcbea57 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -242,7 +242,7 @@ static inline void free_handle(unsigned long handle)
 static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "z3fold:", NULL, NULL, 0x33);
+	return mount_pseudo(fs_type, NULL, NULL, 0x33);
 }
 
 static struct file_system_type z3fold_fs = {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d9f831f63625..ef230be8c03e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1817,7 +1817,7 @@ static void lock_zspage(struct zspage *zspage)
 static struct dentry *zs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "zsmalloc:", NULL, NULL, ZSMALLOC_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, ZSMALLOC_MAGIC);
 }
 
 static struct file_system_type zsmalloc_fs = {
diff --git a/net/socket.c b/net/socket.c
index 472fbefa5d9b..c86679584eed 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -362,7 +362,7 @@ static const struct xattr_handler *sockfs_xattr_handlers[] = {
 static struct dentry *sockfs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
+	return mount_pseudo_xattr(fs_type, &sockfs_ops,
 				  sockfs_xattr_handlers,
 				  &sockfs_dentry_operations, SOCKFS_MAGIC);
 }
-- 
cgit v1.2.3


From bb7b6b2bbdb827e68cd506c8f5e3ba13215cccb2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill mount_ns()

Kill mount_ns() as it has been replaced by vfs_get_super() in the new mount
API.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 38 --------------------------------------
 include/linux/fs.h |  3 ---
 2 files changed, 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 3ba91d70c2a8..6919f5c728f0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1143,44 +1143,6 @@ void kill_litter_super(struct super_block *sb)
 }
 EXPORT_SYMBOL(kill_litter_super);
 
-static int ns_test_super(struct super_block *sb, void *data)
-{
-	return sb->s_fs_info == data;
-}
-
-static int ns_set_super(struct super_block *sb, void *data)
-{
-	sb->s_fs_info = data;
-	return set_anon_super(sb, NULL);
-}
-
-struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int))
-{
-	struct super_block *sb;
-
-	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
-			 user_ns, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!sb->s_root) {
-		int err;
-		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= SB_ACTIVE;
-	}
-
-	return dget(sb->s_root);
-}
-
-EXPORT_SYMBOL(mount_ns);
-
 int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
 {
 	return set_anon_super(sb, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b06251dd429f..790342cf4df9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2206,9 +2206,6 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-extern struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int));
 #ifdef CONFIG_BLOCK
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-- 
cgit v1.2.3


From c80fa7c8301c10ad10d997b9e86b4aeac5923b3e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide sb->s_iflags settings in fs_context struct

Provide a field in the fs_context struct through which bits in the
sb->s_iflags superblock field can be set.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c                 | 1 +
 include/linux/fs_context.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 72b4a5afcfd6..f836b67abffe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,6 +540,7 @@ retry:
 	}
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
+	s->s_iflags |= fc->s_iflags;
 	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
 	list_add_tail(&s->s_list, &super_blocks);
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 1f966670c8dc..c995b852ba40 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -103,6 +103,7 @@ struct fs_context {
 	void			*s_fs_info;	/* Proposed s_fs_info */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
+	unsigned int		s_iflags;	/* OR'd with sb->s_iflags */
 	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	enum fs_context_phase	phase:8;	/* The phase the context is in */
-- 
cgit v1.2.3


From 31d6d5ce53400d6dc58e29ddd8dc184b3ba89d66 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide a mount_pseudo-replacement for the new mount API

Provide a function, init_pseudo(), that provides a common
infrastructure for converting pseudo-filesystems that can never be
mountable.

[AV: once all users of mount_pseudo_xattr() get converted, it will be folded
into pseudo_fs_get_tree()]

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/libfs.c                | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pseudo_fs.h | 16 ++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 include/linux/pseudo_fs.h

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 030e545f586e..edef70d35438 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,8 @@
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
+#include <linux/fs_context.h>
+#include <linux/pseudo_fs.h>
 
 #include <linux/uaccess.h>
 
@@ -235,6 +237,50 @@ static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
 };
 
+static int pseudo_fs_get_tree(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx = fc->fs_private;
+	struct dentry *root;
+
+	root = mount_pseudo_xattr(fc->fs_type,
+				  ctx->ops, ctx->xattr,
+			          ctx->dops, ctx->magic);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	fc->root = root;
+	return 0;
+}
+
+static void pseudo_fs_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations pseudo_fs_context_ops = {
+	.free		= pseudo_fs_free,
+	.get_tree	= pseudo_fs_get_tree,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+					unsigned long magic)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
+	if (likely(ctx)) {
+		ctx->magic = magic;
+		fc->fs_private = ctx;
+		fc->ops = &pseudo_fs_context_ops;
+	}
+	return ctx;
+}
+EXPORT_SYMBOL(init_pseudo);
+
 /*
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
new file mode 100644
index 000000000000..eceda1d1407a
--- /dev/null
+++ b/include/linux/pseudo_fs.h
@@ -0,0 +1,16 @@
+#ifndef __LINUX_PSEUDO_FS__
+#define __LINUX_PSEUDO_FS__
+
+#include <linux/fs_context.h>
+
+struct pseudo_fs_context {
+	const struct super_operations *ops;
+	const struct xattr_handler **xattr;
+	const struct dentry_operations *dops;
+	unsigned long magic;
+};
+
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+				      unsigned long magic);
+
+#endif
-- 
cgit v1.2.3


From 8d9e46d80777b484f8f0945c317ad618224d7811 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 May 2019 11:43:59 -0400
Subject: fold mount_pseudo_xattr() into pseudo_fs_get_tree()

... now that all other callers are gone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 88 +++++++++++++++++++++---------------------------------
 include/linux/fs.h | 13 --------
 2 files changed, 34 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index edef70d35438..7df3c9a85f6b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -240,16 +240,43 @@ static const struct super_operations simple_super_operations = {
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = fc->fs_private;
-	struct dentry *root;
+	struct super_block *s;
+	struct inode *root;
 
-	root = mount_pseudo_xattr(fc->fs_type,
-				  ctx->ops, ctx->xattr,
-			          ctx->dops, ctx->magic);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	s = sget_userns(fc->fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
+			&init_user_ns, NULL);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
 
-	fc->root = root;
+	s->s_maxbytes = MAX_LFS_FILESIZE;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = ctx->magic;
+	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_xattr = ctx->xattr;
+	s->s_time_gran = 1;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		goto Enomem;
+	s->s_d_op = ctx->dops;
+	s->s_flags |= SB_ACTIVE;
+	fc->root = dget(s->s_root);
 	return 0;
+
+Enomem:
+	deactivate_locked_super(s);
+	return -ENOMEM;
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
@@ -281,53 +308,6 @@ struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
 }
 EXPORT_SYMBOL(init_pseudo);
 
-/*
- * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
- * will never be mountable)
- */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
-	const struct super_operations *ops, const struct xattr_handler **xattr,
-	const struct dentry_operations *dops, unsigned long magic)
-{
-	struct super_block *s;
-	struct inode *root;
-
-	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
-			&init_user_ns, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	s->s_maxbytes = MAX_LFS_FILESIZE;
-	s->s_blocksize = PAGE_SIZE;
-	s->s_blocksize_bits = PAGE_SHIFT;
-	s->s_magic = magic;
-	s->s_op = ops ? ops : &simple_super_operations;
-	s->s_xattr = xattr;
-	s->s_time_gran = 1;
-	root = new_inode(s);
-	if (!root)
-		goto Enomem;
-	/*
-	 * since this is the first inode, make it number 1. New inodes created
-	 * after this must take care not to collide with it (by passing
-	 * max_reserved of 1 to iunique).
-	 */
-	root->i_ino = 1;
-	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	s->s_root = d_make_root(root);
-	if (!s->s_root)
-		goto Enomem;
-	s->s_d_op = dops;
-	s->s_flags |= SB_ACTIVE;
-	return dget(s->s_root);
-
-Enomem:
-	deactivate_locked_super(s);
-	return ERR_PTR(-ENOMEM);
-}
-EXPORT_SYMBOL(mount_pseudo_xattr);
-
 int simple_open(struct inode *inode, struct file *file)
 {
 	if (inode->i_private)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 790342cf4df9..d625acabbfcf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2254,19 +2254,6 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
-					 const struct super_operations *ops,
-					 const struct xattr_handler **xattr,
-					 const struct dentry_operations *dops,
-					 unsigned long);
-
-static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type,
-	     const struct super_operations *ops,
-	     const struct dentry_operations *dops, unsigned long magic)
-{
-	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
-}
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
-- 
cgit v1.2.3


From 023d066a0d0a87696c04b0de2ceae53063d0b655 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill sget_userns()

Kill sget_userns(), folding it into sget() as that's the only remaining
user.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c         | 54 ++++++++++++++++--------------------------------------
 include/linux/fs.h |  5 -----
 2 files changed, 16 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index f836b67abffe..ca2302501d32 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -563,24 +563,31 @@ share_extant_sb:
 EXPORT_SYMBOL(sget_fc);
 
 /**
- *	sget_userns -	find or create a superblock
- *	@type:	filesystem type superblock should belong to
- *	@test:	comparison callback
- *	@set:	setup callback
- *	@flags:	mount flags
- *	@user_ns: User namespace for the super_block
- *	@data:	argument to each of them
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
  */
-struct super_block *sget_userns(struct file_system_type *type,
+struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
+			int flags,
 			void *data)
 {
+	struct user_namespace *user_ns = current_user_ns();
 	struct super_block *s = NULL;
 	struct super_block *old;
 	int err;
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & SB_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -621,35 +628,6 @@ retry:
 	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
-
-EXPORT_SYMBOL(sget_userns);
-
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-
-	/* We don't yet pass the user namespace of the parent
-	 * mount through to here so always use &init_user_ns
-	 * until that changes.
-	 */
-	if (flags & SB_SUBMOUNT)
-		user_ns = &init_user_ns;
-
-	return sget_userns(type, test, set, flags, user_ns, data);
-}
-
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d625acabbfcf..71421856ff2c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2245,11 +2245,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget_userns(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
-			void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-- 
cgit v1.2.3


From 7375dca1647fa978310f2d706ddbff537f72110b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 20 May 2019 09:26:24 -0400
Subject: ftrace: Make enable and update parameters bool when applicable

The code modification functions have "enable" and "update" variables that
are sometimes "int" but used as "bool". Remove the ambiguity and make them
"bool" when they are only used for true or false values.

Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com

Reported-by: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  4 ++--
 kernel/trace/ftrace.c  | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 25e2995d4a4c..8a8cb3c401b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
 	     iter = ftrace_rec_iter_next(iter))
 
 
-int ftrace_update_record(struct dyn_ftrace *rec, int enable);
-int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable);
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
 unsigned long ftrace_location_range(unsigned long start, unsigned long end);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..4f2c26bebe2a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		count++;
 
 		/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
-		update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+		update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE;
 
 		/* Shortcut, if we handled all records, we are done. */
 		if (!all && count == hash->count)
@@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 	}
 }
 
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 {
 	unsigned long flag = 0UL;
 
@@ -2146,28 +2146,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 /**
  * ftrace_update_record, set a record that now is tracing or not
  * @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
+ * @enable: set to true if the record is tracing, false to force disable
  *
  * The records that represent all functions that can be traced need
  * to be updated when tracing has been enabled.
  */
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 1);
+	return ftrace_check_record(rec, enable, true);
 }
 
 /**
  * ftrace_test_record, check if the record has been enabled or not
  * @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
+ * @enable: set to true to check if enabled, false if it is disabled
  *
  * The arch code may need to test if a record is already set to
  * tracing to determine how to modify the function code that it
  * represents.
  */
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 0);
+	return ftrace_check_record(rec, enable, false);
 }
 
 static struct ftrace_ops *
@@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 }
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
 {
 	unsigned long ftrace_old_addr;
 	unsigned long ftrace_addr;
@@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+	bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
 	int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
 	int failed;
 
-- 
cgit v1.2.3


From 2d8d8fac3b4eab035dcd0068e1f5a746a697fbb3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:06 +0900
Subject: x86/uaccess: Allow access_ok() in irq context if pagefault_disabled

WARN_ON_IN_IRQ() assumes that the access_ok() and following
user memory access can sleep. But this assumption is not
always correct; when the pagefault is disabled, following
memory access will just returns -EFAULT and never sleep.

Add pagefault_disabled() check in WARN_ON_ONCE() so that
it can ignore the case we call it with disabling pagefault.
For this purpose, this modified pagefault_disabled() as
an inline function.

Link: http://lkml.kernel.org/r/155789868664.26965.7932665824135793317.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/include/asm/uaccess.h | 4 +++-
 include/linux/uaccess.h        | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c82abd6e4ca3..9c4435307ff8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 })
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-# define WARN_ON_IN_IRQ()	WARN_ON_ONCE(!in_task())
+static inline bool pagefault_disabled(void);
+# define WARN_ON_IN_IRQ()	\
+	WARN_ON_ONCE(!in_task() && !pagefault_disabled())
 #else
 # define WARN_ON_IN_IRQ()
 #endif
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 2b70130af585..5a43ef7db492 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -203,7 +203,10 @@ static inline void pagefault_enable(void)
 /*
  * Is the pagefault handler disabled? If so, user access methods will not sleep.
  */
-#define pagefault_disabled() (current->pagefault_disabled != 0)
+static inline bool pagefault_disabled(void)
+{
+	return current->pagefault_disabled != 0;
+}
 
 /*
  * The pagefault handler is in general disabled by pagefault_disable() or
-- 
cgit v1.2.3


From 3d7081822f7f9eab867d9bcc8fd635208ec438e0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:18 +0900
Subject: uaccess: Add non-pagefault user-space read functions

Add probe_user_read(), strncpy_from_unsafe_user() and
strnlen_unsafe_user() which allows caller to access user-space
in IRQ context.

Current probe_kernel_read() and strncpy_from_unsafe() are
not available for user-space memory, because it sets
KERNEL_DS while accessing data. On some arch, user address
space and kernel address space can be co-exist, but others
can not. In that case, setting KERNEL_DS means given
address is treated as a kernel address space.
Also strnlen_user() is only available from user context since
it can sleep if pagefault is enabled.

To access user-space memory without pagefault, we need
these new functions which sets USER_DS while accessing
the data.

Link: http://lkml.kernel.org/r/155789869802.26965.4940338412595759063.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h |  14 ++++++
 mm/maccess.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 130 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5a43ef7db492..9c435c3f2105 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -242,6 +242,17 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
 extern long probe_kernel_read(void *dst, const void *src, size_t size);
 extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 
+/*
+ * probe_user_read(): safely attempt to read from a location in user space
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long probe_user_read(void *dst, const void __user *src, size_t size);
+
 /*
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -255,6 +266,9 @@ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+				     long count);
+extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
 
 /**
  * probe_kernel_address(): safely attempt to read from a location
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be51a24f..19c8c3dc14df 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -5,8 +5,20 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
 /**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
  * @dst: pointer to the buffer that shall take the data
  * @src: address to read from
  * @size: size of the data chunk
@@ -29,16 +41,40 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
 	mm_segment_t old_fs = get_fs();
 
 	set_fs(KERNEL_DS);
-	pagefault_disable();
-	ret = __copy_from_user_inatomic(dst,
-			(__force const void __user *)src, size);
-	pagefault_enable();
+	ret = probe_read_common(dst, (__force const void __user *)src, size);
 	set_fs(old_fs);
 
-	return ret ? -EFAULT : 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(probe_kernel_read);
 
+/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+    __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+	long ret = -EFAULT;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(USER_DS);
+	if (access_ok(src, size))
+		ret = probe_read_common(dst, src, size);
+	set_fs(old_fs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
 /**
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -66,6 +102,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
 }
 EXPORT_SYMBOL_GPL(probe_kernel_write);
 
+
 /**
  * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
  * @dst:   Destination address, in kernel space.  This buffer must be at
@@ -105,3 +142,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 
 	return ret ? -EFAULT : src - unsafe_addr;
 }
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ *				address.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+			      long count)
+{
+	mm_segment_t old_fs = get_fs();
+	long ret;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strncpy_from_user(dst, unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret >= count) {
+		ret = count;
+		dst[ret - 1] = '\0';
+	} else if (ret > 0) {
+		ret++;
+	}
+
+	return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strnlen_user(unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	return ret;
+}
-- 
cgit v1.2.3


From 87a90956eeab260a469a51897bfda27b28adf67d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 22 May 2019 17:27:44 +0900
Subject: uaccess: Add a prototype of non-static __probe_user_read()

Declare a prototype of non-static __probe_user_read() as
same as __probe_kernel_read() at uaccess.h.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9c435c3f2105..34a038563d97 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -252,6 +252,7 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
  * happens, handle that and return -EFAULT.
  */
 extern long probe_user_read(void *dst, const void __user *src, size_t size);
+extern long __probe_user_read(void *dst, const void __user *src, size_t size);
 
 /*
  * probe_kernel_write(): safely attempt to write to a location
-- 
cgit v1.2.3


From 08eb1fb0f77b0036568d2228f3425f2595d671bb Mon Sep 17 00:00:00 2001
From: Michal Kalderon <michal.kalderon@marvell.com>
Date: Sun, 26 May 2019 15:22:22 +0300
Subject: qed*: Change hwfn used for sb initialization

When initializing status blocks use the affined hwfn
instead of the leading one for RDMA / Storage

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/main.c            |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 47 ++++++++++++++++------------
 drivers/net/ethernet/qlogic/qede/qede_main.c |  3 +-
 include/linux/qed/qed_if.h                   | 10 +++++-
 4 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 083c2c00a8e9..806b3d0e57d8 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -312,7 +312,8 @@ static void qedr_free_mem_sb(struct qedr_dev *dev,
 			     struct qed_sb_info *sb_info, int sb_id)
 {
 	if (sb_info->sb_virt) {
-		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id);
+		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id,
+					     QED_SB_TYPE_CNQ);
 		dma_free_coherent(&dev->pdev->dev, sizeof(*sb_info->sb_virt),
 				  (void *)sb_info->sb_virt, sb_info->sb_phys);
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 6de23b56b294..7f19fefe0d79 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1301,26 +1301,21 @@ static u32 qed_sb_init(struct qed_dev *cdev,
 {
 	struct qed_hwfn *p_hwfn;
 	struct qed_ptt *p_ptt;
-	int hwfn_index;
 	u16 rel_sb_id;
-	u8 n_hwfns;
 	u32 rc;
 
-	/* RoCE uses single engine and CMT uses two engines. When using both
-	 * we force only a single engine. Storage uses only engine 0 too.
-	 */
-	if (type == QED_SB_TYPE_L2_QUEUE)
-		n_hwfns = cdev->num_hwfns;
-	else
-		n_hwfns = 1;
-
-	hwfn_index = sb_id % n_hwfns;
-	p_hwfn = &cdev->hwfns[hwfn_index];
-	rel_sb_id = sb_id / n_hwfns;
+	/* RoCE/Storage use a single engine in CMT mode while L2 uses both */
+	if (type == QED_SB_TYPE_L2_QUEUE) {
+		p_hwfn = &cdev->hwfns[sb_id % cdev->num_hwfns];
+		rel_sb_id = sb_id / cdev->num_hwfns;
+	} else {
+		p_hwfn = QED_AFFIN_HWFN(cdev);
+		rel_sb_id = sb_id;
+	}
 
 	DP_VERBOSE(cdev, NETIF_MSG_INTR,
 		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
-		   hwfn_index, rel_sb_id, sb_id);
+		   IS_LEAD_HWFN(p_hwfn) ? 0 : 1, rel_sb_id, sb_id);
 
 	if (IS_PF(p_hwfn->cdev)) {
 		p_ptt = qed_ptt_acquire(p_hwfn);
@@ -1339,20 +1334,26 @@ static u32 qed_sb_init(struct qed_dev *cdev,
 }
 
 static u32 qed_sb_release(struct qed_dev *cdev,
-			  struct qed_sb_info *sb_info, u16 sb_id)
+			  struct qed_sb_info *sb_info,
+			  u16 sb_id,
+			  enum qed_sb_type type)
 {
 	struct qed_hwfn *p_hwfn;
-	int hwfn_index;
 	u16 rel_sb_id;
 	u32 rc;
 
-	hwfn_index = sb_id % cdev->num_hwfns;
-	p_hwfn = &cdev->hwfns[hwfn_index];
-	rel_sb_id = sb_id / cdev->num_hwfns;
+	/* RoCE/Storage use a single engine in CMT mode while L2 uses both */
+	if (type == QED_SB_TYPE_L2_QUEUE) {
+		p_hwfn = &cdev->hwfns[sb_id % cdev->num_hwfns];
+		rel_sb_id = sb_id / cdev->num_hwfns;
+	} else {
+		p_hwfn = QED_AFFIN_HWFN(cdev);
+		rel_sb_id = sb_id;
+	}
 
 	DP_VERBOSE(cdev, NETIF_MSG_INTR,
 		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
-		   hwfn_index, rel_sb_id, sb_id);
+		   IS_LEAD_HWFN(p_hwfn) ? 0 : 1, rel_sb_id, sb_id);
 
 	rc = qed_int_sb_release(p_hwfn, sb_info, rel_sb_id);
 
@@ -2372,6 +2373,11 @@ static int qed_read_module_eeprom(struct qed_dev *cdev, char *buf,
 	return rc;
 }
 
+static u8 qed_get_affin_hwfn_idx(struct qed_dev *cdev)
+{
+	return QED_AFFIN_HWFN_IDX(cdev);
+}
+
 static struct qed_selftest_ops qed_selftest_ops_pass = {
 	.selftest_memory = &qed_selftest_memory,
 	.selftest_interrupt = &qed_selftest_interrupt,
@@ -2419,6 +2425,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.db_recovery_add = &qed_db_recovery_add,
 	.db_recovery_del = &qed_db_recovery_del,
 	.read_module_eeprom = &qed_read_module_eeprom,
+	.get_affin_hwfn_idx = &qed_get_affin_hwfn_idx,
 };
 
 void qed_get_protocol_stats(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 02a97c659e29..a9684a881f2a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1306,7 +1306,8 @@ static void qede_free_mem_sb(struct qede_dev *edev, struct qed_sb_info *sb_info,
 			     u16 sb_id)
 {
 	if (sb_info->sb_virt) {
-		edev->ops->common->sb_release(edev->cdev, sb_info, sb_id);
+		edev->ops->common->sb_release(edev->cdev, sb_info, sb_id,
+					      QED_SB_TYPE_L2_QUEUE);
 		dma_free_coherent(&edev->pdev->dev, sizeof(*sb_info->sb_virt),
 				  (void *)sb_info->sb_virt, sb_info->sb_phys);
 		memset(sb_info, 0, sizeof(*sb_info));
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 48841e5dab90..eef02e64b422 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -907,7 +907,8 @@ struct qed_common_ops {
 
 	u32		(*sb_release)(struct qed_dev *cdev,
 				      struct qed_sb_info *sb_info,
-				      u16 sb_id);
+				      u16 sb_id,
+				      enum qed_sb_type type);
 
 	void		(*simd_handler_config)(struct qed_dev *cdev,
 					       void *token,
@@ -1123,6 +1124,13 @@ struct qed_common_ops {
  */
 	int (*read_module_eeprom)(struct qed_dev *cdev,
 				  char *buf, u8 dev_addr, u32 offset, u32 len);
+
+/**
+ * @brief get_affin_hwfn_idx
+ *
+ * @param cdev
+ */
+	u8 (*get_affin_hwfn_idx)(struct qed_dev *cdev);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 3576e99e08217f291290ac62431c7e330ac111c4 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <michal.kalderon@marvell.com>
Date: Sun, 26 May 2019 15:22:27 +0300
Subject: qed*: Add iWARP 100g support

Add iWARP engine affinity setting for supporting iWARP over 100g.
iWARP cannot be distinguished by the LLH from L2, hence the
engine division will affect L2 as well. For this reason we add
a parameter to devlink to determine the engine division.

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/main.c          | 13 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 31 ++++++++++++++++++++++++++++++
 include/linux/qed/qed_rdma_if.h            |  2 ++
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 055a63144480..5ebf3c53b3fb 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -871,7 +871,16 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
 	dev->user_dpm_enabled = dev_info.user_dpm_enabled;
 	dev->rdma_type = dev_info.rdma_type;
 	dev->num_hwfns = dev_info.common.num_hwfns;
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev)) {
+		rc = dev->ops->iwarp_set_engine_affin(cdev, false);
+		if (rc) {
+			DP_ERR(dev, "iWARP is disabled over a 100g device Enabling it may impact L2 performance. To enable it run devlink dev param set <dev> name iwarp_cmt value true cmode runtime\n");
+			goto init_err;
+		}
+	}
 	dev->affin_hwfn_idx = dev->ops->common->get_affin_hwfn_idx(cdev);
+
 	dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev);
 
 	dev->num_cnq = dev->ops->rdma_get_min_cnq_msix(cdev);
@@ -932,6 +941,10 @@ static void qedr_remove(struct qedr_dev *dev)
 	qedr_stop_hw(dev);
 	qedr_sync_free_irqs(dev);
 	qedr_free_resources(dev);
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev))
+		dev->ops->iwarp_set_engine_affin(dev->cdev, true);
+
 	ib_dealloc_device(&dev->ibdev);
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index e4d63359864e..f900fde448db 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -1916,6 +1916,36 @@ static int qed_roce_ll2_set_mac_filter(struct qed_dev *cdev,
 	return rc;
 }
 
+static int qed_iwarp_set_engine_affin(struct qed_dev *cdev, bool b_reset)
+{
+	enum qed_eng eng;
+	u8 ppfid = 0;
+	int rc;
+
+	/* Make sure iwarp cmt mode is enabled before setting affinity */
+	if (!cdev->iwarp_cmt)
+		return -EINVAL;
+
+	if (b_reset)
+		eng = QED_BOTH_ENG;
+	else
+		eng = cdev->l2_affin_hint ? QED_ENG1 : QED_ENG0;
+
+	rc = qed_llh_set_ppfid_affinity(cdev, ppfid, eng);
+	if (rc) {
+		DP_NOTICE(cdev,
+			  "Failed to set the engine affinity of ppfid %d\n",
+			  ppfid);
+		return rc;
+	}
+
+	DP_VERBOSE(cdev, (QED_MSG_RDMA | QED_MSG_SP),
+		   "LLH: Set the engine affinity of non-RoCE packets as %d\n",
+		   eng);
+
+	return 0;
+}
+
 static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.common = &qed_common_ops_pass,
 	.fill_dev_info = &qed_fill_rdma_dev_info,
@@ -1955,6 +1985,7 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.ll2_set_fragment_of_tx_packet = &qed_ll2_set_fragment_of_tx_packet,
 	.ll2_set_mac_filter = &qed_roce_ll2_set_mac_filter,
 	.ll2_get_stats = &qed_ll2_get_stats,
+	.iwarp_set_engine_affin = &qed_iwarp_set_engine_affin,
 	.iwarp_connect = &qed_iwarp_connect,
 	.iwarp_create_listen = &qed_iwarp_create_listen,
 	.iwarp_destroy_listen = &qed_iwarp_destroy_listen,
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index d15f8e4815e3..898f595ea3d6 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -670,6 +670,8 @@ struct qed_rdma_ops {
 	int (*ll2_set_mac_filter)(struct qed_dev *cdev,
 				  u8 *old_mac_address, u8 *new_mac_address);
 
+	int (*iwarp_set_engine_affin)(struct qed_dev *cdev, bool b_reset);
+
 	int (*iwarp_connect)(void *rdma_cxt,
 			     struct qed_iwarp_connect_in *iparams,
 			     struct qed_iwarp_connect_out *oparams);
-- 
cgit v1.2.3


From 8242c59315b7b40ac97e1274d715665569992ff4 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:19 -0500
Subject: soundwire: add port-related definitions

Somehow previous header files did not include definition for
sink/source, flow and grouping.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 53 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 35662d9c2c62..69ae680a5a21 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -41,6 +41,31 @@ struct sdw_slave;
 #define SDW_DAI_ID_RANGE_START		100
 #define SDW_DAI_ID_RANGE_END		200
 
+enum {
+	SDW_PORT_DIRN_SINK = 0,
+	SDW_PORT_DIRN_SOURCE,
+	SDW_PORT_DIRN_MAX,
+};
+
+/*
+ * constants for flow control, ports and transport
+ *
+ * these are bit masks as devices can have multiple capabilities
+ */
+
+/*
+ * flow modes for SDW port. These can be isochronous, tx controlled,
+ * rx controlled or async
+ */
+#define SDW_PORT_FLOW_MODE_ISOCH	0
+#define SDW_PORT_FLOW_MODE_TX_CNTRL	BIT(0)
+#define SDW_PORT_FLOW_MODE_RX_CNTRL	BIT(1)
+#define SDW_PORT_FLOW_MODE_ASYNC	GENMASK(1, 0)
+
+/* sample packaging for block. It can be per port or per channel */
+#define SDW_BLOCK_PACKG_PER_PORT	BIT(0)
+#define SDW_BLOCK_PACKG_PER_CH		BIT(1)
+
 /**
  * enum sdw_slave_status - Slave status
  * @SDW_SLAVE_UNATTACHED: Slave is not attached with the bus.
@@ -76,6 +101,14 @@ enum sdw_command_response {
 	SDW_CMD_FAIL_OTHER = 4,
 };
 
+/* block group count enum */
+enum sdw_dpn_grouping {
+	SDW_BLK_GRP_CNT_1 = 0,
+	SDW_BLK_GRP_CNT_2 = 1,
+	SDW_BLK_GRP_CNT_3 = 2,
+	SDW_BLK_GRP_CNT_4 = 3,
+};
+
 /**
  * enum sdw_stream_type: data stream type
  *
@@ -100,6 +133,26 @@ enum sdw_data_direction {
 	SDW_DATA_DIR_TX = 1,
 };
 
+/**
+ * enum sdw_port_data_mode: Data Port mode
+ *
+ * @SDW_PORT_DATA_MODE_NORMAL: Normal data mode where audio data is received
+ * and transmitted.
+ * @SDW_PORT_DATA_MODE_STATIC_1: Simple test mode which uses static value of
+ * logic 1. The encoding will result in signal transitions at every bitslot
+ * owned by this Port
+ * @SDW_PORT_DATA_MODE_STATIC_0: Simple test mode which uses static value of
+ * logic 0. The encoding will result in no signal transitions
+ * @SDW_PORT_DATA_MODE_PRBS: Test mode which uses a PRBS generator to produce
+ * a pseudo random data pattern that is transferred
+ */
+enum sdw_port_data_mode {
+	SDW_PORT_DATA_MODE_NORMAL = 0,
+	SDW_PORT_DATA_MODE_STATIC_1 = 1,
+	SDW_PORT_DATA_MODE_STATIC_0 = 2,
+	SDW_PORT_DATA_MODE_PRBS = 3,
+};
+
 /*
  * SDW properties, defined in MIPI DisCo spec v1.0
  */
-- 
cgit v1.2.3


From 53f3c097375c94e87b41227ebdff83c4f4e3af41 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:20 -0500
Subject: soundwire: remove master data port properties

The SoundWire and DisCo specifications do not define Master data ports
or related properties. Data ports are only defined for Slave devices,
so remove the unused member in properties.

Credits: this patch is based on an earlier internal contribution by
Vinod Koul, Sanyog Kale, Shreyas Nc and Hardik Shah.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 69ae680a5a21..831a370eaedd 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -377,7 +377,6 @@ struct sdw_slave_prop {
  * @dynamic_frame: Dynamic frame supported
  * @err_threshold: Number of times that software may retry sending a single
  * command
- * @dpn_prop: Data Port N properties
  */
 struct sdw_master_prop {
 	u32 revision;
@@ -393,7 +392,6 @@ struct sdw_master_prop {
 	u32 default_col;
 	bool dynamic_frame;
 	u32 err_threshold;
-	struct sdw_dpn_prop *dpn_prop;
 };
 
 int sdw_master_read_prop(struct sdw_bus *bus);
-- 
cgit v1.2.3


From 446701d1d161f04aa107903bfe7ec14e1f6cd17f Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:21 -0500
Subject: soundwire: mipi-disco: remove master_count property for masters

The master_count is only defined for a Controller or a Slave in the
MIPI DisCo for SoundWire document.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 831a370eaedd..14376d8458c3 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -364,7 +364,6 @@ struct sdw_slave_prop {
 /**
  * struct sdw_master_prop - Master properties
  * @revision: MIPI spec version of the implementation
- * @master_count: Number of masters
  * @clk_stop_mode: Bitmap for Clock Stop modes supported
  * @max_freq: Maximum Bus clock frequency, in Hz
  * @num_clk_gears: Number of clock gears supported
@@ -380,7 +379,6 @@ struct sdw_slave_prop {
  */
 struct sdw_master_prop {
 	u32 revision;
-	u32 master_count;
 	enum sdw_clk_stop_mode clk_stop_mode;
 	u32 max_freq;
 	u32 num_clk_gears;
-- 
cgit v1.2.3


From 3424305b8be456a8e23c951b8c9aebad0c765ff7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:22 -0500
Subject: soundwire: rename 'freq' fields

Rename all fields with 'freq' as 'clk_freq' to follow the MIPI
specification and avoid confusion between bus clock and audio clocks.

No functionality change.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c        |  4 ++--
 drivers/soundwire/intel.c      | 11 ++++++-----
 drivers/soundwire/mipi_disco.c | 23 ++++++++++++-----------
 drivers/soundwire/stream.c     |  2 +-
 include/linux/soundwire/sdw.h  | 12 ++++++------
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index aac35fc3cf22..96e42df8f458 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -87,7 +87,7 @@ int sdw_add_bus_master(struct sdw_bus *bus)
 
 	/*
 	 * Initialize clock values based on Master properties. The max
-	 * frequency is read from max_freq property. Current assumption
+	 * frequency is read from max_clk_freq property. Current assumption
 	 * is that the bus will start at highest clock frequency when
 	 * powered on.
 	 *
@@ -95,7 +95,7 @@ int sdw_add_bus_master(struct sdw_bus *bus)
 	 * to start with bank 0 (Table 40 of Spec)
 	 */
 	prop = &bus->prop;
-	bus->params.max_dr_freq = prop->max_freq * SDW_DOUBLE_RATE_FACTOR;
+	bus->params.max_dr_freq = prop->max_clk_freq * SDW_DOUBLE_RATE_FACTOR;
 	bus->params.curr_dr_freq = bus->params.max_dr_freq;
 	bus->params.curr_bank = SDW_BANK0;
 	bus->params.next_bank = SDW_BANK1;
diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 31336b0271b0..4ac141730b13 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -796,13 +796,14 @@ static int intel_prop_read(struct sdw_bus *bus)
 	sdw_master_read_prop(bus);
 
 	/* BIOS is not giving some values correctly. So, lets override them */
-	bus->prop.num_freq = 1;
-	bus->prop.freq = devm_kcalloc(bus->dev, bus->prop.num_freq,
-				      sizeof(*bus->prop.freq), GFP_KERNEL);
-	if (!bus->prop.freq)
+	bus->prop.num_clk_freq = 1;
+	bus->prop.clk_freq = devm_kcalloc(bus->dev, bus->prop.num_clk_freq,
+					  sizeof(*bus->prop.clk_freq),
+					  GFP_KERNEL);
+	if (!bus->prop.clk_freq)
 		return -ENOMEM;
 
-	bus->prop.freq[0] = bus->prop.max_freq;
+	bus->prop.clk_freq[0] = bus->prop.max_clk_freq;
 	bus->prop.err_threshold = 5;
 
 	return 0;
diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index 6df68584c963..b1770af43fa8 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -58,31 +58,32 @@ int sdw_master_read_prop(struct sdw_bus *bus)
 
 	fwnode_property_read_u32(link,
 				 "mipi-sdw-max-clock-frequency",
-				 &prop->max_freq);
+				 &prop->max_clk_freq);
 
 	nval = fwnode_property_read_u32_array(link,
 			"mipi-sdw-clock-frequencies-supported", NULL, 0);
 	if (nval > 0) {
-		prop->num_freq = nval;
-		prop->freq = devm_kcalloc(bus->dev, prop->num_freq,
-					  sizeof(*prop->freq), GFP_KERNEL);
-		if (!prop->freq)
+		prop->num_clk_freq = nval;
+		prop->clk_freq = devm_kcalloc(bus->dev, prop->num_clk_freq,
+					      sizeof(*prop->clk_freq),
+					      GFP_KERNEL);
+		if (!prop->clk_freq)
 			return -ENOMEM;
 
 		fwnode_property_read_u32_array(link,
 				"mipi-sdw-clock-frequencies-supported",
-				prop->freq, prop->num_freq);
+				prop->clk_freq, prop->num_clk_freq);
 	}
 
 	/*
 	 * Check the frequencies supported. If FW doesn't provide max
 	 * freq, then populate here by checking values.
 	 */
-	if (!prop->max_freq && prop->freq) {
-		prop->max_freq = prop->freq[0];
-		for (i = 1; i < prop->num_freq; i++) {
-			if (prop->freq[i] > prop->max_freq)
-				prop->max_freq = prop->freq[i];
+	if (!prop->max_clk_freq && prop->clk_freq) {
+		prop->max_clk_freq = prop->clk_freq[0];
+		for (i = 1; i < prop->num_clk_freq; i++) {
+			if (prop->clk_freq[i] > prop->max_clk_freq)
+				prop->max_clk_freq = prop->clk_freq[i];
 		}
 	}
 
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index d01060dbee96..89edc897b8eb 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -1474,7 +1474,7 @@ static int _sdw_prepare_stream(struct sdw_stream_runtime *stream)
 		memcpy(&params, &bus->params, sizeof(params));
 
 		/* TODO: Support Asynchronous mode */
-		if ((prop->max_freq % stream->params.rate) != 0) {
+		if ((prop->max_clk_freq % stream->params.rate) != 0) {
 			dev_err(bus->dev, "Async mode not supported\n");
 			return -EINVAL;
 		}
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 14376d8458c3..c6ded0d7a9f2 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -365,11 +365,11 @@ struct sdw_slave_prop {
  * struct sdw_master_prop - Master properties
  * @revision: MIPI spec version of the implementation
  * @clk_stop_mode: Bitmap for Clock Stop modes supported
- * @max_freq: Maximum Bus clock frequency, in Hz
+ * @max_clk_freq: Maximum Bus clock frequency, in Hz
  * @num_clk_gears: Number of clock gears supported
  * @clk_gears: Clock gears supported
- * @num_freq: Number of clock frequencies supported, in Hz
- * @freq: Clock frequencies supported, in Hz
+ * @num_clk_freq: Number of clock frequencies supported, in Hz
+ * @clk_freq: Clock frequencies supported, in Hz
  * @default_frame_rate: Controller default Frame rate, in Hz
  * @default_row: Number of rows
  * @default_col: Number of columns
@@ -380,11 +380,11 @@ struct sdw_slave_prop {
 struct sdw_master_prop {
 	u32 revision;
 	enum sdw_clk_stop_mode clk_stop_mode;
-	u32 max_freq;
+	u32 max_clk_freq;
 	u32 num_clk_gears;
 	u32 *clk_gears;
-	u32 num_freq;
-	u32 *freq;
+	u32 num_clk_freq;
+	u32 *clk_freq;
 	u32 default_frame_rate;
 	u32 default_row;
 	u32 default_col;
-- 
cgit v1.2.3


From 53d2e9c3773478d00d1851cc048d7ef9e60c7b6d Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:23 -0500
Subject: soundwire: mipi-disco: fix clock stop modes

Fix support for clock_stop_mode0 and 1. The existing code uses a
bitmask between enums, one of which being zero. Or-ing with zero is
not very useful in general...Fix by or-ing with a BIT dependent on the
enum value.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/mipi_disco.c | 4 ++--
 include/linux/soundwire/sdw.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index b1770af43fa8..efb87ee0e7fc 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -50,11 +50,11 @@ int sdw_master_read_prop(struct sdw_bus *bus)
 
 	if (fwnode_property_read_bool(link,
 				      "mipi-sdw-clock-stop-mode0-supported"))
-		prop->clk_stop_mode = SDW_CLK_STOP_MODE0;
+		prop->clk_stop_modes |= BIT(SDW_CLK_STOP_MODE0);
 
 	if (fwnode_property_read_bool(link,
 				      "mipi-sdw-clock-stop-mode1-supported"))
-		prop->clk_stop_mode |= SDW_CLK_STOP_MODE1;
+		prop->clk_stop_modes |= BIT(SDW_CLK_STOP_MODE1);
 
 	fwnode_property_read_u32(link,
 				 "mipi-sdw-max-clock-frequency",
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index c6ded0d7a9f2..0e3fdd03e589 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -364,7 +364,7 @@ struct sdw_slave_prop {
 /**
  * struct sdw_master_prop - Master properties
  * @revision: MIPI spec version of the implementation
- * @clk_stop_mode: Bitmap for Clock Stop modes supported
+ * @clk_stop_modes: Bitmap, bit N set when clock-stop-modeN supported
  * @max_clk_freq: Maximum Bus clock frequency, in Hz
  * @num_clk_gears: Number of clock gears supported
  * @clk_gears: Clock gears supported
@@ -379,7 +379,7 @@ struct sdw_slave_prop {
  */
 struct sdw_master_prop {
 	u32 revision;
-	enum sdw_clk_stop_mode clk_stop_mode;
+	u32 clk_stop_modes;
 	u32 max_clk_freq;
 	u32 num_clk_gears;
 	u32 *clk_gears;
-- 
cgit v1.2.3


From f6686a7d026dc00cecca2d2cefb7f75bb2f84801 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:24 -0500
Subject: soundwire: clarify comment

The MIPI DisCo spec refers to dynamic frame shape, not to dynamic
shape. Clarify.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 0e3fdd03e589..b7efa819d425 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -373,7 +373,7 @@ struct sdw_slave_prop {
  * @default_frame_rate: Controller default Frame rate, in Hz
  * @default_row: Number of rows
  * @default_col: Number of columns
- * @dynamic_frame: Dynamic frame supported
+ * @dynamic_frame: Dynamic frame shape supported
  * @err_threshold: Number of times that software may retry sending a single
  * command
  */
-- 
cgit v1.2.3


From 8acbbfec280f1ee72ebeec407e39aa0d1b879b59 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 22 May 2019 14:47:25 -0500
Subject: soundwire: rename/clarify MIPI DisCo properties

The existing definitions are ambiguous and possibly misleading.

For DP0, 'flow-control' is only relevant for the BRA protocol and
should not be confused with async modes explicitly not supported for
DP0, add prefix to follow MIPI DisCo definition

The use of 'device_interrupts' is also questionable. The MIPI
SoundWire spec defines Slave-, DP0- and DPN-level
implementation-defined interrupts. Using the 'device' prefix in the
last two cases is misleading, not only is the term 'device' overloaded
but these properties are only valid at the DP0 and DPn levels. Rename
to follow the MIPI definitions, no need to be creative here.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c        |  2 +-
 drivers/soundwire/mipi_disco.c |  6 +++---
 drivers/soundwire/stream.c     |  6 +++---
 include/linux/soundwire/sdw.h  | 13 +++++++------
 4 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 96e42df8f458..fe745830a261 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -648,7 +648,7 @@ static int sdw_initialize_slave(struct sdw_slave *slave)
 		return 0;
 
 	/* Enable DP0 interrupts */
-	val = prop->dp0_prop->device_interrupts;
+	val = prop->dp0_prop->imp_def_interrupts;
 	val |= SDW_DP0_INT_PORT_READY | SDW_DP0_INT_BRA_FAILURE;
 
 	ret = sdw_update(slave, SDW_DP0_INTMASK, val, val);
diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c
index efb87ee0e7fc..79fee1b21ab6 100644
--- a/drivers/soundwire/mipi_disco.c
+++ b/drivers/soundwire/mipi_disco.c
@@ -150,13 +150,13 @@ static int sdw_slave_read_dp0(struct sdw_slave *slave,
 				dp0->words, dp0->num_words);
 	}
 
-	dp0->flow_controlled = fwnode_property_read_bool(port,
+	dp0->BRA_flow_controlled = fwnode_property_read_bool(port,
 				"mipi-sdw-bra-flow-controlled");
 
 	dp0->simple_ch_prep_sm = fwnode_property_read_bool(port,
 				"mipi-sdw-simplified-channel-prepare-sm");
 
-	dp0->device_interrupts = fwnode_property_read_bool(port,
+	dp0->imp_def_interrupts = fwnode_property_read_bool(port,
 				"mipi-sdw-imp-def-dp0-interrupts-supported");
 
 	return 0;
@@ -225,7 +225,7 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave,
 
 		fwnode_property_read_u32(node,
 				"mipi-sdw-imp-def-dpn-interrupts-supported",
-				&dpn[i].device_interrupts);
+				&dpn[i].imp_def_interrupts);
 
 		fwnode_property_read_u32(node, "mipi-sdw-min-channel-number",
 					 &dpn[i].min_ch);
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 89edc897b8eb..ce9cb7fa4724 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -439,7 +439,7 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 
 	prep_ch.bank = bus->params.next_bank;
 
-	if (dpn_prop->device_interrupts || !dpn_prop->simple_ch_prep_sm)
+	if (dpn_prop->imp_def_interrupts || !dpn_prop->simple_ch_prep_sm)
 		intr = true;
 
 	/*
@@ -449,7 +449,7 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 	 */
 	if (prep && intr) {
 		ret = sdw_configure_dpn_intr(s_rt->slave, p_rt->num, prep,
-					     dpn_prop->device_interrupts);
+					     dpn_prop->imp_def_interrupts);
 		if (ret < 0)
 			return ret;
 	}
@@ -493,7 +493,7 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 	/* Disable interrupt after Port de-prepare */
 	if (!prep && intr)
 		ret = sdw_configure_dpn_intr(s_rt->slave, p_rt->num, prep,
-					     dpn_prop->device_interrupts);
+					     dpn_prop->imp_def_interrupts);
 
 	return ret;
 }
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index b7efa819d425..bea46bd8b6ce 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -206,10 +206,11 @@ enum sdw_clk_stop_mode {
  * (inclusive)
  * @num_words: number of wordlengths supported
  * @words: wordlengths supported
- * @flow_controlled: Slave implementation results in an OK_NotReady
+ * @BRA_flow_controlled: Slave implementation results in an OK_NotReady
  * response
  * @simple_ch_prep_sm: If channel prepare sequence is required
- * @device_interrupts: If implementation-defined interrupts are supported
+ * @imp_def_interrupts: If set, each bit corresponds to support for
+ * implementation-defined interrupts
  *
  * The wordlengths are specified by Spec as max, min AND number of
  * discrete values, implementation can define based on the wordlengths they
@@ -220,9 +221,9 @@ struct sdw_dp0_prop {
 	u32 min_word;
 	u32 num_words;
 	u32 *words;
-	bool flow_controlled;
+	bool BRA_flow_controlled;
 	bool simple_ch_prep_sm;
-	bool device_interrupts;
+	bool imp_def_interrupts;
 };
 
 /**
@@ -272,7 +273,7 @@ struct sdw_dpn_audio_mode {
  * @simple_ch_prep_sm: If the port supports simplified channel prepare state
  * machine
  * @ch_prep_timeout: Port-specific timeout value, in milliseconds
- * @device_interrupts: If set, each bit corresponds to support for
+ * @imp_def_interrupts: If set, each bit corresponds to support for
  * implementation-defined interrupts
  * @max_ch: Maximum channels supported
  * @min_ch: Minimum channels supported
@@ -297,7 +298,7 @@ struct sdw_dpn_prop {
 	u32 max_grouping;
 	bool simple_ch_prep_sm;
 	u32 ch_prep_timeout;
-	u32 device_interrupts;
+	u32 imp_def_interrupts;
 	u32 max_ch;
 	u32 min_ch;
 	u32 num_ch;
-- 
cgit v1.2.3


From f5151311c3f37f6edc85b2253ccf6d3e2a4c4c26 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:14 +0800
Subject: dmaengine: Add matching device node validation in
 __dma_request_channel()

When user try to request one DMA channel by __dma_request_channel(), it won't
validate if it is the correct DMA device to request, that will lead each DMA
engine driver to validate the correct device node in their filter function
if it is necessary.

Thus we can add the matching device node validation in the DMA engine core,
to remove all of device node validation in the drivers.

Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dmaengine.c   | 10 ++++++++--
 drivers/dma/of-dma.c      |  4 ++--
 include/linux/dmaengine.h | 12 ++++++++----
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3a11b1092e80..610080c629bb 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -641,11 +641,13 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
  * @mask: capabilities that the channel must satisfy
  * @fn: optional callback to disposition available channels
  * @fn_param: opaque parameter to pass to dma_filter_fn
+ * @np: device node to look for DMA channels
  *
  * Returns pointer to appropriate DMA channel on success or NULL.
  */
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-				       dma_filter_fn fn, void *fn_param)
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np)
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan = NULL;
@@ -653,6 +655,10 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 	/* Find a channel */
 	mutex_lock(&dma_list_mutex);
 	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		/* Finds a DMA controller with matching device node */
+		if (np && device->dev->of_node && np != device->dev->of_node)
+			continue;
+
 		chan = find_candidate(device, mask, fn, fn_param);
 		if (!IS_ERR(chan))
 			break;
@@ -769,7 +775,7 @@ struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask)
 	if (!mask)
 		return ERR_PTR(-ENODEV);
 
-	chan = __dma_request_channel(mask, NULL, NULL);
+	chan = __dma_request_channel(mask, NULL, NULL, NULL);
 	if (!chan) {
 		mutex_lock(&dma_list_mutex);
 		if (list_empty(&dma_device_list))
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 91fd395c90c4..6b43d04da05d 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -316,8 +316,8 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 	if (count != 1)
 		return NULL;
 
-	return dma_request_channel(info->dma_cap, info->filter_fn,
-			&dma_spec->args[0]);
+	return __dma_request_channel(&info->dma_cap, info->filter_fn,
+				     &dma_spec->args[0], dma_spec->np);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d49ec5c31944..504085b2bf21 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1314,7 +1314,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
 void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					dma_filter_fn fn, void *fn_param);
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np);
 struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
@@ -1339,7 +1340,9 @@ static inline void dma_issue_pending_all(void)
 {
 }
 static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					      dma_filter_fn fn, void *fn_param)
+						     dma_filter_fn fn,
+						     void *fn_param,
+						     struct device_node *np)
 {
 	return NULL;
 }
@@ -1411,7 +1414,8 @@ void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
 struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
-#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
+#define dma_request_channel(mask, x, y) \
+	__dma_request_channel(&(mask), x, y, NULL)
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
@@ -1429,6 +1433,6 @@ static inline struct dma_chan
 	if (!fn || !fn_param)
 		return NULL;
 
-	return __dma_request_channel(mask, fn, fn_param);
+	return __dma_request_channel(mask, fn, fn_param, NULL);
 }
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3


From 990c0b53bf6599a9c9c7df1529dde681dee6cf64 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:16 +0800
Subject: dmaengine: imx-sdma: Let the core do the device node validation

Let the DMA engine core do the device node validation instead of drivers.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                | 9 ++-------
 include/linux/platform_data/dma-imx.h | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 99d9f431ae2c..ca296f0849ef 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1934,16 +1934,11 @@ disable_clk_ipg:
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
 	struct sdma_channel *sdmac = to_sdma_chan(chan);
-	struct sdma_engine *sdma = sdmac->sdma;
 	struct imx_dma_data *data = fn_param;
 
 	if (!imx_dma_is_general_purpose(chan))
 		return false;
 
-	/* return false if it's not the right device */
-	if (sdma->dev->of_node != data->of_node)
-		return false;
-
 	sdmac->data = *data;
 	chan->private = &sdmac->data;
 
@@ -1971,9 +1966,9 @@ static struct dma_chan *sdma_xlate(struct of_phandle_args *dma_spec,
 	 * be set to sdmac->event_id1.
 	 */
 	data.dma_request2 = 0;
-	data.of_node = ofdma->of_node;
 
-	return dma_request_channel(mask, sdma_filter_fn, &data);
+	return __dma_request_channel(&mask, sdma_filter_fn, &data,
+				     ofdma->of_node);
 }
 
 static int sdma_probe(struct platform_device *pdev)
diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h
index 9daea8d42a10..7d964e787299 100644
--- a/include/linux/platform_data/dma-imx.h
+++ b/include/linux/platform_data/dma-imx.h
@@ -55,7 +55,6 @@ struct imx_dma_data {
 	int dma_request2; /* secondary DMA request line */
 	enum sdma_peripheral_type peripheral_type;
 	int priority;
-	struct device_node *of_node;
 };
 
 static inline int imx_dma_is_ipu(struct dma_chan *chan)
-- 
cgit v1.2.3


From d27ac2e02bf256d4e824e7c1e1e1afa2b96cefcc Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 27 May 2019 09:55:16 +0300
Subject: include: fpga: adi-axi-common.h: add common regs & defs header

The AXI HDL cores provided for Analog Devices reference designs all share
some common base registers (e.g. version register at address 0x00).

To reduce duplication for this, a common header is added to define these
registers as well as bitfields & macros to work with these registers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/fpga/adi-axi-common.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/fpga/adi-axi-common.h

(limited to 'include/linux')

diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h
new file mode 100644
index 000000000000..7fc95d5c95bb
--- /dev/null
+++ b/include/linux/fpga/adi-axi-common.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Analog Devices AXI common registers & definitions
+ *
+ * Copyright 2019 Analog Devices Inc.
+ *
+ * https://wiki.analog.com/resources/fpga/docs/axi_ip
+ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap
+ */
+
+#ifndef ADI_AXI_COMMON_H_
+#define ADI_AXI_COMMON_H_
+
+#define	ADI_AXI_REG_VERSION			0x0000
+
+#define ADI_AXI_PCORE_VER(major, minor, patch)	\
+	(((major) << 16) | ((minor) << 8) | (patch))
+
+#endif /* ADI_AXI_COMMON_H_ */
-- 
cgit v1.2.3


From cdd3a2499d30695730b22dc025c00b9b28884c6b Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Fri, 24 May 2019 16:40:16 -0700
Subject: iommu/vt-d: Introduce macros useful for dumping DMAR table

A scalable mode DMAR table walk would involve looking at bits in each stage
of walk, like,
1. Is PASID enabled in the context entry?
2. What's the size of PASID directory?
3. Is the PASID directory entry present?
4. Is the PASID table entry present?
5. Number of PASID table entries?

Hence, add these macros that will later be used during this walk.
Apart from adding new macros, move existing macros (like
pasid_pde_is_present(), get_pasid_table_from_pde() and pasid_supported())
to appropriate header files so that they could be reused.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Sohil Mehta <sohil.mehta@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  6 +-----
 drivers/iommu/intel-pasid.c | 17 -----------------
 drivers/iommu/intel-pasid.h | 26 ++++++++++++++++++++++++++
 include/linux/intel-iommu.h |  6 ++++++
 4 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 1b7ad80c0537..9722c2ffe428 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -357,6 +357,7 @@ int dmar_disabled = 0;
 int dmar_disabled = 1;
 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 
+int intel_iommu_sm;
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
@@ -364,17 +365,12 @@ static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
-static int intel_iommu_sm;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL		1
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
-#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
-#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
-				 ecap_pasid((iommu)->ecap))
-
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 2fefeafda437..6895a23b2157 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -169,23 +169,6 @@ attach_out:
 	return 0;
 }
 
-/* Get PRESENT bit of a PASID directory entry. */
-static inline bool
-pasid_pde_is_present(struct pasid_dir_entry *pde)
-{
-	return READ_ONCE(pde->val) & PASID_PTE_PRESENT;
-}
-
-/* Get PASID table from a PASID directory entry. */
-static inline struct pasid_entry *
-get_pasid_table_from_pde(struct pasid_dir_entry *pde)
-{
-	if (!pasid_pde_is_present(pde))
-		return NULL;
-
-	return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK);
-}
-
 void intel_pasid_free_table(struct device *dev)
 {
 	struct device_domain_info *info;
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 23537b3f34e3..fc8cd8f17de1 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -18,6 +18,10 @@
 #define PDE_PFN_MASK			PAGE_MASK
 #define PASID_PDE_SHIFT			6
 #define MAX_NR_PASID_BITS		20
+#define PASID_TBL_ENTRIES		BIT(PASID_PDE_SHIFT)
+
+#define is_pasid_enabled(entry)		(((entry)->lo >> 3) & 0x1)
+#define get_pasid_dir_size(entry)	(1 << ((((entry)->lo >> 9) & 0x7) + 7))
 
 /*
  * Domain ID reserved for pasid entries programmed for first-level
@@ -49,6 +53,28 @@ struct pasid_table {
 	struct list_head	dev;		/* device list */
 };
 
+/* Get PRESENT bit of a PASID directory entry. */
+static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde)
+{
+	return READ_ONCE(pde->val) & PASID_PTE_PRESENT;
+}
+
+/* Get PASID table from a PASID directory entry. */
+static inline struct pasid_entry *
+get_pasid_table_from_pde(struct pasid_dir_entry *pde)
+{
+	if (!pasid_pde_is_present(pde))
+		return NULL;
+
+	return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK);
+}
+
+/* Get PRESENT bit of a PASID table entry. */
+static inline bool pasid_pte_is_present(struct pasid_entry *pte)
+{
+	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
+}
+
 extern u32 intel_pasid_max_id;
 int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
 void intel_pasid_free_id(int pasid);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 6925a18a5ca3..4140726867a9 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -447,6 +447,12 @@ enum {
 #define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
 
+extern int intel_iommu_sm;
+
+#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
+#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
+				 ecap_pasid((iommu)->ecap))
+
 struct pasid_entry;
 struct pasid_state_entry;
 struct page_req_dsc;
-- 
cgit v1.2.3


From cb44c9a0ab21a9ae4dfcabac1ed8e38aa872d1af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 21 May 2019 10:03:48 -0500
Subject: signal: Remove task parameter from force_sigsegv

The function force_sigsegv is always called on the current task
so passing in current is redundant and not passing in current
makes this fact obvious.

This also makes it clear force_sigsegv always calls force_sig
on the current task.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/arc/kernel/process.c      | 2 +-
 arch/ia64/kernel/signal.c      | 6 +++---
 arch/nios2/kernel/signal.c     | 2 +-
 arch/sparc/kernel/signal32.c   | 4 ++--
 arch/sparc/kernel/signal_64.c  | 2 +-
 arch/um/kernel/skas/mmu.c      | 2 +-
 arch/um/kernel/trap.c          | 2 +-
 arch/unicore32/kernel/signal.c | 2 +-
 fs/exec.c                      | 2 +-
 include/linux/sched/signal.h   | 2 +-
 kernel/rseq.c                  | 2 +-
 kernel/signal.c                | 6 ++++--
 12 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 641c364fc232..725e556678a4 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -313,7 +313,7 @@ int elf_check_arch(const struct elf32_hdr *x)
 	eflags = x->e_flags;
 	if ((eflags & EF_ARC_OSABI_MSK) != EF_ARC_OSABI_CURRENT) {
 		pr_err("ABI mismatch - you need newer toolchain\n");
-		force_sigsegv(SIGSEGV, current);
+		force_sigsegv(SIGSEGV);
 		return 0;
 	}
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6062fd14e34e..518cceb5d4af 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -257,7 +257,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
 			 */
 			check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN;
 			if (!likely(on_sig_stack(check_sp))) {
-				force_sigsegv(ksig->sig, current);
+				force_sigsegv(ksig->sig);
 				return 1;
 			}
 		}
@@ -265,7 +265,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
 	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
 
 	if (!access_ok(frame, sizeof(*frame))) {
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 		return 1;
 	}
 
@@ -282,7 +282,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
 	err |= setup_sigcontext(&frame->sc, set, scr);
 
 	if (unlikely(err)) {
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 		return 1;
 	}
 
diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c
index 4a81876b6086..9bf38531b189 100644
--- a/arch/nios2/kernel/signal.c
+++ b/arch/nios2/kernel/signal.c
@@ -211,7 +211,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	return 0;
 
 give_sigsegv:
-	force_sigsegv(ksig->sig, current);
+	force_sigsegv(ksig->sig);
 	return -EFAULT;
 }
 
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index e800ce13cc6e..fb431d47a532 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -375,7 +375,7 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs,
 			pr_info("%s[%d] bad frame in setup_frame32: %08lx TPC %08lx O7 %08lx\n",
 				current->comm, current->pid, (unsigned long)sf,
 				regs->tpc, regs->u_regs[UREG_I7]);
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 		return -EINVAL;
 	}
 
@@ -509,7 +509,7 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs,
 			pr_info("%s[%d] bad frame in setup_rt_frame32: %08lx TPC %08lx O7 %08lx\n",
 				current->comm, current->pid, (unsigned long)sf,
 				regs->tpc, regs->u_regs[UREG_I7]);
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 		return -EINVAL;
 	}
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index ca70787efd8e..9d50190cf312 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -374,7 +374,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 			pr_info("%s[%d] bad frame in setup_rt_frame: %016lx TPC %016lx O7 %016lx\n",
 				current->comm, current->pid, (unsigned long)sf,
 				regs->tpc, regs->u_regs[UREG_I7]);
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 		return -EINVAL;
 	}
 
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 7a1f2a936fd1..29e7f5f9f188 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -119,7 +119,7 @@ void uml_setup_stubs(struct mm_struct *mm)
 	return;
 
 out:
-	force_sigsegv(SIGSEGV, current);
+	force_sigsegv(SIGSEGV);
 }
 
 void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 0e8b6158f224..646059402ab3 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -169,7 +169,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 
 void fatal_sigsegv(void)
 {
-	force_sigsegv(SIGSEGV, current);
+	force_sigsegv(SIGSEGV);
 	do_signal(&current->thread.regs);
 	/*
 	 * This is to tell gcc that we're not returning - do_signal
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 63be04809d40..75f27dc68bd0 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -386,7 +386,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 					regs->UCreg_pc = KERN_RESTART_CODE;
 				} else {
 					regs->UCreg_sp += 4;
-					force_sigsegv(0, current);
+					force_sigsegv(0);
 				}
 		}
 		if (regs->UCreg_00 == -ERESTARTNOHAND ||
diff --git a/fs/exec.c b/fs/exec.c
index d88584ebf07f..f5568e45d521 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1662,7 +1662,7 @@ int search_binary_handler(struct linux_binprm *bprm)
 		if (retval < 0 && !bprm->mm) {
 			/* we got to flush_old_exec() and failed after it */
 			read_unlock(&binfmt_lock);
-			force_sigsegv(SIGSEGV, current);
+			force_sigsegv(SIGSEGV);
 			return retval;
 		}
 		if (retval != -ENOEXEC || !bprm->file) {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index c68ca81db0a1..8af3101da782 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -325,7 +325,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey);
 int force_sig_ptrace_errno_trap(int errno, void __user *addr);
 
 extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
-extern void force_sigsegv(int sig, struct task_struct *p);
+extern void force_sigsegv(int sig);
 extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9424ee90589e..e1aa3ebee291 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 
 error:
 	sig = ksig ? ksig->sig : 0;
-	force_sigsegv(sig, t);
+	force_sigsegv(sig);
 }
 
 #ifdef CONFIG_DEBUG_RSEQ
diff --git a/kernel/signal.c b/kernel/signal.c
index 39a3eca5ce22..f7669d240ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1607,8 +1607,10 @@ EXPORT_SYMBOL(force_sig);
  * the problem was already a SIGSEGV, we'll want to
  * make sure we don't even try to deliver the signal..
  */
-void force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig)
 {
+	struct task_struct *p = current;
+
 	if (sig == SIGSEGV) {
 		unsigned long flags;
 		spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -2717,7 +2719,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
 {
 	if (failed)
-		force_sigsegv(ksig->sig, current);
+		force_sigsegv(ksig->sig);
 	else
 		signal_delivered(ksig, stepping);
 }
-- 
cgit v1.2.3


From 3cf5d076fb4d48979f382bc9452765bf8b79e740 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 23 May 2019 10:17:27 -0500
Subject: signal: Remove task parameter from force_sig

All of the remaining callers pass current into force_sig so
remove the task parameter to make this obvious and to make
misuse more difficult in the future.

This also makes it clear force_sig passes current into force_sig_info.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/signal.c              |  4 ++--
 arch/arc/kernel/process.c               |  2 +-
 arch/arc/kernel/signal.c                |  2 +-
 arch/arm/kernel/signal.c                |  4 ++--
 arch/arm64/kernel/traps.c               |  2 +-
 arch/c6x/kernel/signal.c                |  2 +-
 arch/csky/kernel/signal.c               |  4 +---
 arch/h8300/kernel/ptrace_h.c            |  4 ++--
 arch/h8300/kernel/ptrace_s.c            |  2 +-
 arch/h8300/kernel/signal.c              |  2 +-
 arch/hexagon/kernel/signal.c            |  2 +-
 arch/hexagon/kernel/traps.c             | 10 ++++-----
 arch/ia64/kernel/signal.c               |  2 +-
 arch/ia64/kernel/traps.c                |  6 +++---
 arch/m68k/kernel/signal.c               |  4 ++--
 arch/m68k/kernel/traps.c                | 16 +++++++--------
 arch/microblaze/kernel/signal.c         |  2 +-
 arch/mips/kernel/branch.c               | 18 ++++++++---------
 arch/mips/kernel/kprobes.c              |  2 +-
 arch/mips/kernel/signal.c               |  8 ++++----
 arch/mips/kernel/signal_n32.c           |  4 ++--
 arch/mips/kernel/signal_o32.c           |  8 ++++----
 arch/mips/kernel/traps.c                | 36 ++++++++++++++++-----------------
 arch/mips/kernel/unaligned.c            | 20 +++++++++---------
 arch/mips/sgi-ip22/ip22-berr.c          |  2 +-
 arch/mips/sgi-ip22/ip28-berr.c          |  2 +-
 arch/mips/sgi-ip27/ip27-berr.c          |  2 +-
 arch/mips/sgi-ip32/ip32-berr.c          |  2 +-
 arch/nds32/kernel/signal.c              |  2 +-
 arch/nds32/kernel/traps.c               |  6 +++---
 arch/nios2/kernel/signal.c              |  2 +-
 arch/openrisc/kernel/signal.c           |  2 +-
 arch/openrisc/kernel/traps.c            |  4 ++--
 arch/parisc/kernel/signal.c             |  2 +-
 arch/powerpc/kernel/signal_32.c         |  6 +++---
 arch/powerpc/kernel/signal_64.c         |  2 +-
 arch/powerpc/platforms/cell/spufs/run.c |  2 +-
 arch/riscv/kernel/signal.c              |  2 +-
 arch/s390/kernel/compat_signal.c        |  4 ++--
 arch/s390/kernel/signal.c               |  4 ++--
 arch/sh/kernel/cpu/sh2a/fpu.c           |  2 +-
 arch/sh/kernel/cpu/sh4/fpu.c            |  2 +-
 arch/sh/kernel/cpu/sh5/fpu.c            |  4 +---
 arch/sh/kernel/ptrace_64.c              |  4 ++--
 arch/sh/kernel/signal_32.c              |  4 ++--
 arch/sh/kernel/signal_64.c              |  4 ++--
 arch/sh/kernel/traps.c                  |  4 ++--
 arch/sh/kernel/traps_32.c               |  8 +++-----
 arch/sh/kernel/traps_64.c               |  2 +-
 arch/sparc/kernel/process_64.c          |  2 +-
 arch/sparc/kernel/signal32.c            |  4 ++--
 arch/sparc/kernel/signal_32.c           |  4 ++--
 arch/sparc/kernel/signal_64.c           |  6 +++---
 arch/sparc/kernel/traps_64.c            |  2 +-
 arch/sparc/mm/fault_32.c                |  2 +-
 arch/um/kernel/exec.c                   |  2 +-
 arch/um/kernel/tlb.c                    |  4 ++--
 arch/um/kernel/trap.c                   |  2 +-
 arch/unicore32/kernel/signal.c          |  2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c   |  2 +-
 arch/x86/kernel/cpu/mce/core.c          |  2 +-
 arch/x86/kernel/signal.c                |  2 +-
 arch/x86/kernel/traps.c                 |  4 ++--
 arch/x86/kernel/uprobes.c               |  2 +-
 arch/x86/kernel/vm86_32.c               |  2 +-
 arch/x86/mm/mpx.c                       |  2 +-
 arch/x86/um/signal.c                    |  4 ++--
 arch/xtensa/kernel/signal.c             |  2 +-
 arch/xtensa/kernel/traps.c              |  6 +++---
 drivers/misc/lkdtm/bugs.c               |  2 +-
 include/linux/sched/signal.h            |  2 +-
 include/linux/syscalls.h                |  2 +-
 kernel/events/uprobes.c                 |  4 ++--
 kernel/rseq.c                           |  2 +-
 kernel/signal.c                         |  6 +++---
 security/safesetid/lsm.c                |  4 ++--
 76 files changed, 160 insertions(+), 166 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 33e904a05881..a813020d2f11 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -225,7 +225,7 @@ do_sigreturn(struct sigcontext __user *sc)
 	return;
 
 give_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 asmlinkage void
@@ -253,7 +253,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame)
 	return;
 
 give_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 725e556678a4..deee16d5c03f 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -100,7 +100,7 @@ fault:
 		 goto again;
 
 fail:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return ret;
 }
 
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index 1bfb7de696bd..547c8f0cdc3a 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -197,7 +197,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	return regs->r0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index be5edfdde558..3870e0588d53 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -250,7 +250,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
 	return regs->ARM_r0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -283,7 +283,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	return regs->ARM_r0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index e45d5b440fb1..64abe8450780 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -257,7 +257,7 @@ void arm64_force_sig_fault(int signo, int code, void __user *addr,
 {
 	arm64_show_signal(signo, str);
 	if (signo == SIGKILL)
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 	else
 		force_sig_fault(signo, code, addr, current);
 }
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index 33b9f69c38f7..775de34b233a 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -93,7 +93,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	return regs->a4;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 04a43cfd4e09..7c09adeb58bb 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -61,7 +61,6 @@ SYSCALL_DEFINE0(rt_sigreturn)
 {
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
-	struct task_struct *task;
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
@@ -86,8 +85,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	return regs->a0;
 
 badframe:
-	task = current;
-	force_sig(SIGSEGV, task);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/h8300/kernel/ptrace_h.c b/arch/h8300/kernel/ptrace_h.c
index f5ff3b794c85..15db45a03b04 100644
--- a/arch/h8300/kernel/ptrace_h.c
+++ b/arch/h8300/kernel/ptrace_h.c
@@ -250,7 +250,7 @@ asmlinkage void trace_trap(unsigned long bp)
 {
 	if ((unsigned long)current->thread.breakinfo.addr == bp) {
 		user_disable_single_step(current);
-		force_sig(SIGTRAP, current);
+		force_sig(SIGTRAP);
 	} else
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 }
diff --git a/arch/h8300/kernel/ptrace_s.c b/arch/h8300/kernel/ptrace_s.c
index c0af930052c0..ee21f37b7ed4 100644
--- a/arch/h8300/kernel/ptrace_s.c
+++ b/arch/h8300/kernel/ptrace_s.c
@@ -40,5 +40,5 @@ void user_enable_single_step(struct task_struct *child)
 asmlinkage void trace_trap(unsigned long bp)
 {
 	(void)bp;
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 }
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index e0f2b708e5d9..ef7489b7c459 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -126,7 +126,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	return er0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 31e2cf95f189..0433fcbb496c 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -265,6 +265,6 @@ asmlinkage int sys_rt_sigreturn(void)
 	return regs->r00;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 91ee04842c22..e634414361df 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -252,7 +252,7 @@ int die_if_kernel(char *str, struct pt_regs *regs, long err)
 static void misaligned_instruction(struct pt_regs *regs)
 {
 	die_if_kernel("Misaligned Instruction", regs, 0);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 /*
@@ -263,19 +263,19 @@ static void misaligned_instruction(struct pt_regs *regs)
 static void misaligned_data_load(struct pt_regs *regs)
 {
 	die_if_kernel("Misaligned Data Load", regs, 0);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 static void misaligned_data_store(struct pt_regs *regs)
 {
 	die_if_kernel("Misaligned Data Store", regs, 0);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 static void illegal_instruction(struct pt_regs *regs)
 {
 	die_if_kernel("Illegal Instruction", regs, 0);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 /*
@@ -285,7 +285,7 @@ static void illegal_instruction(struct pt_regs *regs)
 static void precise_bus_error(struct pt_regs *regs)
 {
 	die_if_kernel("Precise Bus Error", regs, 0);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 /*
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 518cceb5d4af..e5044aed9452 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -152,7 +152,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	return retval;
 
   give_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return retval;
 }
 
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 85d8616ac4f6..0a3adbfebc2a 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -589,14 +589,14 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
 		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
 		       iip, ifa, isr);
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return;
 
 	      case 46:
 		printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
 		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
 		       iip, ifa, isr, iim);
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return;
 
 	      case 47:
@@ -608,5 +608,5 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		break;
 	}
 	if (!die_if_kernel(buf, &regs, error))
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 }
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 87e7f3639839..05610e6924c1 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -803,7 +803,7 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw)
 	return regs->d0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -825,7 +825,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw)
 	return regs->d0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index b2fd000b9285..2b6e143abd73 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -431,7 +431,7 @@ static inline void bus_error030 (struct frame *fp)
 			pr_err("BAD KERNEL BUSERR\n");
 
 			die_if_kernel("Oops", &fp->ptregs,0);
-			force_sig(SIGKILL, current);
+			force_sig(SIGKILL);
 			return;
 		}
 	} else {
@@ -463,7 +463,7 @@ static inline void bus_error030 (struct frame *fp)
 				 !(ssw & RW) ? "write" : "read", addr,
 				 fp->ptregs.pc);
 			die_if_kernel ("Oops", &fp->ptregs, buserr_type);
-			force_sig (SIGBUS, current);
+			force_sig (SIGBUS);
 			return;
 		}
 
@@ -493,7 +493,7 @@ static inline void bus_error030 (struct frame *fp)
 			do_page_fault (&fp->ptregs, addr, 0);
        } else {
 		pr_debug("protection fault on insn access (segv).\n");
-		force_sig (SIGSEGV, current);
+		force_sig (SIGSEGV);
        }
 }
 #else
@@ -571,7 +571,7 @@ static inline void bus_error030 (struct frame *fp)
 			       !(ssw & RW) ? "write" : "read", addr,
 			       fp->ptregs.pc);
 			die_if_kernel("Oops",&fp->ptregs,mmusr);
-			force_sig(SIGSEGV, current);
+			force_sig(SIGSEGV);
 			return;
 		} else {
 #if 0
@@ -598,7 +598,7 @@ static inline void bus_error030 (struct frame *fp)
 #endif
 			pr_debug("Unknown SIGSEGV - 1\n");
 			die_if_kernel("Oops",&fp->ptregs,mmusr);
-			force_sig(SIGSEGV, current);
+			force_sig(SIGSEGV);
 			return;
 		}
 
@@ -621,7 +621,7 @@ static inline void bus_error030 (struct frame *fp)
 	buserr:
 		pr_err("BAD KERNEL BUSERR\n");
 		die_if_kernel("Oops",&fp->ptregs,0);
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 		return;
 	}
 
@@ -660,7 +660,7 @@ static inline void bus_error030 (struct frame *fp)
 			addr, fp->ptregs.pc);
 		pr_debug("Unknown SIGSEGV - 2\n");
 		die_if_kernel("Oops",&fp->ptregs,mmusr);
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return;
 	}
 
@@ -804,7 +804,7 @@ asmlinkage void buserr_c(struct frame *fp)
 	default:
 	  die_if_kernel("bad frame format",&fp->ptregs,0);
 	  pr_debug("Unknown SIGSEGV - 4\n");
-	  force_sig(SIGSEGV, current);
+	  force_sig(SIGSEGV);
 	}
 }
 
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 0685696349bb..cdd4feb279c5 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -108,7 +108,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	return rval;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 180ad081afcf..1db29957a931 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -32,7 +32,7 @@ int __isa_exception_epc(struct pt_regs *regs)
 	/* Calculate exception PC in branch delay slot. */
 	if (__get_user(inst, (u16 __user *) msk_isa16_mode(epc))) {
 		/* This should never happen because delay slot was checked. */
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return epc;
 	}
 	if (cpu_has_mips16) {
@@ -305,7 +305,7 @@ int __microMIPS_compute_return_epc(struct pt_regs *regs)
 	return 0;
 
 sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return -EFAULT;
 }
 
@@ -328,7 +328,7 @@ int __MIPS16e_compute_return_epc(struct pt_regs *regs)
 	/* Read the instruction. */
 	addr = (u16 __user *)msk_isa16_mode(epc);
 	if (__get_user(inst.full, addr)) {
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return -EFAULT;
 	}
 
@@ -343,7 +343,7 @@ int __MIPS16e_compute_return_epc(struct pt_regs *regs)
 	case MIPS16e_jal_op:
 		addr += 1;
 		if (__get_user(inst2, addr)) {
-			force_sig(SIGSEGV, current);
+			force_sig(SIGSEGV);
 			return -EFAULT;
 		}
 		fullinst = ((unsigned)inst.full << 16) | inst2;
@@ -829,17 +829,17 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 sigill_dsp:
 	pr_debug("%s: DSP branch but not DSP ASE - sending SIGILL.\n",
 		 current->comm);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 	return -EFAULT;
 sigill_r2r6:
 	pr_debug("%s: R2 branch but r2-to-r6 emulator is not present - sending SIGILL.\n",
 		 current->comm);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 	return -EFAULT;
 sigill_r6:
 	pr_debug("%s: R6 branch but no MIPSr6 ISA support - sending SIGILL.\n",
 		 current->comm);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 	return -EFAULT;
 }
 EXPORT_SYMBOL_GPL(__compute_return_epc_for_insn);
@@ -859,7 +859,7 @@ int __compute_return_epc(struct pt_regs *regs)
 	 */
 	addr = (unsigned int __user *) epc;
 	if (__get_user(insn.word, addr)) {
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		return -EFAULT;
 	}
 
@@ -867,7 +867,7 @@ int __compute_return_epc(struct pt_regs *regs)
 
 unaligned:
 	printk("%s: unaligned epc - sending SIGBUS.\n", current->comm);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 	return -EFAULT;
 }
 
diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c
index 54cd675c5d1d..62af3ed65794 100644
--- a/arch/mips/kernel/kprobes.c
+++ b/arch/mips/kernel/kprobes.c
@@ -232,7 +232,7 @@ static int evaluate_branch_instruction(struct kprobe *p, struct pt_regs *regs,
 
 unaligned:
 	pr_notice("%s: unaligned epc - sending SIGBUS.\n", current->comm);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 	return -EFAULT;
 
 }
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index d75337974ee9..f6efabcb4e92 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -641,7 +641,7 @@ asmlinkage void sys_sigreturn(void)
 	if (sig < 0)
 		goto badframe;
 	else if (sig)
-		force_sig(sig, current);
+		force_sig(sig);
 
 	/*
 	 * Don't let your children do this ...
@@ -654,7 +654,7 @@ asmlinkage void sys_sigreturn(void)
 	/* Unreached */
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 #endif /* CONFIG_TRAD_SIGNALS */
 
@@ -678,7 +678,7 @@ asmlinkage void sys_rt_sigreturn(void)
 	if (sig < 0)
 		goto badframe;
 	else if (sig)
-		force_sig(sig, current);
+		force_sig(sig);
 
 	if (restore_altstack(&frame->rs_uc.uc_stack))
 		goto badframe;
@@ -694,7 +694,7 @@ asmlinkage void sys_rt_sigreturn(void)
 	/* Unreached */
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 #ifdef CONFIG_TRAD_SIGNALS
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index c498b027823e..a7601e862261 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -84,7 +84,7 @@ asmlinkage void sysn32_rt_sigreturn(void)
 	if (sig < 0)
 		goto badframe;
 	else if (sig)
-		force_sig(sig, current);
+		force_sig(sig);
 
 	if (compat_restore_altstack(&frame->rs_uc.uc_stack))
 		goto badframe;
@@ -100,7 +100,7 @@ asmlinkage void sysn32_rt_sigreturn(void)
 	/* Unreached */
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 static int setup_rt_frame_n32(void *sig_return, struct ksignal *ksig,
diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c
index df259618e834..299a7a28ca33 100644
--- a/arch/mips/kernel/signal_o32.c
+++ b/arch/mips/kernel/signal_o32.c
@@ -171,7 +171,7 @@ asmlinkage void sys32_rt_sigreturn(void)
 	if (sig < 0)
 		goto badframe;
 	else if (sig)
-		force_sig(sig, current);
+		force_sig(sig);
 
 	if (compat_restore_altstack(&frame->rs_uc.uc_stack))
 		goto badframe;
@@ -187,7 +187,7 @@ asmlinkage void sys32_rt_sigreturn(void)
 	/* Unreached */
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 static int setup_rt_frame_32(void *sig_return, struct ksignal *ksig,
@@ -273,7 +273,7 @@ asmlinkage void sys32_sigreturn(void)
 	if (sig < 0)
 		goto badframe;
 	else if (sig)
-		force_sig(sig, current);
+		force_sig(sig);
 
 	/*
 	 * Don't let your children do this ...
@@ -286,5 +286,5 @@ asmlinkage void sys32_sigreturn(void)
 	/* Unreached */
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c52766a5b85f..a6031b045b95 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -482,7 +482,7 @@ asmlinkage void do_be(struct pt_regs *regs)
 		goto out;
 
 	die_if_kernel("Oops", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 
 out:
 	exception_exit(prev_state);
@@ -765,7 +765,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 		return 1;
 
 	default:
-		force_sig(sig, current);
+		force_sig(sig);
 		return 1;
 	}
 }
@@ -947,7 +947,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 		break;
 	case BRK_BUG:
 		die_if_kernel("Kernel bug detected", regs);
-		force_sig(SIGTRAP, current);
+		force_sig(SIGTRAP);
 		break;
 	case BRK_MEMU:
 		/*
@@ -962,7 +962,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 			return;
 
 		die_if_kernel("Math emu break/trap", regs);
-		force_sig(SIGTRAP, current);
+		force_sig(SIGTRAP);
 		break;
 	default:
 		scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
@@ -970,7 +970,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 		if (si_code) {
 			force_sig_fault(SIGTRAP, si_code, NULL,	current);
 		} else {
-			force_sig(SIGTRAP, current);
+			force_sig(SIGTRAP);
 		}
 	}
 }
@@ -1063,7 +1063,7 @@ out:
 	return;
 
 out_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	goto out;
 }
 
@@ -1105,7 +1105,7 @@ out:
 	return;
 
 out_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	goto out;
 }
 
@@ -1191,7 +1191,7 @@ no_r2_instr:
 	if (unlikely(status > 0)) {
 		regs->cp0_epc = old_epc;		/* Undo skip-over.  */
 		regs->regs[31] = old31;
-		force_sig(status, current);
+		force_sig(status);
 	}
 
 out:
@@ -1220,7 +1220,7 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action,
 
 	die_if_kernel("COP2: Unhandled kernel unaligned access or invalid "
 			      "instruction", regs);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 
 	return NOTIFY_OK;
 }
@@ -1383,7 +1383,7 @@ asmlinkage void do_cpu(struct pt_regs *regs)
 		if (unlikely(status > 0)) {
 			regs->cp0_epc = old_epc;	/* Undo skip-over.  */
 			regs->regs[31] = old31;
-			force_sig(status, current);
+			force_sig(status);
 		}
 
 		break;
@@ -1403,7 +1403,7 @@ asmlinkage void do_cpu(struct pt_regs *regs)
 		 * emulator too.
 		 */
 		if (raw_cpu_has_fpu || !cpu_has_mips_4_5_64_r2_r6) {
-			force_sig(SIGILL, current);
+			force_sig(SIGILL);
 			break;
 		}
 		/* Fall through.  */
@@ -1437,7 +1437,7 @@ asmlinkage void do_cpu(struct pt_regs *regs)
 #else /* CONFIG_MIPS_FP_SUPPORT */
 	case 1:
 	case 3:
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 		break;
 #endif /* CONFIG_MIPS_FP_SUPPORT */
 
@@ -1464,7 +1464,7 @@ asmlinkage void do_msa_fpe(struct pt_regs *regs, unsigned int msacsr)
 	local_irq_enable();
 
 	die_if_kernel("do_msa_fpe invoked from kernel context!", regs);
-	force_sig(SIGFPE, current);
+	force_sig(SIGFPE);
 out:
 	exception_exit(prev_state);
 }
@@ -1477,7 +1477,7 @@ asmlinkage void do_msa(struct pt_regs *regs)
 	prev_state = exception_enter();
 
 	if (!cpu_has_msa || test_thread_flag(TIF_32BIT_FPREGS)) {
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 		goto out;
 	}
 
@@ -1485,7 +1485,7 @@ asmlinkage void do_msa(struct pt_regs *regs)
 
 	err = enable_restore_fp_context(1);
 	if (err)
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 out:
 	exception_exit(prev_state);
 }
@@ -1495,7 +1495,7 @@ asmlinkage void do_mdmx(struct pt_regs *regs)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 	exception_exit(prev_state);
 }
 
@@ -1592,7 +1592,7 @@ asmlinkage void do_mt(struct pt_regs *regs)
 	}
 	die_if_kernel("MIPS MT Thread exception in kernel", regs);
 
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 
@@ -1601,7 +1601,7 @@ asmlinkage void do_dsp(struct pt_regs *regs)
 	if (cpu_has_dsp)
 		panic("Unexpected DSP exception");
 
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 asmlinkage void do_reserved(struct pt_regs *regs)
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 76e33f940971..92bd2b0f0548 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -1365,20 +1365,20 @@ fault:
 		return;
 
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 
 	return;
 
 sigbus:
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 
 	return;
 
 sigill:
 	die_if_kernel
 	    ("Unhandled kernel unaligned access or invalid instruction", regs);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 /* Recode table from 16-bit register notation to 32-bit GPR. */
@@ -1991,20 +1991,20 @@ fault:
 		return;
 
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 
 	return;
 
 sigbus:
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 
 	return;
 
 sigill:
 	die_if_kernel
 	    ("Unhandled kernel unaligned access or invalid instruction", regs);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr)
@@ -2271,20 +2271,20 @@ fault:
 		return;
 
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 
 	return;
 
 sigbus:
 	die_if_kernel("Unhandled kernel unaligned access", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 
 	return;
 
 sigill:
 	die_if_kernel
 	    ("Unhandled kernel unaligned access or invalid instruction", regs);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 asmlinkage void do_ade(struct pt_regs *regs)
@@ -2364,7 +2364,7 @@ asmlinkage void do_ade(struct pt_regs *regs)
 
 sigbus:
 	die_if_kernel("Kernel unaligned instruction access", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 
 	/*
 	 * XXX On return from the signal handler we should advance the epc
diff --git a/arch/mips/sgi-ip22/ip22-berr.c b/arch/mips/sgi-ip22/ip22-berr.c
index 34bb9801d5ff..dc0110a607a5 100644
--- a/arch/mips/sgi-ip22/ip22-berr.c
+++ b/arch/mips/sgi-ip22/ip22-berr.c
@@ -98,7 +98,7 @@ void ip22_be_interrupt(int irq)
 	       field, regs->cp0_epc, field, regs->regs[31]);
 	/* Assume it would be too dangerous to continue ... */
 	die_if_kernel("Oops", regs);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 static int ip22_be_handler(struct pt_regs *regs, int is_fixup)
diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c
index 082541d33161..c0cf7baee36d 100644
--- a/arch/mips/sgi-ip22/ip28-berr.c
+++ b/arch/mips/sgi-ip22/ip28-berr.c
@@ -462,7 +462,7 @@ void ip22_be_interrupt(int irq)
 	if (ip28_be_interrupt(regs) != MIPS_BE_DISCARD) {
 		/* Assume it would be too dangerous to continue ... */
 		die_if_kernel("Oops", regs);
-		force_sig(SIGBUS, current);
+		force_sig(SIGBUS);
 	} else if (debug_be_interrupt)
 		show_regs(regs);
 }
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index 83efe03d5c60..73ad29b180fb 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -74,7 +74,7 @@ int ip27_be_handler(struct pt_regs *regs, int is_fixup)
 	show_regs(regs);
 	dump_tlb_all();
 	while(1);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 void __init ip27_be_init(void)
diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c
index c1f12a9cf305..c860f95ab7ed 100644
--- a/arch/mips/sgi-ip32/ip32-berr.c
+++ b/arch/mips/sgi-ip32/ip32-berr.c
@@ -29,7 +29,7 @@ static int ip32_be_handler(struct pt_regs *regs, int is_fixup)
 	show_regs(regs);
 	dump_tlb_all();
 	while(1);
-	force_sig(SIGBUS, current);
+	force_sig(SIGBUS);
 }
 
 void __init ip32_be_init(void)
diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c
index 5f7660aa2d68..fe61513982b4 100644
--- a/arch/nds32/kernel/signal.c
+++ b/arch/nds32/kernel/signal.c
@@ -163,7 +163,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	return regs->uregs[0];
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index 5aa7c17da27a..8d84b8b30eb6 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -288,7 +288,7 @@ void unhandled_interruption(struct pt_regs *regs)
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGKILL);
-	force_sig(SIGKILL, current);
+	force_sig(SIGKILL);
 }
 
 void unhandled_exceptions(unsigned long entry, unsigned long addr,
@@ -299,7 +299,7 @@ void unhandled_exceptions(unsigned long entry, unsigned long addr,
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGKILL);
-	force_sig(SIGKILL, current);
+	force_sig(SIGKILL);
 }
 
 extern int do_page_fault(unsigned long entry, unsigned long addr,
@@ -326,7 +326,7 @@ void do_revinsn(struct pt_regs *regs)
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGILL);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 #ifdef CONFIG_ALIGNMENT_TRAP
diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c
index 9bf38531b189..a42dd09c6578 100644
--- a/arch/nios2/kernel/signal.c
+++ b/arch/nios2/kernel/signal.c
@@ -120,7 +120,7 @@ asmlinkage int do_rt_sigreturn(struct switch_stack *sw)
 	return rval;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 5ac9d3b1d615..0337d1e1d2d5 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -99,7 +99,7 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs)
 	return regs->gpr[11];
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 6ed7293ef007..0fad2e46ff43 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -376,7 +376,7 @@ static inline void simulate_lwa(struct pt_regs *regs, unsigned long address,
 
 	if (get_user(value, lwa_addr)) {
 		if (user_mode(regs)) {
-			force_sig(SIGSEGV, current);
+			force_sig(SIGSEGV);
 			return;
 		}
 
@@ -423,7 +423,7 @@ static inline void simulate_swa(struct pt_regs *regs, unsigned long address,
 
 	if (put_user(regs->gpr[rb], vaddr)) {
 		if (user_mode(regs)) {
-			force_sig(SIGSEGV, current);
+			force_sig(SIGSEGV);
 			return;
 		}
 
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 848c1934680b..02895a8f2c55 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -164,7 +164,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 
 give_sigsegv:
 	DBG(1,"sys_rt_sigreturn: Sending SIGSEGV\n");
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return;
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index ede4f04281ae..fd48cdc0a4ff 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1249,7 +1249,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 				   current->comm, current->pid,
 				   rt_sf, regs->nip, regs->link);
 
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -1338,7 +1338,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
 					   current->comm, current->pid,
 					   ctx, regs->nip, regs->link);
 
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 		goto out;
 	}
 
@@ -1516,6 +1516,6 @@ badframe:
 				   current->comm, current->pid,
 				   addr, regs->nip, regs->link);
 
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 06c299ef6132..ea08d848f558 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -812,7 +812,7 @@ badframe:
 				   current->comm, current->pid, "rt_sigreturn",
 				   (long)uc, regs->nip, regs->link);
 
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 07f82d7395ff..3f2380f40f99 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -443,7 +443,7 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 
 	else if (unlikely((status & SPU_STATUS_STOPPED_BY_STOP)
 	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff)) {
-		force_sig(SIGTRAP, current);
+		force_sig(SIGTRAP);
 		ret = -ERESTARTSYS;
 	}
 
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 804d6ee4f3c5..50c0e64372b0 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -139,7 +139,7 @@ badframe:
 			task->comm, task_pid_nr(task), __func__,
 			frame, (void *)regs->sepc, (void *)regs->sp);
 	}
-	force_sig(SIGSEGV, task);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 6f2a193ccccc..38d4bdbc34b9 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -194,7 +194,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 	load_sigregs();
 	return regs->gprs[2];
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -217,7 +217,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 	load_sigregs();
 	return regs->gprs[2];
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }	
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 22f08245aa5d..e6fca5498e1f 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -232,7 +232,7 @@ SYSCALL_DEFINE0(sigreturn)
 	load_sigregs();
 	return regs->gprs[2];
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -256,7 +256,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	load_sigregs();
 	return regs->gprs[2];
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c
index 74b48db86dd7..0bcff11a4843 100644
--- a/arch/sh/kernel/cpu/sh2a/fpu.c
+++ b/arch/sh/kernel/cpu/sh2a/fpu.c
@@ -568,5 +568,5 @@ BUILD_TRAP_HANDLER(fpu_error)
 		return;
 	}
 
-	force_sig(SIGFPE, tsk);
+	force_sig(SIGFPE);
 }
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index 1ff56e5ba990..03ffd8cdf542 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -421,5 +421,5 @@ BUILD_TRAP_HANDLER(fpu_error)
 		}
 	}
 
-	force_sig(SIGFPE, tsk);
+	force_sig(SIGFPE);
 }
diff --git a/arch/sh/kernel/cpu/sh5/fpu.c b/arch/sh/kernel/cpu/sh5/fpu.c
index 9218d9ed787e..3966b5ee8e93 100644
--- a/arch/sh/kernel/cpu/sh5/fpu.c
+++ b/arch/sh/kernel/cpu/sh5/fpu.c
@@ -100,9 +100,7 @@ void restore_fpu(struct task_struct *tsk)
 
 asmlinkage void do_fpu_error(unsigned long ex, struct pt_regs *regs)
 {
-	struct task_struct *tsk = current;
-
 	regs->pc += 4;
 
-	force_sig(SIGFPE, tsk);
+	force_sig(SIGFPE);
 }
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index 3390349ff976..11085e48eaa6 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -550,7 +550,7 @@ asmlinkage void do_single_step(unsigned long long vec, struct pt_regs *regs)
 	   continually stepping. */
 	local_irq_enable();
 	regs->sr &= ~SR_SSTEP;
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 }
 
 /* Called with interrupts disabled */
@@ -561,7 +561,7 @@ BUILD_TRAP_HANDLER(breakpoint)
 	/* We need to forward step the PC, to counteract the backstep done
 	   in signal.c. */
 	local_irq_enable();
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 	regs->pc += 4;
 }
 
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2a2121ba8ebe..24473fa6c3b6 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -176,7 +176,7 @@ asmlinkage int sys_sigreturn(void)
 	return r0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -207,7 +207,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	return r0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index f1f1598879c2..b9aaa9266b34 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -277,7 +277,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 	return (int) ret;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -311,7 +311,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	return (int) ret;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 8b49cced663d..63cf17bc760d 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -141,7 +141,7 @@ BUILD_TRAP_HANDLER(debug)
 		       SIGTRAP) == NOTIFY_STOP)
 		return;
 
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 }
 
 /*
@@ -167,7 +167,7 @@ BUILD_TRAP_HANDLER(bug)
 	}
 #endif
 
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 }
 
 BUILD_TRAP_HANDLER(nmi)
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index f2a18b5fafd8..bd5568c8e7f0 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -611,7 +611,6 @@ asmlinkage void do_reserved_inst(void)
 {
 	struct pt_regs *regs = current_pt_regs();
 	unsigned long error_code;
-	struct task_struct *tsk = current;
 
 #ifdef CONFIG_SH_FPU_EMU
 	unsigned short inst = 0;
@@ -633,7 +632,7 @@ asmlinkage void do_reserved_inst(void)
 		/* Enable DSP mode, and restart instruction. */
 		regs->sr |= SR_DSP;
 		/* Save DSP mode */
-		tsk->thread.dsp_status.status |= SR_DSP;
+		current->thread.dsp_status.status |= SR_DSP;
 		return;
 	}
 #endif
@@ -641,7 +640,7 @@ asmlinkage void do_reserved_inst(void)
 	error_code = lookup_exception_vector();
 
 	local_irq_enable();
-	force_sig(SIGILL, tsk);
+	force_sig(SIGILL);
 	die_if_no_fixup("reserved instruction", regs, error_code);
 }
 
@@ -697,7 +696,6 @@ asmlinkage void do_illegal_slot_inst(void)
 {
 	struct pt_regs *regs = current_pt_regs();
 	unsigned long inst;
-	struct task_struct *tsk = current;
 
 	if (kprobe_handle_illslot(regs->pc) == 0)
 		return;
@@ -716,7 +714,7 @@ asmlinkage void do_illegal_slot_inst(void)
 	inst = lookup_exception_vector();
 
 	local_irq_enable();
-	force_sig(SIGILL, tsk);
+	force_sig(SIGILL);
 	die_if_no_fixup("illegal slot instruction", regs, inst);
 }
 
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index 8ce90a7da67d..37046f3a26d3 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -599,7 +599,7 @@ static void do_unhandled_exception(int signr, char *str, unsigned long error,
 				   struct pt_regs *regs)
 {
 	if (user_mode(regs))
-		force_sig(signr, current);
+		force_sig(signr);
 
 	die_if_no_fixup(str, regs, error);
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 59eaf6227af1..c4bccd97f3cf 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -570,7 +570,7 @@ void fault_in_user_windows(struct pt_regs *regs)
 
 barf:
 	set_thread_wsaved(window + 1);
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 asmlinkage long sparc_do_fork(unsigned long clone_flags,
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index fb431d47a532..a237810aa9f4 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -170,7 +170,7 @@ void do_sigreturn32(struct pt_regs *regs)
 	return;
 
 segv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
@@ -256,7 +256,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 	set_current_blocked(&set);
 	return;
 segv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 83953780ca01..42c3de313fd6 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -137,7 +137,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	return;
 
 segv_and_exit:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
@@ -196,7 +196,7 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
 	set_current_blocked(&set);
 	return;
 segv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 9d50190cf312..69ae814b7e90 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -134,7 +134,7 @@ out:
 	exception_exit(prev_state);
 	return;
 do_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	goto out;
 }
 
@@ -228,7 +228,7 @@ out:
 	exception_exit(prev_state);
 	return;
 do_sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	goto out;
 }
 
@@ -320,7 +320,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	set_current_blocked(&set);
 	return;
 segv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 }
 
 static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 0cd02a64a451..12bfc7e215ca 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2181,7 +2181,7 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 				addr += PAGE_SIZE;
 			}
 		}
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 
 		return true;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index b0440b0edd97..2731faf415ba 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -425,7 +425,7 @@ do_sigbus:
 static void check_stack_aligned(unsigned long sp)
 {
 	if (sp & 0x7UL)
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 }
 
 void window_overflow_fault(void)
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index a43d42bf0a86..783b9247161f 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -32,7 +32,7 @@ void flush_thread(void)
 	if (ret) {
 		printk(KERN_ERR "flush_thread - clearing address space failed, "
 		       "err = %d\n", ret);
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 	}
 	get_safe_registers(current_pt_regs()->regs.gp,
 			   current_pt_regs()->regs.fp);
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 8347161c2ae0..45f739bf302f 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -329,7 +329,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
 		       "process: %d\n", task_tgid_vnr(current));
 		/* We are under mmap_sem, release it such that current can terminate */
 		up_write(&current->mm->mmap_sem);
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 		do_signal(&current->thread.regs);
 	}
 }
@@ -487,7 +487,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
 
 kill:
 	printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address);
-	force_sig(SIGKILL, current);
+	force_sig(SIGKILL);
 }
 
 pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address)
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 646059402ab3..1c943c66063f 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -309,7 +309,7 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
 	} else {
 		printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
 		       sig, code, err);
-		force_sig(sig, current);
+		force_sig(sig);
 	}
 }
 
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 75f27dc68bd0..070fa58d23a9 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -129,7 +129,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
 	return regs->UCreg_00;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index d9d81ad7a400..7ea87f4ad0b7 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -268,7 +268,7 @@ do_ret:
 	return true;
 
 sigsegv:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return true;
 }
 
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5112a50e6486..e11ac124dd37 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1329,7 +1329,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		local_irq_enable();
 
 		if (kill_it || do_memory_failure(&m))
-			force_sig(SIGBUS, current);
+			force_sig(SIGBUS);
 		local_irq_disable();
 		ist_end_non_atomic();
 	} else {
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 364813cea647..7cf508f78c8c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -857,7 +857,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 		pr_cont("\n");
 	}
 
-	force_sig(SIGSEGV, me);
+	force_sig(SIGSEGV);
 }
 
 #ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8b6d03e55d2f..e54f0cad4b2e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -254,7 +254,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	show_signal(tsk, signr, "trap ", str, regs, error_code);
 
 	if (!sicode)
-		force_sig(signr, tsk);
+		force_sig(signr);
 	else
 		force_sig_fault(signr, sicode, addr, tsk);
 }
@@ -566,7 +566,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
 
-	force_sig(SIGSEGV, tsk);
+	force_sig(SIGSEGV);
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ccf03416e434..18239d5a8b53 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1087,7 +1087,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 		pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
 		       current->pid, regs->sp, regs->ip);
 
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 	}
 
 	return -1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 6a38717d179c..a76c12b38e92 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -583,7 +583,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 		return 1; /* we let this handle by the calling routine */
 	current->thread.trap_nr = trapno;
 	current->thread.error_code = error_code;
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 	return 0;
 }
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 0d1c47cbbdd6..895fb7a9294d 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -912,7 +912,7 @@ void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
 
 	ret = mpx_unmap_tables(mm, start, end);
 	if (ret)
-		force_sig(SIGSEGV, current);
+		force_sig(SIGSEGV);
 }
 
 /* MPX cannot handle addresses above 47 bits yet. */
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 8b4a71efe7ee..7c11c9e5d7ea 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -471,7 +471,7 @@ long sys_sigreturn(void)
 	return PT_REGS_SYSCALL_RET(&current->thread.regs);
 
  segfault:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
@@ -577,6 +577,6 @@ long sys_rt_sigreturn(void)
 	return PT_REGS_SYSCALL_RET(&current->thread.regs);
 
  segfault:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index dc22a238ed9c..fbedf2aba09d 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -270,7 +270,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	return ret;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 454d53096bc9..6f26b254091b 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -184,7 +184,7 @@ void do_unhandled(struct pt_regs *regs, unsigned long exccause)
 			    "\tEXCCAUSE is %ld\n",
 			    current->comm, task_pid_nr(current), regs->pc,
 			    exccause);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 /*
@@ -306,7 +306,7 @@ do_illegal_instruction(struct pt_regs *regs)
 
 	pr_info_ratelimited("Illegal Instruction in '%s' (pid = %d, pc = %#010lx)\n",
 			    current->comm, task_pid_nr(current), regs->pc);
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 }
 
 
@@ -354,7 +354,7 @@ do_debug(struct pt_regs *regs)
 
 	/* If in user mode, send SIGTRAP signal to current process */
 
-	force_sig(SIGTRAP, current);
+	force_sig(SIGTRAP);
 }
 
 
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 7eebbdfbcacd..86556adb1482 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -225,7 +225,7 @@ void lkdtm_CORRUPT_USER_DS(void)
 	set_fs(KERNEL_DS);
 
 	/* Make sure we do not keep running with a KERNEL_DS! */
-	force_sig(SIGKILL, current);
+	force_sig(SIGKILL);
 }
 
 /* Test that VMAP_STACK is actually allocating with a leading guard page */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 8af3101da782..e9df3f0cce48 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -335,7 +335,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
-extern void force_sig(int, struct task_struct *);
+extern void force_sig(int);
 extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..fd6e0f5ebfdf 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -266,7 +266,7 @@ static inline void addr_limit_user_check(void)
 
 	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
 				  "Invalid address limit on user-mode return"))
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 
 #ifdef TIF_FSCHECK
 	clear_thread_flag(TIF_FSCHECK);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..359122185cfb 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs)
 
  sigill:
 	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
-	force_sig(SIGILL, current);
+	force_sig(SIGILL);
 
 }
 
@@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 
 	if (unlikely(err)) {
 		uprobe_warn(current, "execute the probed insn, sending SIGILL.");
-		force_sig(SIGILL, current);
+		force_sig(SIGILL);
 	}
 }
 
diff --git a/kernel/rseq.c b/kernel/rseq.c
index e1aa3ebee291..27c48eb7de40 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs)
 		return;
 	if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
 	    rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
-		force_sig(SIGSEGV, t);
+		force_sig(SIGSEGV);
 }
 
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index f7669d240ce4..20878c4c28c2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1595,9 +1595,9 @@ send_sig(int sig, struct task_struct *p, int priv)
 }
 EXPORT_SYMBOL(send_sig);
 
-void force_sig(int sig, struct task_struct *p)
+void force_sig(int sig)
 {
-	force_sig_info(sig, SEND_SIG_PRIV, p);
+	force_sig_info(sig, SEND_SIG_PRIV, current);
 }
 EXPORT_SYMBOL(force_sig);
 
@@ -1617,7 +1617,7 @@ void force_sigsegv(int sig)
 		p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
-	force_sig(SIGSEGV, p);
+	force_sig(SIGSEGV);
 }
 
 int force_sig_fault(int sig, int code, void __user *addr
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
index cecd38e2ac80..06d4259f9ab1 100644
--- a/security/safesetid/lsm.c
+++ b/security/safesetid/lsm.c
@@ -111,7 +111,7 @@ static int check_uid_transition(kuid_t parent, kuid_t child)
 	 * that could arise from a missing whitelist entry preventing a
 	 * privileged process from dropping to a lesser-privileged one.
 	 */
-	force_sig(SIGKILL, current);
+	force_sig(SIGKILL);
 	return -EACCES;
 }
 
@@ -203,7 +203,7 @@ static int safesetid_task_fix_setuid(struct cred *new,
 		break;
 	default:
 		pr_warn("Unknown setid state %d\n", flags);
-		force_sig(SIGKILL, current);
+		force_sig(SIGKILL);
 		return -EINVAL;
 	}
 	return 0;
-- 
cgit v1.2.3


From f8eac9011b6be56acfb5d1d0dfd5ee30082a12ee Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 5 Feb 2019 18:14:19 -0600
Subject: signal: Remove task parameter from force_sig_mceerr

All of the callers pass current into force_sig_mceer so remove the
task parameter to make this obvious.

This also makes it clear that force_sig_mceerr passes current
into force_sig_info.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/arm64/kernel/traps.c    | 2 +-
 arch/parisc/mm/fault.c       | 2 +-
 arch/powerpc/mm/fault.c      | 3 +--
 arch/x86/mm/fault.c          | 2 +-
 include/linux/sched/signal.h | 2 +-
 kernel/signal.c              | 4 ++--
 mm/memory-failure.c          | 2 +-
 7 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 64abe8450780..c76a64c1bcb3 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -266,7 +266,7 @@ void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
 			    const char *str)
 {
 	arm64_show_signal(SIGBUS, str);
-	force_sig_mceerr(code, addr, lsb, current);
+	force_sig_mceerr(code, addr, lsb);
 }
 
 void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr,
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index c8e8b7c05558..56ceacb3401d 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -403,7 +403,7 @@ bad_area:
 				lsb = PAGE_SHIFT;
 
 			force_sig_mceerr(BUS_MCEERR_AR, (void __user *) address,
-					 lsb, current);
+					 lsb);
 			return;
 		}
 #endif
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b5d3578d9f65..6ed6c341c670 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -182,8 +182,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		if (fault & VM_FAULT_HWPOISON)
 			lsb = PAGE_SHIFT;
 
-		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb,
-				 current);
+		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
 		return 0;
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 46df4c6aae46..c431326ee3fa 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1040,7 +1040,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
 		if (fault & VM_FAULT_HWPOISON)
 			lsb = PAGE_SHIFT;
-		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk);
+		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
 		return;
 	}
 #endif
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e9df3f0cce48..4178bb1f7709 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -316,7 +316,7 @@ int send_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 	, struct task_struct *t);
 
-int force_sig_mceerr(int code, void __user *, short, struct task_struct *);
+int force_sig_mceerr(int code, void __user *, short);
 int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
 
 int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
diff --git a/kernel/signal.c b/kernel/signal.c
index 20878c4c28c2..398489facf9f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1666,7 +1666,7 @@ int send_sig_fault(int sig, int code, void __user *addr
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+int force_sig_mceerr(int code, void __user *addr, short lsb)
 {
 	struct kernel_siginfo info;
 
@@ -1677,7 +1677,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct
 	info.si_code = code;
 	info.si_addr = addr;
 	info.si_addr_lsb = lsb;
-	return force_sig_info(info.si_signo, &info, t);
+	return force_sig_info(info.si_signo, &info, current);
 }
 
 int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc8b51744579..bc749265a8f3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -221,7 +221,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 
 	if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
 		ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
-				       addr_lsb, current);
+				       addr_lsb);
 	} else {
 		/*
 		 * Don't use force here, it's convenient if the signal
-- 
cgit v1.2.3


From 7423e01741dd6a5f1255f589145313f0fb1c8cbe Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 25 May 2019 13:41:22 +0800
Subject: iommu: Add API to request DMA domain for device

Normally during iommu probing a device, a default doamin will
be allocated and attached to the device. The domain type of
the default domain is statically defined, which results in a
situation where the allocated default domain isn't suitable
for the device due to some limitations. We already have API
iommu_request_dm_for_dev() to replace a DMA domain with an
identity one. This adds iommu_request_dma_domain_for_dev()
to request a dma domain if an allocated identity domain isn't
suitable for the device in question.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 36 +++++++++++++++++++++++++-----------
 include/linux/iommu.h |  6 ++++++
 2 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 67ee6623f9b2..2fca04c3dbaf 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1907,10 +1907,10 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 	return region;
 }
 
-/* Request that a device is direct mapped by the IOMMU */
-int iommu_request_dm_for_dev(struct device *dev)
+static int
+request_default_domain_for_dev(struct device *dev, unsigned long type)
 {
-	struct iommu_domain *dm_domain;
+	struct iommu_domain *domain;
 	struct iommu_group *group;
 	int ret;
 
@@ -1923,8 +1923,7 @@ int iommu_request_dm_for_dev(struct device *dev)
 
 	/* Check if the default domain is already direct mapped */
 	ret = 0;
-	if (group->default_domain &&
-	    group->default_domain->type == IOMMU_DOMAIN_IDENTITY)
+	if (group->default_domain && group->default_domain->type == type)
 		goto out;
 
 	/* Don't change mappings of existing devices */
@@ -1934,23 +1933,26 @@ int iommu_request_dm_for_dev(struct device *dev)
 
 	/* Allocate a direct mapped domain */
 	ret = -ENOMEM;
-	dm_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_IDENTITY);
-	if (!dm_domain)
+	domain = __iommu_domain_alloc(dev->bus, type);
+	if (!domain)
 		goto out;
 
 	/* Attach the device to the domain */
-	ret = __iommu_attach_group(dm_domain, group);
+	ret = __iommu_attach_group(domain, group);
 	if (ret) {
-		iommu_domain_free(dm_domain);
+		iommu_domain_free(domain);
 		goto out;
 	}
 
+	iommu_group_create_direct_mappings(group, dev);
+
 	/* Make the direct mapped domain the default for this group */
 	if (group->default_domain)
 		iommu_domain_free(group->default_domain);
-	group->default_domain = dm_domain;
+	group->default_domain = domain;
 
-	dev_info(dev, "Using iommu direct mapping\n");
+	dev_info(dev, "Using iommu %s mapping\n",
+		 type == IOMMU_DOMAIN_DMA ? "dma" : "direct");
 
 	ret = 0;
 out:
@@ -1960,6 +1962,18 @@ out:
 	return ret;
 }
 
+/* Request that a device is direct mapped by the IOMMU */
+int iommu_request_dm_for_dev(struct device *dev)
+{
+	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY);
+}
+
+/* Request that a device can't be direct mapped by the IOMMU */
+int iommu_request_dma_domain_for_dev(struct device *dev)
+{
+	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA);
+}
+
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 {
 	const struct iommu_ops *ops = NULL;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a815cf6f6f47..91af22a344e2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -362,6 +362,7 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern int iommu_request_dm_for_dev(struct device *dev);
+extern int iommu_request_dma_domain_for_dev(struct device *dev);
 extern struct iommu_resv_region *
 iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot,
 			enum iommu_resv_type type);
@@ -626,6 +627,11 @@ static inline int iommu_request_dm_for_dev(struct device *dev)
 	return -ENODEV;
 }
 
+static inline int iommu_request_dma_domain_for_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
 static inline int iommu_attach_group(struct iommu_domain *domain,
 				     struct iommu_group *group)
 {
-- 
cgit v1.2.3


From 185da893fab1caa458c47f032a6f53717dbae2eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 May 2019 09:29:26 +0200
Subject: iommu/dma: Cleanup dma-iommu.h

No need for a __KERNEL__ guard outside uapi and add a missing comment
describing the #else cpp statement.  Last but not least include
<linux/errno.h> instead of the asm version, which is frowned upon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dma-iommu.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 476e0c54de2d..dfb83f9c24dc 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -16,9 +16,8 @@
 #ifndef __DMA_IOMMU_H
 #define __DMA_IOMMU_H
 
-#ifdef __KERNEL__
+#include <linux/errno.h>
 #include <linux/types.h>
-#include <asm/errno.h>
 
 #ifdef CONFIG_IOMMU_DMA
 #include <linux/dma-mapping.h>
@@ -86,7 +85,7 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
-#else
+#else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
 struct msi_desc;
@@ -128,5 +127,4 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 }
 
 #endif	/* CONFIG_IOMMU_DMA */
-#endif	/* __KERNEL__ */
 #endif	/* __DMA_IOMMU_H */
-- 
cgit v1.2.3


From af751d4308a7c80434b5f40fd44288d33dc1962f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 May 2019 09:29:27 +0200
Subject: iommu/dma: Remove the flush_page callback

We now have a arch_dma_prep_coherent architecture hook that is used
for the generic DMA remap allocator, and we should use the same
interface for the dma-iommu code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c | 8 +-------
 drivers/iommu/dma-iommu.c   | 8 +++-----
 include/linux/dma-iommu.h   | 3 +--
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 674860e3e478..10a8852c8b6a 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -104,12 +104,6 @@ arch_initcall(arm64_dma_init);
 #include <linux/platform_device.h>
 #include <linux/amba/bus.h>
 
-/* Thankfully, all cache ops are by VA so we can ignore phys here */
-static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
-{
-	__dma_flush_area(virt, PAGE_SIZE);
-}
-
 static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 				 dma_addr_t *handle, gfp_t gfp,
 				 unsigned long attrs)
@@ -186,7 +180,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 		struct page **pages;
 
 		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
-					handle, flush_page);
+					handle);
 		if (!pages)
 			return NULL;
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 129c4badf9ae..aac12433ffef 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -22,6 +22,7 @@
 #include <linux/acpi_iort.h>
 #include <linux/device.h>
 #include <linux/dma-iommu.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/gfp.h>
 #include <linux/huge_mm.h>
 #include <linux/iommu.h>
@@ -560,8 +561,6 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  * @attrs: DMA attributes for this allocation
  * @prot: IOMMU mapping flags
  * @handle: Out argument for allocated DMA handle
- * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
- *		given VA/PA are visible to the given non-coherent device.
  *
  * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
  * but an IOMMU which supports smaller pages might not map the whole thing.
@@ -570,8 +569,7 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  *	   or NULL on failure.
  */
 struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-		unsigned long attrs, int prot, dma_addr_t *handle,
-		void (*flush_page)(struct device *, const void *, phys_addr_t))
+		unsigned long attrs, int prot, dma_addr_t *handle)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -615,7 +613,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
 		 */
 		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents, SG_MITER_FROM_SG);
 		while (sg_miter_next(&miter))
-			flush_page(dev, miter.addr, page_to_phys(miter.page));
+			arch_dma_prep_coherent(miter.page, PAGE_SIZE);
 		sg_miter_stop(&miter);
 	}
 
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index dfb83f9c24dc..e1ef265b578b 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -44,8 +44,7 @@ int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
  * the arch code to take care of attributes and cache maintenance
  */
 struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-		unsigned long attrs, int prot, dma_addr_t *handle,
-		void (*flush_page)(struct device *, const void *, phys_addr_t));
+		unsigned long attrs, int prot, dma_addr_t *handle);
 void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
 		dma_addr_t *handle);
 
-- 
cgit v1.2.3


From 06d60728ff5c01795ac0bad66a5c42e3e78dcb6b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 May 2019 09:29:29 +0200
Subject: iommu/dma: move the arm64 wrappers to common code

There is nothing really arm64 specific in the iommu_dma_ops
implementation, so move it to dma-iommu.c and keep a lot of symbols
self-contained.  Note the implementation does depend on the
DMA_DIRECT_REMAP infrastructure for now, so we'll have to make the
DMA_IOMMU support depend on it, but this will be relaxed soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c | 394 +------------------------------------------
 drivers/iommu/Kconfig       |   1 +
 drivers/iommu/dma-iommu.c   | 398 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/dma-iommu.h   |  42 +----
 4 files changed, 378 insertions(+), 457 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 10a8852c8b6a..d1661f78eb4d 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -27,6 +27,7 @@
 #include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/dma-contiguous.h>
+#include <linux/dma-iommu.h>
 #include <linux/vmalloc.h>
 #include <linux/swiotlb.h>
 #include <linux/pci.h>
@@ -58,37 +59,6 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	__dma_flush_area(page_address(page), size);
 }
 
-#ifdef CONFIG_IOMMU_DMA
-static int __swiotlb_get_sgtable_page(struct sg_table *sgt,
-				      struct page *page, size_t size)
-{
-	int ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
-
-	if (!ret)
-		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
-
-	return ret;
-}
-
-static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
-			      unsigned long pfn, size_t size)
-{
-	int ret = -ENXIO;
-	unsigned long nr_vma_pages = vma_pages(vma);
-	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	unsigned long off = vma->vm_pgoff;
-
-	if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
-		ret = remap_pfn_range(vma, vma->vm_start,
-				      pfn + off,
-				      vma->vm_end - vma->vm_start,
-				      vma->vm_page_prot);
-	}
-
-	return ret;
-}
-#endif /* CONFIG_IOMMU_DMA */
-
 static int __init arm64_dma_init(void)
 {
 	WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
@@ -100,374 +70,18 @@ static int __init arm64_dma_init(void)
 arch_initcall(arm64_dma_init);
 
 #ifdef CONFIG_IOMMU_DMA
-#include <linux/dma-iommu.h>
-#include <linux/platform_device.h>
-#include <linux/amba/bus.h>
-
-static void *__iommu_alloc_attrs(struct device *dev, size_t size,
-				 dma_addr_t *handle, gfp_t gfp,
-				 unsigned long attrs)
-{
-	bool coherent = dev_is_dma_coherent(dev);
-	int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
-	size_t iosize = size;
-	void *addr;
-
-	if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
-		return NULL;
-
-	size = PAGE_ALIGN(size);
-
-	/*
-	 * Some drivers rely on this, and we probably don't want the
-	 * possibility of stale kernel data being read by devices anyway.
-	 */
-	gfp |= __GFP_ZERO;
-
-	if (!gfpflags_allow_blocking(gfp)) {
-		struct page *page;
-		/*
-		 * In atomic context we can't remap anything, so we'll only
-		 * get the virtually contiguous buffer we need by way of a
-		 * physically contiguous allocation.
-		 */
-		if (coherent) {
-			page = alloc_pages(gfp, get_order(size));
-			addr = page ? page_address(page) : NULL;
-		} else {
-			addr = dma_alloc_from_pool(size, &page, gfp);
-		}
-		if (!addr)
-			return NULL;
-
-		*handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
-		if (*handle == DMA_MAPPING_ERROR) {
-			if (coherent)
-				__free_pages(page, get_order(size));
-			else
-				dma_free_from_pool(addr, size);
-			addr = NULL;
-		}
-	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-		pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
-		struct page *page;
-
-		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-					get_order(size), gfp & __GFP_NOWARN);
-		if (!page)
-			return NULL;
-
-		*handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
-		if (*handle == DMA_MAPPING_ERROR) {
-			dma_release_from_contiguous(dev, page,
-						    size >> PAGE_SHIFT);
-			return NULL;
-		}
-		addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
-						   prot,
-						   __builtin_return_address(0));
-		if (addr) {
-			if (!coherent)
-				__dma_flush_area(page_to_virt(page), iosize);
-			memset(addr, 0, size);
-		} else {
-			iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs);
-			dma_release_from_contiguous(dev, page,
-						    size >> PAGE_SHIFT);
-		}
-	} else {
-		pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
-		struct page **pages;
-
-		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
-					handle);
-		if (!pages)
-			return NULL;
-
-		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
-					      __builtin_return_address(0));
-		if (!addr)
-			iommu_dma_free(dev, pages, iosize, handle);
-	}
-	return addr;
-}
-
-static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
-			       dma_addr_t handle, unsigned long attrs)
-{
-	size_t iosize = size;
-
-	size = PAGE_ALIGN(size);
-	/*
-	 * @cpu_addr will be one of 4 things depending on how it was allocated:
-	 * - A remapped array of pages for contiguous allocations.
-	 * - A remapped array of pages from iommu_dma_alloc(), for all
-	 *   non-atomic allocations.
-	 * - A non-cacheable alias from the atomic pool, for atomic
-	 *   allocations by non-coherent devices.
-	 * - A normal lowmem address, for atomic allocations by
-	 *   coherent devices.
-	 * Hence how dodgy the below logic looks...
-	 */
-	if (dma_in_atomic_pool(cpu_addr, size)) {
-		iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
-		dma_free_from_pool(cpu_addr, size);
-	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-		struct page *page = vmalloc_to_page(cpu_addr);
-
-		iommu_dma_unmap_page(dev, handle, iosize, 0, attrs);
-		dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
-		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
-	} else if (is_vmalloc_addr(cpu_addr)){
-		struct vm_struct *area = find_vm_area(cpu_addr);
-
-		if (WARN_ON(!area || !area->pages))
-			return;
-		iommu_dma_free(dev, area->pages, iosize, &handle);
-		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
-	} else {
-		iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
-		__free_pages(virt_to_page(cpu_addr), get_order(size));
-	}
-}
-
-static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
-			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
-			      unsigned long attrs)
-{
-	struct vm_struct *area;
-	int ret;
-
-	vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
-
-	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
-		return ret;
-
-	if (!is_vmalloc_addr(cpu_addr)) {
-		unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
-		return __swiotlb_mmap_pfn(vma, pfn, size);
-	}
-
-	if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-		/*
-		 * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
-		 * hence in the vmalloc space.
-		 */
-		unsigned long pfn = vmalloc_to_pfn(cpu_addr);
-		return __swiotlb_mmap_pfn(vma, pfn, size);
-	}
-
-	area = find_vm_area(cpu_addr);
-	if (WARN_ON(!area || !area->pages))
-		return -ENXIO;
-
-	return iommu_dma_mmap(area->pages, size, vma);
-}
-
-static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
-			       void *cpu_addr, dma_addr_t dma_addr,
-			       size_t size, unsigned long attrs)
-{
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	struct vm_struct *area = find_vm_area(cpu_addr);
-
-	if (!is_vmalloc_addr(cpu_addr)) {
-		struct page *page = virt_to_page(cpu_addr);
-		return __swiotlb_get_sgtable_page(sgt, page, size);
-	}
-
-	if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-		/*
-		 * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
-		 * hence in the vmalloc space.
-		 */
-		struct page *page = vmalloc_to_page(cpu_addr);
-		return __swiotlb_get_sgtable_page(sgt, page, size);
-	}
-
-	if (WARN_ON(!area || !area->pages))
-		return -ENXIO;
-
-	return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
-					 GFP_KERNEL);
-}
-
-static void __iommu_sync_single_for_cpu(struct device *dev,
-					dma_addr_t dev_addr, size_t size,
-					enum dma_data_direction dir)
-{
-	phys_addr_t phys;
-
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dev_addr);
-	arch_sync_dma_for_cpu(dev, phys, size, dir);
-}
-
-static void __iommu_sync_single_for_device(struct device *dev,
-					   dma_addr_t dev_addr, size_t size,
-					   enum dma_data_direction dir)
-{
-	phys_addr_t phys;
-
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dev_addr);
-	arch_sync_dma_for_device(dev, phys, size, dir);
-}
-
-static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	bool coherent = dev_is_dma_coherent(dev);
-	int prot = dma_info_to_prot(dir, coherent, attrs);
-	dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
-
-	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    dev_addr != DMA_MAPPING_ERROR)
-		__dma_map_area(page_address(page) + offset, size, dir);
-
-	return dev_addr;
-}
-
-static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
-		__iommu_sync_single_for_cpu(dev, dev_addr, size, dir);
-
-	iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs);
-}
-
-static void __iommu_sync_sg_for_cpu(struct device *dev,
-				    struct scatterlist *sgl, int nelems,
-				    enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
-}
-
-static void __iommu_sync_sg_for_device(struct device *dev,
-				       struct scatterlist *sgl, int nelems,
-				       enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
-}
-
-static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
-				int nelems, enum dma_data_direction dir,
-				unsigned long attrs)
-{
-	bool coherent = dev_is_dma_coherent(dev);
-
-	if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
-		__iommu_sync_sg_for_device(dev, sgl, nelems, dir);
-
-	return iommu_dma_map_sg(dev, sgl, nelems,
-				dma_info_to_prot(dir, coherent, attrs));
-}
-
-static void __iommu_unmap_sg_attrs(struct device *dev,
-				   struct scatterlist *sgl, int nelems,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
-		__iommu_sync_sg_for_cpu(dev, sgl, nelems, dir);
-
-	iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
-}
-
-static const struct dma_map_ops iommu_dma_ops = {
-	.alloc = __iommu_alloc_attrs,
-	.free = __iommu_free_attrs,
-	.mmap = __iommu_mmap_attrs,
-	.get_sgtable = __iommu_get_sgtable,
-	.map_page = __iommu_map_page,
-	.unmap_page = __iommu_unmap_page,
-	.map_sg = __iommu_map_sg_attrs,
-	.unmap_sg = __iommu_unmap_sg_attrs,
-	.sync_single_for_cpu = __iommu_sync_single_for_cpu,
-	.sync_single_for_device = __iommu_sync_single_for_device,
-	.sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
-	.sync_sg_for_device = __iommu_sync_sg_for_device,
-	.map_resource = iommu_dma_map_resource,
-	.unmap_resource = iommu_dma_unmap_resource,
-};
-
-static int __init __iommu_dma_init(void)
-{
-	return iommu_dma_init();
-}
-arch_initcall(__iommu_dma_init);
-
-static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				  const struct iommu_ops *ops)
-{
-	struct iommu_domain *domain;
-
-	if (!ops)
-		return;
-
-	/*
-	 * The IOMMU core code allocates the default DMA domain, which the
-	 * underlying IOMMU driver needs to support via the dma-iommu layer.
-	 */
-	domain = iommu_get_domain_for_dev(dev);
-
-	if (!domain)
-		goto out_err;
-
-	if (domain->type == IOMMU_DOMAIN_DMA) {
-		if (iommu_dma_init_domain(domain, dma_base, size, dev))
-			goto out_err;
-
-		dev->dma_ops = &iommu_dma_ops;
-	}
-
-	return;
-
-out_err:
-	 pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
-		 dev_name(dev));
-}
-
 void arch_teardown_dma_ops(struct device *dev)
 {
 	dev->dma_ops = NULL;
 }
-
-#else
-
-static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				  const struct iommu_ops *iommu)
-{ }
-
-#endif  /* CONFIG_IOMMU_DMA */
+#endif
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
 	dev->dma_coherent = coherent;
-	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
+	if (iommu)
+		iommu_setup_dma_ops(dev, dma_base, size);
 
 #ifdef CONFIG_XEN
 	if (xen_initial_domain())
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 83664db5221d..d6d063160dd6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -97,6 +97,7 @@ config IOMMU_DMA
 	select IOMMU_IOVA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
+	depends on DMA_DIRECT_REMAP
 
 config FSL_PAMU
 	bool "Freescale IOMMU support"
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9b7f120d7381..e34ba23353cb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -21,6 +21,7 @@
 
 #include <linux/acpi_iort.h>
 #include <linux/device.h>
+#include <linux/dma-contiguous.h>
 #include <linux/dma-iommu.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/gfp.h>
@@ -79,11 +80,6 @@ static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
 	return cookie;
 }
 
-int iommu_dma_init(void)
-{
-	return iova_cache_get();
-}
-
 /**
  * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
  * @domain: IOMMU domain to prepare for DMA-API usage
@@ -314,7 +310,7 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
  * any change which could make prior IOVAs invalid will fail.
  */
-int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
+static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 		u64 size, struct device *dev)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -365,7 +361,6 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	return iova_reserve_iommu_regions(dev, domain);
 }
-EXPORT_SYMBOL(iommu_dma_init_domain);
 
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
@@ -376,7 +371,7 @@ EXPORT_SYMBOL(iommu_dma_init_domain);
  *
  * Return: corresponding IOMMU API page protection flags
  */
-int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
 		     unsigned long attrs)
 {
 	int prot = coherent ? IOMMU_CACHE : 0;
@@ -535,17 +530,17 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 }
 
 /**
- * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc()
+ * iommu_dma_free - Free a buffer allocated by __iommu_dma_alloc()
  * @dev: Device which owns this buffer
- * @pages: Array of buffer pages as returned by iommu_dma_alloc()
+ * @pages: Array of buffer pages as returned by __iommu_dma_alloc()
  * @size: Size of buffer in bytes
  * @handle: DMA address of buffer
  *
  * Frees both the pages associated with the buffer, and the array
  * describing them
  */
-void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
-		dma_addr_t *handle)
+static void __iommu_dma_free(struct device *dev, struct page **pages,
+		size_t size, dma_addr_t *handle)
 {
 	__iommu_dma_unmap(iommu_get_dma_domain(dev), *handle, size);
 	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
@@ -553,7 +548,7 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
 }
 
 /**
- * iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
+ * __iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
  * @dev: Device to allocate memory for. Must be a real device
  *	 attached to an iommu_dma_domain
  * @size: Size of buffer in bytes
@@ -568,8 +563,8 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  * Return: Array of struct page pointers describing the buffer,
  *	   or NULL on failure.
  */
-struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-		unsigned long attrs, int prot, dma_addr_t *handle)
+static struct page **__iommu_dma_alloc(struct device *dev, size_t size,
+		gfp_t gfp, unsigned long attrs, int prot, dma_addr_t *handle)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -631,20 +626,72 @@ out_free_pages:
 }
 
 /**
- * iommu_dma_mmap - Map a buffer into provided user VMA
- * @pages: Array representing buffer from iommu_dma_alloc()
+ * __iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from __iommu_dma_alloc()
  * @size: Size of buffer in bytes
  * @vma: VMA describing requested userspace mapping
  *
  * Maps the pages of the buffer in @pages into @vma. The caller is responsible
  * for verifying the correct size and protection of @vma beforehand.
  */
-
-int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
+static int __iommu_dma_mmap(struct page **pages, size_t size,
+		struct vm_area_struct *vma)
 {
 	return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
 }
 
+static void iommu_dma_sync_single_for_cpu(struct device *dev,
+		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t phys;
+
+	if (dev_is_dma_coherent(dev))
+		return;
+
+	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
+	arch_sync_dma_for_cpu(dev, phys, size, dir);
+}
+
+static void iommu_dma_sync_single_for_device(struct device *dev,
+		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t phys;
+
+	if (dev_is_dma_coherent(dev))
+		return;
+
+	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
+	arch_sync_dma_for_device(dev, phys, size, dir);
+}
+
+static void iommu_dma_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nelems,
+		enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	if (dev_is_dma_coherent(dev))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void iommu_dma_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nelems,
+		enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	if (dev_is_dma_coherent(dev))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 		size_t size, int prot, struct iommu_domain *domain)
 {
@@ -668,19 +715,44 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	return iova + iova_off;
 }
 
-dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+static dma_addr_t __iommu_dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, int prot)
 {
 	return __iommu_dma_map(dev, page_to_phys(page) + offset, size, prot,
 			iommu_get_dma_domain(dev));
 }
 
-void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
+static void __iommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	__iommu_dma_unmap(iommu_get_dma_domain(dev), handle, size);
 }
 
+static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	phys_addr_t phys = page_to_phys(page) + offset;
+	bool coherent = dev_is_dma_coherent(dev);
+	dma_addr_t dma_handle;
+
+	dma_handle =__iommu_dma_map(dev, phys, size,
+			dma_info_to_prot(dir, coherent, attrs),
+			iommu_get_dma_domain(dev));
+	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    dma_handle != DMA_MAPPING_ERROR)
+		arch_sync_dma_for_device(dev, phys, size, dir);
+	return dma_handle;
+}
+
+static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
+	__iommu_dma_unmap(iommu_get_dma_domain(dev), dma_handle, size);
+}
+
 /*
  * Prepare a successfully-mapped scatterlist to give back to the caller.
  *
@@ -763,18 +835,22 @@ static void __invalidate_sg(struct scatterlist *sg, int nents)
  * impedance-matching, to be able to hand off a suitably-aligned list,
  * but still preserve the original offsets and sizes for the caller.
  */
-int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
-		int nents, int prot)
+static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	struct iova_domain *iovad = &cookie->iovad;
 	struct scatterlist *s, *prev = NULL;
+	int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
 	dma_addr_t iova;
 	size_t iova_len = 0;
 	unsigned long mask = dma_get_seg_boundary(dev);
 	int i;
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
+
 	/*
 	 * Work out how much IOVA space we need, and align the segments to
 	 * IOVA granules for the IOMMU driver to handle. With some clever
@@ -834,12 +910,16 @@ out_restore_sg:
 	return 0;
 }
 
-void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-		enum dma_data_direction dir, unsigned long attrs)
+static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
 	dma_addr_t start, end;
 	struct scatterlist *tmp;
 	int i;
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
+		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
+
 	/*
 	 * The scatterlist segments are mapped into a single
 	 * contiguous IOVA allocation, so this is incredibly easy.
@@ -854,7 +934,7 @@ void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 	__iommu_dma_unmap(iommu_get_dma_domain(dev), start, end - start);
 }
 
-dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
+static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	return __iommu_dma_map(dev, phys, size,
@@ -862,12 +942,268 @@ dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 			iommu_get_dma_domain(dev));
 }
 
-void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
+static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	__iommu_dma_unmap(iommu_get_dma_domain(dev), handle, size);
 }
 
+static void *iommu_dma_alloc(struct device *dev, size_t size,
+		dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+{
+	bool coherent = dev_is_dma_coherent(dev);
+	int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+	size_t iosize = size;
+	void *addr;
+
+	size = PAGE_ALIGN(size);
+	gfp |= __GFP_ZERO;
+
+	if (!gfpflags_allow_blocking(gfp)) {
+		struct page *page;
+		/*
+		 * In atomic context we can't remap anything, so we'll only
+		 * get the virtually contiguous buffer we need by way of a
+		 * physically contiguous allocation.
+		 */
+		if (coherent) {
+			page = alloc_pages(gfp, get_order(size));
+			addr = page ? page_address(page) : NULL;
+		} else {
+			addr = dma_alloc_from_pool(size, &page, gfp);
+		}
+		if (!addr)
+			return NULL;
+
+		*handle = __iommu_dma_map_page(dev, page, 0, iosize, ioprot);
+		if (*handle == DMA_MAPPING_ERROR) {
+			if (coherent)
+				__free_pages(page, get_order(size));
+			else
+				dma_free_from_pool(addr, size);
+			addr = NULL;
+		}
+	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
+		pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
+		struct page *page;
+
+		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
+					get_order(size), gfp & __GFP_NOWARN);
+		if (!page)
+			return NULL;
+
+		*handle = __iommu_dma_map_page(dev, page, 0, iosize, ioprot);
+		if (*handle == DMA_MAPPING_ERROR) {
+			dma_release_from_contiguous(dev, page,
+						    size >> PAGE_SHIFT);
+			return NULL;
+		}
+		addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
+						   prot,
+						   __builtin_return_address(0));
+		if (addr) {
+			if (!coherent)
+				arch_dma_prep_coherent(page, iosize);
+			memset(addr, 0, size);
+		} else {
+			__iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs);
+			dma_release_from_contiguous(dev, page,
+						    size >> PAGE_SHIFT);
+		}
+	} else {
+		pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
+		struct page **pages;
+
+		pages = __iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
+					handle);
+		if (!pages)
+			return NULL;
+
+		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
+					      __builtin_return_address(0));
+		if (!addr)
+			__iommu_dma_free(dev, pages, iosize, handle);
+	}
+	return addr;
+}
+
+static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t handle, unsigned long attrs)
+{
+	size_t iosize = size;
+
+	size = PAGE_ALIGN(size);
+	/*
+	 * @cpu_addr will be one of 4 things depending on how it was allocated:
+	 * - A remapped array of pages for contiguous allocations.
+	 * - A remapped array of pages from __iommu_dma_alloc(), for all
+	 *   non-atomic allocations.
+	 * - A non-cacheable alias from the atomic pool, for atomic
+	 *   allocations by non-coherent devices.
+	 * - A normal lowmem address, for atomic allocations by
+	 *   coherent devices.
+	 * Hence how dodgy the below logic looks...
+	 */
+	if (dma_in_atomic_pool(cpu_addr, size)) {
+		__iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
+		dma_free_from_pool(cpu_addr, size);
+	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
+		struct page *page = vmalloc_to_page(cpu_addr);
+
+		__iommu_dma_unmap_page(dev, handle, iosize, 0, attrs);
+		dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
+		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
+	} else if (is_vmalloc_addr(cpu_addr)){
+		struct vm_struct *area = find_vm_area(cpu_addr);
+
+		if (WARN_ON(!area || !area->pages))
+			return;
+		__iommu_dma_free(dev, area->pages, iosize, &handle);
+		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
+	} else {
+		__iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
+		__free_pages(virt_to_page(cpu_addr), get_order(size));
+	}
+}
+
+static int __iommu_dma_mmap_pfn(struct vm_area_struct *vma,
+			      unsigned long pfn, size_t size)
+{
+	int ret = -ENXIO;
+	unsigned long nr_vma_pages = vma_pages(vma);
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long off = vma->vm_pgoff;
+
+	if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      pfn + off,
+				      vma->vm_end - vma->vm_start,
+				      vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long off = vma->vm_pgoff;
+	struct vm_struct *area;
+	int ret;
+
+	vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
+
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
+		return -ENXIO;
+
+	if (!is_vmalloc_addr(cpu_addr)) {
+		unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
+		return __iommu_dma_mmap_pfn(vma, pfn, size);
+	}
+
+	if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
+		/*
+		 * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
+		 * hence in the vmalloc space.
+		 */
+		unsigned long pfn = vmalloc_to_pfn(cpu_addr);
+		return __iommu_dma_mmap_pfn(vma, pfn, size);
+	}
+
+	area = find_vm_area(cpu_addr);
+	if (WARN_ON(!area || !area->pages))
+		return -ENXIO;
+
+	return __iommu_dma_mmap(area->pages, size, vma);
+}
+
+static int __iommu_dma_get_sgtable_page(struct sg_table *sgt, struct page *page,
+		size_t size)
+{
+	int ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+
+	if (!ret)
+		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	return ret;
+}
+
+static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct vm_struct *area = find_vm_area(cpu_addr);
+
+	if (!is_vmalloc_addr(cpu_addr)) {
+		struct page *page = virt_to_page(cpu_addr);
+		return __iommu_dma_get_sgtable_page(sgt, page, size);
+	}
+
+	if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
+		/*
+		 * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
+		 * hence in the vmalloc space.
+		 */
+		struct page *page = vmalloc_to_page(cpu_addr);
+		return __iommu_dma_get_sgtable_page(sgt, page, size);
+	}
+
+	if (WARN_ON(!area || !area->pages))
+		return -ENXIO;
+
+	return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
+					 GFP_KERNEL);
+}
+
+static const struct dma_map_ops iommu_dma_ops = {
+	.alloc			= iommu_dma_alloc,
+	.free			= iommu_dma_free,
+	.mmap			= iommu_dma_mmap,
+	.get_sgtable		= iommu_dma_get_sgtable,
+	.map_page		= iommu_dma_map_page,
+	.unmap_page		= iommu_dma_unmap_page,
+	.map_sg			= iommu_dma_map_sg,
+	.unmap_sg		= iommu_dma_unmap_sg,
+	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
+	.sync_single_for_device	= iommu_dma_sync_single_for_device,
+	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
+	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
+	.map_resource		= iommu_dma_map_resource,
+	.unmap_resource		= iommu_dma_unmap_resource,
+};
+
+/*
+ * The IOMMU core code allocates the default DMA domain, which the underlying
+ * IOMMU driver needs to support via the dma-iommu layer.
+ */
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	if (!domain)
+		goto out_err;
+
+	/*
+	 * The IOMMU core code allocates the default DMA domain, which the
+	 * underlying IOMMU driver needs to support via the dma-iommu layer.
+	 */
+	if (domain->type == IOMMU_DOMAIN_DMA) {
+		if (iommu_dma_init_domain(domain, dma_base, size, dev))
+			goto out_err;
+		dev->dma_ops = &iommu_dma_ops;
+	}
+
+	return;
+out_err:
+	 pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+		 dev_name(dev));
+}
+
 static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 		phys_addr_t msi_addr, struct iommu_domain *domain)
 {
@@ -948,3 +1284,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 	msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
 	msg->address_lo += lower_32_bits(msi_page->iova);
 }
+
+static int iommu_dma_init(void)
+{
+	return iova_cache_get();
+}
+arch_initcall(iommu_dma_init);
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index e1ef265b578b..b3cc3fb84079 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -24,49 +24,13 @@
 #include <linux/iommu.h>
 #include <linux/msi.h>
 
-int iommu_dma_init(void);
-
 /* Domain management interface for IOMMU drivers */
 int iommu_get_dma_cookie(struct iommu_domain *domain);
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
-int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-		u64 size, struct device *dev);
-
-/* General helpers for DMA-API <-> IOMMU-API interaction */
-int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
-		     unsigned long attrs);
-
-/*
- * These implement the bulk of the relevant DMA mapping callbacks, but require
- * the arch code to take care of attributes and cache maintenance
- */
-struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-		unsigned long attrs, int prot, dma_addr_t *handle);
-void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
-		dma_addr_t *handle);
-
-int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma);
-
-dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, int prot);
-int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
-		int nents, int prot);
-
-/*
- * Arch code with no special attribute handling may use these
- * directly as DMA mapping callbacks for simplicity
- */
-void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
-		enum dma_data_direction dir, unsigned long attrs);
-void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -91,9 +55,9 @@ struct msi_desc;
 struct msi_msg;
 struct device;
 
-static inline int iommu_dma_init(void)
+static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
+		u64 size)
 {
-	return 0;
 }
 
 static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
-- 
cgit v1.2.3


From a9f4d93dbeb6f5ccb50c6362ba944afe34cb8f12 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 May 2019 09:29:46 +0200
Subject: iommu/dma: Switch copyright boilerplace to SPDX

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c | 13 +------------
 include/linux/dma-iommu.h | 13 +------------
 2 files changed, 2 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0aff220c4aed..0cd49c2d3770 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * A fairly generic DMA-API to IOMMU-API glue layer.
  *
@@ -5,18 +6,6 @@
  *
  * based in part on arch/arm/mm/dma-mapping.c:
  * Copyright (C) 2000-2004 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/acpi_iort.h>
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index b3cc3fb84079..05556f4d9cce 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -1,17 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2014-2015 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __DMA_IOMMU_H
 #define __DMA_IOMMU_H
-- 
cgit v1.2.3


From 88c50322449a4dea8687ed155968d19cfc763393 Mon Sep 17 00:00:00 2001
From: Przemyslaw Gaj <pgaj@cadence.com>
Date: Tue, 16 Apr 2019 09:36:14 +0100
Subject: i3c: Drop support for I2C 10 bit addresing

This patch drops support for I2C devices with 10 bit addressing. When I2C
device with 10 bit address is defined in DT, I3C master registration fails.

Address space for I2C devices has been reduced and ->i2c_funcs() hook has been
removed.

Because this patch series dropped support for 10 bit I2C devices, support is
also dropped in Cadence I3C master driver and Synopsys DesignWare I3C master
driver.

Signed-off-by: Przemyslaw Gaj <pgaj@cadence.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/i3c/master.c                 | 21 ++++++++++++++-------
 drivers/i3c/master/dw-i3c-master.c   |  6 ------
 drivers/i3c/master/i3c-master-cdns.c | 10 +---------
 include/linux/i3c/master.h           |  5 +----
 4 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 5f4bd52121fe..b9d2b88928e1 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -1963,6 +1963,16 @@ of_i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master,
 	if (ret)
 		return ret;
 
+	/*
+	 * The I3C Specification does not clearly say I2C devices with 10-bit
+	 * address are supported. These devices can't be passed properly through
+	 * DEFSLVS command.
+	 */
+	if (boardinfo->base.flags & I2C_CLIENT_TEN) {
+		dev_err(&master->dev, "I2C device with 10 bit address not supported.");
+		return -ENOTSUPP;
+	}
+
 	/* LVR is encoded in reg[2]. */
 	boardinfo->lvr = reg[2];
 
@@ -2111,16 +2121,14 @@ static int i3c_master_i2c_adapter_xfer(struct i2c_adapter *adap,
 	return ret ? ret : nxfers;
 }
 
-static u32 i3c_master_i2c_functionalities(struct i2c_adapter *adap)
+static u32 i3c_master_i2c_funcs(struct i2c_adapter *adapter)
 {
-	struct i3c_master_controller *master = i2c_adapter_to_i3c_master(adap);
-
-	return master->ops->i2c_funcs(master);
+	return I2C_FUNC_SMBUS_EMUL | I2C_FUNC_I2C;
 }
 
 static const struct i2c_algorithm i3c_master_i2c_algo = {
 	.master_xfer = i3c_master_i2c_adapter_xfer,
-	.functionality = i3c_master_i2c_functionalities,
+	.functionality = i3c_master_i2c_funcs,
 };
 
 static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master)
@@ -2379,8 +2387,7 @@ EXPORT_SYMBOL_GPL(i3c_generic_ibi_recycle_slot);
 static int i3c_master_check_ops(const struct i3c_master_controller_ops *ops)
 {
 	if (!ops || !ops->bus_init || !ops->priv_xfers ||
-	    !ops->send_ccc_cmd || !ops->do_daa || !ops->i2c_xfers ||
-	    !ops->i2c_funcs)
+	    !ops->send_ccc_cmd || !ops->do_daa || !ops->i2c_xfers)
 		return -EINVAL;
 
 	if (ops->request_ibi &&
diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 1d83c97431c7..d6e517e69f84 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -1060,11 +1060,6 @@ static void dw_i3c_master_detach_i2c_dev(struct i2c_dev_desc *dev)
 	kfree(data);
 }
 
-static u32 dw_i3c_master_i2c_funcs(struct i3c_master_controller *m)
-{
-	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
-}
-
 static irqreturn_t dw_i3c_master_irq_handler(int irq, void *dev_id)
 {
 	struct dw_i3c_master *master = dev_id;
@@ -1099,7 +1094,6 @@ static const struct i3c_master_controller_ops dw_mipi_i3c_ops = {
 	.attach_i2c_dev = dw_i3c_master_attach_i2c_dev,
 	.detach_i2c_dev = dw_i3c_master_detach_i2c_dev,
 	.i2c_xfers = dw_i3c_master_i2c_xfers,
-	.i2c_funcs = dw_i3c_master_i2c_funcs,
 };
 
 static int dw_i3c_probe(struct platform_device *pdev)
diff --git a/drivers/i3c/master/i3c-master-cdns.c b/drivers/i3c/master/i3c-master-cdns.c
index 8889a4fdb454..237f24adddc6 100644
--- a/drivers/i3c/master/i3c-master-cdns.c
+++ b/drivers/i3c/master/i3c-master-cdns.c
@@ -864,11 +864,6 @@ static int cdns_i3c_master_i2c_xfers(struct i2c_dev_desc *dev,
 	return ret;
 }
 
-static u32 cdns_i3c_master_i2c_funcs(struct i3c_master_controller *m)
-{
-	return I2C_FUNC_SMBUS_EMUL | I2C_FUNC_I2C | I2C_FUNC_10BIT_ADDR;
-}
-
 struct cdns_i3c_i2c_dev_data {
 	u16 id;
 	s16 ibi;
@@ -1010,9 +1005,7 @@ static int cdns_i3c_master_attach_i2c_dev(struct i2c_dev_desc *dev)
 	master->free_rr_slots &= ~BIT(slot);
 	i2c_dev_set_master_data(dev, data);
 
-	writel(prepare_rr0_dev_address(dev->boardinfo->base.addr) |
-	       (dev->boardinfo->base.flags & I2C_CLIENT_TEN ?
-		DEV_ID_RR0_LVR_EXT_ADDR : 0),
+	writel(prepare_rr0_dev_address(dev->boardinfo->base.addr),
 	       master->regs + DEV_ID_RR0(data->id));
 	writel(dev->boardinfo->lvr, master->regs + DEV_ID_RR2(data->id));
 	writel(readl(master->regs + DEVS_CTRL) |
@@ -1518,7 +1511,6 @@ static const struct i3c_master_controller_ops cdns_i3c_master_ops = {
 	.send_ccc_cmd = cdns_i3c_master_send_ccc_cmd,
 	.priv_xfers = cdns_i3c_master_priv_xfers,
 	.i2c_xfers = cdns_i3c_master_i2c_xfers,
-	.i2c_funcs = cdns_i3c_master_i2c_funcs,
 	.enable_ibi = cdns_i3c_master_enable_ibi,
 	.disable_ibi = cdns_i3c_master_disable_ibi,
 	.request_ibi = cdns_i3c_master_request_ibi,
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index f13fd8b1dd79..eca8337bdaa5 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -48,7 +48,7 @@ struct i3c_i2c_dev_desc {
 #define I3C_LVR_I2C_INDEX(x)		((x) << 5)
 #define I3C_LVR_I2C_FM_MODE		BIT(4)
 
-#define I2C_MAX_ADDR			GENMASK(9, 0)
+#define I2C_MAX_ADDR			GENMASK(6, 0)
 
 /**
  * struct i2c_dev_boardinfo - I2C device board information
@@ -390,8 +390,6 @@ struct i3c_bus {
  *	       and i2c_put_dma_safe_msg_buf() helpers provided by the I2C
  *	       framework.
  *	       This method is mandatory.
- * @i2c_funcs: expose the supported I2C functionalities.
- *	       This method is mandatory.
  * @request_ibi: attach an IBI handler to an I3C device. This implies defining
  *		 an IBI handler and the constraints of the IBI (maximum payload
  *		 length and number of pre-allocated slots).
@@ -437,7 +435,6 @@ struct i3c_master_controller_ops {
 	void (*detach_i2c_dev)(struct i2c_dev_desc *dev);
 	int (*i2c_xfers)(struct i2c_dev_desc *dev,
 			 const struct i2c_msg *xfers, int nxfers);
-	u32 (*i2c_funcs)(struct i3c_master_controller *master);
 	int (*request_ibi)(struct i3c_dev_desc *dev,
 			   const struct i3c_ibi_setup *req);
 	void (*free_ibi)(struct i3c_dev_desc *dev);
-- 
cgit v1.2.3


From 4ec066c7b1476e0ca66a7acdb575627a5d1a1ee6 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 25 May 2019 13:41:33 +0800
Subject: iommu/vt-d: Cleanup get_valid_domain_for_dev()

Previously, get_valid_domain_for_dev() is used to retrieve the
DMA domain which has been attached to the device or allocate one
if no domain has been attached yet. As we have delegated the DMA
domain management to upper layer, this function is used purely to
allocate a private DMA domain if the default domain doesn't work
for ths device. Cleanup the code for readability.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 18 ++++++++----------
 include/linux/intel-iommu.h |  1 -
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c8b73802f0e0..ebc06ee79dce 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2609,7 +2609,6 @@ static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
 	}
 
 out:
-
 	return domain;
 }
 
@@ -3558,16 +3557,17 @@ static unsigned long intel_alloc_iova(struct device *dev,
 	return iova_pfn;
 }
 
-struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
+static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
 {
 	struct dmar_domain *domain, *tmp;
 	struct dmar_rmrr_unit *rmrr;
 	struct device *i_dev;
 	int i, ret;
 
+	/* Device shouldn't be attached by any domains. */
 	domain = find_domain(dev);
 	if (domain)
-		goto out;
+		return NULL;
 
 	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 	if (!domain)
@@ -3597,11 +3597,9 @@ struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
 	}
 
 out:
-
 	if (!domain)
 		dev_err(dev, "Allocating domain failed\n");
 
-
 	return domain;
 }
 
@@ -3638,7 +3636,7 @@ static bool iommu_need_mapping(struct device *dev)
 				dmar_domain = to_dmar_domain(domain);
 				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
 			}
-			get_valid_domain_for_dev(dev);
+			get_private_domain_for_dev(dev);
 		}
 
 		dev_info(dev, "32bit DMA uses non-identity mapping\n");
@@ -3660,7 +3658,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 
 	BUG_ON(dir == DMA_NONE);
 
-	domain = get_valid_domain_for_dev(dev);
+	domain = find_domain(dev);
 	if (!domain)
 		return DMA_MAPPING_ERROR;
 
@@ -3875,7 +3873,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	if (!iommu_need_mapping(dev))
 		return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
 
-	domain = get_valid_domain_for_dev(dev);
+	domain = find_domain(dev);
 	if (!domain)
 		return 0;
 
@@ -5547,7 +5545,7 @@ static int intel_iommu_add_device(struct device *dev)
 			ret = iommu_request_dma_domain_for_dev(dev);
 			if (ret) {
 				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-				if (!get_valid_domain_for_dev(dev)) {
+				if (!get_private_domain_for_dev(dev)) {
 					dev_warn(dev,
 						 "Failed to get a private domain.\n");
 					return -ENOMEM;
@@ -5640,7 +5638,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 	u64 ctx_lo;
 	int ret;
 
-	domain = get_valid_domain_for_dev(dev);
+	domain = find_domain(dev);
 	if (!domain)
 		return -EINVAL;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4140726867a9..5b961c8ca64c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -660,7 +660,6 @@ extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern int dmar_ir_support(void);
 
-struct dmar_domain *get_valid_domain_for_dev(struct device *dev);
 void *alloc_pgtable_page(int node);
 void free_pgtable_page(void *vaddr);
 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
-- 
cgit v1.2.3


From 153969fd952d81ab8f57574f9be1a90b0a0fa791 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 21 May 2019 03:38:25 +0200
Subject: ARM: versatile: Drop CLCD platform data

The Versatile family no longer makes any use of the CLCD
platform data, we have moved over all users to the DRM
driver that has built-in handling of the displays. Delete
the old auxdata.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-versatile/versatile_dt.c             | 157 ---------------------
 include/linux/platform_data/video-clcd-versatile.h |  28 ----
 2 files changed, 185 deletions(-)
 delete mode 100644 include/linux/platform_data/video-clcd-versatile.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-versatile/versatile_dt.c b/arch/arm/mach-versatile/versatile_dt.c
index 028463af726d..b5ff1ea5a944 100644
--- a/arch/arm/mach-versatile/versatile_dt.c
+++ b/arch/arm/mach-versatile/versatile_dt.c
@@ -29,8 +29,6 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/amba/bus.h>
-#include <linux/amba/clcd.h>
-#include <linux/platform_data/video-clcd-versatile.h>
 #include <linux/amba/mmci.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -47,14 +45,12 @@
  */
 #define VERSATILE_SYS_PCICTL_OFFSET           0x44
 #define VERSATILE_SYS_MCI_OFFSET              0x48
-#define VERSATILE_SYS_CLCD_OFFSET             0x50
 
 /*
  * VERSATILE peripheral addresses
  */
 #define VERSATILE_MMCI0_BASE           0x10005000	/* MMC interface */
 #define VERSATILE_MMCI1_BASE           0x1000B000	/* MMC Interface */
-#define VERSATILE_CLCD_BASE            0x10120000	/* CLCD */
 #define VERSATILE_SCTL_BASE            0x101E0000	/* System controller */
 #define VERSATILE_IB2_BASE             0x24000000	/* IB2 module */
 #define VERSATILE_IB2_CTL_BASE		(VERSATILE_IB2_BASE + 0x03000000)
@@ -96,158 +92,6 @@ static struct mmci_platform_data mmc1_plat_data = {
 	.status		= mmc_status,
 };
 
-/*
- * CLCD support.
- */
-#define SYS_CLCD_MODE_MASK	(3 << 0)
-#define SYS_CLCD_MODE_888	(0 << 0)
-#define SYS_CLCD_MODE_5551	(1 << 0)
-#define SYS_CLCD_MODE_565_RLSB	(2 << 0)
-#define SYS_CLCD_MODE_565_BLSB	(3 << 0)
-#define SYS_CLCD_NLCDIOON	(1 << 2)
-#define SYS_CLCD_VDDPOSSWITCH	(1 << 3)
-#define SYS_CLCD_PWR3V5SWITCH	(1 << 4)
-#define SYS_CLCD_ID_MASK	(0x1f << 8)
-#define SYS_CLCD_ID_SANYO_3_8	(0x00 << 8)
-#define SYS_CLCD_ID_UNKNOWN_8_4	(0x01 << 8)
-#define SYS_CLCD_ID_EPSON_2_2	(0x02 << 8)
-#define SYS_CLCD_ID_SANYO_2_5	(0x07 << 8)
-#define SYS_CLCD_ID_VGA		(0x1f << 8)
-
-static bool is_sanyo_2_5_lcd;
-
-/*
- * Disable all display connectors on the interface module.
- */
-static void versatile_clcd_disable(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light off
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl &= ~0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Enable the relevant connector on the interface module.
- */
-static void versatile_clcd_enable(struct clcd_fb *fb)
-{
-	struct fb_var_screeninfo *var = &fb->fb.var;
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_MODE_MASK;
-
-	switch (var->green.length) {
-	case 5:
-		val |= SYS_CLCD_MODE_5551;
-		break;
-	case 6:
-		if (var->red.offset == 0)
-			val |= SYS_CLCD_MODE_565_RLSB;
-		else
-			val |= SYS_CLCD_MODE_565_BLSB;
-		break;
-	case 8:
-		val |= SYS_CLCD_MODE_888;
-		break;
-	}
-
-	/*
-	 * Set the MUX
-	 */
-	writel(val, sys_clcd);
-
-	/*
-	 * And now enable the PSUs
-	 */
-	val |= SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light on
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl |= 0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Detect which LCD panel is connected, and return the appropriate
- * clcd_panel structure.  Note: we do not have any information on
- * the required timings for the 8.4in panel, so we presently assume
- * VGA timings.
- */
-static int versatile_clcd_setup(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	const char *panel_name;
-	u32 val;
-
-	is_sanyo_2_5_lcd = false;
-
-	val = readl(sys_clcd) & SYS_CLCD_ID_MASK;
-	if (val == SYS_CLCD_ID_SANYO_3_8)
-		panel_name = "Sanyo TM38QV67A02A";
-	else if (val == SYS_CLCD_ID_SANYO_2_5) {
-		panel_name = "Sanyo QVGA Portrait";
-		is_sanyo_2_5_lcd = true;
-	} else if (val == SYS_CLCD_ID_EPSON_2_2)
-		panel_name = "Epson L2F50113T00";
-	else if (val == SYS_CLCD_ID_VGA)
-		panel_name = "VGA";
-	else {
-		printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n",
-			val);
-		panel_name = "VGA";
-	}
-
-	fb->panel = versatile_clcd_get_panel(panel_name);
-	if (!fb->panel)
-		return -EINVAL;
-
-	return versatile_clcd_setup_dma(fb, SZ_1M);
-}
-
-static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	clcdfb_decode(fb, regs);
-
-	/* Always clear BGR for RGB565: we do the routing externally */
-	if (fb->fb.var.green.length == 6)
-		regs->cntl &= ~CNTL_BGR;
-}
-
-static struct clcd_board clcd_plat_data = {
-	.name		= "Versatile",
-	.caps		= CLCD_CAP_5551 | CLCD_CAP_565 | CLCD_CAP_888,
-	.check		= clcdfb_check,
-	.decode		= versatile_clcd_decode,
-	.disable	= versatile_clcd_disable,
-	.enable		= versatile_clcd_enable,
-	.setup		= versatile_clcd_setup,
-	.mmap		= versatile_clcd_mmap_dma,
-	.remove		= versatile_clcd_remove_dma,
-};
-
 /*
  * Lookup table for attaching a specific name and platform_data pointer to
  * devices as they get created by of_platform_populate().  Ideally this table
@@ -257,7 +101,6 @@ static struct clcd_board clcd_plat_data = {
 struct of_dev_auxdata versatile_auxdata_lookup[] __initdata = {
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI0_BASE, "fpga:05", &mmc0_plat_data),
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI1_BASE, "fpga:0b", &mmc1_plat_data),
-	OF_DEV_AUXDATA("arm,primecell", VERSATILE_CLCD_BASE, "dev:20", &clcd_plat_data),
 	{}
 };
 
diff --git a/include/linux/platform_data/video-clcd-versatile.h b/include/linux/platform_data/video-clcd-versatile.h
deleted file mode 100644
index 305ebaec3afd..000000000000
--- a/include/linux/platform_data/video-clcd-versatile.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef PLAT_CLCD_H
-#define PLAT_CLCD_H
-
-#ifdef CONFIG_PLAT_VERSATILE_CLCD
-struct clcd_panel *versatile_clcd_get_panel(const char *);
-int versatile_clcd_setup_dma(struct clcd_fb *, unsigned long);
-int versatile_clcd_mmap_dma(struct clcd_fb *, struct vm_area_struct *);
-void versatile_clcd_remove_dma(struct clcd_fb *);
-#else
-static inline struct clcd_panel *versatile_clcd_get_panel(const char *s)
-{
-	return NULL;
-}
-static inline int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
-{
-	return -ENODEV;
-}
-static inline int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vm)
-{
-	return -ENODEV;
-}
-static inline void versatile_clcd_remove_dma(struct clcd_fb *fb)
-{
-}
-#endif
-
-#endif
-- 
cgit v1.2.3


From 2b2f7def058a5386838ef4dba70a860285f79e66 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Add support for missing clockdomain handling

We need to let ti-sysc driver manage clockdomain autoidle for the
duration of of reset, enable and idle. And we need to do it before we
enable the clock and after we disable it. Currently we are still
relying on platform callbacks indirectly managing clockdomain autoidle.
But I noticed that for device tree only probed drivers it now happens
only after we enabling the clocks and before we disable the clocks,
while it should be the other way around. So far I have not noticed
any issues with this though.

Let's add new ti_sysc_clkdm_deny_idle() and ti_sysc_clkdm_allow_idle()
functions for ti-sysc driver to use to manage clockdomains directly via
platform data callbacks. Note that we can implement the clockdomain
functions in pdata-quirks.c as for probing devices without "ti,hwmods"
custom property we don't need to use the other platform data callbacks.

Let's do this in one patch as there's is still an unlikely chance we
may need to apply this as a fix for v5.2 for dropping legacy platform
data for some devices. We also do have the option of adding back the
platform data if needed in case of trouble.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/omap_hwmod.c      |  39 ++---------
 arch/arm/mach-omap2/pdata-quirks.c    |  60 ++++++++++++++++
 drivers/bus/ti-sysc.c                 | 127 +++++++++++++++++++++++++++-------
 include/linux/platform_data/ti-sysc.h |   8 +++
 4 files changed, 174 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 405ac24def05..932ba221e8e7 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -3445,6 +3445,7 @@ static int omap_hwmod_check_module(struct device *dev,
  * @dev: struct device
  * @oh: module
  * @sysc_fields: sysc register bits
+ * @clockdomain: clockdomain
  * @rev_offs: revision register offset
  * @sysc_offs: sysconfig register offset
  * @syss_offs: sysstatus register offset
@@ -3456,6 +3457,7 @@ static int omap_hwmod_check_module(struct device *dev,
 static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 				      const struct ti_sysc_module_data *data,
 				      struct sysc_regbits *sysc_fields,
+				      struct clockdomain *clkdm,
 				      s32 rev_offs, s32 sysc_offs,
 				      s32 syss_offs, u32 sysc_flags,
 				      u32 idlemodes)
@@ -3463,8 +3465,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 	struct omap_hwmod_class_sysconfig *sysc;
 	struct omap_hwmod_class *class = NULL;
 	struct omap_hwmod_ocp_if *oi = NULL;
-	struct clockdomain *clkdm = NULL;
-	struct clk *clk = NULL;
 	void __iomem *regs = NULL;
 	unsigned long flags;
 
@@ -3511,36 +3511,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 		oi->user = OCP_USER_MPU | OCP_USER_SDMA;
 	}
 
-	if (!oh->_clk) {
-		struct clk_hw_omap *hwclk;
-
-		clk = of_clk_get_by_name(dev->of_node, "fck");
-		if (!IS_ERR(clk))
-			clk_prepare(clk);
-		else
-			clk = NULL;
-
-		/*
-		 * Populate clockdomain based on dts clock. It is needed for
-		 * clkdm_deny_idle() and clkdm_allow_idle() until we have have
-		 * interconnect driver and reset driver capable of blocking
-		 * clockdomain idle during reset, enable and idle.
-		 */
-		if (clk) {
-			hwclk = to_clk_hw_omap(__clk_get_hw(clk));
-			if (hwclk && hwclk->clkdm_name)
-				clkdm = clkdm_lookup(hwclk->clkdm_name);
-		}
-
-		/*
-		 * Note that we assume interconnect driver manages the clocks
-		 * and do not need to populate oh->_clk for dynamically
-		 * allocated modules.
-		 */
-		clk_unprepare(clk);
-		clk_put(clk);
-	}
-
 	spin_lock_irqsave(&oh->_lock, flags);
 	if (regs)
 		oh->_mpu_rt_va = regs;
@@ -3626,7 +3596,7 @@ int omap_hwmod_init_module(struct device *dev,
 	u32 sysc_flags, idlemodes;
 	int error;
 
-	if (!dev || !data)
+	if (!dev || !data || !data->name || !cookie)
 		return -EINVAL;
 
 	oh = _lookup(data->name);
@@ -3697,7 +3667,8 @@ int omap_hwmod_init_module(struct device *dev,
 		return error;
 
 	return omap_hwmod_allocate_module(dev, oh, data, sysc_fields,
-					  rev_offs, sysc_offs, syss_offs,
+					  cookie->clkdm, rev_offs,
+					  sysc_offs, syss_offs,
 					  sysc_flags, idlemodes);
 }
 
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index a2ecc5e69abb..b09cc4e8d240 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -29,6 +29,7 @@
 #include <linux/platform_data/wkup_m3.h>
 #include <linux/platform_data/asoc-ti-mcbsp.h>
 
+#include "clockdomain.h"
 #include "common.h"
 #include "common-board-devices.h"
 #include "control.h"
@@ -463,6 +464,62 @@ static void __init dra7x_evm_mmc_quirk(void)
 }
 #endif
 
+static struct clockdomain *ti_sysc_find_one_clockdomain(struct clk *clk)
+{
+	struct clockdomain *clkdm = NULL;
+	struct clk_hw_omap *hwclk;
+
+	hwclk = to_clk_hw_omap(__clk_get_hw(clk));
+	if (hwclk && hwclk->clkdm_name)
+		clkdm = clkdm_lookup(hwclk->clkdm_name);
+
+	return clkdm;
+}
+
+/**
+ * ti_sysc_clkdm_init - find clockdomain based on clock
+ * @fck: device functional clock
+ * @ick: device interface clock
+ * @dev: struct device
+ *
+ * Populate clockdomain based on clock. It is needed for
+ * clkdm_deny_idle() and clkdm_allow_idle() for blocking clockdomain
+ * clockdomain idle during reset, enable and idle.
+ *
+ * Note that we assume interconnect driver manages the clocks
+ * and do not need to populate oh->_clk for dynamically
+ * allocated modules.
+ */
+static int ti_sysc_clkdm_init(struct device *dev,
+			      struct clk *fck, struct clk *ick,
+			      struct ti_sysc_cookie *cookie)
+{
+	if (fck)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(fck);
+	if (cookie->clkdm)
+		return 0;
+	if (ick)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(ick);
+	if (cookie->clkdm)
+		return 0;
+
+	return -ENODEV;
+}
+
+static void ti_sysc_clkdm_deny_idle(struct device *dev,
+				    const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_deny_idle(cookie->clkdm);
+}
+
+static void ti_sysc_clkdm_allow_idle(struct device *dev,
+				     const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_allow_idle(cookie->clkdm);
+}
+
 static int ti_sysc_enable_module(struct device *dev,
 				 const struct ti_sysc_cookie *cookie)
 {
@@ -494,6 +551,9 @@ static struct of_dev_auxdata omap_auxdata_lookup[];
 
 static struct ti_sysc_platform_data ti_sysc_pdata = {
 	.auxdata = omap_auxdata_lookup,
+	.init_clockdomain = ti_sysc_clkdm_init,
+	.clkdm_deny_idle = ti_sysc_clkdm_deny_idle,
+	.clkdm_allow_idle = ti_sysc_clkdm_allow_idle,
 	.init_module = omap_hwmod_init_module,
 	.enable_module = ti_sysc_enable_module,
 	.idle_module = ti_sysc_idle_module,
diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index b72741668c92..e86f7850206a 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -422,6 +422,30 @@ static void sysc_disable_opt_clocks(struct sysc *ddata)
 	}
 }
 
+static void sysc_clkdm_deny_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_deny_idle)
+		pdata->clkdm_deny_idle(ddata->dev, &ddata->cookie);
+}
+
+static void sysc_clkdm_allow_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_allow_idle)
+		pdata->clkdm_allow_idle(ddata->dev, &ddata->cookie);
+}
+
 /**
  * sysc_init_resets - init rstctrl reset line if configured
  * @ddata: device driver data
@@ -795,6 +819,7 @@ static void sysc_show_registers(struct sysc *ddata)
 
 #define SYSC_IDLE_MASK	(SYSC_NR_IDLEMODES - 1)
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_enable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -805,11 +830,6 @@ static int sysc_enable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -861,6 +881,7 @@ static int sysc_best_idle_mode(u32 idlemodes, u32 *best_mode)
 	return 0;
 }
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_disable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -872,11 +893,6 @@ static int sysc_disable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -966,14 +982,16 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 	if (!ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (ddata->legacy_mode) {
 		error = sysc_runtime_suspend_legacy(dev, ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	} else {
 		error = sysc_disable_module(dev);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	sysc_disable_main_clocks(ddata);
@@ -983,6 +1001,9 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 
 	ddata->enabled = false;
 
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
+
 	return error;
 }
 
@@ -996,10 +1017,12 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 	if (ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (sysc_opt_clks_needed(ddata)) {
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	error = sysc_enable_main_clocks(ddata);
@@ -1018,6 +1041,8 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 
 	ddata->enabled = true;
 
+	sysc_clkdm_allow_idle(ddata);
+
 	return 0;
 
 err_main_clocks:
@@ -1025,6 +1050,8 @@ err_main_clocks:
 err_opt_clocks:
 	if (sysc_opt_clks_needed(ddata))
 		sysc_disable_opt_clocks(ddata);
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
 
 	return error;
 }
@@ -1245,6 +1272,33 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+static int sysc_clockdomain_init(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
+	struct clk *fck = NULL, *ick = NULL;
+	int error;
+
+	if (!pdata || !pdata->init_clockdomain)
+		return 0;
+
+	switch (ddata->nr_clocks) {
+	case 2:
+		ick = ddata->clocks[SYSC_ICK];
+		/* fallthrough */
+	case 1:
+		fck = ddata->clocks[SYSC_FCK];
+		break;
+	case 0:
+		return 0;
+	}
+
+	error = pdata->init_clockdomain(ddata->dev, fck, ick, &ddata->cookie);
+	if (!error || error == -ENODEV)
+		return 0;
+
+	return error;
+}
+
 /*
  * Note that pdata->init_module() typically does a reset first. After
  * pdata->init_module() is done, PM runtime can be used for the interconnect
@@ -1255,7 +1309,7 @@ static int sysc_legacy_init(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	int error;
 
-	if (!ddata->legacy_mode || !pdata || !pdata->init_module)
+	if (!pdata || !pdata->init_module)
 		return 0;
 
 	error = pdata->init_module(ddata->dev, ddata->mdata, &ddata->cookie);
@@ -1347,7 +1401,13 @@ static int sysc_init_module(struct sysc *ddata)
 	    (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT))
 		manage_clocks = false;
 
+	error = sysc_clockdomain_init(ddata);
+	if (error)
+		return error;
+
 	if (manage_clocks) {
+		sysc_clkdm_deny_idle(ddata);
+
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
 			return error;
@@ -1360,20 +1420,33 @@ static int sysc_init_module(struct sysc *ddata)
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
 
-	error = sysc_legacy_init(ddata);
-	if (error)
-		goto err_main_clocks;
+	if (ddata->legacy_mode) {
+		error = sysc_legacy_init(ddata);
+		if (error)
+			goto err_main_clocks;
+	}
+
+	if (!ddata->legacy_mode && manage_clocks) {
+		error = sysc_enable_module(ddata->dev);
+		if (error)
+			goto err_main_clocks;
+	}
 
 	error = sysc_reset(ddata);
 	if (error)
 		dev_err(ddata->dev, "Reset failed with %d\n", error);
 
+	if (!ddata->legacy_mode && manage_clocks)
+		sysc_disable_module(ddata->dev);
+
 err_main_clocks:
 	if (manage_clocks)
 		sysc_disable_main_clocks(ddata);
 err_opt_clocks:
-	if (manage_clocks)
+	if (manage_clocks) {
 		sysc_disable_opt_clocks(ddata);
+		sysc_clkdm_allow_idle(ddata);
+	}
 
 	return error;
 }
@@ -2012,20 +2085,22 @@ static int sysc_init_pdata(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	struct ti_sysc_module_data *mdata;
 
-	if (!pdata || !ddata->legacy_mode)
+	if (!pdata)
 		return 0;
 
 	mdata = devm_kzalloc(ddata->dev, sizeof(*mdata), GFP_KERNEL);
 	if (!mdata)
 		return -ENOMEM;
 
-	mdata->name = ddata->legacy_mode;
-	mdata->module_pa = ddata->module_pa;
-	mdata->module_size = ddata->module_size;
-	mdata->offsets = ddata->offsets;
-	mdata->nr_offsets = SYSC_MAX_REGS;
-	mdata->cap = ddata->cap;
-	mdata->cfg = &ddata->cfg;
+	if (ddata->legacy_mode) {
+		mdata->name = ddata->legacy_mode;
+		mdata->module_pa = ddata->module_pa;
+		mdata->module_size = ddata->module_size;
+		mdata->offsets = ddata->offsets;
+		mdata->nr_offsets = SYSC_MAX_REGS;
+		mdata->cap = ddata->cap;
+		mdata->cfg = &ddata->cfg;
+	}
 
 	ddata->mdata = mdata;
 
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 9256c0305968..6626fd31e309 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -19,6 +19,7 @@ enum ti_sysc_module_type {
 
 struct ti_sysc_cookie {
 	void *data;
+	void *clkdm;
 };
 
 /**
@@ -125,9 +126,16 @@ struct ti_sysc_module_data {
 };
 
 struct device;
+struct clk;
 
 struct ti_sysc_platform_data {
 	struct of_dev_auxdata *auxdata;
+	int (*init_clockdomain)(struct device *dev, struct clk *fck,
+				struct clk *ick, struct ti_sysc_cookie *cookie);
+	void (*clkdm_deny_idle)(struct device *dev,
+				const struct ti_sysc_cookie *cookie);
+	void (*clkdm_allow_idle)(struct device *dev,
+				 const struct ti_sysc_cookie *cookie);
 	int (*init_module)(struct device *dev,
 			   const struct ti_sysc_module_data *data,
 			   struct ti_sysc_cookie *cookie);
-- 
cgit v1.2.3


From e0db94fe87dacd72be0699adcc29e321db7f1689 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Make OCP reset work for sysstatus and sysconfig reset
 bits

We've had minimal OCP softreset support in ti-sysc interconnect target
module driver only used for MCAN driver so far. But it turns out that
MCAN has the sysstatus register resetdone bit inverted compared to most
other modules.

Let's make OCP softreset work for other typical cases with reset status
in sysstatus or sysconfig register so we can use the new functions for
sysc_enable_module() and sysc_disable_module() without "ti,hwmods"
property in the following patches.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 72 ++++++++++++++++++++++++++---------
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index f00997eea207..f4a048430cd1 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -153,6 +153,26 @@ static u32 sysc_read_revision(struct sysc *ddata)
 	return sysc_read(ddata, offset);
 }
 
+static u32 sysc_read_sysconfig(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSCONFIG];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
+static u32 sysc_read_sysstatus(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSSTATUS];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
 static int sysc_add_named_clock_from_child(struct sysc *ddata,
 					   const char *name,
 					   const char *optfck_name)
@@ -1369,34 +1389,49 @@ static int sysc_rstctrl_reset_deassert(struct sysc *ddata, bool reset)
 	return reset_control_deassert(ddata->rsts);
 }
 
+/*
+ * Note that the caller must ensure the interconnect target module is enabled
+ * before calling reset. Otherwise reset will not complete.
+ */
 static int sysc_reset(struct sysc *ddata)
 {
-	int offset = ddata->offsets[SYSC_SYSCONFIG];
-	int val;
+	int sysc_offset, syss_offset, sysc_val, rstval, quirks, error = 0;
+	u32 sysc_mask, syss_done;
+
+	sysc_offset = ddata->offsets[SYSC_SYSCONFIG];
+	syss_offset = ddata->offsets[SYSC_SYSSTATUS];
+	quirks = ddata->cfg.quirks;
 
-	if (ddata->legacy_mode || offset < 0 ||
+	if (ddata->legacy_mode || sysc_offset < 0 ||
+	    ddata->cap->regbits->srst_shift < 0 ||
 	    ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)
 		return 0;
 
-	/*
-	 * Currently only support reset status in sysstatus.
-	 * Warn and return error in all other cases
-	 */
-	if (!ddata->cfg.syss_mask) {
-		dev_err(ddata->dev, "No ti,syss-mask. Reset failed\n");
-		return -EINVAL;
-	}
+	sysc_mask = BIT(ddata->cap->regbits->srst_shift);
 
-	val = sysc_read(ddata, offset);
-	val |= (0x1 << ddata->cap->regbits->srst_shift);
-	sysc_write(ddata, offset, val);
+	if (ddata->cfg.quirks & SYSS_QUIRK_RESETDONE_INVERTED)
+		syss_done = 0;
+	else
+		syss_done = ddata->cfg.syss_mask;
+
+	sysc_val = sysc_read_sysconfig(ddata);
+	sysc_val |= sysc_mask;
+	sysc_write(ddata, sysc_offset, sysc_val);
 
 	/* Poll on reset status */
-	offset = ddata->offsets[SYSC_SYSSTATUS];
+	if (syss_offset >= 0) {
+		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
+					   (rstval & ddata->cfg.syss_mask) ==
+					   syss_done,
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+
+	} else if (ddata->cfg.quirks & SYSC_QUIRK_RESET_STATUS) {
+		error = readx_poll_timeout(sysc_read_sysconfig, ddata, rstval,
+					   !(rstval & sysc_mask),
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+	}
 
-	return readl_poll_timeout(ddata->module_va + offset, val,
-				  (val & ddata->cfg.syss_mask) == 0x0,
-				  100, MAX_MODULE_SOFTRESET_WAIT);
+	return error;
 }
 
 /*
@@ -2099,6 +2134,7 @@ static const struct sysc_capabilities sysc_dra7_mcan = {
 	.type = TI_SYSC_DRA7_MCAN,
 	.sysc_mask = SYSC_DRA7_MCAN_ENAWAKEUP | SYSC_OMAP4_SOFTRESET,
 	.regbits = &sysc_regbits_dra7_mcan,
+	.mod_quirks = SYSS_QUIRK_RESETDONE_INVERTED,
 };
 
 static int sysc_init_pdata(struct sysc *ddata)
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 6626fd31e309..8822e99ff813 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
 #define SYSC_QUIRK_SWSUP_SIDLE		BIT(11)
-- 
cgit v1.2.3


From f34e1176df34b87e88eb65cd730255c913f92f8c Mon Sep 17 00:00:00 2001
From: Weitao Hou <houweitaoo@gmail.com>
Date: Mon, 20 May 2019 13:09:48 +0800
Subject: iommu/vt-d: Fix typo in SVM code comment

Fix 'acccess' to 'access'.

Signed-off-by: Weitao Hou <houweitaoo@gmail.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-svm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index e3f76315ca4d..8dfead70699c 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -57,7 +57,7 @@ struct svm_dev_ops {
 
 /**
  * intel_svm_bind_mm() - Bind the current process to a PASID
- * @dev:	Device to be granted acccess
+ * @dev:	Device to be granted access
  * @pasid:	Address for allocated PASID
  * @flags:	Flags. Later for requesting supervisor mode, etc.
  * @ops:	Callbacks to device driver
-- 
cgit v1.2.3


From 621dc2fdcea1496ddd472c297d42e8d6642b05bc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 28 May 2019 15:36:44 +0200
Subject: acpi/irq: Implement helper to create hierachical domains

ACPI permits arbitrary producer->consumer interrupt links to be
described in AML, which means a topology such as the following
is perfectly legal:

  Device (EXIU) {
    Name (_HID, "SCX0008")
    Name (_UID, Zero)
    Name (_CRS, ResourceTemplate () {
      ...
    })
  }

  Device (GPIO) {
    Name (_HID, "SCX0007")
    Name (_UID, Zero)
    Name (_CRS, ResourceTemplate () {
      Memory32Fixed (ReadWrite, SYNQUACER_GPIO_BASE, SYNQUACER_GPIO_SIZE)
      Interrupt (ResourceConsumer, Edge, ActiveHigh, ExclusiveAndWake, 0, "\\_SB.EXIU") {
        7,
      }
    })
    ...
  }

The EXIU in this example is the external interrupt unit as can be found
on Socionext SynQuacer based platforms, which converts a block of 32 SPIs
from arbitrary polarity/trigger into level-high, with a separate set
of config/mask/unmask/clear controls.

The existing DT based driver in drivers/irqchip/irq-sni-exiu.c models
this as a hierarchical domain stacked on top of the GIC's irqdomain.
Since the GIC is modeled as a DT node as well, obtaining a reference
to this irqdomain is easily done by going through the parent link.

On ACPI systems, however, the GIC is not modeled as an object in the
namespace, and so device objects cannot refer to it directly. So in
order to obtain the irqdomain reference when driving the EXIU in ACPI
mode, we need a helper that implicitly grabs the default domain as the
parent of the hierarchy for interrupts allocated out of the global GSI
pool.

Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/acpi/irq.c   | 26 ++++++++++++++++++++++++++
 include/linux/acpi.h |  7 +++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index c3b2222e2129..ce6b25a3b7a7 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -295,3 +295,29 @@ void __init acpi_set_irq_model(enum acpi_irq_model_id model,
 	acpi_irq_model = model;
 	acpi_gsi_domain_id = fwnode;
 }
+
+/**
+ * acpi_irq_create_hierarchy - Create a hierarchical IRQ domain with the default
+ *                             GSI domain as its parent.
+ * @flags:      Irq domain flags associated with the domain
+ * @size:       Size of the domain.
+ * @fwnode:     Optional fwnode of the interrupt controller
+ * @ops:        Pointer to the interrupt domain callbacks
+ * @host_data:  Controller private data pointer
+ */
+struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
+					     unsigned int size,
+					     struct fwnode_handle *fwnode,
+					     const struct irq_domain_ops *ops,
+					     void *host_data)
+{
+	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
+							DOMAIN_BUS_ANY);
+
+	if (!d)
+		return NULL;
+
+	return irq_domain_create_hierarchy(d, flags, size, fwnode, ops,
+					   host_data);
+}
+EXPORT_SYMBOL_GPL(acpi_irq_create_hierarchy);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 98440df7fe42..70de4bc30cea 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -23,6 +23,7 @@
 
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
+#include <linux/irqdomain.h>
 #include <linux/resource_ext.h>
 #include <linux/device.h>
 #include <linux/property.h>
@@ -327,6 +328,12 @@ int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 void acpi_set_irq_model(enum acpi_irq_model_id model,
 			struct fwnode_handle *fwnode);
 
+struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
+					     unsigned int size,
+					     struct fwnode_handle *fwnode,
+					     const struct irq_domain_ops *ops,
+					     void *host_data);
+
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-- 
cgit v1.2.3


From 12edff045bc6dd3ab1565cc02fa4841803c2a633 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 9 Apr 2019 07:48:18 -0700
Subject: rcu: Make kfree_rcu() ignore NULL pointers

This commit makes the kfree_rcu() macro's semantics be consistent
with the likes of kfree() by adding a check for NULL pointers, so
that kfree_rcu(NULL, ...) is a no-op.

Reported-by: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Reviewed-by: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/rcupdate.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 922bb6848813..915460ec0872 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -805,7 +805,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 /**
  * kfree_rcu() - kfree an object after a grace period.
  * @ptr:	pointer to kfree
- * @rcu_head:	the name of the struct rcu_head within the type of @ptr.
+ * @rhf:	the name of the struct rcu_head within the type of @ptr.
  *
  * Many rcu callbacks functions just call kfree() on the base structure.
  * These functions are trivial, but their size adds up, and furthermore
@@ -828,9 +828,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * The BUILD_BUG_ON check must not involve any function calls, hence the
  * checks are done in macros here.
  */
-#define kfree_rcu(ptr, rcu_head)					\
-	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
-
+#define kfree_rcu(ptr, rhf)						\
+do {									\
+	typeof (ptr) ___p = (ptr);					\
+									\
+	if (___p)							\
+		__kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+} while (0)
 
 /*
  * Place this after a lock-acquisition primitive to guarantee that
-- 
cgit v1.2.3


From fe15b50cdeeebd9248bf27e3c31278668f08bc04 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 5 Apr 2019 16:15:00 -0700
Subject: srcu: Allocate per-CPU data for DEFINE_SRCU() in modules

Adding DEFINE_SRCU() or DEFINE_STATIC_SRCU() to a loadable module requires
that the size of the reserved region be increased, which is not something
we want to be doing all that often.  One approach would be to require
that loadable modules define an srcu_struct and invoke init_srcu_struct()
from their module_init function and cleanup_srcu_struct() from their
module_exit function.  However, this is more than a bit user unfriendly.

This commit therefore creates an ___srcu_struct_ptrs linker section,
and pointers to srcu_struct structures created by DEFINE_SRCU() and
DEFINE_STATIC_SRCU() within a module are placed into that module's
___srcu_struct_ptrs section.  The required init_srcu_struct() and
cleanup_srcu_struct() functions are then automatically invoked as needed
when that module is loaded and unloaded, thus allowing modules to continue
to use DEFINE_SRCU() and DEFINE_STATIC_SRCU() while avoiding the need
to increase the size of the reserved region.

Many of the algorithms and some of the code was cheerfully cherry-picked
from other code making use of linker sections, perhaps most notably from
tracepoints.  All bugs are nevertheless the sole property of the author.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
[ paulmck: Use __section() and use "default" in srcu_module_notify()'s
  "switch" statement as suggested by Joel Fernandes. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/asm-generic/vmlinux.lds.h |  4 +++
 include/linux/module.h            |  5 +++
 include/linux/srcutree.h          | 14 +++++++--
 kernel/module.c                   |  5 +++
 kernel/rcu/srcutree.c             | 65 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 90 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 088987e9a3ea..ba1ad39468fc 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -337,6 +337,10 @@
 		KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
 		__stop___tracepoints_ptrs = .;				\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
+		. = ALIGN(8);						\
+		__start___srcu_struct = .;				\
+		*(___srcu_struct_ptrs)					\
+		__end___srcu_struct = .;				\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/module.h b/include/linux/module.h
index 188998d3dca9..1455812dd325 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -21,6 +21,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/error-injection.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/srcu.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -450,6 +451,10 @@ struct module {
 	unsigned int num_tracepoints;
 	tracepoint_ptr_t *tracepoints_ptrs;
 #endif
+#ifdef CONFIG_TREE_SRCU
+	unsigned int num_srcu_structs;
+	struct srcu_struct **srcu_struct_ptrs;
+#endif
 #ifdef CONFIG_BPF_EVENTS
 	unsigned int num_bpf_raw_events;
 	struct bpf_raw_event_map *bpf_raw_events;
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 7f7c8c050f63..8af1824c46a8 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -120,9 +120,17 @@ struct srcu_struct {
  *
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
-#define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
+#ifdef MODULE
+# define __DEFINE_SRCU(name, is_static)					\
+	is_static struct srcu_struct name;				\
+	struct srcu_struct *__srcu_struct_##name			\
+		__section("___srcu_struct_ptrs") = &name
+#else
+# define __DEFINE_SRCU(name, is_static)					\
+	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);	\
+	is_static struct srcu_struct name =				\
+		__SRCU_STRUCT_INIT(name, name##_srcu_data)
+#endif
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
diff --git a/kernel/module.c b/kernel/module.c
index 6e6712b3aaf5..c79a53b629b6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3095,6 +3095,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					     sizeof(*mod->tracepoints_ptrs),
 					     &mod->num_tracepoints);
 #endif
+#ifdef CONFIG_TREE_SRCU
+	mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+					     sizeof(*mod->srcu_struct_ptrs),
+					     &mod->num_srcu_structs);
+#endif
 #ifdef CONFIG_BPF_EVENTS
 	mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
 					   sizeof(*mod->bpf_raw_events),
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..2ded2614a2f4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1310,3 +1310,68 @@ void __init srcu_init(void)
 		queue_work(rcu_gp_wq, &ssp->work.work);
 	}
 }
+
+#ifdef CONFIG_MODULES
+
+/* Initialize any global-scope srcu_struct structures used by this module. */
+static int srcu_module_coming(struct module *mod)
+{
+	int i;
+	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+	int ret;
+
+	for (i = 0; i < mod->num_srcu_structs; i++) {
+		ret = init_srcu_struct(*(sspp++));
+		if (WARN_ON_ONCE(ret))
+			return ret;
+	}
+	return 0;
+}
+
+/* Clean up any global-scope srcu_struct structures used by this module. */
+static void srcu_module_going(struct module *mod)
+{
+	int i;
+	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+
+	for (i = 0; i < mod->num_srcu_structs; i++)
+		cleanup_srcu_struct(*(sspp++));
+}
+
+/* Handle one module, either coming or going. */
+static int srcu_module_notify(struct notifier_block *self,
+			      unsigned long val, void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ret = srcu_module_coming(mod);
+		break;
+	case MODULE_STATE_GOING:
+		srcu_module_going(mod);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static struct notifier_block srcu_module_nb = {
+	.notifier_call = srcu_module_notify,
+	.priority = 0,
+};
+
+static __init int init_srcu_module_notifier(void)
+{
+	int ret;
+
+	ret = register_module_notifier(&srcu_module_nb);
+	if (ret)
+		pr_warn("Failed to register srcu module notifier\n");
+	return ret;
+}
+late_initcall(init_srcu_module_notifier);
+
+#endif /* #ifdef CONFIG_MODULES */
-- 
cgit v1.2.3


From 056b89e7e699742cc060ce722d3f26effe51b4aa Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Thu, 11 Apr 2019 16:24:21 -0400
Subject: module: Make srcu_struct ptr array as read-only

Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers, which is used by srcu code to initialize and
clean up these structures and save valuable per-cpu reserved space.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Tested with the following diff to ensure array not writable:

(diff is a bit reduced to avoid patch command getting confused)
 a/kernel/module.c
 b/kernel/module.c
  -3506,6 +3506,14  static noinline int do_init_module [snip]
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
 	module_enable_ro(mod, true);
+
+	if (mod->srcu_struct_ptrs) {
+		// Check if srcu_struct_ptrs access is possible
+		char x = *(char *)mod->srcu_struct_ptrs;
+		*(char *)mod->srcu_struct_ptrs = 0;
+		*(char *)mod->srcu_struct_ptrs = x;
+	}
+
 	mod_tree_remove_init(mod);
 	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);

Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: paulmck@linux.vnet.ibm.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: rcu@vger.kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/srcutree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8af1824c46a8..9cfcc8a756ae 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -123,7 +123,7 @@ struct srcu_struct {
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)					\
 	is_static struct srcu_struct name;				\
-	struct srcu_struct *__srcu_struct_##name			\
+	struct srcu_struct * const __srcu_struct_##name			\
 		__section("___srcu_struct_ptrs") = &name
 #else
 # define __DEFINE_SRCU(name, is_static)					\
-- 
cgit v1.2.3


From 95bf33b55ff4465399bad843f1d8d618c8baf1f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Apr 2019 14:07:24 +0200
Subject: rcu/sync: Kill rcu_sync_type/gp_type

Now that the RCU flavors have been consolidated, rcu_sync_type makes no
sense because none of internal update functions aside from .held() depend
on gp_type.  This commit therefore removes this field and consolidates
the relevant code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[ paulmck: Added RCU and RCU-bh checks to rcu_sync_is_idle(). ]
[ paulmck: And applied subsequent feedback from Oleg Nesterov. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/percpu-rwsem.h  |  2 +-
 include/linux/rcu_sync.h      | 36 ++++++++--------------------
 kernel/locking/percpu-rwsem.c |  2 +-
 kernel/rcu/sync.c             | 55 ++++---------------------------------------
 4 files changed, 17 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 03cb4b6f842e..6887636ea169 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -20,7 +20,7 @@ struct percpu_rw_semaphore {
 #define DEFINE_STATIC_PERCPU_RWSEM(name)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 static struct percpu_rw_semaphore name = {				\
-	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
+	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 6fc53a1345b3..87971e85519c 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -13,8 +13,6 @@
 #include <linux/wait.h>
 #include <linux/rcupdate.h>
 
-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
 /* Structure to mediate between updaters and fastpath-using readers.  */
 struct rcu_sync {
 	int			gp_state;
@@ -23,52 +21,38 @@ struct rcu_sync {
 
 	int			cb_state;
 	struct rcu_head		cb_head;
-
-	enum rcu_sync_type	gp_type;
 };
 
-extern void rcu_sync_lockdep_assert(struct rcu_sync *);
-
 /**
  * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *
- * Returns true if readers are permitted to use their fastpaths.
- * Must be invoked within an RCU read-side critical section whose
- * flavor matches that of the rcu_sync struture.
+ * Returns true if readers are permitted to use their fastpaths.  Must be
+ * invoked within some flavor of RCU read-side critical section.
  */
 static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 {
-#ifdef CONFIG_PROVE_RCU
-	rcu_sync_lockdep_assert(rsp);
-#endif
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+			 !rcu_read_lock_bh_held() &&
+			 !rcu_read_lock_sched_held(),
+			 "suspicious rcu_sync_is_idle() usage");
 	return !rsp->gp_state; /* GP_IDLE */
 }
 
-extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_init(struct rcu_sync *);
 extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
 
-#define __RCU_SYNC_INITIALIZER(name, type) {				\
+#define __RCU_SYNC_INITIALIZER(name) {					\
 		.gp_state = 0,						\
 		.gp_count = 0,						\
 		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
 		.cb_state = 0,						\
-		.gp_type = type,					\
 	}
 
-#define	__DEFINE_RCU_SYNC(name, type)	\
-	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
-
-#define DEFINE_RCU_SYNC(name)		\
-	__DEFINE_RCU_SYNC(name, RCU_SYNC)
-
-#define DEFINE_RCU_SCHED_SYNC(name)	\
-	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
-
-#define DEFINE_RCU_BH_SYNC(name)	\
-	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+#define	DEFINE_RCU_SYNC(name)	\
+	struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)
 
 #endif /* _LINUX_RCU_SYNC_H_ */
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..48cab93a47fd 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 		return -ENOMEM;
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	rcu_sync_init(&sem->rss);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index a8304d90573f..ee427e138dad 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,65 +10,20 @@
 #include <linux/rcu_sync.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_PROVE_RCU
-#define __INIT_HELD(func)	.held = func,
-#else
-#define __INIT_HELD(func)
-#endif
-
-static const struct {
-	void (*sync)(void);
-	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
-	void (*wait)(void);
-#ifdef CONFIG_PROVE_RCU
-	int  (*held)(void);
-#endif
-} gp_ops[] = {
-	[RCU_SYNC] = {
-		.sync = synchronize_rcu,
-		.call = call_rcu,
-		.wait = rcu_barrier,
-		__INIT_HELD(rcu_read_lock_held)
-	},
-	[RCU_SCHED_SYNC] = {
-		.sync = synchronize_rcu,
-		.call = call_rcu,
-		.wait = rcu_barrier,
-		__INIT_HELD(rcu_read_lock_sched_held)
-	},
-	[RCU_BH_SYNC] = {
-		.sync = synchronize_rcu,
-		.call = call_rcu,
-		.wait = rcu_barrier,
-		__INIT_HELD(rcu_read_lock_bh_held)
-	},
-};
-
 enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
 enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
 #define	rss_lock	gp_wait.lock
 
-#ifdef CONFIG_PROVE_RCU
-void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
-{
-	RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
-			 "suspicious rcu_sync_is_idle() usage");
-}
-
-EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
-#endif
-
 /**
  * rcu_sync_init() - Initialize an rcu_sync structure
  * @rsp: Pointer to rcu_sync structure to be initialized
  * @type: Flavor of RCU with which to synchronize rcu_sync structure
  */
-void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync *rsp)
 {
 	memset(rsp, 0, sizeof(*rsp));
 	init_waitqueue_head(&rsp->gp_wait);
-	rsp->gp_type = type;
 }
 
 /**
@@ -114,7 +69,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 
 	WARN_ON_ONCE(need_wait && need_sync);
 	if (need_sync) {
-		gp_ops[rsp->gp_type].sync();
+		synchronize_rcu();
 		rsp->gp_state = GP_PASSED;
 		wake_up_all(&rsp->gp_wait);
 	} else if (need_wait) {
@@ -167,7 +122,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
 		 * to catch a later GP.
 		 */
 		rsp->cb_state = CB_PENDING;
-		gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+		call_rcu(&rsp->cb_head, rcu_sync_func);
 	} else {
 		/*
 		 * We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -195,7 +150,7 @@ void rcu_sync_exit(struct rcu_sync *rsp)
 	if (!--rsp->gp_count) {
 		if (rsp->cb_state == CB_IDLE) {
 			rsp->cb_state = CB_PENDING;
-			gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+			call_rcu(&rsp->cb_head, rcu_sync_func);
 		} else if (rsp->cb_state == CB_PENDING) {
 			rsp->cb_state = CB_REPLAY;
 		}
@@ -220,7 +175,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
 	spin_unlock_irq(&rsp->rss_lock);
 
 	if (cb_state != CB_IDLE) {
-		gp_ops[rsp->gp_type].wait();
+		rcu_barrier();
 		WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
 	}
 }
-- 
cgit v1.2.3


From 3f2947b78151ec938dc06aea4ba0e11e56becdff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Apr 2019 18:32:41 +0200
Subject: locking/percpu-rwsem: Add DEFINE_PERCPU_RWSEM(), use it to initialize
 cgroup_threadgroup_rwsem

Turn DEFINE_STATIC_PERCPU_RWSEM() into __DEFINE_PERCPU_RWSEM() with the
additional "is_static" argument to introduce DEFINE_PERCPU_RWSEM().

Change cgroup.c to use DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/percpu-rwsem.h | 8 ++++++--
 kernel/cgroup/cgroup.c       | 3 +--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 6887636ea169..2809b44cbbee 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -17,14 +17,18 @@ struct percpu_rw_semaphore {
 	int			readers_block;
 };
 
-#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
+#define __DEFINE_PERCPU_RWSEM(name, is_static)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
-static struct percpu_rw_semaphore name = {				\
+is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
 }
+#define DEFINE_PERCPU_RWSEM(name)		\
+	__DEFINE_PERCPU_RWSEM(name, /* not static */)
+#define DEFINE_STATIC_PERCPU_RWSEM(name)	\
+	__DEFINE_PERCPU_RWSEM(name, static)
 
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..b112e93388dc 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
@@ -5616,7 +5616,6 @@ int __init cgroup_init(void)
 	int ssid;
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
-- 
cgit v1.2.3


From 89da3b94bb97417ca2c5b0ce3a28643819030247 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 25 Apr 2019 18:50:55 +0200
Subject: rcu/sync: Simplify the state machine

With this patch rcu_sync has a single state variable and the transition rules
become really simple:

	GP_IDLE   - owned by the first rcu_sync_enter() which moves it to

	GP_ENTER  - owned by rcu-callback which moves it to

	GP_PASSED - owned by the last rcu_sync_exit() which moves it to

	GP_EXIT   - and this is the only "nontrivial" state.

		rcu-callback moves it back to GP_IDLE unless another enter()
		comes before a GP pass.

		If rcu-callback is invoked before the next rcu_sync_exit() it
		must see gp_count incremented by that enter() and set GP_PASSED.

		Otherwise, if the next rcu_sync_exit() wins the race, it will
		move it to

	GP_REPLAY - owned by rcu-callback which moves it to GP_EXIT

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[ paulmck: While here, apply READ_ONCE() and WRITE_ONCE() to ->gp_state. ]
[ paulmck: Tweaks to make htmldocs happy. (Reported by kbuild test robot.) ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/rcu_sync.h |   4 +-
 kernel/rcu/sync.c        | 165 +++++++++++++++++++++++++++--------------------
 2 files changed, 96 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 87971e85519c..9b83865d24f9 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -19,7 +19,6 @@ struct rcu_sync {
 	int			gp_count;
 	wait_queue_head_t	gp_wait;
 
-	int			cb_state;
 	struct rcu_head		cb_head;
 };
 
@@ -36,7 +35,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 			 !rcu_read_lock_bh_held() &&
 			 !rcu_read_lock_sched_held(),
 			 "suspicious rcu_sync_is_idle() usage");
-	return !rsp->gp_state; /* GP_IDLE */
+	return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
 }
 
 extern void rcu_sync_init(struct rcu_sync *);
@@ -49,7 +48,6 @@ extern void rcu_sync_dtor(struct rcu_sync *);
 		.gp_state = 0,						\
 		.gp_count = 0,						\
 		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
-		.cb_state = 0,						\
 	}
 
 #define	DEFINE_RCU_SYNC(name)	\
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index ee427e138dad..d4558ab7a07d 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,15 +10,13 @@
 #include <linux/rcu_sync.h>
 #include <linux/sched.h>
 
-enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
-enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
 
 #define	rss_lock	gp_wait.lock
 
 /**
  * rcu_sync_init() - Initialize an rcu_sync structure
  * @rsp: Pointer to rcu_sync structure to be initialized
- * @type: Flavor of RCU with which to synchronize rcu_sync structure
  */
 void rcu_sync_init(struct rcu_sync *rsp)
 {
@@ -41,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp)
 	rsp->gp_state = GP_PASSED;
 }
 
-/**
- * rcu_sync_enter() - Force readers onto slowpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * This function is used by updaters who need readers to make use of
- * a slowpath during the update.  After this function returns, all
- * subsequent calls to rcu_sync_is_idle() will return false, which
- * tells readers to stay off their fastpaths.  A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
- *
- * When called in isolation, rcu_sync_enter() must wait for a grace
- * period, however, closely spaced calls to rcu_sync_enter() can
- * optimize away the grace-period wait via a state machine implemented
- * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
- */
-void rcu_sync_enter(struct rcu_sync *rsp)
-{
-	bool need_wait, need_sync;
 
-	spin_lock_irq(&rsp->rss_lock);
-	need_wait = rsp->gp_count++;
-	need_sync = rsp->gp_state == GP_IDLE;
-	if (need_sync)
-		rsp->gp_state = GP_PENDING;
-	spin_unlock_irq(&rsp->rss_lock);
+static void rcu_sync_func(struct rcu_head *rhp);
 
-	WARN_ON_ONCE(need_wait && need_sync);
-	if (need_sync) {
-		synchronize_rcu();
-		rsp->gp_state = GP_PASSED;
-		wake_up_all(&rsp->gp_wait);
-	} else if (need_wait) {
-		wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
-	} else {
-		/*
-		 * Possible when there's a pending CB from a rcu_sync_exit().
-		 * Nobody has yet been allowed the 'fast' path and thus we can
-		 * avoid doing any sync(). The callback will get 'dropped'.
-		 */
-		WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
-	}
+static void rcu_sync_call(struct rcu_sync *rsp)
+{
+	call_rcu(&rsp->cb_head, rcu_sync_func);
 }
 
 /**
  * rcu_sync_func() - Callback function managing reader access to fastpath
  * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
  *
- * This function is passed to one of the call_rcu() functions by
+ * This function is passed to call_rcu() function by rcu_sync_enter() and
  * rcu_sync_exit(), so that it is invoked after a grace period following the
- * that invocation of rcu_sync_exit().  It takes action based on events that
+ * that invocation of enter/exit.
+ *
+ * If it is called by rcu_sync_enter() it signals that all the readers were
+ * switched onto slow path.
+ *
+ * If it is called by rcu_sync_exit() it takes action based on events that
  * have taken place in the meantime, so that closely spaced rcu_sync_enter()
  * and rcu_sync_exit() pairs need not wait for a grace period.
  *
@@ -107,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp)
 	struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
 	unsigned long flags;
 
-	WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
-	WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
 
 	spin_lock_irqsave(&rsp->rss_lock, flags);
 	if (rsp->gp_count) {
 		/*
-		 * A new rcu_sync_begin() has happened; drop the callback.
+		 * We're at least a GP after the GP_IDLE->GP_ENTER transition.
 		 */
-		rsp->cb_state = CB_IDLE;
-	} else if (rsp->cb_state == CB_REPLAY) {
+		WRITE_ONCE(rsp->gp_state, GP_PASSED);
+		wake_up_locked(&rsp->gp_wait);
+	} else if (rsp->gp_state == GP_REPLAY) {
 		/*
-		 * A new rcu_sync_exit() has happened; requeue the callback
-		 * to catch a later GP.
+		 * A new rcu_sync_exit() has happened; requeue the callback to
+		 * catch a later GP.
 		 */
-		rsp->cb_state = CB_PENDING;
-		call_rcu(&rsp->cb_head, rcu_sync_func);
+		WRITE_ONCE(rsp->gp_state, GP_EXIT);
+		rcu_sync_call(rsp);
 	} else {
 		/*
-		 * We're at least a GP after rcu_sync_exit(); eveybody will now
-		 * have observed the write side critical section. Let 'em rip!.
+		 * We're at least a GP after the last rcu_sync_exit(); eveybody
+		 * will now have observed the write side critical section.
+		 * Let 'em rip!.
 		 */
-		rsp->cb_state = CB_IDLE;
-		rsp->gp_state = GP_IDLE;
+		WRITE_ONCE(rsp->gp_state, GP_IDLE);
 	}
 	spin_unlock_irqrestore(&rsp->rss_lock, flags);
 }
 
 /**
- * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update.  After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths.  A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+	int gp_state;
+
+	spin_lock_irq(&rsp->rss_lock);
+	gp_state = rsp->gp_state;
+	if (gp_state == GP_IDLE) {
+		WRITE_ONCE(rsp->gp_state, GP_ENTER);
+		WARN_ON_ONCE(rsp->gp_count);
+		/*
+		 * Note that we could simply do rcu_sync_call(rsp) here and
+		 * avoid the "if (gp_state == GP_IDLE)" block below.
+		 *
+		 * However, synchronize_rcu() can be faster if rcu_expedited
+		 * or rcu_blocking_is_gp() is true.
+		 *
+		 * Another reason is that we can't wait for rcu callback if
+		 * we are called at early boot time but this shouldn't happen.
+		 */
+	}
+	rsp->gp_count++;
+	spin_unlock_irq(&rsp->rss_lock);
+
+	if (gp_state == GP_IDLE) {
+		/*
+		 * See the comment above, this simply does the "synchronous"
+		 * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
+		 */
+		synchronize_rcu();
+		rcu_sync_func(&rsp->cb_head);
+		/* Not really needed, wait_event() would see GP_PASSED. */
+		return;
+	}
+
+	wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast path after grace period
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *
  * This function is used by updaters who have completed, and can therefore
@@ -146,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp)
  */
 void rcu_sync_exit(struct rcu_sync *rsp)
 {
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
+
 	spin_lock_irq(&rsp->rss_lock);
 	if (!--rsp->gp_count) {
-		if (rsp->cb_state == CB_IDLE) {
-			rsp->cb_state = CB_PENDING;
-			call_rcu(&rsp->cb_head, rcu_sync_func);
-		} else if (rsp->cb_state == CB_PENDING) {
-			rsp->cb_state = CB_REPLAY;
+		if (rsp->gp_state == GP_PASSED) {
+			WRITE_ONCE(rsp->gp_state, GP_EXIT);
+			rcu_sync_call(rsp);
+		} else if (rsp->gp_state == GP_EXIT) {
+			WRITE_ONCE(rsp->gp_state, GP_REPLAY);
 		}
 	}
 	spin_unlock_irq(&rsp->rss_lock);
@@ -164,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp)
  */
 void rcu_sync_dtor(struct rcu_sync *rsp)
 {
-	int cb_state;
+	int gp_state;
 
-	WARN_ON_ONCE(rsp->gp_count);
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
+	WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
 
 	spin_lock_irq(&rsp->rss_lock);
-	if (rsp->cb_state == CB_REPLAY)
-		rsp->cb_state = CB_PENDING;
-	cb_state = rsp->cb_state;
+	if (rsp->gp_state == GP_REPLAY)
+		WRITE_ONCE(rsp->gp_state, GP_EXIT);
+	gp_state = rsp->gp_state;
 	spin_unlock_irq(&rsp->rss_lock);
 
-	if (cb_state != CB_IDLE) {
+	if (gp_state != GP_IDLE) {
 		rcu_barrier();
-		WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
+		WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
 	}
 }
-- 
cgit v1.2.3


From ff3bf92d90d396e51eb78c5ecde11a994ab7a179 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 9 Apr 2019 14:44:49 -0700
Subject: torture: Allow inter-stutter interval to be specified

Currently, the inter-stutter interval is the same as the stutter duration,
that is, whatever number of jiffies is passed into torture_stutter_init().
This has worked well for quite some time, but the addition of
forward-progress testing to rcutorture can delay processes for several
seconds, which can triple the time that they are stuttered.

This commit therefore adds a second argument to torture_stutter_init()
that specifies the inter-stutter interval.  While locktorture preserves
the current behavior, rcutorture uses the RCU CPU stall warning interval
to provide a wider inter-stutter interval.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/torture.h      | 2 +-
 kernel/locking/locktorture.c | 2 +-
 kernel/rcu/rcutorture.c      | 5 ++++-
 kernel/torture.c             | 6 ++++--
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 23d80db426d7..a620118385bb 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -66,7 +66,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void));
 
 /* Task stuttering, which forces load/no-load transitions. */
 bool stutter_wait(const char *title);
-int torture_stutter_init(int s);
+int torture_stutter_init(int s, int sgap);
 
 /* Initialization and cleanup. */
 bool torture_init_begin(char *ttype, int v);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -975,7 +975,7 @@ static int __init lock_torture_init(void)
 			goto unwind;
 	}
 	if (stutter > 0) {
-		firsterr = torture_stutter_init(stutter);
+		firsterr = torture_stutter_init(stutter, stutter);
 		if (firsterr)
 			goto unwind;
 	}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 954ac2b98619..a16d6abe1715 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2373,7 +2373,10 @@ rcu_torture_init(void)
 	if (stutter < 0)
 		stutter = 0;
 	if (stutter) {
-		firsterr = torture_stutter_init(stutter * HZ);
+		int t;
+
+		t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
+		firsterr = torture_stutter_init(stutter * HZ, t);
 		if (firsterr)
 			goto unwind;
 	}
diff --git a/kernel/torture.c b/kernel/torture.c
index de0e0ecf88e1..a8d9bdfba7c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void)
 static struct task_struct *stutter_task;
 static int stutter_pause_test;
 static int stutter;
+static int stutter_gap;
 
 /*
  * Block until the stutter interval ends.  This must be called periodically
@@ -621,7 +622,7 @@ static int torture_stutter(void *arg)
 		}
 		WRITE_ONCE(stutter_pause_test, 0);
 		if (!torture_must_stop())
-			schedule_timeout_interruptible(stutter);
+			schedule_timeout_interruptible(stutter_gap);
 		torture_shutdown_absorb("torture_stutter");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_stutter");
@@ -631,9 +632,10 @@ static int torture_stutter(void *arg)
 /*
  * Initialize and kick off the torture_stutter kthread.
  */
-int torture_stutter_init(const int s)
+int torture_stutter_init(const int s, const int sgap)
 {
 	stutter = s;
+	stutter_gap = sgap;
 	return torture_create_kthread(torture_stutter, NULL, stutter_task);
 }
 EXPORT_SYMBOL_GPL(torture_stutter_init);
-- 
cgit v1.2.3


From 0b3b094ac9a7bb1fcf5d694f3ec981e6864a63d3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 May 2019 16:28:34 +0200
Subject: fanotify: Disallow permission events for proc filesystem

Proc filesystem has special locking rules for various files. Thus
fanotify which opens files on event delivery can easily deadlock
against another process that waits for fanotify permission event to be
handled. Since permission events on /proc have doubtful value anyway,
just disallow them.

Link: https://lore.kernel.org/linux-fsdevel/20190320131642.GE9485@quack2.suse.cz/
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 22 ++++++++++++++++++++++
 fs/proc/root.c                     |  2 +-
 include/linux/fs.h                 |  1 +
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a90bb19dcfa2..91006f47e420 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -920,6 +920,22 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
 	return 0;
 }
 
+static int fanotify_events_supported(struct path *path, __u64 mask)
+{
+	/*
+	 * Some filesystems such as 'proc' acquire unusual locks when opening
+	 * files. For them fanotify permission events have high chances of
+	 * deadlocking the system - open done when reporting fanotify event
+	 * blocks on this "unusual" lock while another process holding the lock
+	 * waits for fanotify permission event to be answered. Just disallow
+	 * permission events for such filesystems.
+	 */
+	if (mask & FANOTIFY_PERM_EVENTS &&
+	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
+		return -EINVAL;
+	return 0;
+}
+
 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 			    int dfd, const char  __user *pathname)
 {
@@ -1018,6 +1034,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (ret)
 		goto fput_and_out;
 
+	if (flags & FAN_MARK_ADD) {
+		ret = fanotify_events_supported(&path, mask);
+		if (ret)
+			goto path_put_and_out;
+	}
+
 	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
 		ret = fanotify_test_fid(&path, &__fsid);
 		if (ret)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8b145e7b9661..522199e9525e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -211,7 +211,7 @@ static struct file_system_type proc_fs_type = {
 	.init_fs_context	= proc_init_fs_context,
 	.parameters		= &proc_fs_parameters,
 	.kill_sb		= proc_kill_sb,
-	.fs_flags		= FS_USERNS_MOUNT,
+	.fs_flags		= FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
 };
 
 void __init proc_root_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..c7136c98b5ba 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2184,6 +2184,7 @@ struct file_system_type {
 #define FS_BINARY_MOUNTDATA	2
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
+#define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_description *parameters;
-- 
cgit v1.2.3


From 4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sat, 25 May 2019 09:37:39 -0700
Subject: bpf: decouple the lifetime of cgroup_bpf from cgroup itself

Currently the lifetime of bpf programs attached to a cgroup is bound
to the lifetime of the cgroup itself. It means that if a user
forgets (or intentionally avoids) to detach a bpf program before
removing the cgroup, it will stay attached up to the release of the
cgroup. Since the cgroup can stay in the dying state (the state
between being rmdir()'ed and being released) for a very long time, it
leads to a waste of memory. Also, it blocks a possibility to implement
the memcg-based memory accounting for bpf objects, because a circular
reference dependency will occur. Charged memory pages are pinning the
corresponding memory cgroup, and if the memory cgroup is pinning
the attached bpf program, nothing will be ever released.

A dying cgroup can not contain any processes, so the only chance for
an attached bpf program to be executed is a live socket associated
with the cgroup. So in order to release all bpf data early, let's
count associated sockets using a new percpu refcounter. On cgroup
removal the counter is transitioned to the atomic mode, and as soon
as it reaches 0, all bpf programs are detached.

Because cgroup_bpf_release() can block, it can't be called from
the percpu ref counter callback directly, so instead an asynchronous
work is scheduled.

The reference counter is not socket specific, and can be used for any
other types of programs, which can be executed from a cgroup-bpf hook
outside of the process context, had such a need arise in the future.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: jolsa@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h | 11 +++++++++--
 include/linux/cgroup.h     | 18 ++++++++++++++++++
 kernel/bpf/cgroup.c        | 41 +++++++++++++++++++++++++++++++++++++----
 kernel/cgroup/cgroup.c     | 11 ++++++++---
 4 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..9f100fc422c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
+#include <linux/percpu-refcount.h>
 #include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
@@ -72,10 +73,16 @@ struct cgroup_bpf {
 
 	/* temp storage for effective prog array used by prog_attach/detach */
 	struct bpf_prog_array __rcu *inactive;
+
+	/* reference counter used to detach bpf programs after cgroup removal */
+	struct percpu_ref refcnt;
+
+	/* cgroup_bpf is released using a work queue */
+	struct work_struct release_work;
 };
 
-void cgroup_bpf_put(struct cgroup *cgrp);
 int cgroup_bpf_inherit(struct cgroup *cgrp);
+void cgroup_bpf_offline(struct cgroup *cgrp);
 
 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 
 struct bpf_prog;
 struct cgroup_bpf {};
-static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
 
 static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 					 enum bpf_prog_type ptype,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0077adeea83..49e8facf7c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 
 #endif /* !CONFIG_CGROUPS */
 
+#ifdef CONFIG_CGROUP_BPF
+static inline void cgroup_bpf_get(struct cgroup *cgrp)
+{
+	percpu_ref_get(&cgrp->bpf.refcnt);
+}
+
+static inline void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	percpu_ref_put(&cgrp->bpf.refcnt);
+}
+
+#else /* CONFIG_CGROUP_BPF */
+
+static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+
+#endif /* CONFIG_CGROUP_BPF */
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..d995edbe816d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,12 +22,21 @@
 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+	cgroup_get(cgrp);
+	percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
 /**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ *                        release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
  */
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
 {
+	struct cgroup *cgrp = container_of(work, struct cgroup,
+					   bpf.release_work);
 	enum bpf_cgroup_storage_type stype;
 	unsigned int type;
 
@@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 		}
 		bpf_prog_array_free(cgrp->bpf.effective[type]);
 	}
+
+	percpu_ref_exit(&cgrp->bpf.refcnt);
+	cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ *                           of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+	queue_work(system_wq, &cgrp->bpf.release_work);
 }
 
 /* count number of elements in the list.
@@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
  */
 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
 	struct bpf_prog_array __rcu *arrays[NR] = {};
-	int i;
+	int ret, i;
+
+	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+			      GFP_KERNEL);
+	if (ret)
+		return ret;
 
 	for (i = 0; i < NR; i++)
 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 cleanup:
 	for (i = 0; i < NR; i++)
 		bpf_prog_array_free(arrays[i]);
+
+	percpu_ref_exit(&cgrp->bpf.refcnt);
+
 	return -ENOMEM;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..ef9cfbfc82a9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
-
-		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	cgroup1_check_for_release(parent);
 
+	cgroup_bpf_offline(cgrp);
+
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
@@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 		 * Don't use cgroup_get_live().
 		 */
 		cgroup_get(sock_cgroup_ptr(skcd));
+		cgroup_bpf_get(sock_cgroup_ptr(skcd));
 		return;
 	}
 
@@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
 			skcd->val = (unsigned long)cset->dfl_cgrp;
+			cgroup_bpf_get(cset->dfl_cgrp);
 			break;
 		}
 		cpu_relax();
@@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
-	cgroup_put(sock_cgroup_ptr(skcd));
+	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+	cgroup_bpf_put(cgrp);
+	cgroup_put(cgrp);
 }
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
-- 
cgit v1.2.3


From d2d0727b1654e11563f181f4d3d48b9275514480 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:39 -0700
Subject: fscrypt: simplify bounce page handling

Currently, bounce page handling for writes to encrypted files is
unnecessarily complicated.  A fscrypt_ctx is allocated along with each
bounce page, page_private(bounce_page) points to this fscrypt_ctx, and
fscrypt_ctx::w::control_page points to the original pagecache page.

However, because writes don't use the fscrypt_ctx for anything else,
there's no reason why page_private(bounce_page) can't just point to the
original pagecache page directly.

Therefore, this patch makes this change.  In the process, it also cleans
up the API exposed to filesystems that allows testing whether a page is
a bounce page, getting the pagecache page from a bounce page, and
freeing a bounce page.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/bio.c             |  38 +++-------------
 fs/crypto/crypto.c          | 104 +++++++++++++++-----------------------------
 fs/crypto/fscrypt_private.h |   4 +-
 fs/ext4/page-io.c           |  36 ++++++---------
 fs/f2fs/data.c              |  12 +++--
 include/linux/fscrypt.h     |  38 +++++++++++-----
 6 files changed, 84 insertions(+), 148 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index b46021ebde85..c857b70b5328 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -70,46 +70,18 @@ void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
 }
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
 
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	if (restore)
-		fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
-
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				sector_t pblk, unsigned int len)
 {
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
+	struct page *ciphertext_page;
 	struct bio *bio;
 	int ret, err = 0;
 
 	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
 
-	ctx = fscrypt_get_ctx(GFP_NOFS);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
-	if (IS_ERR(ciphertext_page)) {
-		err = PTR_ERR(ciphertext_page);
-		goto errout;
-	}
+	ciphertext_page = fscrypt_alloc_bounce_page(GFP_NOWAIT);
+	if (!ciphertext_page)
+		return -ENOMEM;
 
 	while (len--) {
 		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
@@ -147,7 +119,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	}
 	err = 0;
 errout:
-	fscrypt_release_ctx(ctx);
+	fscrypt_free_bounce_page(ciphertext_page);
 	return err;
 }
 EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 335a362ee446..881e2a69f8a6 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -64,18 +64,11 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
  *
  * If the encryption context was allocated from the pre-allocated pool, returns
  * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
  */
 void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
-		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
-		ctx->w.bounce_page = NULL;
-	}
-	ctx->w.control_page = NULL;
 	if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
 		kmem_cache_free(fscrypt_ctx_cachep, ctx);
 	} else {
@@ -100,14 +93,8 @@ struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
 	unsigned long flags;
 
 	/*
-	 * We first try getting the ctx from a free list because in
-	 * the common case the ctx will have an allocated and
-	 * initialized crypto tfm, so it's probably a worthwhile
-	 * optimization. For the bounce page, we first try getting it
-	 * from the kernel allocator because that's just about as fast
-	 * as getting it from a list and because a cache of free pages
-	 * should generally be a "last resort" option for a filesystem
-	 * to be able to do its job.
+	 * First try getting a ctx from the free list so that we don't have to
+	 * call into the slab allocator.
 	 */
 	spin_lock_irqsave(&fscrypt_ctx_lock, flags);
 	ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
@@ -123,11 +110,31 @@ struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
 	} else {
 		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-	ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx;
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
 
+struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
+{
+	return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
+}
+
+/**
+ * fscrypt_free_bounce_page() - free a ciphertext bounce page
+ *
+ * Free a bounce page that was allocated by fscrypt_encrypt_page(), or by
+ * fscrypt_alloc_bounce_page() directly.
+ */
+void fscrypt_free_bounce_page(struct page *bounce_page)
+{
+	if (!bounce_page)
+		return;
+	set_page_private(bounce_page, (unsigned long)NULL);
+	ClearPagePrivate(bounce_page);
+	mempool_free(bounce_page, fscrypt_bounce_page_pool);
+}
+EXPORT_SYMBOL(fscrypt_free_bounce_page);
+
 void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
 			 const struct fscrypt_info *ci)
 {
@@ -186,16 +193,6 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 	return 0;
 }
 
-struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
-				       gfp_t gfp_flags)
-{
-	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
-	if (ctx->w.bounce_page == NULL)
-		return ERR_PTR(-ENOMEM);
-	ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
-	return ctx->w.bounce_page;
-}
-
 /**
  * fscypt_encrypt_page() - Encrypts a page
  * @inode:     The inode for which the encryption should take place
@@ -210,22 +207,12 @@ struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
  *             previously written data.
  * @gfp_flags: The gfp flag for memory allocation
  *
- * Encrypts @page using the ctx encryption context. Performs encryption
- * either in-place or into a newly allocated bounce page.
- * Called on the page write path.
- *
- * Bounce page allocation is the default.
- * In this case, the contents of @page are encrypted and stored in an
- * allocated bounce page. @page has to be locked and the caller must call
- * fscrypt_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
- * fscrypt_operations. Here, the input-page is returned with its content
- * encrypted.
+ * Encrypts @page.  If the filesystem set FS_CFLG_OWN_PAGES, then the data is
+ * encrypted in-place and @page is returned.  Else, a bounce page is allocated,
+ * the data is encrypted into the bounce page, and the bounce page is returned.
+ * The caller is responsible for calling fscrypt_free_bounce_page().
  *
- * Return: A page with the encrypted content on success. Else, an
- * error value or NULL.
+ * Return: A page containing the encrypted data on success, else an ERR_PTR()
  */
 struct page *fscrypt_encrypt_page(const struct inode *inode,
 				struct page *page,
@@ -234,7 +221,6 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 				u64 lblk_num, gfp_t gfp_flags)
 
 {
-	struct fscrypt_ctx *ctx;
 	struct page *ciphertext_page = page;
 	int err;
 
@@ -253,30 +239,20 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 
 	BUG_ON(!PageLocked(page));
 
-	ctx = fscrypt_get_ctx(gfp_flags);
-	if (IS_ERR(ctx))
-		return ERR_CAST(ctx);
-
 	/* The encryption operation will require a bounce page. */
-	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
-	if (IS_ERR(ciphertext_page))
-		goto errout;
+	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
+	if (!ciphertext_page)
+		return ERR_PTR(-ENOMEM);
 
-	ctx->w.control_page = page;
 	err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
 				     page, ciphertext_page, len, offs,
 				     gfp_flags);
 	if (err) {
-		ciphertext_page = ERR_PTR(err);
-		goto errout;
+		fscrypt_free_bounce_page(ciphertext_page);
+		return ERR_PTR(err);
 	}
 	SetPagePrivate(ciphertext_page);
-	set_page_private(ciphertext_page, (unsigned long)ctx);
-	lock_page(ciphertext_page);
-	return ciphertext_page;
-
-errout:
-	fscrypt_release_ctx(ctx);
+	set_page_private(ciphertext_page, (unsigned long)page);
 	return ciphertext_page;
 }
 EXPORT_SYMBOL(fscrypt_encrypt_page);
@@ -355,18 +331,6 @@ const struct dentry_operations fscrypt_d_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
 };
 
-void fscrypt_restore_control_page(struct page *page)
-{
-	struct fscrypt_ctx *ctx;
-
-	ctx = (struct fscrypt_ctx *)page_private(page);
-	set_page_private(page, (unsigned long)NULL);
-	ClearPagePrivate(page);
-	unlock_page(page);
-	fscrypt_release_ctx(ctx);
-}
-EXPORT_SYMBOL(fscrypt_restore_control_page);
-
 static void fscrypt_destroy(void)
 {
 	struct fscrypt_ctx *pos, *n;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7da276159593..4122ee1a0b7b 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -94,7 +94,6 @@ typedef enum {
 } fscrypt_direction_t;
 
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
-#define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
 static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 					   u32 filenames_mode)
@@ -123,8 +122,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
 				  struct page *dest_page,
 				  unsigned int len, unsigned int offs,
 				  gfp_t gfp_flags);
-extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
-					      gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 extern const struct dentry_operations fscrypt_d_ops;
 
 extern void __printf(3, 4) __cold
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4690618a92e9..13d5ecc0af03 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -66,9 +66,7 @@ static void ext4_finish_bio(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
-#ifdef CONFIG_FS_ENCRYPTION
-		struct page *data_page = NULL;
-#endif
+		struct page *bounce_page = NULL;
 		struct buffer_head *bh, *head;
 		unsigned bio_start = bvec->bv_offset;
 		unsigned bio_end = bio_start + bvec->bv_len;
@@ -78,13 +76,10 @@ static void ext4_finish_bio(struct bio *bio)
 		if (!page)
 			continue;
 
-#ifdef CONFIG_FS_ENCRYPTION
-		if (!page->mapping) {
-			/* The bounce data pages are unmapped. */
-			data_page = page;
-			fscrypt_pullback_bio_page(&page, false);
+		if (fscrypt_is_bounce_page(page)) {
+			bounce_page = page;
+			page = fscrypt_pagecache_page(bounce_page);
 		}
-#endif
 
 		if (bio->bi_status) {
 			SetPageError(page);
@@ -111,10 +106,7 @@ static void ext4_finish_bio(struct bio *bio)
 		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 		local_irq_restore(flags);
 		if (!under_io) {
-#ifdef CONFIG_FS_ENCRYPTION
-			if (data_page)
-				fscrypt_restore_control_page(data_page);
-#endif
+			fscrypt_free_bounce_page(bounce_page);
 			end_page_writeback(page);
 		}
 	}
@@ -415,7 +407,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct writeback_control *wbc,
 			bool keep_towrite)
 {
-	struct page *data_page = NULL;
+	struct page *bounce_page = NULL;
 	struct inode *inode = page->mapping->host;
 	unsigned block_start;
 	struct buffer_head *bh, *head;
@@ -479,10 +471,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		gfp_t gfp_flags = GFP_NOFS;
 
 	retry_encrypt:
-		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
-						page->index, gfp_flags);
-		if (IS_ERR(data_page)) {
-			ret = PTR_ERR(data_page);
+		bounce_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+						   page->index, gfp_flags);
+		if (IS_ERR(bounce_page)) {
+			ret = PTR_ERR(bounce_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
 				if (io->io_bio) {
 					ext4_io_submit(io);
@@ -491,7 +483,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 				gfp_flags |= __GFP_NOFAIL;
 				goto retry_encrypt;
 			}
-			data_page = NULL;
+			bounce_page = NULL;
 			goto out;
 		}
 	}
@@ -500,8 +492,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		ret = io_submit_add_bh(io, inode,
-				       data_page ? data_page : page, bh);
+		ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
 		if (ret) {
 			/*
 			 * We only get here on ENOMEM.  Not much else
@@ -517,8 +508,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	/* Error stopped previous loop? Clean up buffers... */
 	if (ret) {
 	out:
-		if (data_page)
-			fscrypt_restore_control_page(data_page);
+		fscrypt_free_bounce_page(bounce_page);
 		printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
 		redirty_page_for_writepage(wbc, page);
 		do {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eda4181d2092..968ebdbcb583 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -185,7 +185,7 @@ static void f2fs_write_end_io(struct bio *bio)
 			continue;
 		}
 
-		fscrypt_pullback_bio_page(&page, true);
+		fscrypt_finalize_bounce_page(&page);
 
 		if (unlikely(bio->bi_status)) {
 			mapping_set_error(page->mapping, -EIO);
@@ -362,10 +362,9 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 
 	bio_for_each_segment_all(bvec, io->bio, iter_all) {
 
-		if (bvec->bv_page->mapping)
-			target = bvec->bv_page;
-		else
-			target = fscrypt_control_page(bvec->bv_page);
+		target = bvec->bv_page;
+		if (fscrypt_is_bounce_page(target))
+			target = fscrypt_pagecache_page(target);
 
 		if (inode && inode == target->mapping->host)
 			return true;
@@ -1900,8 +1899,7 @@ got_it:
 		err = f2fs_inplace_write_data(fio);
 		if (err) {
 			if (f2fs_encrypted_file(inode))
-				fscrypt_pullback_bio_page(&fio->encrypted_page,
-									true);
+				fscrypt_finalize_bounce_page(&fio->encrypted_page);
 			if (PageWriteback(page))
 				end_page_writeback(page);
 		} else {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index f7680ef1abd2..d016fa384d60 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -112,12 +112,17 @@ extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int, u64);
 
-static inline struct page *fscrypt_control_page(struct page *page)
+static inline bool fscrypt_is_bounce_page(struct page *page)
 {
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+	return page->mapping == NULL;
 }
 
-extern void fscrypt_restore_control_page(struct page *);
+static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
+{
+	return (struct page *)page_private(bounce_page);
+}
+
+extern void fscrypt_free_bounce_page(struct page *bounce_page);
 
 /* policy.c */
 extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
@@ -223,7 +228,6 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 extern void fscrypt_decrypt_bio(struct bio *);
 extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
 					struct bio *bio);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 				 unsigned int);
 
@@ -300,15 +304,19 @@ static inline int fscrypt_decrypt_page(const struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
-static inline struct page *fscrypt_control_page(struct page *page)
+static inline bool fscrypt_is_bounce_page(struct page *page)
+{
+	return false;
+}
+
+static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
 {
 	WARN_ON_ONCE(1);
 	return ERR_PTR(-EINVAL);
 }
 
-static inline void fscrypt_restore_control_page(struct page *page)
+static inline void fscrypt_free_bounce_page(struct page *bounce_page)
 {
-	return;
 }
 
 /* policy.c */
@@ -410,11 +418,6 @@ static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
 {
 }
 
-static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	return;
-}
-
 static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 					sector_t pblk, unsigned int len)
 {
@@ -692,4 +695,15 @@ static inline int fscrypt_encrypt_symlink(struct inode *inode,
 	return 0;
 }
 
+/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
+static inline void fscrypt_finalize_bounce_page(struct page **pagep)
+{
+	struct page *page = *pagep;
+
+	if (fscrypt_is_bounce_page(page)) {
+		*pagep = fscrypt_pagecache_page(page);
+		fscrypt_free_bounce_page(page);
+	}
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
-- 
cgit v1.2.3


From 2a415a0257314cb2e49fb9ac4c6770837112f261 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:40 -0700
Subject: fscrypt: remove the "write" part of struct fscrypt_ctx

Now that fscrypt_ctx is not used for writes, remove the 'w' fields.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/bio.c         | 11 +++++------
 fs/crypto/crypto.c      | 14 +++++++-------
 include/linux/fscrypt.h |  7 ++-----
 3 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index c857b70b5328..c53425348387 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -53,9 +53,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_bio);
 
 static void completion_pages(struct work_struct *work)
 {
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
+	struct fscrypt_ctx *ctx = container_of(work, struct fscrypt_ctx, work);
+	struct bio *bio = ctx->bio;
 
 	__fscrypt_decrypt_bio(bio, true);
 	fscrypt_release_ctx(ctx);
@@ -64,9 +63,9 @@ static void completion_pages(struct work_struct *work)
 
 void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
 {
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	fscrypt_enqueue_decrypt_work(&ctx->r.work);
+	INIT_WORK(&ctx->work, completion_pages);
+	ctx->bio = bio;
+	fscrypt_enqueue_decrypt_work(&ctx->work);
 }
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
 
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 881e2a69f8a6..9dd7a643eae0 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -59,11 +59,11 @@ void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
 /**
- * fscrypt_release_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
+ * fscrypt_release_ctx() - Release a decryption context
+ * @ctx: The decryption context to release.
  *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
+ * If the decryption context was allocated from the pre-allocated pool, return
+ * it to that pool.  Else, free it.
  */
 void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
@@ -80,12 +80,12 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 EXPORT_SYMBOL(fscrypt_release_ctx);
 
 /**
- * fscrypt_get_ctx() - Gets an encryption context
+ * fscrypt_get_ctx() - Get a decryption context
  * @gfp_flags:   The gfp flag for memory allocation
  *
- * Allocates and initializes an encryption context.
+ * Allocate and initialize a decryption context.
  *
- * Return: A new encryption context on success; an ERR_PTR() otherwise.
+ * Return: A new decryption context on success; an ERR_PTR() otherwise.
  */
 struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
 {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index d016fa384d60..1c7287f146a9 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -63,16 +63,13 @@ struct fscrypt_operations {
 	unsigned int max_namelen;
 };
 
+/* Decryption work */
 struct fscrypt_ctx {
 	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
 		struct {
 			struct bio *bio;
 			struct work_struct work;
-		} r;
+		};
 		struct list_head free_list;	/* Free list */
 	};
 	u8 flags;				/* Flags */
-- 
cgit v1.2.3


From 03569f2fb8e734f281379767de674e23c38b0b14 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:43 -0700
Subject: fscrypt: introduce fscrypt_encrypt_block_inplace()

fscrypt_encrypt_page() behaves very differently depending on whether the
filesystem set FS_CFLG_OWN_PAGES in its fscrypt_operations.  This makes
the function difficult to understand and document.  It also makes it so
that all callers have to provide inode and lblk_num, when fscrypt could
determine these itself for pagecache pages.

Therefore, move the FS_CFLG_OWN_PAGES behavior into a new function
fscrypt_encrypt_block_inplace().

This is in preparation for allowing encryption on ext4 filesystems with
blocksize != PAGE_SIZE.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c      | 50 ++++++++++++++++++++++++++++++-------------------
 fs/ubifs/crypto.c       | 12 ++++++------
 include/linux/fscrypt.h | 13 +++++++++++++
 3 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 59337287e580..2969a1dff10b 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -200,8 +200,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 /**
  * fscypt_encrypt_page() - Encrypts a page
  * @inode:     The inode for which the encryption should take place
- * @page:      The page to encrypt. Must be locked for bounce-page
- *             encryption.
+ * @page:      The page to encrypt. Must be locked.
  * @len:       Length of data to encrypt in @page and encrypted
  *             data in returned page.
  * @offs:      Offset of data within @page and returned
@@ -211,10 +210,9 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
  *             previously written data.
  * @gfp_flags: The gfp flag for memory allocation
  *
- * Encrypts @page.  If the filesystem set FS_CFLG_OWN_PAGES, then the data is
- * encrypted in-place and @page is returned.  Else, a bounce page is allocated,
- * the data is encrypted into the bounce page, and the bounce page is returned.
- * The caller is responsible for calling fscrypt_free_bounce_page().
+ * Encrypts @page.  A bounce page is allocated, the data is encrypted into the
+ * bounce page, and the bounce page is returned.  The caller is responsible for
+ * calling fscrypt_free_bounce_page().
  *
  * Return: A page containing the encrypted data on success, else an ERR_PTR()
  */
@@ -225,24 +223,12 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 				u64 lblk_num, gfp_t gfp_flags)
 
 {
-	struct page *ciphertext_page = page;
+	struct page *ciphertext_page;
 	int err;
 
-	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
-		/* with inplace-encryption we just encrypt the page */
-		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page,
-					  ciphertext_page, len, offs,
-					  gfp_flags);
-		if (err)
-			return ERR_PTR(err);
-
-		return ciphertext_page;
-	}
-
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return ERR_PTR(-EINVAL);
 
-	/* The encryption operation will require a bounce page. */
 	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
 	if (!ciphertext_page)
 		return ERR_PTR(-ENOMEM);
@@ -259,6 +245,32 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 }
 EXPORT_SYMBOL(fscrypt_encrypt_page);
 
+/**
+ * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place
+ * @inode:     The inode to which this block belongs
+ * @page:      The page containing the block to encrypt
+ * @len:       Size of block to encrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to encrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ * @gfp_flags: Memory allocation flags
+ *
+ * Encrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
+				  unsigned int len, unsigned int offs,
+				  u64 lblk_num, gfp_t gfp_flags)
+{
+	return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
+				   len, offs, gfp_flags);
+}
+EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
+
 /**
  * fscrypt_decrypt_page() - Decrypts a page in-place
  * @inode:     The corresponding inode for the page to decrypt.
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 4aaedf2d7f44..032efdad2e66 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -29,8 +29,8 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 {
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	void *p = &dn->data;
-	struct page *ret;
 	unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
+	int err;
 
 	ubifs_assert(c, pad_len <= *out_len);
 	dn->compr_size = cpu_to_le16(in_len);
@@ -39,11 +39,11 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 	if (pad_len != in_len)
 		memset(p + in_len, 0, pad_len - in_len);
 
-	ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len,
-			offset_in_page(&dn->data), block, GFP_NOFS);
-	if (IS_ERR(ret)) {
-		ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret));
-		return PTR_ERR(ret);
+	err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len,
+					    offset_in_page(p), block, GFP_NOFS);
+	if (err) {
+		ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err);
+		return err;
 	}
 	*out_len = pad_len;
 
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 1c7287f146a9..a9b2d26e615d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -106,6 +106,10 @@ extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						unsigned int, unsigned int,
 						u64, gfp_t);
+extern int fscrypt_encrypt_block_inplace(const struct inode *inode,
+					 struct page *page, unsigned int len,
+					 unsigned int offs, u64 lblk_num,
+					 gfp_t gfp_flags);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int, u64);
 
@@ -293,6 +297,15 @@ static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs, u64 lblk_num,
+						gfp_t gfp_flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int fscrypt_decrypt_page(const struct inode *inode,
 				       struct page *page,
 				       unsigned int len, unsigned int offs,
-- 
cgit v1.2.3


From 53bc1d854c64c20d967dab15b111baca02a6d99e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:44 -0700
Subject: fscrypt: support encrypting multiple filesystem blocks per page

Rename fscrypt_encrypt_page() to fscrypt_encrypt_pagecache_blocks() and
redefine its behavior to encrypt all filesystem blocks from the given
region of the given page, rather than assuming that the region consists
of just one filesystem block.  Also remove the 'inode' and 'lblk_num'
parameters, since they can be retrieved from the page as it's already
assumed to be a pagecache page.

This is in preparation for allowing encryption on ext4 filesystems with
blocksize != PAGE_SIZE.

This is based on work by Chandan Rajendra.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c      | 67 ++++++++++++++++++++++++++++---------------------
 fs/ext4/page-io.c       |  4 +--
 fs/f2fs/data.c          |  5 ++--
 include/linux/fscrypt.h | 17 +++++++------
 4 files changed, 53 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2969a1dff10b..ff43a13c3abf 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -122,8 +122,8 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 /**
  * fscrypt_free_bounce_page() - free a ciphertext bounce page
  *
- * Free a bounce page that was allocated by fscrypt_encrypt_page(), or by
- * fscrypt_alloc_bounce_page() directly.
+ * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(),
+ * or by fscrypt_alloc_bounce_page() directly.
  */
 void fscrypt_free_bounce_page(struct page *bounce_page)
 {
@@ -198,52 +198,63 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 }
 
 /**
- * fscypt_encrypt_page() - Encrypts a page
- * @inode:     The inode for which the encryption should take place
- * @page:      The page to encrypt. Must be locked.
- * @len:       Length of data to encrypt in @page and encrypted
- *             data in returned page.
- * @offs:      Offset of data within @page and returned
- *             page holding encrypted data.
- * @lblk_num:  Logical block number. This must be unique for multiple
- *             calls with same inode, except when overwriting
- *             previously written data.
- * @gfp_flags: The gfp flag for memory allocation
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to encrypt
+ * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to encrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ * @gfp_flags: Memory allocation flags
+ *
+ * A new bounce page is allocated, and the specified block(s) are encrypted into
+ * it.  In the bounce page, the ciphertext block(s) will be located at the same
+ * offsets at which the plaintext block(s) were located in the source page; any
+ * other parts of the bounce page will be left uninitialized.  However, normally
+ * blocksize == PAGE_SIZE and the whole page is encrypted at once.
  *
- * Encrypts @page.  A bounce page is allocated, the data is encrypted into the
- * bounce page, and the bounce page is returned.  The caller is responsible for
- * calling fscrypt_free_bounce_page().
+ * This is for use by the filesystem's ->writepages() method.
  *
- * Return: A page containing the encrypted data on success, else an ERR_PTR()
+ * Return: the new encrypted bounce page on success; an ERR_PTR() on failure
  */
-struct page *fscrypt_encrypt_page(const struct inode *inode,
-				struct page *page,
-				unsigned int len,
-				unsigned int offs,
-				u64 lblk_num, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
+					      unsigned int len,
+					      unsigned int offs,
+					      gfp_t gfp_flags)
 
 {
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
 	struct page *ciphertext_page;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
 	int err;
 
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return ERR_PTR(-EINVAL);
 
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return ERR_PTR(-EINVAL);
+
 	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
 	if (!ciphertext_page)
 		return ERR_PTR(-ENOMEM);
 
-	err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page,
-				  ciphertext_page, len, offs, gfp_flags);
-	if (err) {
-		fscrypt_free_bounce_page(ciphertext_page);
-		return ERR_PTR(err);
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
+					  page, ciphertext_page,
+					  blocksize, i, gfp_flags);
+		if (err) {
+			fscrypt_free_bounce_page(ciphertext_page);
+			return ERR_PTR(err);
+		}
 	}
 	SetPagePrivate(ciphertext_page);
 	set_page_private(ciphertext_page, (unsigned long)page);
 	return ciphertext_page;
 }
-EXPORT_SYMBOL(fscrypt_encrypt_page);
+EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
 
 /**
  * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 13d5ecc0af03..40ee33df5764 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -471,8 +471,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		gfp_t gfp_flags = GFP_NOFS;
 
 	retry_encrypt:
-		bounce_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
-						   page->index, gfp_flags);
+		bounce_page = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE,
+							       0, gfp_flags);
 		if (IS_ERR(bounce_page)) {
 			ret = PTR_ERR(bounce_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 968ebdbcb583..a546ac8685ea 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1726,8 +1726,9 @@ static int encrypt_one_page(struct f2fs_io_info *fio)
 	f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
 
 retry_encrypt:
-	fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-			PAGE_SIZE, 0, fio->page->index, gfp_flags);
+	fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page,
+							       PAGE_SIZE, 0,
+							       gfp_flags);
 	if (IS_ERR(fio->encrypted_page)) {
 		/* flush pending IOs and wait for a while in the ENOMEM case */
 		if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index a9b2d26e615d..c7e16bd16a6c 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -103,9 +103,11 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry)
 extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
 extern struct fscrypt_ctx *fscrypt_get_ctx(gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
-						unsigned int, unsigned int,
-						u64, gfp_t);
+
+extern struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
+						     unsigned int len,
+						     unsigned int offs,
+						     gfp_t gfp_flags);
 extern int fscrypt_encrypt_block_inplace(const struct inode *inode,
 					 struct page *page, unsigned int len,
 					 unsigned int offs, u64 lblk_num,
@@ -288,11 +290,10 @@ static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 	return;
 }
 
-static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
-						struct page *page,
-						unsigned int len,
-						unsigned int offs,
-						u64 lblk_num, gfp_t gfp_flags)
+static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
+							    unsigned int len,
+							    unsigned int offs,
+							    gfp_t gfp_flags)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
cgit v1.2.3


From 41adbcb7267b0060682576d523956160b5c617bd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:46 -0700
Subject: fscrypt: introduce fscrypt_decrypt_block_inplace()

Currently fscrypt_decrypt_page() does one of two logically distinct
things depending on whether FS_CFLG_OWN_PAGES is set in the filesystem's
fscrypt_operations: decrypt a pagecache page in-place, or decrypt a
filesystem block in-place in any page.  Currently these happen to share
the same implementation, but this conflates the notion of blocks and
pages.  It also makes it so that all callers have to provide inode and
lblk_num, when fscrypt could determine these itself for pagecache pages.

Therefore, move the FS_CFLG_OWN_PAGES behavior into a new function
fscrypt_decrypt_block_inplace().  This mirrors
fscrypt_encrypt_block_inplace().

This is in preparation for allowing encryption on ext4 filesystems with
blocksize != PAGE_SIZE.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c      | 31 +++++++++++++++++++++++++++----
 fs/ubifs/crypto.c       |  7 ++++---
 include/linux/fscrypt.h | 11 +++++++++++
 3 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ff43a13c3abf..f82c45ac285a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -285,8 +285,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 /**
  * fscrypt_decrypt_page() - Decrypts a page in-place
  * @inode:     The corresponding inode for the page to decrypt.
- * @page:      The page to decrypt. Must be locked in case
- *             it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @page:      The page to decrypt. Must be locked.
  * @len:       Number of bytes in @page to be decrypted.
  * @offs:      Start of data in @page.
  * @lblk_num:  Logical block number.
@@ -300,8 +299,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 			unsigned int len, unsigned int offs, u64 lblk_num)
 {
-	if (WARN_ON_ONCE(!PageLocked(page) &&
-			 !(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)))
+	if (WARN_ON_ONCE(!PageLocked(page)))
 		return -EINVAL;
 
 	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
@@ -309,6 +307,31 @@ int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
+/**
+ * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
+ * @inode:     The inode to which this block belongs
+ * @page:      The page containing the block to decrypt
+ * @len:       Size of block to decrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to decrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ *
+ * Decrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
+				  unsigned int len, unsigned int offs,
+				  u64 lblk_num)
+{
+	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
+				   len, offs, GFP_NOFS);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
+
 /*
  * Validate dentries in encrypted directories to make sure we aren't potentially
  * caching stale dentries after a key has been added.
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 032efdad2e66..22be7aeb96c4 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -64,10 +64,11 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 	}
 
 	ubifs_assert(c, dlen <= UBIFS_BLOCK_SIZE);
-	err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen,
-			offset_in_page(&dn->data), block);
+	err = fscrypt_decrypt_block_inplace(inode, virt_to_page(&dn->data),
+					    dlen, offset_in_page(&dn->data),
+					    block);
 	if (err) {
-		ubifs_err(c, "fscrypt_decrypt_page failed: %i", err);
+		ubifs_err(c, "fscrypt_decrypt_block_inplace() failed: %d", err);
 		return err;
 	}
 	*out_len = clen;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index c7e16bd16a6c..315affc99b05 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -114,6 +114,9 @@ extern int fscrypt_encrypt_block_inplace(const struct inode *inode,
 					 gfp_t gfp_flags);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int, u64);
+extern int fscrypt_decrypt_block_inplace(const struct inode *inode,
+					 struct page *page, unsigned int len,
+					 unsigned int offs, u64 lblk_num);
 
 static inline bool fscrypt_is_bounce_page(struct page *page)
 {
@@ -315,6 +318,14 @@ static inline int fscrypt_decrypt_page(const struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
+static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs, u64 lblk_num)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline bool fscrypt_is_bounce_page(struct page *page)
 {
 	return false;
-- 
cgit v1.2.3


From aa8bc1ac6ef32a332671ca25e06cfd277a3839a5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:29:47 -0700
Subject: fscrypt: support decrypting multiple filesystem blocks per page

Rename fscrypt_decrypt_page() to fscrypt_decrypt_pagecache_blocks() and
redefine its behavior to decrypt all filesystem blocks in the given
region of the given page, rather than assuming that the region consists
of just one filesystem block.  Also remove the 'inode' and 'lblk_num'
parameters, since they can be retrieved from the page as it's already
assumed to be a pagecache page.

This is in preparation for allowing encryption on ext4 filesystems with
blocksize != PAGE_SIZE.

This is based on work by Chandan Rajendra.

Reviewed-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/bio.c         |  3 +--
 fs/crypto/crypto.c      | 46 ++++++++++++++++++++++++++++++++--------------
 fs/ext4/inode.c         |  7 +++----
 include/linux/fscrypt.h | 12 ++++++------
 4 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index f9111ffa12ff..61da06fda45c 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -33,8 +33,7 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
 
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
+		int ret = fscrypt_decrypt_pagecache_blocks(page, PAGE_SIZE, 0);
 
 		if (ret)
 			SetPageError(page);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index f82c45ac285a..45c3d0427fb2 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -283,29 +283,47 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
 /**
- * fscrypt_decrypt_page() - Decrypts a page in-place
- * @inode:     The corresponding inode for the page to decrypt.
- * @page:      The page to decrypt. Must be locked.
- * @len:       Number of bytes in @page to be decrypted.
- * @offs:      Start of data in @page.
- * @lblk_num:  Logical block number.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to decrypt
+ * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to decrypt.  Must be
+ *		a multiple of the filesystem's block size.
  *
- * Decrypts page in-place using the ctx encryption context.
+ * The specified block(s) are decrypted in-place within the pagecache page,
+ * which must still be locked and not uptodate.  Normally, blocksize ==
+ * PAGE_SIZE and the whole page is decrypted at once.
  *
- * Called from the read completion callback.
+ * This is for use by the filesystem's ->readpages() method.
  *
- * Return: Zero on success, non-zero otherwise.
+ * Return: 0 on success; -errno on failure
  */
-int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
-			unsigned int len, unsigned int offs, u64 lblk_num)
+int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
+				     unsigned int offs)
 {
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
+	int err;
+
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return -EINVAL;
 
-	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
-				   len, offs, GFP_NOFS);
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return -EINVAL;
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
+					  page, blocksize, i, GFP_NOFS);
+		if (err)
+			return err;
+	}
+	return 0;
 }
-EXPORT_SYMBOL(fscrypt_decrypt_page);
+EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
 
 /**
  * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c7f77c643008..8bfd8941f5ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1232,8 +1232,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
-		err = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
+		err = fscrypt_decrypt_pagecache_blocks(page, PAGE_SIZE, 0);
 	return err;
 }
 #endif
@@ -4066,8 +4065,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_SIZE);
-			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
-						page, PAGE_SIZE, 0, page->index));
+			WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
+						page, PAGE_SIZE, 0));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 315affc99b05..bd8f207a2fb6 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -112,8 +112,9 @@ extern int fscrypt_encrypt_block_inplace(const struct inode *inode,
 					 struct page *page, unsigned int len,
 					 unsigned int offs, u64 lblk_num,
 					 gfp_t gfp_flags);
-extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
-				unsigned int, u64);
+
+extern int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
+					    unsigned int offs);
 extern int fscrypt_decrypt_block_inplace(const struct inode *inode,
 					 struct page *page, unsigned int len,
 					 unsigned int offs, u64 lblk_num);
@@ -310,10 +311,9 @@ static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
-static inline int fscrypt_decrypt_page(const struct inode *inode,
-				       struct page *page,
-				       unsigned int len, unsigned int offs,
-				       u64 lblk_num)
+static inline int fscrypt_decrypt_pagecache_blocks(struct page *page,
+						   unsigned int len,
+						   unsigned int offs)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 54e9c9d4b506b611228890752d1cfa960e0965e1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 May 2019 14:14:41 -0700
Subject: bpf: remove __rcu annotations from bpf_prog_array

Drop __rcu annotations and rcu read sections from bpf_prog_array
helper functions. They are not needed since all existing callers
call those helpers from the rcu update side while holding a mutex.
This guarantees that use-after-free could not happen.

In the next patches I'll fix the callers with missing
rcu_dereference_protected to make sparse/lockdep happy, the proper
way to use these helpers is:

	struct bpf_prog_array __rcu *progs = ...;
	struct bpf_prog_array *p;

	mutex_lock(&mtx);
	p = rcu_dereference_protected(progs, lockdep_is_held(&mtx));
	bpf_prog_array_length(p);
	bpf_prog_array_copy_to_user(p, ...);
	bpf_prog_array_delete_safe(p, ...);
	bpf_prog_array_copy_info(p, ...);
	bpf_prog_array_copy(p, ...);
	bpf_prog_array_free(p);
	mutex_unlock(&mtx);

No functional changes! rcu_dereference_protected with lockdep_is_held
should catch any cases where we update prog array without a mutex
(I've looked at existing call sites and I think we hold a mutex
everywhere).

Motivation is to fix sparse warnings:
kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces)
kernel/bpf/core.c:1803:9:    expected struct callback_head *head
kernel/bpf/core.c:1803:9:    got struct callback_head [noderef] <asn:4> *
kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces)
kernel/bpf/core.c:1877:44:    expected struct bpf_prog_array_item *item
kernel/bpf/core.c:1877:44:    got struct bpf_prog_array_item [noderef] <asn:4> *
kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces)
kernel/bpf/core.c:1901:26:    expected struct bpf_prog_array_item *existing
kernel/bpf/core.c:1901:26:    got struct bpf_prog_array_item [noderef] <asn:4> *
kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces)
kernel/bpf/core.c:1935:26:    expected struct bpf_prog_array_item *[assigned] existing
kernel/bpf/core.c:1935:26:    got struct bpf_prog_array_item [noderef] <asn:4> *

v2:
* remove comment about potential race; that can't happen
  because all callers are in rcu-update section

Cc: Roman Gushchin <guro@fb.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 12 ++++++------
 kernel/bpf/core.c   | 37 +++++++++++++------------------------
 2 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d98141edb74b..ff3e00ff84d2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -514,17 +514,17 @@ struct bpf_prog_array {
 };
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_free(struct bpf_prog_array *progs);
+int bpf_prog_array_length(struct bpf_prog_array *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
 				struct bpf_prog *old_prog);
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt);
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3675b19ecb90..33fb292f2e30 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1795,38 +1795,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 	return &empty_prog_array.hdr;
 }
 
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-	if (!progs ||
-	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+	if (!progs || progs == &empty_prog_array.hdr)
 		return;
 	kfree_rcu(progs, rcu);
 }
 
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
 {
 	struct bpf_prog_array_item *item;
 	u32 cnt = 0;
 
-	rcu_read_lock();
-	item = rcu_dereference(array)->items;
-	for (; item->prog; item++)
+	for (item = array->items; item->prog; item++)
 		if (item->prog != &dummy_bpf_prog.prog)
 			cnt++;
-	rcu_read_unlock();
 	return cnt;
 }
 
 
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
 				     u32 *prog_ids,
 				     u32 request_cnt)
 {
 	struct bpf_prog_array_item *item;
 	int i = 0;
 
-	item = rcu_dereference_check(array, 1)->items;
-	for (; item->prog; item++) {
+	for (item = array->items; item->prog; item++) {
 		if (item->prog == &dummy_bpf_prog.prog)
 			continue;
 		prog_ids[i] = item->prog->aux->id;
@@ -1839,7 +1834,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
 	return !!(item->prog);
 }
 
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
 				__u32 __user *prog_ids, u32 cnt)
 {
 	unsigned long err = 0;
@@ -1850,18 +1845,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
 	 * cnt = bpf_prog_array_length();
 	 * if (cnt > 0)
 	 *     bpf_prog_array_copy_to_user(..., cnt);
-	 * so below kcalloc doesn't need extra cnt > 0 check, but
-	 * bpf_prog_array_length() releases rcu lock and
-	 * prog array could have been swapped with empty or larger array,
-	 * so always copy 'cnt' prog_ids to the user.
-	 * In a rare race the user will see zero prog_ids
+	 * so below kcalloc doesn't need extra cnt > 0 check.
 	 */
 	ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
 	if (!ids)
 		return -ENOMEM;
-	rcu_read_lock();
 	nospc = bpf_prog_array_copy_core(array, ids, cnt);
-	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
 	if (err)
@@ -1871,19 +1860,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
 	return 0;
 }
 
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
 				struct bpf_prog *old_prog)
 {
-	struct bpf_prog_array_item *item = array->items;
+	struct bpf_prog_array_item *item;
 
-	for (; item->prog; item++)
+	for (item = array->items; item->prog; item++)
 		if (item->prog == old_prog) {
 			WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
 			break;
 		}
 }
 
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array)
@@ -1947,7 +1936,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	return 0;
 }
 
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt)
 {
-- 
cgit v1.2.3


From dbcc1ba26e43bd32cb308e50ac4cb4a29d2f5967 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 May 2019 14:14:43 -0700
Subject: bpf: cgroup: properly use bpf_prog_array api

Now that we don't have __rcu markers on the bpf_prog_array helpers,
let's use proper rcu_dereference_protected to obtain array pointer
under mutex.

We also don't need __rcu annotations on cgroup_bpf.inactive since
it's not read/updated concurrently.

v4:
* drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably
  should be more clear to understand which mutex/refcount protects
  each particular place

v3:
* amend cgroup_rcu_dereference to include percpu_ref_is_dying;
  cgroup_bpf is now reference counted and we don't hold cgroup_mutex
  anymore in cgroup_bpf_release

v2:
* replace xchg with rcu_swap_protected

Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  2 +-
 kernel/bpf/cgroup.c        | 28 +++++++++++++++++-----------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 9f100fc422c3..b631ee75762d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -72,7 +72,7 @@ struct cgroup_bpf {
 	u32 flags[MAX_BPF_ATTACH_TYPE];
 
 	/* temp storage for effective prog array used by prog_attach/detach */
-	struct bpf_prog_array __rcu *inactive;
+	struct bpf_prog_array *inactive;
 
 	/* reference counter used to detach bpf programs after cgroup removal */
 	struct percpu_ref refcnt;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index d995edbe816d..ff594eb86fd7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -38,6 +38,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct cgroup *cgrp = container_of(work, struct cgroup,
 					   bpf.release_work);
 	enum bpf_cgroup_storage_type stype;
+	struct bpf_prog_array *old_array;
 	unsigned int type;
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -54,7 +55,10 @@ static void cgroup_bpf_release(struct work_struct *work)
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
-		bpf_prog_array_free(cgrp->bpf.effective[type]);
+		old_array = rcu_dereference_protected(
+				cgrp->bpf.effective[type],
+				percpu_ref_is_dying(&cgrp->bpf.refcnt));
+		bpf_prog_array_free(old_array);
 	}
 
 	percpu_ref_exit(&cgrp->bpf.refcnt);
@@ -126,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
  */
 static int compute_effective_progs(struct cgroup *cgrp,
 				   enum bpf_attach_type type,
-				   struct bpf_prog_array __rcu **array)
+				   struct bpf_prog_array **array)
 {
 	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_array *progs;
@@ -164,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
 		}
 	} while ((p = cgroup_parent(p)));
 
-	rcu_assign_pointer(*array, progs);
+	*array = progs;
 	return 0;
 }
 
 static void activate_effective_progs(struct cgroup *cgrp,
 				     enum bpf_attach_type type,
-				     struct bpf_prog_array __rcu *array)
+				     struct bpf_prog_array *old_array)
 {
-	struct bpf_prog_array __rcu *old_array;
-
-	old_array = xchg(&cgrp->bpf.effective[type], array);
+	rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+			   lockdep_is_held(&cgroup_mutex));
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
 	 */
@@ -191,7 +194,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
  * that array below is variable length
  */
 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
-	struct bpf_prog_array __rcu *arrays[NR] = {};
+	struct bpf_prog_array *arrays[NR] = {};
 	int ret, i;
 
 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
@@ -477,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	enum bpf_attach_type type = attr->query.attach_type;
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog_array *effective;
 	int cnt, ret = 0, i;
 
+	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
-		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+		cnt = bpf_prog_array_length(effective);
 	else
 		cnt = prog_list_length(progs);
 
@@ -497,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
-		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
-						   prog_ids, cnt);
+		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
 	} else {
 		struct bpf_prog_list *pl;
 		u32 id;
-- 
cgit v1.2.3


From 91ca180dbdd687d45fe4aab055b02d29c91b90df Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 6 Feb 2019 16:39:13 -0600
Subject: signal: Use force_sig_fault_to_task for the two calls that don't
 deliver to current

In preparation for removing the task parameter from force_sig_fault
introduce force_sig_fault_to_task and use it for the two cases where
it matters.

On mips force_fcr31_sig calls force_sig_fault and is called on either
the current task, or a task that is suspended and is being switched to
by the scheduler.  This is safe because the task being switched to by
the scheduler is guaranteed to be suspended.  This ensures that
task->sighand is stable while the signal is delivered to it.

On parisc user_enable_single_step calls force_sig_fault and is in turn
called by ptrace_request.  The function ptrace_request always calls
user_enable_single_step on a child that is stopped for tracing.  The
child being traced and not reaped ensures that child->sighand is not
NULL, and that the child will not change child->sighand.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/mips/kernel/traps.c     |  2 +-
 arch/parisc/kernel/ptrace.c  |  6 +++---
 include/linux/sched/signal.h |  4 ++++
 kernel/signal.c              | 12 +++++++++++-
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index a6031b045b95..62df48b6fb46 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -733,7 +733,7 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
 	else if (fcr31 & FPU_CSR_INE_X)
 		si_code = FPE_FLTRES;
 
-	force_sig_fault(SIGFPE, si_code, fault_addr, tsk);
+	force_sig_fault_to_task(SIGFPE, si_code, fault_addr, tsk);
 }
 
 int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index a3d2fb4e6dd2..f642ba378ffa 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -88,9 +88,9 @@ void user_enable_single_step(struct task_struct *task)
 		ptrace_disable(task);
 		/* Don't wake up the task, but let the
 		   parent know something happened. */
-		force_sig_fault(SIGTRAP, TRAP_TRACE,
-				(void __user *) (task_regs(task)->iaoq[0] & ~3),
-				task);
+		force_sig_fault_to_task(SIGTRAP, TRAP_TRACE,
+					(void __user *) (task_regs(task)->iaoq[0] & ~3),
+					task);
 		/* notify_parent(task, SIGCHLD); */
 		return;
 	}
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 4178bb1f7709..507af66a1fc8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -307,6 +307,10 @@ static inline void kernel_signal_stop(void)
 # define ___ARCH_SI_IA64(_a1, _a2, _a3)
 #endif
 
+int force_sig_fault_to_task(int sig, int code, void __user *addr
+	___ARCH_SI_TRAPNO(int trapno)
+	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+	, struct task_struct *t);
 int force_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
 	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
diff --git a/kernel/signal.c b/kernel/signal.c
index 398489facf9f..e420489ac4c9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1620,7 +1620,7 @@ void force_sigsegv(int sig)
 	force_sig(SIGSEGV);
 }
 
-int force_sig_fault(int sig, int code, void __user *addr
+int force_sig_fault_to_task(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
 	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 	, struct task_struct *t)
@@ -1643,6 +1643,16 @@ int force_sig_fault(int sig, int code, void __user *addr
 	return force_sig_info(info.si_signo, &info, t);
 }
 
+int force_sig_fault(int sig, int code, void __user *addr
+	___ARCH_SI_TRAPNO(int trapno)
+	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+	, struct task_struct *t)
+{
+	return force_sig_fault_to_task(sig, code, addr
+				       ___ARCH_SI_TRAPNO(trapno)
+				       ___ARCH_SI_IA64(imm, flags, isr), t);
+}
+
 int send_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
 	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-- 
cgit v1.2.3


From 2e1661d2673667d886cd40ad9f414cb6db48d8da Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 23 May 2019 11:04:24 -0500
Subject: signal: Remove the task parameter from force_sig_fault

As synchronous exceptions really only make sense against the current
task (otherwise how are you synchronous) remove the task parameter
from from force_sig_fault to make it explicit that is what is going
on.

The two known exceptions that deliver a synchronous exception to a
stopped ptraced task have already been changed to
force_sig_fault_to_task.

The callers have been changed with the following emacs regular expression
(with obvious variations on the architectures that take more arguments)
to avoid typos:

force_sig_fault[(]\([^,]+\)[,]\([^,]+\)[,]\([^,]+\)[,]\W+current[)]
->
force_sig_fault(\1,\2,\3)

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/traps.c                 |  2 +-
 arch/alpha/mm/fault.c                     |  4 ++--
 arch/arc/kernel/traps.c                   |  2 +-
 arch/arc/mm/fault.c                       |  4 ++--
 arch/arm/kernel/ptrace.c                  |  2 +-
 arch/arm/kernel/traps.c                   |  2 +-
 arch/arm/mm/alignment.c                   |  2 +-
 arch/arm/mm/fault.c                       |  2 +-
 arch/arm64/kernel/traps.c                 |  2 +-
 arch/c6x/kernel/traps.c                   |  2 +-
 arch/csky/abiv1/alignment.c               |  2 +-
 arch/csky/abiv2/fpu.c                     |  2 +-
 arch/csky/kernel/traps.c                  |  2 +-
 arch/csky/mm/fault.c                      |  4 ++--
 arch/hexagon/kernel/traps.c               |  2 +-
 arch/hexagon/mm/vm_fault.c                |  4 ++--
 arch/ia64/kernel/brl_emu.c                |  6 ++---
 arch/ia64/kernel/traps.c                  | 18 +++++++-------
 arch/ia64/kernel/unaligned.c              |  2 +-
 arch/ia64/mm/fault.c                      |  2 +-
 arch/m68k/kernel/traps.c                  |  4 ++--
 arch/m68k/mm/fault.c                      |  4 ++--
 arch/microblaze/kernel/exceptions.c       |  2 +-
 arch/microblaze/mm/fault.c                |  2 +-
 arch/mips/kernel/traps.c                  | 12 +++++-----
 arch/mips/mm/fault.c                      |  4 ++--
 arch/nds32/kernel/fpu.c                   |  2 +-
 arch/nds32/kernel/traps.c                 |  4 ++--
 arch/nds32/mm/fault.c                     |  4 ++--
 arch/nios2/kernel/traps.c                 |  2 +-
 arch/openrisc/kernel/traps.c              |  8 +++----
 arch/openrisc/mm/fault.c                  |  4 ++--
 arch/parisc/kernel/traps.c                | 14 +++++------
 arch/parisc/kernel/unaligned.c            |  4 ++--
 arch/parisc/math-emu/driver.c             |  2 +-
 arch/parisc/mm/fault.c                    |  2 +-
 arch/powerpc/kernel/process.c             |  2 +-
 arch/powerpc/kernel/traps.c               |  4 ++--
 arch/powerpc/mm/fault.c                   |  2 +-
 arch/powerpc/platforms/cell/spufs/fault.c |  9 ++++---
 arch/riscv/kernel/traps.c                 |  4 ++--
 arch/s390/kernel/traps.c                  |  6 ++---
 arch/s390/mm/fault.c                      |  6 ++---
 arch/sh/kernel/hw_breakpoint.c            |  2 +-
 arch/sh/kernel/traps_32.c                 |  4 ++--
 arch/sh/math-emu/math.c                   |  2 +-
 arch/sh/mm/fault.c                        |  2 +-
 arch/sparc/kernel/process_64.c            |  2 +-
 arch/sparc/kernel/sys_sparc_32.c          |  2 +-
 arch/sparc/kernel/sys_sparc_64.c          |  2 +-
 arch/sparc/kernel/traps_32.c              |  4 ++--
 arch/sparc/kernel/traps_64.c              | 39 ++++++++++++++-----------------
 arch/sparc/mm/fault_32.c                  |  2 +-
 arch/sparc/mm/fault_64.c                  |  2 +-
 arch/um/kernel/ptrace.c                   |  3 +--
 arch/um/kernel/trap.c                     | 12 ++++------
 arch/unicore32/kernel/traps.c             |  2 +-
 arch/unicore32/mm/fault.c                 |  2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c     |  2 +-
 arch/x86/kernel/ptrace.c                  |  2 +-
 arch/x86/kernel/traps.c                   |  4 ++--
 arch/x86/kernel/umip.c                    |  2 +-
 arch/x86/mm/fault.c                       |  7 +++---
 arch/xtensa/kernel/traps.c                |  2 +-
 arch/xtensa/mm/fault.c                    |  4 ++--
 include/linux/sched/signal.h              |  3 +--
 kernel/signal.c                           |  5 ++--
 67 files changed, 137 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index bc9627698796..f6b9664ac504 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -402,7 +402,7 @@ do_entDbg(struct pt_regs *regs)
 {
 	die_if_kernel("Instruction fault", regs, 0, NULL);
 
-	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current);
+	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0);
 }
 
 
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 188fc9256baf..741e61ef9d3f 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -221,13 +221,13 @@ retry:
 	up_read(&mm->mmap_sem);
 	/* Send a sigbus, regardless of whether we were in kernel
 	   or user mode.  */
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0);
 	if (!user_mode(regs))
 		goto no_context;
 	return;
 
  do_sigsegv:
-	force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current);
+	force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0);
 	return;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index e618fbb3e28d..fc56efc25488 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -50,7 +50,7 @@ unhandled_exception(const char *str, struct pt_regs *regs,
 
 		tsk->thread.fault_address = (__force unsigned int)addr;
 
-		force_sig_fault(signo, si_code, addr, current);
+		force_sig_fault(signo, si_code, addr);
 
 	} else {
 		/* If not due to copy_(to|from)_user, we are doomed */
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index d5d4758d7e75..5001f6418e92 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -202,7 +202,7 @@ bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 	if (user_mode(regs)) {
 		tsk->thread.fault_address = address;
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 		return;
 	}
 
@@ -237,5 +237,5 @@ do_sigbus:
 		goto no_context;
 
 	tsk->thread.fault_address = address;
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index f9cbd08a9075..1512d6b5e1cf 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -204,7 +204,7 @@ void ptrace_disable(struct task_struct *child)
 void ptrace_break(struct pt_regs *regs)
 {
 	force_sig_fault(SIGTRAP, TRAP_BRKPT,
-			(void __user *)instruction_pointer(regs), current);
+			(void __user *)instruction_pointer(regs));
 }
 
 static int break_trap(struct pt_regs *regs, unsigned int instr)
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 288989c7355d..a32342fa3e4a 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -372,7 +372,7 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
 		current->thread.error_code = err;
 		current->thread.trap_no = trap;
 
-		force_sig_fault(signo, si_code, addr, current);
+		force_sig_fault(signo, si_code, addr);
 	} else {
 		die(str, regs, err);
 	}
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index e376883ab35b..a6fffd788c9c 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -948,7 +948,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		goto fixup;
 
 	if (ai_usermode & UM_SIGNAL) {
-		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr);
 	} else {
 		/*
 		 * We're about to disable the alignment trap and return to
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 03007ea4cc72..49e8ec2e9e7b 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -184,7 +184,7 @@ __do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig,
 	tsk->thread.address = addr;
 	tsk->thread.error_code = fsr;
 	tsk->thread.trap_no = 14;
-	force_sig_fault(sig, code, (void __user *)addr, current);
+	force_sig_fault(sig, code, (void __user *)addr);
 }
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index c76a64c1bcb3..a490a4a32e77 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -259,7 +259,7 @@ void arm64_force_sig_fault(int signo, int code, void __user *addr,
 	if (signo == SIGKILL)
 		force_sig(SIGKILL);
 	else
-		force_sig_fault(signo, code, addr, current);
+		force_sig_fault(signo, code, addr);
 }
 
 void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 5c60aea3b75a..ca54d1dd2aee 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -253,7 +253,7 @@ static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
 	die_if_kernel(except_info->kernel_str, regs, addr);
 
 	force_sig_fault(except_info->signo, except_info->code,
-			(void __user *)addr, current);
+			(void __user *)addr);
 }
 
 /*
diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c
index d789be36eb4f..27ef5b2c43ab 100644
--- a/arch/csky/abiv1/alignment.c
+++ b/arch/csky/abiv1/alignment.c
@@ -283,7 +283,7 @@ bad_area:
 		do_exit(SIGKILL);
 	}
 
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr);
 }
 
 static struct ctl_table alignment_tbl[4] = {
diff --git a/arch/csky/abiv2/fpu.c b/arch/csky/abiv2/fpu.c
index e7e11344005a..86d187d4e5af 100644
--- a/arch/csky/abiv2/fpu.c
+++ b/arch/csky/abiv2/fpu.c
@@ -124,7 +124,7 @@ void fpu_fpe(struct pt_regs *regs)
 			code = FPE_FLTRES;
 	}
 
-	force_sig_fault(sig, code, (void __user *)regs->pc, current);
+	force_sig_fault(sig, code, (void __user *)regs->pc);
 }
 
 #define FMFVR_FPU_REGS(vrx, vry)	\
diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c
index f487a9b996ae..2792e9601ac5 100644
--- a/arch/csky/kernel/traps.c
+++ b/arch/csky/kernel/traps.c
@@ -106,7 +106,7 @@ void buserr(struct pt_regs *regs)
 	pr_err("User mode Bus Error\n");
 	show_regs(regs);
 
-	force_sig_fault(SIGSEGV, 0, (void __user *)regs->pc, current);
+	force_sig_fault(SIGSEGV, 0, (void __user *)regs->pc);
 }
 
 #define USR_BKPT 0x1464
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 18041f46ded1..f76618b630f9 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -179,7 +179,7 @@ bad_area:
 bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 	if (user_mode(regs)) {
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 		return;
 	}
 
@@ -212,5 +212,5 @@ do_sigbus:
 	if (!user_mode(regs))
 		goto no_context;
 
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index e634414361df..b8a69b2e3f3d 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -420,7 +420,7 @@ void do_trap0(struct pt_regs *regs)
 			 * may want to use a different trap0 flavor.
 			 */
 			force_sig_fault(SIGTRAP, TRAP_BRKPT,
-					(void __user *) pt_elr(regs), current);
+					(void __user *) pt_elr(regs));
 		} else {
 #ifdef CONFIG_KGDB
 			kgdb_handle_exception(pt_cause(regs), SIGTRAP,
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index eb263e61daf4..2b3e22509cdf 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -148,14 +148,14 @@ good_area:
 		si_signo = SIGSEGV;
 		si_code  = SEGV_ACCERR;
 	}
-	force_sig_fault(si_signo, si_code, (void __user *)address, current);
+	force_sig_fault(si_signo, si_code, (void __user *)address);
 	return;
 
 bad_area:
 	up_read(&mm->mmap_sem);
 
 	if (user_mode(regs)) {
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 		return;
 	}
 	/* Kernel-mode fault falls through */
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index c0239bf77a09..782c481d7052 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -197,21 +197,21 @@ ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
 		 */
 		printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n");
 		force_sig_fault(SIGILL, ILL_BADIADDR, (void __user *)NULL,
-				0, 0, 0, current);
+				0, 0, 0);
 	} else if (ia64_psr(regs)->tb) {
 		/*
 		 *  Branch Tracing is enabled.
 		 *  Force a taken branch signal.
 		 */
 		force_sig_fault(SIGTRAP, TRAP_BRANCH, (void __user *)NULL,
-				0, 0, 0, current);
+				0, 0, 0);
 	} else if (ia64_psr(regs)->ss) {
 		/*
 		 *  Single Step is enabled.
 		 *  Force a trace signal.
 		 */
 		force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)NULL,
-				0, 0, 0, current);
+				0, 0, 0);
 	}
 	return rv;
 }
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 0a3adbfebc2a..e13cb905930f 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -176,7 +176,7 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	}
 	force_sig_fault(sig, code,
 			(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-			break_num, 0 /* clear __ISR_VALID */, 0, current);
+			break_num, 0 /* clear __ISR_VALID */, 0);
 }
 
 /*
@@ -353,7 +353,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			}
 			force_sig_fault(SIGFPE, si_code,
 					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr, current);
+					0, __ISR_VALID, isr);
 		}
 	} else {
 		if (exception == -1) {
@@ -373,7 +373,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			}
 			force_sig_fault(SIGFPE, si_code,
 					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr, current);
+					0, __ISR_VALID, isr);
 		}
 	}
 	return 0;
@@ -408,7 +408,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 
 	force_sig_fault(SIGILL, ILL_ILLOPC,
 			(void __user *) (regs.cr_iip + ia64_psr(&regs)->ri),
-			0, 0, 0, current);
+			0, 0, 0);
 	return rv;
 }
 
@@ -483,7 +483,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 							+ ia64_psr(&regs)->ri);
 			}
 			force_sig_fault(sig, code, addr,
-					vector, __ISR_VALID, isr, current);
+					vector, __ISR_VALID, isr);
 			return;
 		} else if (ia64_done_with_exception(&regs))
 			return;
@@ -493,7 +493,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	      case 31: /* Unsupported Data Reference */
 		if (user_mode(&regs)) {
 			force_sig_fault(SIGILL, ILL_ILLOPN, (void __user *) iip,
-					vector, __ISR_VALID, isr, current);
+					vector, __ISR_VALID, isr);
 			return;
 		}
 		sprintf(buf, "Unsupported data reference");
@@ -542,7 +542,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 			       	== NOTIFY_STOP)
 			return;
 		force_sig_fault(SIGTRAP, si_code, (void __user *) ifa,
-				0, __ISR_VALID, isr, current);
+				0, __ISR_VALID, isr);
 		return;
 
 	      case 32: /* fp fault */
@@ -550,7 +550,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
 		if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
 			force_sig_fault(SIGFPE, FPE_FLTINV, (void __user *) iip,
-					0, __ISR_VALID, isr, current);
+					0, __ISR_VALID, isr);
 		}
 		return;
 
@@ -578,7 +578,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 			if (user_mode(&regs)) {
 				force_sig_fault(SIGILL, ILL_BADIADDR,
 						(void __user *) iip,
-						0, 0, 0, current);
+						0, 0, 0);
 				return;
 			}
 			sprintf(buf, "Unimplemented Instruction Address fault");
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index a167a3824b35..eb7d5df59fa3 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1537,6 +1537,6 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 	}
   force_sigbus:
 	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) ifa,
-			0, 0, 0, current);
+			0, 0, 0);
 	goto done;
 }
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 5baeb022f474..3c3a283d3172 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -249,7 +249,7 @@ retry:
 	}
 	if (user_mode(regs)) {
 		force_sig_fault(signal, code, (void __user *) address,
-				0, __ISR_VALID, isr, current);
+				0, __ISR_VALID, isr);
 		return;
 	}
 
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 2b6e143abd73..344f93d36a9a 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1127,7 +1127,7 @@ asmlinkage void trap_c(struct frame *fp)
 		addr = (void __user*) fp->un.fmtb.daddr;
 		break;
 	}
-	force_sig_fault(sig, si_code, addr, current);
+	force_sig_fault(sig, si_code, addr);
 }
 
 void die_if_kernel (char *str, struct pt_regs *fp, int nr)
@@ -1159,6 +1159,6 @@ asmlinkage void fpsp040_die(void)
 #ifdef CONFIG_M68KFPU_EMU
 asmlinkage void fpemu_signal(int signal, int code, void *addr)
 {
-	force_sig_fault(signal, code, addr, current);
+	force_sig_fault(signal, code, addr);
 }
 #endif
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 9b6163c05a75..e9b1d7585b43 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -30,13 +30,13 @@ int send_fault_sig(struct pt_regs *regs)
 	pr_debug("send_fault_sig: %p,%d,%d\n", addr, signo, si_code);
 
 	if (user_mode(regs)) {
-		force_sig_fault(signo, si_code, addr, current);
+		force_sig_fault(signo, si_code, addr);
 	} else {
 		if (fixup_exception(regs))
 			return -1;
 
 		//if (signo == SIGBUS)
-		//	force_sig_fault(si_signo, si_code, addr, current);
+		//	force_sig_fault(si_signo, si_code, addr);
 
 		/*
 		 * Oops. The kernel tried to access some bad page. We'll have to
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index eafff21fcb0e..cf99c411503e 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -63,7 +63,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 	if (kernel_mode(regs))
 		die("Exception in kernel mode", regs, signr);
 
-	force_sig_fault(signr, code, (void __user *)addr, current);
+	force_sig_fault(signr, code, (void __user *)addr);
 }
 
 asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index 202ad6a494f5..e6a810b0c7ad 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -289,7 +289,7 @@ out_of_memory:
 do_sigbus:
 	up_read(&mm->mmap_sem);
 	if (user_mode(regs)) {
-		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 		return;
 	}
 	bad_page_fault(regs, address, SIGBUS);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 62df48b6fb46..be4a7b25269c 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -705,7 +705,7 @@ asmlinkage void do_ov(struct pt_regs *regs)
 	prev_state = exception_enter();
 	die_if_kernel("Integer overflow", regs);
 
-	force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc, current);
+	force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc);
 	exception_exit(prev_state);
 }
 
@@ -750,7 +750,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 		return 1;
 
 	case SIGBUS:
-		force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr);
 		return 1;
 
 	case SIGSEGV:
@@ -761,7 +761,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 		else
 			si_code = SEGV_MAPERR;
 		up_read(&current->mm->mmap_sem);
-		force_sig_fault(SIGSEGV, si_code, fault_addr, current);
+		force_sig_fault(SIGSEGV, si_code, fault_addr);
 		return 1;
 
 	default:
@@ -943,7 +943,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 		die_if_kernel(b, regs);
 		force_sig_fault(SIGFPE,
 				code == BRK_DIVZERO ? FPE_INTDIV : FPE_INTOVF,
-				(void __user *) regs->cp0_epc, current);
+				(void __user *) regs->cp0_epc);
 		break;
 	case BRK_BUG:
 		die_if_kernel("Kernel bug detected", regs);
@@ -968,7 +968,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 		scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
 		die_if_kernel(b, regs);
 		if (si_code) {
-			force_sig_fault(SIGTRAP, si_code, NULL,	current);
+			force_sig_fault(SIGTRAP, si_code, NULL);
 		} else {
 			force_sig(SIGTRAP);
 		}
@@ -1521,7 +1521,7 @@ asmlinkage void do_watch(struct pt_regs *regs)
 	if (test_tsk_thread_flag(current, TIF_LOAD_WATCH)) {
 		mips_read_watch_registers();
 		local_irq_enable();
-		force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL, current);
+		force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL);
 	} else {
 		mips_clear_watch_registers();
 		local_irq_enable();
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index e63abd492f65..f589aa8f47d9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -223,7 +223,7 @@ bad_area_nosemaphore:
 			pr_cont("\n");
 		}
 		current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 		return;
 	}
 
@@ -279,7 +279,7 @@ do_sigbus:
 #endif
 	current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
 	tsk->thread.cp0_badvaddr = address;
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 
 	return;
 #ifndef CONFIG_64BIT
diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c
index fddd40c7a16f..1f8694c6bd5a 100644
--- a/arch/nds32/kernel/fpu.c
+++ b/arch/nds32/kernel/fpu.c
@@ -246,7 +246,7 @@ inline void handle_fpu_exception(struct pt_regs *regs)
 	}
 
 	force_sig_fault(si_signo, si_code,
-			(void __user *)instruction_pointer(regs), current);
+			(void __user *)instruction_pointer(regs));
 done:
 	own_fpu();
 }
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index a16e97f7bc75..f4d386b52622 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -205,7 +205,7 @@ int bad_syscall(int n, struct pt_regs *regs)
 	}
 
 	force_sig_fault(SIGILL, ILL_ILLTRP,
-			(void __user *)instruction_pointer(regs) - 4, current);
+			(void __user *)instruction_pointer(regs) - 4);
 	die_if_kernel("Oops - bad syscall", regs, n);
 	return regs->uregs[0];
 }
@@ -263,7 +263,7 @@ static void send_sigtrap(struct pt_regs *regs, int error_code, int si_code)
 	tsk->thread.error_code = error_code;
 
 	force_sig_fault(SIGTRAP, si_code,
-			(void __user *)instruction_pointer(regs), current);
+			(void __user *)instruction_pointer(regs));
 }
 
 void do_debug_trap(unsigned long entry, unsigned long addr,
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 38441113c202..064ae5d2159d 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -271,7 +271,7 @@ bad_area_nosemaphore:
 		tsk->thread.address = addr;
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = entry;
-		force_sig_fault(SIGSEGV, si_code, (void __user *)addr, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)addr);
 		return;
 	}
 
@@ -340,7 +340,7 @@ do_sigbus:
 	tsk->thread.address = addr;
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = entry;
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr);
 
 	return;
 
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 3bc3cd22b750..486db793923c 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(die_lock);
 
 static void _send_sig(int signo, int code, unsigned long addr)
 {
-	force_sig_fault(signo, code, (void __user *) addr, current);
+	force_sig_fault(signo, code, (void __user *) addr);
 }
 
 void die(const char *str, struct pt_regs *regs, long err)
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 0fad2e46ff43..a4cc6e59c57f 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -249,7 +249,7 @@ void __init trap_init(void)
 
 asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 {
-	force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address, current);
+	force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address);
 
 	regs->pc += 4;
 }
@@ -258,7 +258,7 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 {
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
-		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address);
 	} else {
 		printk("KERNEL: Unaligned Access 0x%.8lx\n", address);
 		show_registers(regs);
@@ -271,7 +271,7 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
 {
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
-		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 	} else {		/* Kernel mode */
 		printk("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address);
 		show_registers(regs);
@@ -466,7 +466,7 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 
 	if (user_mode(regs)) {
 		/* Send a SIGILL */
-		force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address, current);
+		force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address);
 	} else {		/* Kernel mode */
 		printk("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n",
 		       address);
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index f8b3a5a6ba3a..ae9468c22c9d 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -213,7 +213,7 @@ bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 
 	if (user_mode(regs)) {
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 		return;
 	}
 
@@ -278,7 +278,7 @@ do_sigbus:
 	 * Send a sigbus, regardless of whether we were in kernel
 	 * or user mode.
 	 */
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs))
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 096e319adeb3..58dcf445e32f 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -275,7 +275,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
 static void handle_gdb_break(struct pt_regs *regs, int wot)
 {
 	force_sig_fault(SIGTRAP, wot,
-			(void __user *) (regs->iaoq[0] & ~3), current);
+			(void __user *) (regs->iaoq[0] & ~3));
 }
 
 static void handle_break(struct pt_regs *regs)
@@ -609,13 +609,13 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 		si_code = ILL_PRVREG;
 	give_sigill:
 		force_sig_fault(SIGILL, si_code,
-				(void __user *) regs->iaoq[0], current);
+				(void __user *) regs->iaoq[0]);
 		return;
 
 	case 12:
 		/* Overflow Trap, let the userland signal handler do the cleanup */
 		force_sig_fault(SIGFPE, FPE_INTOVF,
-				(void __user *) regs->iaoq[0], current);
+				(void __user *) regs->iaoq[0]);
 		return;
 		
 	case 13:
@@ -627,7 +627,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 			 * to by si_addr.
 			 */
 			force_sig_fault(SIGFPE, FPE_CONDTRAP,
-					(void __user *) regs->iaoq[0], current);
+					(void __user *) regs->iaoq[0]);
 			return;
 		} 
 		/* The kernel doesn't want to handle condition codes */
@@ -739,7 +739,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 		force_sig_fault(SIGSEGV, SEGV_MAPERR,
 				(code == 7)?
 				((void __user *) regs->iaoq[0]) :
-				((void __user *) regs->ior), current);
+				((void __user *) regs->ior));
 		return;
 
 	case 28: 
@@ -754,7 +754,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 				task_pid_nr(current), current->comm);
 			/* SIGBUS, for lack of a better one. */
 			force_sig_fault(SIGBUS, BUS_OBJERR,
-					(void __user *)regs->ior, current);
+					(void __user *)regs->ior);
 			return;
 		}
 		pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
@@ -770,7 +770,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 				code, fault_space,
 				task_pid_nr(current), current->comm);
 		force_sig_fault(SIGSEGV, SEGV_MAPERR,
-				(void __user *)regs->ior, current);
+				(void __user *)regs->ior);
 		return;
 	    }
 	}
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index 932bfc0b7cd8..3ccc3a69469c 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -690,14 +690,14 @@ void handle_unaligned(struct pt_regs *regs)
 		if (ret == ERR_PAGEFAULT)
 		{
 			force_sig_fault(SIGSEGV, SEGV_MAPERR,
-					(void __user *)regs->ior, current);
+					(void __user *)regs->ior);
 		}
 		else
 		{
 force_sigbus:
 			/* couldn't handle it ... */
 			force_sig_fault(SIGBUS, BUS_ADRALN,
-					(void __user *)regs->ior, current);
+					(void __user *)regs->ior);
 		}
 		
 		return;
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 0590e05571d1..f3e0bddcbb38 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -117,7 +117,7 @@ handle_fpe(struct pt_regs *regs)
 	memcpy(regs->fr, frcopy, sizeof regs->fr);
 	if (signalcode != 0) {
 	    force_sig_fault(signalcode >> 24, signalcode & 0xffffff,
-			    (void __user *) regs->iaoq[0], current);
+			    (void __user *) regs->iaoq[0]);
 	    return -1;
 	}
 
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 56ceacb3401d..6dd4669ce7a5 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -409,7 +409,7 @@ bad_area:
 #endif
 		show_signal_msg(regs, code, address, tsk, vma);
 
-		force_sig_fault(signo, si_code, (void __user *) address, current);
+		force_sig_fault(signo, si_code, (void __user *) address);
 		return;
 	}
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 87da40129927..1b5b1477afa2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -643,7 +643,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
 	hw_breakpoint_disable();
 
 	/* Deliver the signal to userspace */
-	force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address, current);
+	force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address);
 }
 #endif	/* CONFIG_PPC_ADV_DEBUG_REGS */
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 83e59fdaa62d..dfc61f2f69a0 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -301,7 +301,7 @@ NOKPROBE_SYMBOL(die);
 
 void user_single_step_report(struct pt_regs *regs)
 {
-	force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)regs->nip, current);
+	force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)regs->nip);
 }
 
 static void show_signal_msg(int signr, struct pt_regs *regs, int code,
@@ -367,7 +367,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 	if (!exception_common(signr, regs, code, addr))
 		return;
 
-	force_sig_fault(signr, code, (void __user *)addr, current);
+	force_sig_fault(signr, code, (void __user *)addr);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6ed6c341c670..02c70fa535ef 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -187,7 +187,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	}
 
 #endif
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 971ac43b5d60..6634c0c5ed9e 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -44,22 +44,21 @@ static void spufs_handle_event(struct spu_context *ctx,
 
 	switch (type) {
 	case SPE_EVENT_INVALID_DMA:
-		force_sig_fault(SIGBUS, BUS_OBJERR, NULL, current);
+		force_sig_fault(SIGBUS, BUS_OBJERR, NULL);
 		break;
 	case SPE_EVENT_SPE_DATA_STORAGE:
 		ctx->ops->restart_dma(ctx);
-		force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea,
-				current);
+		force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea);
 		break;
 	case SPE_EVENT_DMA_ALIGNMENT:
 		/* DAR isn't set for an alignment fault :( */
-		force_sig_fault(SIGBUS, BUS_ADRALN, NULL, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
 		break;
 	case SPE_EVENT_SPE_ERROR:
 		force_sig_fault(
 			SIGILL, ILL_ILLOPC,
 			(void __user *)(unsigned long)
-			ctx->ops->npc_read(ctx) - 4, current);
+			ctx->ops->npc_read(ctx) - 4);
 		break;
 	}
 }
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 6d67892dfc82..859ab550d52a 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -76,7 +76,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr)
 		show_regs(regs);
 	}
 
-	force_sig_fault(signo, code, (void __user *)addr, current);
+	force_sig_fault(signo, code, (void __user *)addr);
 }
 
 static void do_trap_error(struct pt_regs *regs, int signo, int code,
@@ -149,7 +149,7 @@ asmlinkage void do_trap_break(struct pt_regs *regs)
 	}
 #endif /* CONFIG_GENERIC_BUG */
 
-	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc), current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc));
 }
 
 #ifdef CONFIG_GENERIC_BUG
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 82e81a9f7112..ac44dbfc4a7e 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -45,7 +45,7 @@ int is_valid_bugaddr(unsigned long addr)
 void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 {
 	if (user_mode(regs)) {
-		force_sig_fault(si_signo, si_code, get_trap_ip(regs), current);
+		force_sig_fault(si_signo, si_code, get_trap_ip(regs));
 		report_user_fault(regs, si_signo, 0);
         } else {
                 const struct exception_table_entry *fixup;
@@ -79,7 +79,7 @@ void do_per_trap(struct pt_regs *regs)
 	if (!current->ptrace)
 		return;
 	force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-		(void __force __user *) current->thread.per_event.address, current);
+		(void __force __user *) current->thread.per_event.address);
 }
 NOKPROBE_SYMBOL(do_per_trap);
 
@@ -165,7 +165,7 @@ void illegal_op(struct pt_regs *regs)
 			return;
 		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
 			if (current->ptrace)
-				force_sig_fault(SIGTRAP, TRAP_BRKPT, location, current);
+				force_sig_fault(SIGTRAP, TRAP_BRKPT, location);
 			else
 				signal = SIGILL;
 #ifdef CONFIG_UPROBES
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index c220399ae196..79afed544cac 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -243,8 +243,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 {
 	report_user_fault(regs, SIGSEGV, 1);
 	force_sig_fault(SIGSEGV, si_code,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
-			current);
+			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 }
 
 const struct exception_table_entry *s390_search_extables(unsigned long addr)
@@ -305,8 +304,7 @@ static noinline void do_sigbus(struct pt_regs *regs)
 	 * or user mode.
 	 */
 	force_sig_fault(SIGBUS, BUS_ADRERR,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
-			current);
+			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 }
 
 static noinline int signal_return(struct pt_regs *regs)
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index bc96b16288c1..3bd010b4c55f 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -338,7 +338,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		/* Deliver the signal to userspace */
 		if (!arch_check_bp_in_kernelspace(&bp->hw.info)) {
 			force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-					(void __user *)NULL, current);
+					(void __user *)NULL);
 		}
 
 		rcu_read_unlock();
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index bd5568c8e7f0..058c6181bb30 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -533,7 +533,7 @@ uspace_segv:
 		       "access (PC %lx PR %lx)\n", current->comm, regs->pc,
 		       regs->pr);
 
-		force_sig_fault(SIGBUS, si_code, (void __user *)address, current);
+		force_sig_fault(SIGBUS, si_code, (void __user *)address);
 	} else {
 		inc_unaligned_kernel_access();
 
@@ -603,7 +603,7 @@ asmlinkage void do_divide_error(unsigned long r4)
 		/* Let gcc know unhandled cases don't make it past here */
 		return;
 	}
-	force_sig_fault(SIGFPE, code, NULL, current);
+	force_sig_fault(SIGFPE, code, NULL);
 }
 #endif
 
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index fe261b0983cc..e8be0eca0444 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -560,7 +560,7 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 			task_thread_info(tsk)->status |= TS_USEDFPU;
 		} else {
 			force_sig_fault(SIGFPE, FPE_FLTINV,
-					(void __user *)regs->pc, current);
+					(void __user *)regs->pc);
 		}
 
 		regs->pc = nextpc;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 851a3cbb2b9c..3093bc372138 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -41,7 +41,7 @@ static inline int notify_page_fault(struct pt_regs *regs, int trap)
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address)
 {
-	force_sig_fault(si_signo, si_code, (void __user *)address, current);
+	force_sig_fault(si_signo, si_code, (void __user *)address);
 }
 
 /*
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c4bccd97f3cf..4282116e28e7 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -519,7 +519,7 @@ void synchronize_user_stack(void)
 
 static void stack_unaligned(unsigned long sp)
 {
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0);
 }
 
 static const char uwfault32[] = KERN_INFO \
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 452e4d080855..be77538bc038 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -151,7 +151,7 @@ sparc_breakpoint (struct pt_regs *regs)
 #ifdef DEBUG_SPARC_BREAKPOINT
         printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc);
 #endif
-	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0, current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0);
 
 #ifdef DEBUG_SPARC_BREAKPOINT
 	printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc);
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 9825ca6a6020..ccc88926bc00 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -511,7 +511,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs)
 #ifdef DEBUG_SPARC_BREAKPOINT
         printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc);
 #endif
-	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0, current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0);
 #ifdef DEBUG_SPARC_BREAKPOINT
 	printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc);
 #endif
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index bcdfc6168dd5..4ceecad556a9 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -103,7 +103,7 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
 		die_if_kernel("Kernel bad trap", regs);
 
 	force_sig_fault(SIGILL, ILL_ILLTRP,
-			(void __user *)regs->pc, type - 0x80, current);
+			(void __user *)regs->pc, type - 0x80);
 }
 
 void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc,
@@ -327,7 +327,7 @@ void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc
 	printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n",
 	       pc, npc, psr);
 #endif
-	force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0, current);
+	force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0);
 }
 
 void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc,
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 12bfc7e215ca..614d92c18506 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -107,7 +107,7 @@ void bad_trap(struct pt_regs *regs, long lvl)
 		regs->tnpc &= 0xffffffff;
 	}
 	force_sig_fault(SIGILL, ILL_ILLTRP,
-			(void __user *)regs->tpc, lvl, current);
+			(void __user *)regs->tpc, lvl);
 }
 
 void bad_trap_tl1(struct pt_regs *regs, long lvl)
@@ -201,7 +201,7 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 		regs->tnpc &= 0xffffffff;
 	}
 	force_sig_fault(SIGSEGV, SEGV_MAPERR,
-			(void __user *)regs->tpc, 0, current);
+			(void __user *)regs->tpc, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -236,7 +236,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0);
 }
 
 void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
@@ -321,7 +321,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 	if (is_no_fault_exception(regs))
 		return;
 
-	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -385,16 +385,13 @@ void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 	 */
 	switch (type) {
 	case HV_FAULT_TYPE_INV_ASI:
-		force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr, 0,
-				current);
+		force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr, 0);
 		break;
 	case HV_FAULT_TYPE_MCD_DIS:
-		force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr, 0,
-				current);
+		force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr, 0);
 		break;
 	default:
-		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr, 0,
-				current);
+		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr, 0);
 		break;
 	}
 }
@@ -571,7 +568,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0, current);
+	force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0);
 }
 
 void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar)
@@ -2073,7 +2070,7 @@ void do_mcd_err(struct pt_regs *regs, struct sun4v_error_entry ent)
 	 * code
 	 */
 	force_sig_fault(SIGSEGV, SEGV_ADIDERR, (void __user *)ent.err_raddr,
-			0, current);
+			0);
 }
 
 /* We run with %pil set to PIL_NORMAL_MAX and PSTATE_IE enabled in %pstate.
@@ -2187,7 +2184,7 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 	}
 	if (attrs & SUN4V_ERR_ATTRS_PIO) {
 		force_sig_fault(SIGBUS, BUS_ADRERR,
-				(void __user *)sun4v_get_vaddr(regs), 0, current);
+				(void __user *)sun4v_get_vaddr(regs), 0);
 		return true;
 	}
 
@@ -2344,7 +2341,7 @@ static void do_fpe_common(struct pt_regs *regs)
 				code = FPE_FLTRES;
 		}
 		force_sig_fault(SIGFPE, code,
-				(void __user *)regs->tpc, 0, current);
+				(void __user *)regs->tpc, 0);
 	}
 }
 
@@ -2399,7 +2396,7 @@ void do_tof(struct pt_regs *regs)
 		regs->tnpc &= 0xffffffff;
 	}
 	force_sig_fault(SIGEMT, EMT_TAGOVF,
-			(void __user *)regs->tpc, 0, current);
+			(void __user *)regs->tpc, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -2419,7 +2416,7 @@ void do_div0(struct pt_regs *regs)
 		regs->tnpc &= 0xffffffff;
 	}
 	force_sig_fault(SIGFPE, FPE_INTDIV,
-			(void __user *)regs->tpc, 0, current);
+			(void __user *)regs->tpc, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -2615,7 +2612,7 @@ void do_illegal_instruction(struct pt_regs *regs)
 			}
 		}
 	}
-	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current);
+	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -2635,7 +2632,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 	if (is_no_fault_exception(regs))
 		return;
 
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0);
 out:
 	exception_exit(prev_state);
 }
@@ -2653,7 +2650,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
 	if (is_no_fault_exception(regs))
 		return;
 
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0);
 }
 
 /* sun4v_mem_corrupt_detect_precise() - Handle precise exception on an ADI
@@ -2700,7 +2697,7 @@ void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr, 0, current);
+	force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr, 0);
 }
 
 void do_privop(struct pt_regs *regs)
@@ -2716,7 +2713,7 @@ void do_privop(struct pt_regs *regs)
 		regs->tnpc &= 0xffffffff;
 	}
 	force_sig_fault(SIGILL, ILL_PRVOPC,
-			(void __user *)regs->tpc, 0, current);
+			(void __user *)regs->tpc, 0);
 out:
 	exception_exit(prev_state);
 }
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 2731faf415ba..8d69de111470 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -131,7 +131,7 @@ static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 		show_signal_msg(regs, sig, code,
 				addr, current);
 
-	force_sig_fault(sig, code, (void __user *) addr, 0, current);
+	force_sig_fault(sig, code, (void __user *) addr, 0);
 }
 
 static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 8f8a604c1300..83fda4d9c3b2 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -187,7 +187,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 	if (unlikely(show_unhandled_signals))
 		show_signal_msg(regs, sig, code, addr, current);
 
-	force_sig_fault(sig, code, (void __user *) addr, 0, current);
+	force_sig_fault(sig, code, (void __user *) addr, 0);
 }
 
 static unsigned int get_fault_insn(struct pt_regs *regs, unsigned int insn)
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 1797dfe9ce6d..da1e96b1ec3e 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -117,8 +117,7 @@ static void send_sigtrap(struct uml_pt_regs *regs, int error_code)
 	/* Send us the fake SIGTRAP */
 	force_sig_fault(SIGTRAP, TRAP_BRKPT,
 			/* User-mode eip? */
-			UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL,
-			current);
+			UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL);
 }
 
 /*
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 1c943c66063f..58fe36856182 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -163,8 +163,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
 static void bad_segv(struct faultinfo fi, unsigned long ip)
 {
 	current->thread.arch.faultinfo = fi;
-	force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi),
-			current);
+	force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
 }
 
 void fatal_sigsegv(void)
@@ -268,13 +267,11 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 
 	if (err == -EACCES) {
 		current->thread.arch.faultinfo = fi;
-		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address,
-				current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 	} else {
 		BUG_ON(err != -EFAULT);
 		current->thread.arch.faultinfo = fi;
-		force_sig_fault(SIGSEGV, si_code, (void __user *) address,
-				current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *) address);
 	}
 
 out:
@@ -304,8 +301,7 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
 	if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
 		struct faultinfo *fi = UPT_FAULTINFO(regs);
 		current->thread.arch.faultinfo = *fi;
-		force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi),
-				current);
+		force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
 	} else {
 		printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
 		       sig, code, err);
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index fb376d83e043..a0878035cda7 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -248,7 +248,7 @@ void uc32_notify_die(const char *str, struct pt_regs *regs,
 		current->thread.error_code = err;
 		current->thread.trap_no = trap;
 
-		force_sig_fault(sig, code, addr, current);
+		force_sig_fault(sig, code, addr);
 	} else
 		die(str, regs, err);
 }
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 313547a93513..c85ba5339c1f 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -124,7 +124,7 @@ static void __do_user_fault(unsigned long addr, unsigned int fsr,
 	tsk->thread.address = addr;
 	tsk->thread.error_code = fsr;
 	tsk->thread.trap_no = 14;
-	force_sig_fault(sig, code, (void __user *)addr, current);
+	force_sig_fault(sig, code, (void __user *)addr);
 }
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 7ea87f4ad0b7..2f31faf339d5 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -106,7 +106,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 		thread->cr2		= ptr;
 		thread->trap_nr		= X86_TRAP_PF;
 
-		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current);
+		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
 		return false;
 	} else {
 		return true;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 34d27b2dc7a1..8f8f197389db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1369,7 +1369,7 @@ void send_sigtrap(struct pt_regs *regs, int error_code, int si_code)
 
 	/* Send us the fake SIGTRAP */
 	force_sig_fault(SIGTRAP, si_code,
-			user_mode(regs) ? (void __user *)regs->ip : NULL, current);
+			user_mode(regs) ? (void __user *)regs->ip : NULL);
 }
 
 void user_single_step_report(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 945b9a0719dd..87095a477154 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -256,7 +256,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	if (!sicode)
 		force_sig(signr);
 	else
-		force_sig_fault(signr, sicode, addr, current);
+		force_sig_fault(signr, sicode, addr);
 }
 NOKPROBE_SYMBOL(do_trap);
 
@@ -856,7 +856,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		return;
 
 	force_sig_fault(SIGFPE, si_code,
-			(void __user *)uprobe_get_trap_addr(regs), current);
+			(void __user *)uprobe_get_trap_addr(regs));
 }
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 68cdcd717c85..5b345add550f 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -277,7 +277,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
 	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;
 	tsk->thread.trap_nr	= X86_TRAP_PF;
 
-	force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR, addr);
 
 	if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
 		return;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 16a5d1b615a7..46ac96aa7c81 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -756,8 +756,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 			set_signal_archinfo(address, error_code);
 
 			/* XXX: hwpoison faults will set the wrong code. */
-			force_sig_fault(signal, si_code, (void __user *)address,
-					current);
+			force_sig_fault(signal, si_code, (void __user *)address);
 		}
 
 		/*
@@ -918,7 +917,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		if (si_code == SEGV_PKUERR)
 			force_sig_pkuerr((void __user *)address, pkey);
 
-		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 
 		return;
 	}
@@ -1044,7 +1043,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		return;
 	}
 #endif
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
 
 static noinline void
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 6f26b254091b..f060348c1b23 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -330,7 +330,7 @@ do_unaligned_user (struct pt_regs *regs)
 			    "(pid = %d, pc = %#010lx)\n",
 			    regs->excvaddr, current->comm,
 			    task_pid_nr(current), regs->pc);
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr);
 }
 #endif
 
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 2ab0e0dcd166..f81b1478da61 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -157,7 +157,7 @@ bad_area:
 	if (user_mode(regs)) {
 		current->thread.bad_vaddr = address;
 		current->thread.error_code = is_write;
-		force_sig_fault(SIGSEGV, code, (void *) address, current);
+		force_sig_fault(SIGSEGV, code, (void *) address);
 		return;
 	}
 	bad_page_fault(regs, address, SIGSEGV);
@@ -182,7 +182,7 @@ do_sigbus:
 	 * or user mode.
 	 */
 	current->thread.bad_vaddr = address;
-	force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs))
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 507af66a1fc8..7f872506e1de 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -313,8 +313,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
 	, struct task_struct *t);
 int force_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
+	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
 int send_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
 	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
diff --git a/kernel/signal.c b/kernel/signal.c
index e420489ac4c9..d92b636b4e9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1645,12 +1645,11 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
 
 int force_sig_fault(int sig, int code, void __user *addr
 	___ARCH_SI_TRAPNO(int trapno)
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t)
+	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
 {
 	return force_sig_fault_to_task(sig, code, addr
 				       ___ARCH_SI_TRAPNO(trapno)
-				       ___ARCH_SI_IA64(imm, flags, isr), t);
+				       ___ARCH_SI_IA64(imm, flags, isr), current);
 }
 
 int send_sig_fault(int sig, int code, void __user *addr
-- 
cgit v1.2.3


From a89e9b8abf82725e4ac96100e07c8104dbe8a240 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 May 2019 10:11:09 -0500
Subject: signal: Remove the signal number and task parameters from
 force_sig_info

force_sig_info always delivers to the current task and the signal
parameter always matches info.si_signo.  So remove those parameters to
make it a simpler less error prone interface, and to make it clear
that none of the callers are doing anything clever.

This guarantees that force_sig_info will not grow any new buggy
callers that attempt to call force_sig on a non-current task, or that
pass an signal number that does not match info.si_signo.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h       |  2 +-
 include/linux/sched/signal.h |  2 +-
 kernel/seccomp.c             |  2 +-
 kernel/signal.c              | 14 +++++++-------
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index d5084ebd9f03..2a9df80ea887 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -355,7 +355,7 @@ static inline void user_single_step_report(struct pt_regs *regs)
 	info.si_code = SI_USER;
 	info.si_pid = 0;
 	info.si_uid = 0;
-	force_sig_info(info.si_signo, &info, current);
+	force_sig_info(&info);
 }
 #endif
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7f872506e1de..532458698bde 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -329,7 +329,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr);
 
 extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern void force_sigsegv(int sig);
-extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
+extern int force_sig_info(struct kernel_siginfo *);
 extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
 extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 811b4a86cdf6..dba52a7db5e8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
 {
 	struct kernel_siginfo info;
 	seccomp_init_siginfo(&info, syscall, reason);
-	force_sig_info(SIGSYS, &info, current);
+	force_sig_info(&info);
 }
 #endif	/* CONFIG_SECCOMP_FILTER */
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 0984158cd41a..ff6944e4964e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1325,9 +1325,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
 	return ret;
 }
 
-int force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
+int force_sig_info(struct kernel_siginfo *info)
 {
-	return force_sig_info_to_task(info, t);
+	return force_sig_info_to_task(info, current);
 }
 
 /*
@@ -1619,7 +1619,7 @@ void force_sig(int sig)
 	info.si_code = SI_KERNEL;
 	info.si_pid = 0;
 	info.si_uid = 0;
-	force_sig_info(info.si_signo, &info, current);
+	force_sig_info(&info);
 }
 EXPORT_SYMBOL(force_sig);
 
@@ -1708,7 +1708,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb)
 	info.si_code = code;
 	info.si_addr = addr;
 	info.si_addr_lsb = lsb;
-	return force_sig_info(info.si_signo, &info, current);
+	return force_sig_info(&info);
 }
 
 int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
@@ -1737,7 +1737,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
 	info.si_addr  = addr;
 	info.si_lower = lower;
 	info.si_upper = upper;
-	return force_sig_info(info.si_signo, &info, current);
+	return force_sig_info(&info);
 }
 
 #ifdef SEGV_PKUERR
@@ -1751,7 +1751,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
 	info.si_code  = SEGV_PKUERR;
 	info.si_addr  = addr;
 	info.si_pkey  = pkey;
-	return force_sig_info(info.si_signo, &info, current);
+	return force_sig_info(&info);
 }
 #endif
 
@@ -1767,7 +1767,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
 	info.si_errno = errno;
 	info.si_code  = TRAP_HWBKPT;
 	info.si_addr  = addr;
-	return force_sig_info(info.si_signo, &info, current);
+	return force_sig_info(&info);
 }
 
 int kill_pgrp(struct pid *pid, int sig, int priv)
-- 
cgit v1.2.3


From 279758f8001f0014b15656a4ef130a20852f6df6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 May 2019 15:02:31 +0800
Subject: rhashtable: Add rht_ptr_rcu and improve rht_ptr

This patch moves common code between rht_ptr and rht_ptr_exclusive
into __rht_ptr.  It also adds a new helper rht_ptr_rcu exclusively
for the RCU case.  This way rht_ptr becomes a lock-only construct
so we can use the lighter rcu_dereference_protected primitive.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 9f8bc06d4136..beb9a9da1699 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -352,37 +352,38 @@ static inline void rht_unlock(struct bucket_table *tbl,
 static inline struct rhash_head __rcu *__rht_ptr(
 	struct rhash_lock_head *const *bkt)
 {
-	return (struct rhash_head __rcu *)((unsigned long)*bkt & ~BIT(0));
+	return (struct rhash_head __rcu *)
+		((unsigned long)*bkt & ~BIT(0) ?:
+		 (unsigned long)RHT_NULLS_MARKER(bkt));
 }
 
 /*
  * Where 'bkt' is a bucket and might be locked:
- *   rht_ptr() dereferences that pointer and clears the lock bit.
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
  *   rht_ptr_exclusive() dereferences in a context where exclusive
  *            access is guaranteed, such as when destroying the table.
  */
+static inline struct rhash_head *rht_ptr_rcu(
+	struct rhash_lock_head *const *bkt)
+{
+	struct rhash_head __rcu *p = __rht_ptr(bkt);
+
+	return rcu_dereference(p);
+}
+
 static inline struct rhash_head *rht_ptr(
 	struct rhash_lock_head *const *bkt,
 	struct bucket_table *tbl,
 	unsigned int hash)
 {
-	struct rhash_head __rcu *p = __rht_ptr(bkt);
-
-	if (!p)
-		return RHT_NULLS_MARKER(bkt);
-
-	return rht_dereference_bucket_rcu(p, tbl, hash);
+	return rht_dereference_bucket(__rht_ptr(bkt), tbl, hash);
 }
 
 static inline struct rhash_head *rht_ptr_exclusive(
 	struct rhash_lock_head *const *bkt)
 {
-	struct rhash_head __rcu *p = __rht_ptr(bkt);
-
-	if (!p)
-		return RHT_NULLS_MARKER(bkt);
-
-	return rcu_dereference_protected(p, 1);
+	return rcu_dereference_protected(__rht_ptr(bkt), 1);
 }
 
 static inline void rht_assign_locked(struct rhash_lock_head **bkt,
@@ -509,7 +510,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_rcu(pos, tbl, hash)			\
 	for (({barrier(); }),					\
-	     pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash);	\
+	     pos = rht_ptr_rcu(rht_bucket(tbl, hash));		\
 	     !rht_is_a_nulls(pos);				\
 	     pos = rcu_dereference_raw(pos->next))
 
@@ -546,8 +547,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
 	rht_for_each_entry_rcu_from(tpos, pos,				   \
-				    rht_ptr(rht_bucket(tbl, hash),	   \
-					    tbl, hash),			   \
+				    rht_ptr_rcu(rht_bucket(tbl, hash)),	   \
 				    tbl, hash, member)
 
 /**
@@ -603,7 +603,7 @@ restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	bkt = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
-- 
cgit v1.2.3


From 44cc27e43fa3b8977373915a8e7f515a9d263343 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 28 May 2019 20:38:12 +0300
Subject: net: phylink: Add struct phylink_config to PHYLINK API

The phylink_config structure will encapsulate a pointer to a struct
device and the operation type requested for this instance of PHYLINK.
This patch does not make any functional changes, it just transitions the
PHYLINK internals and all its users to the new API.

A pointer to a phylink_config structure will be passed to
phylink_create() instead of the net_device directly. Also, the same
phylink_config pointer will be passed back to all phylink_mac_ops
callbacks instead of the net_device. Using this mechanism, a PHYLINK
user can get the original net_device using a structure such as
'to_net_dev(config->dev)' or directly the structure containing the
phylink_config using a container_of call.

At the moment, only the PHYLINK_NETDEV is defined as a valid operation
type for PHYLINK. In this mode, a valid reference to a struct device
linked to the original net_device should be passed to PHYLINK through
the phylink_config structure.

This API changes is mainly driven by the necessity of adding a new
operation type in PHYLINK that disconnects the phy_device from the
net_device and also works when the net_device is lacking.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sfp-phylink.rst        |  5 ++-
 drivers/net/ethernet/marvell/mvneta.c           | 36 ++++++++++------
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h      |  1 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 43 +++++++++++--------
 drivers/net/phy/phylink.c                       | 26 ++++++++----
 include/linux/phylink.h                         | 56 ++++++++++++++++---------
 include/net/dsa.h                               |  2 +
 net/dsa/slave.c                                 | 31 ++++++++------
 8 files changed, 128 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 5bd26cb07244..91446b431b70 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -98,6 +98,7 @@ this documentation.
 4. Add::
 
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 
    to the driver's private data structure.  We shall refer to the
    driver's private data pointer as ``priv`` below, and the driver's
@@ -223,8 +224,10 @@ this documentation.
    .. code-block:: c
 
 	struct phylink *phylink;
+	priv->phylink_config.dev = &dev.dev;
+	priv->phylink_config.type = PHYLINK_NETDEV;
 
-	phylink = phylink_create(dev, node, phy_mode, &phylink_ops);
+	phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops);
 	if (IS_ERR(phylink)) {
 		err = PTR_ERR(phylink);
 		fail probe;
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e758650b2c26..adbbcdde73e6 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -437,6 +437,7 @@ struct mvneta_port {
 	struct device_node *dn;
 	unsigned int tx_csum_limit;
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 	struct phy *comphy;
 
 	struct mvneta_bm *bm_priv;
@@ -3356,9 +3357,11 @@ static int mvneta_set_mac_addr(struct net_device *dev, void *addr)
 	return 0;
 }
 
-static void mvneta_validate(struct net_device *ndev, unsigned long *supported,
+static void mvneta_validate(struct phylink_config *config,
+			    unsigned long *supported,
 			    struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
@@ -3408,9 +3411,10 @@ static void mvneta_validate(struct net_device *ndev, unsigned long *supported,
 	phylink_helper_basex_speed(state);
 }
 
-static int mvneta_mac_link_state(struct net_device *ndev,
+static int mvneta_mac_link_state(struct phylink_config *config,
 				 struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 gmac_stat;
 
@@ -3438,8 +3442,9 @@ static int mvneta_mac_link_state(struct net_device *ndev,
 	return 1;
 }
 
-static void mvneta_mac_an_restart(struct net_device *ndev)
+static void mvneta_mac_an_restart(struct phylink_config *config)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 gmac_an = mvreg_read(pp, MVNETA_GMAC_AUTONEG_CONFIG);
 
@@ -3449,9 +3454,10 @@ static void mvneta_mac_an_restart(struct net_device *ndev)
 		    gmac_an & ~MVNETA_GMAC_INBAND_RESTART_AN);
 }
 
-static void mvneta_mac_config(struct net_device *ndev, unsigned int mode,
-	const struct phylink_link_state *state)
+static void mvneta_mac_config(struct phylink_config *config, unsigned int mode,
+			      const struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 new_ctrl0, gmac_ctrl0 = mvreg_read(pp, MVNETA_GMAC_CTRL_0);
 	u32 new_ctrl2, gmac_ctrl2 = mvreg_read(pp, MVNETA_GMAC_CTRL_2);
@@ -3581,9 +3587,10 @@ static void mvneta_set_eee(struct mvneta_port *pp, bool enable)
 	mvreg_write(pp, MVNETA_LPI_CTRL_1, lpi_ctl1);
 }
 
-static void mvneta_mac_link_down(struct net_device *ndev, unsigned int mode,
-				 phy_interface_t interface)
+static void mvneta_mac_link_down(struct phylink_config *config,
+				 unsigned int mode, phy_interface_t interface)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 val;
 
@@ -3600,10 +3607,11 @@ static void mvneta_mac_link_down(struct net_device *ndev, unsigned int mode,
 	mvneta_set_eee(pp, false);
 }
 
-static void mvneta_mac_link_up(struct net_device *ndev, unsigned int mode,
+static void mvneta_mac_link_up(struct phylink_config *config, unsigned int mode,
 			       phy_interface_t interface,
 			       struct phy_device *phy)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 val;
 
@@ -4500,8 +4508,14 @@ static int mvneta_probe(struct platform_device *pdev)
 		comphy = NULL;
 	}
 
-	phylink = phylink_create(dev, pdev->dev.fwnode, phy_mode,
-				 &mvneta_phylink_ops);
+	pp = netdev_priv(dev);
+	spin_lock_init(&pp->lock);
+
+	pp->phylink_config.dev = &dev->dev;
+	pp->phylink_config.type = PHYLINK_NETDEV;
+
+	phylink = phylink_create(&pp->phylink_config, pdev->dev.fwnode,
+				 phy_mode, &mvneta_phylink_ops);
 	if (IS_ERR(phylink)) {
 		err = PTR_ERR(phylink);
 		goto err_free_irq;
@@ -4513,8 +4527,6 @@ static int mvneta_probe(struct platform_device *pdev)
 
 	dev->ethtool_ops = &mvneta_eth_tool_ops;
 
-	pp = netdev_priv(dev);
-	spin_lock_init(&pp->lock);
 	pp->phylink = phylink;
 	pp->comphy = comphy;
 	pp->phy_interface = phy_mode;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 18ae8d06b692..d67c970f02e5 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -915,6 +915,7 @@ struct mvpp2_port {
 
 	phy_interface_t phy_interface;
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 	struct phy *comphy;
 
 	struct mvpp2_bm_pool *pool_long;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 3ed713b8dea5..757f8e31645e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -56,9 +56,9 @@ static struct {
 /* The prototype is added here to be used in start_dev when using ACPI. This
  * will be removed once phylink is used for all modes (dt+ACPI).
  */
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 			     const struct phylink_link_state *state);
-static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface, struct phy_device *phy);
 
 /* Queue modes */
@@ -3239,9 +3239,9 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
 		struct phylink_link_state state = {
 			.interface = port->phy_interface,
 		};
-		mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
-		mvpp2_mac_link_up(port->dev, MLO_AN_INBAND, port->phy_interface,
-				  NULL);
+		mvpp2_mac_config(&port->phylink_config, MLO_AN_INBAND, &state);
+		mvpp2_mac_link_up(&port->phylink_config, MLO_AN_INBAND,
+				  port->phy_interface, NULL);
 	}
 
 	netif_tx_start_all_queues(port->dev);
@@ -4463,11 +4463,12 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
 	eth_hw_addr_random(dev);
 }
 
-static void mvpp2_phylink_validate(struct net_device *dev,
+static void mvpp2_phylink_validate(struct phylink_config *config,
 				   unsigned long *supported,
 				   struct phylink_link_state *state)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
 	/* Invalid combinations */
@@ -4591,10 +4592,11 @@ static void mvpp2_gmac_link_state(struct mvpp2_port *port,
 		state->pause |= MLO_PAUSE_TX;
 }
 
-static int mvpp2_phylink_mac_link_state(struct net_device *dev,
+static int mvpp2_phylink_mac_link_state(struct phylink_config *config,
 					struct phylink_link_state *state)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 
 	if (port->priv->hw_version == MVPP22 && port->gop_id == 0) {
 		u32 mode = readl(port->base + MVPP22_XLG_CTRL3_REG);
@@ -4610,9 +4612,10 @@ static int mvpp2_phylink_mac_link_state(struct net_device *dev,
 	return 1;
 }
 
-static void mvpp2_mac_an_restart(struct net_device *dev)
+static void mvpp2_mac_an_restart(struct phylink_config *config)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 	u32 val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 
 	writel(val | MVPP2_GMAC_IN_BAND_RESTART_AN,
@@ -4797,9 +4800,10 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	}
 }
 
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 			     const struct phylink_link_state *state)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	bool change_interface = port->phy_interface != state->interface;
 
@@ -4839,9 +4843,10 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
 	mvpp2_port_enable(port);
 }
 
-static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface, struct phy_device *phy)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	u32 val;
 
@@ -4866,9 +4871,10 @@ static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
 	netif_tx_wake_all_queues(dev);
 }
 
-static void mvpp2_mac_link_down(struct net_device *dev, unsigned int mode,
-				phy_interface_t interface)
+static void mvpp2_mac_link_down(struct phylink_config *config,
+				unsigned int mode, phy_interface_t interface)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	u32 val;
 
@@ -5125,8 +5131,11 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 
 	/* Phylink isn't used w/ ACPI as of now */
 	if (port_node) {
-		phylink = phylink_create(dev, port_fwnode, phy_mode,
-					 &mvpp2_phylink_ops);
+		port->phylink_config.dev = &dev->dev;
+		port->phylink_config.type = PHYLINK_NETDEV;
+
+		phylink = phylink_create(&port->phylink_config, port_fwnode,
+					 phy_mode, &mvpp2_phylink_ops);
 		if (IS_ERR(phylink)) {
 			err = PTR_ERR(phylink);
 			goto err_free_port_pcpu;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 83ab83c3edba..5a283bf9d402 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -41,6 +41,7 @@ struct phylink {
 	/* private: */
 	struct net_device *netdev;
 	const struct phylink_mac_ops *ops;
+	struct phylink_config *config;
 
 	unsigned long phylink_disable_state; /* bitmask of disables */
 	struct phy_device *phydev;
@@ -111,7 +112,7 @@ static const char *phylink_an_mode_str(unsigned int mode)
 static int phylink_validate(struct phylink *pl, unsigned long *supported,
 			    struct phylink_link_state *state)
 {
-	pl->ops->validate(pl->netdev, supported, state);
+	pl->ops->validate(pl->config, supported, state);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
@@ -299,7 +300,7 @@ static void phylink_mac_config(struct phylink *pl,
 		   __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
 		   state->pause, state->link, state->an_enabled);
 
-	pl->ops->mac_config(pl->netdev, pl->link_an_mode, state);
+	pl->ops->mac_config(pl->config, pl->link_an_mode, state);
 }
 
 static void phylink_mac_config_up(struct phylink *pl,
@@ -313,12 +314,11 @@ static void phylink_mac_an_restart(struct phylink *pl)
 {
 	if (pl->link_config.an_enabled &&
 	    phy_interface_mode_is_8023z(pl->link_config.interface))
-		pl->ops->mac_an_restart(pl->netdev);
+		pl->ops->mac_an_restart(pl->config);
 }
 
 static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *state)
 {
-	struct net_device *ndev = pl->netdev;
 
 	linkmode_copy(state->advertising, pl->link_config.advertising);
 	linkmode_zero(state->lp_advertising);
@@ -330,7 +330,7 @@ static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *
 	state->an_complete = 0;
 	state->link = 1;
 
-	return pl->ops->mac_link_state(ndev, state);
+	return pl->ops->mac_link_state(pl->config, state);
 }
 
 /* The fixed state is... fixed except for the link state,
@@ -400,7 +400,7 @@ static void phylink_mac_link_up(struct phylink *pl,
 {
 	struct net_device *ndev = pl->netdev;
 
-	pl->ops->mac_link_up(ndev, pl->link_an_mode,
+	pl->ops->mac_link_up(pl->config, pl->link_an_mode,
 			     pl->phy_state.interface,
 			     pl->phydev);
 
@@ -418,7 +418,7 @@ static void phylink_mac_link_down(struct phylink *pl)
 	struct net_device *ndev = pl->netdev;
 
 	netif_carrier_off(ndev);
-	pl->ops->mac_link_down(ndev, pl->link_an_mode,
+	pl->ops->mac_link_down(pl->config, pl->link_an_mode,
 			       pl->phy_state.interface);
 	netdev_info(ndev, "Link is Down\n");
 }
@@ -553,7 +553,7 @@ static int phylink_register_sfp(struct phylink *pl,
  * Returns a pointer to a &struct phylink, or an error-pointer value. Users
  * must use IS_ERR() to check for errors from this function.
  */
-struct phylink *phylink_create(struct net_device *ndev,
+struct phylink *phylink_create(struct phylink_config *config,
 			       struct fwnode_handle *fwnode,
 			       phy_interface_t iface,
 			       const struct phylink_mac_ops *ops)
@@ -567,7 +567,15 @@ struct phylink *phylink_create(struct net_device *ndev,
 
 	mutex_init(&pl->state_mutex);
 	INIT_WORK(&pl->resolve, phylink_resolve);
-	pl->netdev = ndev;
+
+	pl->config = config;
+	if (config->type == PHYLINK_NETDEV) {
+		pl->netdev = to_net_dev(config->dev);
+	} else {
+		kfree(pl);
+		return ERR_PTR(-EINVAL);
+	}
+
 	pl->phy_state.interface = iface;
 	pl->link_interface = iface;
 	if (iface == PHY_INTERFACE_MODE_MOCA)
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 6411c624f63a..67f35f07ac4b 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -54,6 +54,20 @@ struct phylink_link_state {
 	unsigned int an_complete:1;
 };
 
+enum phylink_op_type {
+	PHYLINK_NETDEV = 0,
+};
+
+/**
+ * struct phylink_config - PHYLINK configuration structure
+ * @dev: a pointer to a struct device associated with the MAC
+ * @type: operation type of PHYLINK instance
+ */
+struct phylink_config {
+	struct device *dev;
+	enum phylink_op_type type;
+};
+
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
@@ -66,16 +80,17 @@ struct phylink_link_state {
  * The individual methods are described more fully below.
  */
 struct phylink_mac_ops {
-	void (*validate)(struct net_device *ndev, unsigned long *supported,
+	void (*validate)(struct phylink_config *config,
+			 unsigned long *supported,
 			 struct phylink_link_state *state);
-	int (*mac_link_state)(struct net_device *ndev,
+	int (*mac_link_state)(struct phylink_config *config,
 			      struct phylink_link_state *state);
-	void (*mac_config)(struct net_device *ndev, unsigned int mode,
+	void (*mac_config)(struct phylink_config *config, unsigned int mode,
 			   const struct phylink_link_state *state);
-	void (*mac_an_restart)(struct net_device *ndev);
-	void (*mac_link_down)(struct net_device *ndev, unsigned int mode,
+	void (*mac_an_restart)(struct phylink_config *config);
+	void (*mac_link_down)(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface);
-	void (*mac_link_up)(struct net_device *ndev, unsigned int mode,
+	void (*mac_link_up)(struct phylink_config *config, unsigned int mode,
 			    phy_interface_t interface,
 			    struct phy_device *phy);
 };
@@ -83,7 +98,7 @@ struct phylink_mac_ops {
 #if 0 /* For kernel-doc purposes only. */
 /**
  * validate - Validate and update the link configuration
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @supported: ethtool bitmask for supported link modes.
  * @state: a pointer to a &struct phylink_link_state.
  *
@@ -100,12 +115,12 @@ struct phylink_mac_ops {
  * based on @state->advertising and/or @state->speed and update
  * @state->interface accordingly.
  */
-void validate(struct net_device *ndev, unsigned long *supported,
+void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
 
 /**
  * mac_link_state() - Read the current link state from the hardware
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @state: a pointer to a &struct phylink_link_state.
  *
  * Read the current link state from the MAC, reporting the current
@@ -114,12 +129,12 @@ void validate(struct net_device *ndev, unsigned long *supported,
  * negotiation completion state in @state->an_complete, and link
  * up state in @state->link.
  */
-int mac_link_state(struct net_device *ndev,
+int mac_link_state(struct phylink_config *config,
 		   struct phylink_link_state *state);
 
 /**
  * mac_config() - configure the MAC for the selected mode and state
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
  * @state: a pointer to a &struct phylink_link_state.
  *
@@ -157,18 +172,18 @@ int mac_link_state(struct net_device *ndev,
  * down.  This "update" behaviour is critical to avoid bouncing the
  * link up status.
  */
-void mac_config(struct net_device *ndev, unsigned int mode,
+void mac_config(struct phylink_config *config, unsigned int mode,
 		const struct phylink_link_state *state);
 
 /**
  * mac_an_restart() - restart 802.3z BaseX autonegotiation
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  */
-void mac_an_restart(struct net_device *ndev);
+void mac_an_restart(struct phylink_config *config);
 
 /**
  * mac_link_down() - take the link down
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: link autonegotiation mode
  * @interface: link &typedef phy_interface_t mode
  *
@@ -177,12 +192,12 @@ void mac_an_restart(struct net_device *ndev);
  * Energy Efficient Ethernet MAC configuration. Interface type
  * selection must be done in mac_config().
  */
-void mac_link_down(struct net_device *ndev, unsigned int mode,
+void mac_link_down(struct phylink_config *config, unsigned int mode,
 		   phy_interface_t interface);
 
 /**
  * mac_link_up() - allow the link to come up
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: link autonegotiation mode
  * @interface: link &typedef phy_interface_t mode
  * @phy: any attached phy
@@ -193,13 +208,14 @@ void mac_link_down(struct net_device *ndev, unsigned int mode,
  * phy_init_eee() and perform appropriate MAC configuration for EEE.
  * Interface type selection must be done in mac_config().
  */
-void mac_link_up(struct net_device *ndev, unsigned int mode,
+void mac_link_up(struct phylink_config *config, unsigned int mode,
 		 phy_interface_t interface,
 		 struct phy_device *phy);
 #endif
 
-struct phylink *phylink_create(struct net_device *, struct fwnode_handle *,
-	phy_interface_t iface, const struct phylink_mac_ops *ops);
+struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
+			       phy_interface_t iface,
+			       const struct phylink_mac_ops *ops);
 void phylink_destroy(struct phylink *);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 685294817712..a7f36219904f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -22,6 +22,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/platform_data/dsa.h>
+#include <linux/phylink.h>
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
@@ -193,6 +194,7 @@ struct dsa_port {
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
+	struct phylink_config	pl_config;
 
 	struct work_struct	xmit_work;
 	struct sk_buff_head	xmit_queue;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9892ca1f6859..48e017637d4f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1164,11 +1164,11 @@ static struct device_type dsa_type = {
 	.name	= "dsa",
 };
 
-static void dsa_slave_phylink_validate(struct net_device *dev,
+static void dsa_slave_phylink_validate(struct phylink_config *config,
 				       unsigned long *supported,
 				       struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_validate)
@@ -1177,10 +1177,10 @@ static void dsa_slave_phylink_validate(struct net_device *dev,
 	ds->ops->phylink_validate(ds, dp->index, supported, state);
 }
 
-static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+static int dsa_slave_phylink_mac_link_state(struct phylink_config *config,
 					    struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	/* Only called for SGMII and 802.3z */
@@ -1190,11 +1190,11 @@ static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
 	return ds->ops->phylink_mac_link_state(ds, dp->index, state);
 }
 
-static void dsa_slave_phylink_mac_config(struct net_device *dev,
+static void dsa_slave_phylink_mac_config(struct phylink_config *config,
 					 unsigned int mode,
 					 const struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_config)
@@ -1203,9 +1203,9 @@ static void dsa_slave_phylink_mac_config(struct net_device *dev,
 	ds->ops->phylink_mac_config(ds, dp->index, mode, state);
 }
 
-static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+static void dsa_slave_phylink_mac_an_restart(struct phylink_config *config)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_an_restart)
@@ -1214,11 +1214,12 @@ static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
 	ds->ops->phylink_mac_an_restart(ds, dp->index);
 }
 
-static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+static void dsa_slave_phylink_mac_link_down(struct phylink_config *config,
 					    unsigned int mode,
 					    phy_interface_t interface)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct net_device *dev = dp->slave;
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_link_down) {
@@ -1230,12 +1231,13 @@ static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
 	ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
 }
 
-static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+static void dsa_slave_phylink_mac_link_up(struct phylink_config *config,
 					  unsigned int mode,
 					  phy_interface_t interface,
 					  struct phy_device *phydev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct net_device *dev = dp->slave;
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_link_up) {
@@ -1303,7 +1305,10 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
 
-	dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+	dp->pl_config.dev = &slave_dev->dev;
+	dp->pl_config.type = PHYLINK_NETDEV;
+
+	dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode,
 				&dsa_slave_phylink_mac_ops);
 	if (IS_ERR(dp->pl)) {
 		netdev_err(slave_dev,
-- 
cgit v1.2.3


From 43de61959b999279bafb031c0c9bdf0f6cd1c501 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 28 May 2019 20:38:13 +0300
Subject: net: phylink: Add PHYLINK_DEV operation type

In the PHYLINK_DEV operation type, the PHYLINK infrastructure can work
without an attached net_device. For printing usecases, instead, a struct
device * should be passed to PHYLINK using the phylink_config structure.

Also, netif_carrier_* calls ar guarded by the presence of a valid
net_device. When using the PHYLINK_DEV operation type, we cannot check
link status using the netif_carrier_ok() API so instead, keep an
internal state of the MAC and call mac_link_{down,up} only when the link
changed.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 25 ++++++++++++++++++++-----
 include/linux/phylink.h   |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 5a283bf9d402..5f6120f3fa3f 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -42,6 +42,8 @@ struct phylink {
 	struct net_device *netdev;
 	const struct phylink_mac_ops *ops;
 	struct phylink_config *config;
+	struct device *dev;
+	unsigned int old_link_state:1;
 
 	unsigned long phylink_disable_state; /* bitmask of disables */
 	struct phy_device *phydev;
@@ -404,7 +406,8 @@ static void phylink_mac_link_up(struct phylink *pl,
 			     pl->phy_state.interface,
 			     pl->phydev);
 
-	netif_carrier_on(ndev);
+	if (ndev)
+		netif_carrier_on(ndev);
 
 	netdev_info(ndev,
 		    "Link is Up - %s/%s - flow control %s\n",
@@ -417,7 +420,8 @@ static void phylink_mac_link_down(struct phylink *pl)
 {
 	struct net_device *ndev = pl->netdev;
 
-	netif_carrier_off(ndev);
+	if (ndev)
+		netif_carrier_off(ndev);
 	pl->ops->mac_link_down(pl->config, pl->link_an_mode,
 			       pl->phy_state.interface);
 	netdev_info(ndev, "Link is Down\n");
@@ -428,6 +432,7 @@ static void phylink_resolve(struct work_struct *w)
 	struct phylink *pl = container_of(w, struct phylink, resolve);
 	struct phylink_link_state link_state;
 	struct net_device *ndev = pl->netdev;
+	int link_changed;
 
 	mutex_lock(&pl->state_mutex);
 	if (pl->phylink_disable_state) {
@@ -470,7 +475,13 @@ static void phylink_resolve(struct work_struct *w)
 		}
 	}
 
-	if (link_state.link != netif_carrier_ok(ndev)) {
+	if (pl->netdev)
+		link_changed = (link_state.link != netif_carrier_ok(ndev));
+	else
+		link_changed = (link_state.link != pl->old_link_state);
+
+	if (link_changed) {
+		pl->old_link_state = link_state.link;
 		if (!link_state.link)
 			phylink_mac_link_down(pl);
 		else
@@ -571,6 +582,8 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->config = config;
 	if (config->type == PHYLINK_NETDEV) {
 		pl->netdev = to_net_dev(config->dev);
+	} else if (config->type == PHYLINK_DEV) {
+		pl->dev = config->dev;
 	} else {
 		kfree(pl);
 		return ERR_PTR(-EINVAL);
@@ -910,7 +923,8 @@ void phylink_start(struct phylink *pl)
 		    phy_modes(pl->link_config.interface));
 
 	/* Always set the carrier off */
-	netif_carrier_off(pl->netdev);
+	if (pl->netdev)
+		netif_carrier_off(pl->netdev);
 
 	/* Apply the link configuration to the MAC when starting. This allows
 	 * a fixed-link to start with the correct parameters, and also
@@ -1255,7 +1269,8 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
 		switch (pl->link_an_mode) {
 		case MLO_AN_PHY:
 			/* Silently mark the carrier down, and then trigger a resolve */
-			netif_carrier_off(pl->netdev);
+			if (pl->netdev)
+				netif_carrier_off(pl->netdev);
 			phylink_run_resolve(pl);
 			break;
 
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 67f35f07ac4b..0f6f65bb9d44 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -56,6 +56,7 @@ struct phylink_link_state {
 
 enum phylink_op_type {
 	PHYLINK_NETDEV = 0,
+	PHYLINK_DEV,
 };
 
 /**
-- 
cgit v1.2.3


From 84ede58dfcd1db6f04f71dd3ccd5328271b346da Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 20 May 2019 09:54:46 -0700
Subject: crypto: hash - remove CRYPTO_ALG_TYPE_DIGEST

Remove the unnecessary constant CRYPTO_ALG_TYPE_DIGEST, which has the
same value as CRYPTO_ALG_TYPE_HASH.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/architecture.rst | 4 +---
 crypto/cryptd.c                       | 2 +-
 include/linux/crypto.h                | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index ee8ff0762d7f..3eae1ae7f798 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -208,9 +208,7 @@ the aforementioned cipher types:
 -  CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
    an ECDH or DH implementation
 
--  CRYPTO_ALG_TYPE_DIGEST Raw message digest
-
--  CRYPTO_ALG_TYPE_HASH Alias for CRYPTO_ALG_TYPE_DIGEST
+-  CRYPTO_ALG_TYPE_HASH Raw message digest
 
 -  CRYPTO_ALG_TYPE_SHASH Synchronous multi-block hash
 
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 1bf777b76512..c34d10309b1b 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -925,7 +925,7 @@ static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
 	switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_BLKCIPHER:
 		return cryptd_create_skcipher(tmpl, tb, &queue);
-	case CRYPTO_ALG_TYPE_DIGEST:
+	case CRYPTO_ALG_TYPE_HASH:
 		return cryptd_create_hash(tmpl, tb, &queue);
 	case CRYPTO_ALG_TYPE_AEAD:
 		return cryptd_create_aead(tmpl, tb, &queue);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index f2565a103158..311237b1dab0 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -54,7 +54,6 @@
 #define CRYPTO_ALG_TYPE_SCOMPRESS	0x0000000b
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
 #define CRYPTO_ALG_TYPE_AKCIPHER	0x0000000d
-#define CRYPTO_ALG_TYPE_DIGEST		0x0000000e
 #define CRYPTO_ALG_TYPE_HASH		0x0000000e
 #define CRYPTO_ALG_TYPE_SHASH		0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000f
-- 
cgit v1.2.3


From 1e91a2e5d827e643cbabad66d133f155a7fcb0de Mon Sep 17 00:00:00 2001
From: Ruslan Babayev <ruslan@babayev.com>
Date: Tue, 28 May 2019 16:02:32 -0700
Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle

This allows drivers to lookup i2c adapters on ACPI based systems similar to
of_get_i2c_adapter_by_node() with DT based systems.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Cc: xe-linux-external@cisco.com
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 272800692088..964687534754 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data)
 	return ACPI_COMPANION(dev) == data;
 }
 
-static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 {
 	struct device *dev;
 
@@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 			      i2c_acpi_find_match_adapter);
 	return dev ? i2c_verify_adapter(dev) : NULL;
 }
+EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
 
 static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1308126fc384..e982b8913b73 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -14,6 +14,7 @@
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
+#include <linux/acpi.h>		/* for acpi_handle */
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
 {
 	return NULL;
 }
+static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From 764dd163ac922f8683b5bcd3007251ce7b26cd33 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:38 +0200
Subject: netfilter: nf_conntrack_bridge: add support for IPv6

br_defrag() and br_fragment() indirections are added in case that IPv6
support comes as a module, to avoid pulling innecessary dependencies in.

The new fraglist iterator and fragment transformer APIs are used to
implement the refragmentation code.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h             |  50 ++++++++++++
 net/bridge/netfilter/nf_conntrack_bridge.c |  59 +++++++++++++-
 net/ipv6/netfilter.c                       | 123 +++++++++++++++++++++++++++++
 3 files changed, 230 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 12113e502656..a21b8c9623ee 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -19,6 +19,7 @@ struct ip6_rt_info {
 };
 
 struct nf_queue_entry;
+struct nf_ct_bridge_frag_data;
 
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
@@ -39,6 +40,15 @@ struct nf_ipv6_ops {
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
+#if IS_MODULE(CONFIG_IPV6)
+	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user);
+	int (*br_fragment)(struct net *net, struct sock *sk,
+			   struct sk_buff *skb,
+			   struct nf_ct_bridge_frag_data *data,
+			   int (*output)(struct net *, struct sock *sk,
+					 const struct nf_ct_bridge_frag_data *data,
+					 struct sk_buff *));
+#endif
 };
 
 #ifdef CONFIG_NETFILTER
@@ -86,6 +96,46 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #endif
 }
 
+static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
+				    u32 user)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (!v6_ops)
+		return 1;
+
+	return v6_ops->br_defrag(net, skb, user);
+#else
+	return nf_ct_frag6_gather(net, skb, user);
+#endif
+}
+
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		    struct nf_ct_bridge_frag_data *data,
+		    int (*output)(struct net *, struct sock *sk,
+				  const struct nf_ct_bridge_frag_data *data,
+				  struct sk_buff *));
+
+static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
+				     struct sk_buff *skb,
+				     struct nf_ct_bridge_frag_data *data,
+				     int (*output)(struct net *, struct sock *sk,
+						   const struct nf_ct_bridge_frag_data *data,
+						   struct sk_buff *))
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (!v6_ops)
+		return 1;
+
+	return v6_ops->br_fragment(net, sk, skb, data, output);
+#else
+	return br_ip6_fragment(net, sk, skb, data, output);
+#endif
+}
+
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb);
 
 static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 2571528ed582..b675cd7c1a82 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -163,6 +163,31 @@ static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+				     const struct nf_hook_state *state)
+{
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+	enum ip_conntrack_info ctinfo;
+	struct br_input_skb_cb cb;
+	const struct nf_conn *ct;
+	int err;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct)
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+	br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+	err = nf_ipv6_br_defrag(state->net, skb,
+				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+	/* queued */
+	if (err == -EINPROGRESS)
+		return NF_STOLEN;
+
+	br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+	return err == 0 ? NF_ACCEPT : NF_DROP;
+}
+
 static int nf_ct_br_ip_check(const struct sk_buff *skb)
 {
 	const struct iphdr *iph;
@@ -177,6 +202,23 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb)
 	len = ntohs(iph->tot_len);
 	if (skb->len < nhoff + len ||
 	    len < (iph->ihl * 4))
+                return -1;
+
+	return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	int nhoff, len;
+
+	nhoff = skb_network_offset(skb);
+	hdr = ipv6_hdr(skb);
+	if (hdr->version != 6)
+		return -1;
+
+	len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+	if (skb->len < len)
 		return -1;
 
 	return 0;
@@ -212,7 +254,19 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
 		ret = nf_ct_br_defrag4(skb, &bridge_state);
 		break;
 	case htons(ETH_P_IPV6):
-		/* fall through */
+		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+			return NF_ACCEPT;
+
+		len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+		if (pskb_trim_rcsum(skb, len))
+			return NF_ACCEPT;
+
+		if (nf_ct_br_ipv6_check(skb))
+			return NF_ACCEPT;
+
+		bridge_state.pf = NFPROTO_IPV6;
+		ret = nf_ct_br_defrag6(skb, &bridge_state);
+		break;
 	default:
 		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 		return NF_ACCEPT;
@@ -254,7 +308,8 @@ nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
 		nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
 		break;
 	case htons(ETH_P_IPV6):
-		return NF_ACCEPT;
+		nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		return NF_DROP;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 1240ccd57f39..c6665382acb5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,6 +16,9 @@
 #include <net/ip6_route.h>
 #include <net/xfrm.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
 
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 {
@@ -109,6 +112,122 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
 }
 EXPORT_SYMBOL_GPL(__nf_ip6_route);
 
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		    struct nf_ct_bridge_frag_data *data,
+		    int (*output)(struct net *, struct sock *sk,
+				  const struct nf_ct_bridge_frag_data *data,
+				  struct sk_buff *))
+{
+	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+	struct ip6_frag_state state;
+	u8 *prevhdr, nexthdr = 0;
+	unsigned int mtu, hlen;
+	int hroom, err = 0;
+	__be32 frag_id;
+
+	err = ip6_find_1stfragopt(skb, &prevhdr);
+	if (err < 0)
+		goto blackhole;
+	hlen = err;
+	nexthdr = *prevhdr;
+
+	mtu = skb->dev->mtu;
+	if (frag_max_size > mtu ||
+	    frag_max_size < IPV6_MIN_MTU)
+		goto blackhole;
+
+	mtu = frag_max_size;
+	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+		goto blackhole;
+	mtu -= hlen + sizeof(struct frag_hdr);
+
+	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+				    &ipv6_hdr(skb)->saddr);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto blackhole;
+
+	hroom = LL_RESERVED_SPACE(skb->dev);
+	if (skb_has_frag_list(skb)) {
+		unsigned int first_len = skb_pagelen(skb);
+		struct ip6_fraglist_iter iter;
+		struct sk_buff *frag2;
+
+		if (first_len - hlen > mtu ||
+		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+			goto blackhole;
+
+		if (skb_cloned(skb))
+			goto slow_path;
+
+		skb_walk_frags(skb, frag2) {
+			if (frag2->len > mtu ||
+			    skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+				goto blackhole;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag2))
+				goto slow_path;
+		}
+
+		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+					&iter);
+		if (err < 0)
+			goto blackhole;
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down.
+			 */
+			if (iter.frag)
+				ip6_fraglist_prepare(skb, &iter);
+
+			err = output(net, sk, data, skb);
+			if (err || !iter.frag)
+				break;
+
+			skb = ip6_fraglist_next(&iter);
+		}
+
+		kfree(iter.tmp_hdr);
+		if (!err)
+			return 0;
+
+		kfree_skb_list(iter.frag_list);
+		return err;
+	}
+slow_path:
+	/* This is a linearized skbuff, the original geometry is lost for us.
+	 * This may also be a clone skbuff, we could preserve the geometry for
+	 * the copies but probably not worth the effort.
+	 */
+	ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+		      LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+		      &state);
+
+	while (state.left > 0) {
+		struct sk_buff *skb2;
+
+		skb2 = ip6_frag_next(skb, &state);
+		if (IS_ERR(skb2)) {
+			err = PTR_ERR(skb2);
+			goto blackhole;
+		}
+
+		err = output(net, sk, data, skb2);
+		if (err)
+			goto blackhole;
+	}
+	consume_skb(skb);
+	return err;
+
+blackhole:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
 static const struct nf_ipv6_ops ipv6ops = {
 #if IS_MODULE(CONFIG_IPV6)
 	.chk_addr		= ipv6_chk_addr,
@@ -119,6 +238,10 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
 	.reroute		= nf_ip6_reroute,
+#if IS_MODULE(CONFIG_NF_CONNTRACK_BRIDGE)
+	.br_defrag		= nf_ct_frag6_gather,
+	.br_fragment		= br_ip6_fragment,
+#endif
 };
 
 int __init ipv6_netfilter_init(void)
-- 
cgit v1.2.3


From ed0ac5c7ec3763e3261c48e3c5d4b7528b60fd85 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 20 May 2019 21:51:50 +0100
Subject: keys: Add a keyctl to move a key between keyrings

Add a keyctl to atomically move a link to a key from one keyring to
another.  The key must exist in "from" keyring and a flag can be given to
cause the operation to fail if there's a matching key already in the "to"
keyring.

This can be done with:

	keyctl(KEYCTL_MOVE,
	       key_serial_t key,
	       key_serial_t from_keyring,
	       key_serial_t to_keyring,
	       unsigned int flags);

The key being moved must grant Link permission and both keyrings must grant
Write permission.

flags should be 0 or KEYCTL_MOVE_EXCL, with the latter preventing
displacement of a matching key from the "to" keyring.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/core.rst |  21 +++++++
 include/linux/key.h                  |   5 ++
 include/uapi/linux/keyctl.h          |   3 +
 security/keys/compat.c               |   3 +
 security/keys/internal.h             |   3 +
 security/keys/keyctl.c               |  52 +++++++++++++++++
 security/keys/keyring.c              | 108 +++++++++++++++++++++++++++++++++++
 7 files changed, 195 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 9521c4207f01..823d29bf44f7 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -577,6 +577,27 @@ The keyctl syscall functions are:
      added.
 
 
+  *  Move a key from one keyring to another::
+
+	long keyctl(KEYCTL_MOVE,
+		    key_serial_t id,
+		    key_serial_t from_ring_id,
+		    key_serial_t to_ring_id,
+		    unsigned int flags);
+
+     Move the key specified by "id" from the keyring specified by
+     "from_ring_id" to the keyring specified by "to_ring_id".  If the two
+     keyrings are the same, nothing is done.
+
+     "flags" can have KEYCTL_MOVE_EXCL set in it to cause the operation to fail
+     with EEXIST if a matching key exists in the destination keyring, otherwise
+     such a key will be replaced.
+
+     A process must have link permission on the key for this function to be
+     successful and write permission on both keyrings.  Any errors that can
+     occur from KEYCTL_LINK also apply on the destination keyring here.
+
+
   *  Unlink a key or keyring from another keyring::
 
 	long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key);
diff --git a/include/linux/key.h b/include/linux/key.h
index 1f09aad1c98c..612e1cf84049 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -310,6 +310,11 @@ extern int key_update(key_ref_t key,
 extern int key_link(struct key *keyring,
 		    struct key *key);
 
+extern int key_move(struct key *key,
+		    struct key *from_keyring,
+		    struct key *to_keyring,
+		    unsigned int flags);
+
 extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index f45ee0f69c0c..fd9fb11b312b 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -67,6 +67,7 @@
 #define KEYCTL_PKEY_SIGN		27	/* Create a public key signature */
 #define KEYCTL_PKEY_VERIFY		28	/* Verify a public key signature */
 #define KEYCTL_RESTRICT_KEYRING		29	/* Restrict keys allowed to link to a keyring */
+#define KEYCTL_MOVE			30	/* Move keys between keyrings */
 
 /* keyctl structures */
 struct keyctl_dh_params {
@@ -112,4 +113,6 @@ struct keyctl_pkey_params {
 	__u32		__spare[7];
 };
 
+#define KEYCTL_MOVE_EXCL	0x00000001 /* Do not displace from the to-keyring */
+
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 9482df601dc3..b326bc4f84d7 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -159,6 +159,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 		return keyctl_pkey_verify(compat_ptr(arg2), compat_ptr(arg3),
 					  compat_ptr(arg4), compat_ptr(arg5));
 
+	case KEYCTL_MOVE:
+		return keyctl_keyring_move(arg2, arg3, arg4, arg5);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 25cdd0cbdc06..b54a58c025ae 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -95,6 +95,8 @@ extern void key_type_put(struct key_type *ktype);
 
 extern int __key_link_lock(struct key *keyring,
 			   const struct keyring_index_key *index_key);
+extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
+			   const struct keyring_index_key *index_key);
 extern int __key_link_begin(struct key *keyring,
 			    const struct keyring_index_key *index_key,
 			    struct assoc_array_edit **_edit);
@@ -217,6 +219,7 @@ extern long keyctl_update_key(key_serial_t, const void __user *, size_t);
 extern long keyctl_revoke_key(key_serial_t);
 extern long keyctl_keyring_clear(key_serial_t);
 extern long keyctl_keyring_link(key_serial_t, key_serial_t);
+extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int);
 extern long keyctl_keyring_unlink(key_serial_t, key_serial_t);
 extern long keyctl_describe_key(key_serial_t, char __user *, size_t);
 extern long keyctl_keyring_search(key_serial_t, const char __user *,
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0f947bcbad46..bbfe7d92d41c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -572,6 +572,52 @@ error:
 	return ret;
 }
 
+/*
+ * Move a link to a key from one keyring to another, displacing any matching
+ * key from the destination keyring.
+ *
+ * The key must grant the caller Link permission and both keyrings must grant
+ * the caller Write permission.  There must also be a link in the from keyring
+ * to the key.  If both keyrings are the same, nothing is done.
+ *
+ * If successful, 0 will be returned.
+ */
+long keyctl_keyring_move(key_serial_t id, key_serial_t from_ringid,
+			 key_serial_t to_ringid, unsigned int flags)
+{
+	key_ref_t key_ref, from_ref, to_ref;
+	long ret;
+
+	if (flags & ~KEYCTL_MOVE_EXCL)
+		return -EINVAL;
+
+	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK);
+	if (IS_ERR(key_ref))
+		return PTR_ERR(key_ref);
+
+	from_ref = lookup_user_key(from_ringid, 0, KEY_NEED_WRITE);
+	if (IS_ERR(from_ref)) {
+		ret = PTR_ERR(from_ref);
+		goto error2;
+	}
+
+	to_ref = lookup_user_key(to_ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
+	if (IS_ERR(to_ref)) {
+		ret = PTR_ERR(to_ref);
+		goto error3;
+	}
+
+	ret = key_move(key_ref_to_ptr(key_ref), key_ref_to_ptr(from_ref),
+		       key_ref_to_ptr(to_ref), flags);
+
+	key_ref_put(to_ref);
+error3:
+	key_ref_put(from_ref);
+error2:
+	key_ref_put(key_ref);
+	return ret;
+}
+
 /*
  * Return a description of a key to userspace.
  *
@@ -1772,6 +1818,12 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			(const void __user *)arg4,
 			(const void __user *)arg5);
 
+	case KEYCTL_MOVE:
+		return keyctl_keyring_move((key_serial_t)arg2,
+					   (key_serial_t)arg3,
+					   (key_serial_t)arg4,
+					   (unsigned int)arg5);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 12acad3db6cf..67066bb58b83 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1221,6 +1221,40 @@ int __key_link_lock(struct key *keyring,
 	return 0;
 }
 
+/*
+ * Lock keyrings for move (link/unlink combination).
+ */
+int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
+		    const struct keyring_index_key *index_key)
+	__acquires(&l_keyring->sem)
+	__acquires(&u_keyring->sem)
+	__acquires(&keyring_serialise_link_lock)
+{
+	if (l_keyring->type != &key_type_keyring ||
+	    u_keyring->type != &key_type_keyring)
+		return -ENOTDIR;
+
+	/* We have to be very careful here to take the keyring locks in the
+	 * right order, lest we open ourselves to deadlocking against another
+	 * move operation.
+	 */
+	if (l_keyring < u_keyring) {
+		down_write(&l_keyring->sem);
+		down_write_nested(&u_keyring->sem, 1);
+	} else {
+		down_write(&u_keyring->sem);
+		down_write_nested(&l_keyring->sem, 1);
+	}
+
+	/* Serialise link/link calls to prevent parallel calls causing a cycle
+	 * when linking two keyring in opposite orders.
+	 */
+	if (index_key->type == &key_type_keyring)
+		mutex_lock(&keyring_serialise_link_lock);
+
+	return 0;
+}
+
 /*
  * Preallocate memory so that a key can be linked into to a keyring.
  */
@@ -1494,6 +1528,80 @@ int key_unlink(struct key *keyring, struct key *key)
 }
 EXPORT_SYMBOL(key_unlink);
 
+/**
+ * key_move - Move a key from one keyring to another
+ * @key: The key to move
+ * @from_keyring: The keyring to remove the link from.
+ * @to_keyring: The keyring to make the link in.
+ * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL.
+ *
+ * Make a link in @to_keyring to a key, such that the keyring holds a reference
+ * on that key and the key can potentially be found by searching that keyring
+ * whilst simultaneously removing a link to the key from @from_keyring.
+ *
+ * This function will write-lock both keyring's semaphores and will consume
+ * some of the user's key data quota to hold the link on @to_keyring.
+ *
+ * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring,
+ * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second
+ * keyring is full, -EDQUOT if there is insufficient key data quota remaining
+ * to add another link or -ENOMEM if there's insufficient memory.  If
+ * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a
+ * matching key in @to_keyring.
+ *
+ * It is assumed that the caller has checked that it is permitted for a link to
+ * be made (the keyring should have Write permission and the key Link
+ * permission).
+ */
+int key_move(struct key *key,
+	     struct key *from_keyring,
+	     struct key *to_keyring,
+	     unsigned int flags)
+{
+	struct assoc_array_edit *from_edit = NULL, *to_edit = NULL;
+	int ret;
+
+	kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial);
+
+	if (from_keyring == to_keyring)
+		return 0;
+
+	key_check(key);
+	key_check(from_keyring);
+	key_check(to_keyring);
+
+	ret = __key_move_lock(from_keyring, to_keyring, &key->index_key);
+	if (ret < 0)
+		goto out;
+	ret = __key_unlink_begin(from_keyring, key, &from_edit);
+	if (ret < 0)
+		goto error;
+	ret = __key_link_begin(to_keyring, &key->index_key, &to_edit);
+	if (ret < 0)
+		goto error;
+
+	ret = -EEXIST;
+	if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL))
+		goto error;
+
+	ret = __key_link_check_restriction(to_keyring, key);
+	if (ret < 0)
+		goto error;
+	ret = __key_link_check_live_key(to_keyring, key);
+	if (ret < 0)
+		goto error;
+
+	__key_unlink(from_keyring, key, &from_edit);
+	__key_link(key, &to_edit);
+error:
+	__key_link_end(to_keyring, &key->index_key, to_edit);
+	__key_unlink_end(from_keyring, key, from_edit);
+out:
+	kleave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(key_move);
+
 /**
  * keyring_clear - Clear a keyring
  * @keyring: The keyring to clear.
-- 
cgit v1.2.3


From 07b0928918c694c845a387cc16256a8b63ced4fc Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:09:15 +0200
Subject: net: phy: enable interrupts when PHY is attached already

This patch is a step towards allowing PHY drivers to handle more
interrupt sources than just link change. E.g. several PHY's have
built-in temperature monitoring and can raise an interrupt if a
temperature threshold is exceeded. We may be interested in such
interrupts also if the phylib state machine isn't started.
Therefore move enabling interrupts to phy_request_interrupt().

v2:
- patch added to series

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        | 36 ++++++++++++++++++++++--------------
 drivers/net/phy/phy_device.c |  2 +-
 include/linux/phy.h          |  1 +
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e8885429293a..4ba71dc3aee7 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -799,10 +799,10 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 }
 
 /**
- * phy_request_interrupt - request interrupt for a PHY device
+ * phy_request_interrupt - request and enable interrupt for a PHY device
  * @phydev: target phy_device struct
  *
- * Description: Request the interrupt for the given PHY.
+ * Description: Request and enable the interrupt for the given PHY.
  *   If this fails, then we set irq to PHY_POLL.
  *   This should only be called with a valid IRQ number.
  */
@@ -817,10 +817,30 @@ void phy_request_interrupt(struct phy_device *phydev)
 		phydev_warn(phydev, "Error %d requesting IRQ %d, falling back to polling\n",
 			    err, phydev->irq);
 		phydev->irq = PHY_POLL;
+	} else {
+		if (phy_enable_interrupts(phydev)) {
+			phydev_warn(phydev, "Can't enable interrupt, falling back to polling\n");
+			phy_free_interrupt(phydev);
+			phydev->irq = PHY_POLL;
+		}
 	}
 }
 EXPORT_SYMBOL(phy_request_interrupt);
 
+/**
+ * phy_free_interrupt - disable and free interrupt for a PHY device
+ * @phydev: target phy_device struct
+ *
+ * Description: Disable and free the interrupt for the given PHY.
+ *   This should only be called with a valid IRQ number.
+ */
+void phy_free_interrupt(struct phy_device *phydev)
+{
+	phy_disable_interrupts(phydev);
+	free_irq(phydev->irq, phydev);
+}
+EXPORT_SYMBOL(phy_free_interrupt);
+
 /**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
@@ -835,9 +855,6 @@ void phy_stop(struct phy_device *phydev)
 
 	mutex_lock(&phydev->lock);
 
-	if (phy_interrupt_is_valid(phydev))
-		phy_disable_interrupts(phydev);
-
 	phydev->state = PHY_HALTED;
 
 	mutex_unlock(&phydev->lock);
@@ -864,8 +881,6 @@ EXPORT_SYMBOL(phy_stop);
  */
 void phy_start(struct phy_device *phydev)
 {
-	int err;
-
 	mutex_lock(&phydev->lock);
 
 	if (phydev->state != PHY_READY && phydev->state != PHY_HALTED) {
@@ -877,13 +892,6 @@ void phy_start(struct phy_device *phydev)
 	/* if phy was suspended, bring the physical link up again */
 	__phy_resume(phydev);
 
-	/* make sure interrupts are enabled for the PHY */
-	if (phy_interrupt_is_valid(phydev)) {
-		err = phy_enable_interrupts(phydev);
-		if (err < 0)
-			goto out;
-	}
-
 	phydev->state = PHY_UP;
 
 	phy_start_machine(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 8b4fc3b4f269..2c879ba01f35 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1016,7 +1016,7 @@ void phy_disconnect(struct phy_device *phydev)
 		phy_stop(phydev);
 
 	if (phy_interrupt_is_valid(phydev))
-		free_irq(phydev->irq, phydev);
+		phy_free_interrupt(phydev);
 
 	phydev->adjust_link = NULL;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7180b1d1e5e3..72e1196f9799 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1147,6 +1147,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev,
 			      const struct ethtool_link_ksettings *cmd);
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 void phy_request_interrupt(struct phy_device *phydev);
+void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
-- 
cgit v1.2.3


From 49644e68f472c6480e015253fa4d7448c6cfa2aa Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:10:06 +0200
Subject: net: phy: add callback for custom interrupt handler to struct
 phy_driver

The phylib interrupt handler handles link change events only currently.
However PHY drivers may want to use other interrupt sources too,
e.g. to report temperature monitoring events. Therefore add a callback
to struct phy_driver allowing PHY drivers to implement a custom
interrupt handler.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Suggested-by: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 9 +++++++--
 include/linux/phy.h   | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 4ba71dc3aee7..c6b0010a6d20 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -772,8 +772,13 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	if (phydev->drv->did_interrupt && !phydev->drv->did_interrupt(phydev))
 		return IRQ_NONE;
 
-	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev);
+	if (phydev->drv->handle_interrupt) {
+		if (phydev->drv->handle_interrupt(phydev))
+			goto phy_err;
+	} else {
+		/* reschedule state queue work to run as soon as possible */
+		phy_trigger_machine(phydev);
+	}
 
 	if (phy_clear_interrupt(phydev))
 		goto phy_err;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 72e1196f9799..16cd33915496 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -537,6 +537,9 @@ struct phy_driver {
 	 */
 	int (*did_interrupt)(struct phy_device *phydev);
 
+	/* Override default interrupt handling */
+	int (*handle_interrupt)(struct phy_device *phydev);
+
 	/* Clears up any memory if needed */
 	void (*remove)(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From 97b33bdf9bddb6bebc2e87148df3e30aa7a13b2d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:11:06 +0200
Subject: net: phy: export phy_queue_state_machine

We face the issue that link change interrupt and link status may be
reported by different PHY layers. As a result the link change
interrupt may occur before the link status changes.
Export phy_queue_state_machine to allow PHY drivers to specify a
delay between link status change interrupt and link status check.

v2:
- change jiffies parameter type to unsigned long

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Suggested-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 8 +++++---
 include/linux/phy.h   | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c6b0010a6d20..84671d868a80 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -29,6 +29,8 @@
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 
+#define PHY_STATE_TIME	HZ
+
 #define PHY_STATE_STR(_state)			\
 	case PHY_##_state:			\
 		return __stringify(_state);	\
@@ -478,12 +480,12 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL(phy_mii_ioctl);
 
-static void phy_queue_state_machine(struct phy_device *phydev,
-				    unsigned int secs)
+void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies)
 {
 	mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
-			 secs * HZ);
+			 jiffies);
 }
+EXPORT_SYMBOL(phy_queue_state_machine);
 
 static void phy_trigger_machine(struct phy_device *phydev)
 {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 16cd33915496..dc4b51060ebc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -188,7 +188,6 @@ static inline const char *phy_modes(phy_interface_t interface)
 
 
 #define PHY_INIT_TIMEOUT	100000
-#define PHY_STATE_TIME		1
 #define PHY_FORCE_TIMEOUT	10
 
 #define PHY_MAX_ADDR	32
@@ -1140,6 +1139,7 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
+void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-- 
cgit v1.2.3


From 229b4e0728e0a6ddca2645e73696d5b104fbbbfb Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Tue, 14 May 2019 22:47:24 +0800
Subject: Documentation: PCI: convert pci.txt to reST

Convert plain text documentation to reStructuredText format and add it to
Sphinx TOC tree.  No essential content change.

Move the description of struct pci_driver and struct pci_device_id into
in-source comments.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
[bhelgaas: fix kernel-doc warnings related to moving descriptions to
linux/pci.h, fix "space tab" whitespace errors in mod_devicetable.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/PCI/index.rst     |   2 +
 Documentation/PCI/pci.rst       | 578 ++++++++++++++++++++++++++++++++++++
 Documentation/PCI/pci.txt       | 636 ----------------------------------------
 include/linux/mod_devicetable.h |  29 +-
 include/linux/pci.h             |  48 ++-
 5 files changed, 651 insertions(+), 642 deletions(-)
 create mode 100644 Documentation/PCI/pci.rst
 delete mode 100644 Documentation/PCI/pci.txt

(limited to 'include/linux')

diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
index c2f8728d11cf..7babf43709b0 100644
--- a/Documentation/PCI/index.rst
+++ b/Documentation/PCI/index.rst
@@ -7,3 +7,5 @@ Linux PCI Bus Subsystem
 .. toctree::
    :maxdepth: 2
    :numbered:
+
+   pci
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
new file mode 100644
index 000000000000..6864f9a70f5f
--- /dev/null
+++ b/Documentation/PCI/pci.rst
@@ -0,0 +1,578 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+How To Write Linux PCI Drivers
+==============================
+
+:Authors: - Martin Mares <mj@ucw.cz>
+          - Grant Grundler <grundler@parisc-linux.org>
+
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+http://lwn.net/Kernel/LDD3/.
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+Structure of PCI drivers
+========================
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and coherent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Release MMIO/IOP resources
+  - Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+pci_register_driver() call
+==========================
+
+PCI device drivers call ``pci_register_driver()`` during their
+initialization with a pointer to a structure describing the driver
+(``struct pci_driver``):
+
+.. kernel-doc:: include/linux/pci.h
+   :functions: pci_driver
+
+The ID table is an array of ``struct pci_device_id`` entries ending with an
+all-zero entry.  Definitions with static const are generally preferred.
+
+.. kernel-doc:: include/linux/mod_devicetable.h
+   :functions: pci_device_id
+
+Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below::
+
+  echo "vendor device subvendor subdevice class class_mask driver_data" > \
+  /sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+
+  - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+  - class and classmask fields default to 0
+  - driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+"Attributes" for driver functions/data
+--------------------------------------
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+	======		=================================================
+	__init		Initialization code. Thrown away after the driver
+			initializes.
+	__exit		Exit code. Ignored for non-modular drivers.
+	======		=================================================
+
+Tips on when/where to use the above attributes:
+	- The module_init()/module_exit() functions (and all
+	  initialization functions called _only_ from these)
+	  should be marked __init/__exit.
+
+	- Do not mark the struct pci_driver.
+
+	- Do NOT mark a function if you are not sure which mark to use.
+	  Better to not mark the function than mark the function wrong.
+
+
+How to find PCI devices manually
+================================
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID::
+
+	struct pci_dev *dev = NULL;
+	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+		configure_device(dev);
+
+Searching by class ID (iterate in a similar way)::
+
+	pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID::
+
+	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+Device Initialization Steps
+===========================
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+Enable the PCI device
+---------------------
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+
+  - wake up the device if it was in suspended state,
+  - allocate I/O and memory regions of the device (if BIOS did not),
+  - allocate an IRQ (if BIOS did not).
+
+.. note::
+   pci_enable_device() can fail! Check the return value.
+
+.. warning::
+   OS BUG: we don't check resource allocations before enabling those
+   resources. The sequence would make more sense if we called
+   pci_request_resources() before calling pci_enable_device().
+   Currently, the device drivers can't detect the bug when when two
+   devices have been allocated the same range. This is not a common
+   problem and unlikely to get fixed soon.
+
+   This has been discussed before but not changed as of 2.6.19:
+   http://lkml.org/lkml/2006/3/2/194
+
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS.  pci_clear_master() will
+disable DMA by clearing the bus master bit.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate.  Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+Request MMIO/IOP resources
+--------------------------
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/io-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+.. tip::
+   See OS BUG comment above. Currently (2.6.19), The driver can only
+   determine MMIO and IO Port resource availability _after_ calling
+   pci_enable_device().
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+Set the DMA mask size
+---------------------
+.. note::
+   If anything below doesn't make sense, please refer to
+   Documentation/DMA-API.txt. This section is just a reminder that
+   drivers need to indicate DMA capabilities of the device and is not
+   an authoritative source for DMA interfaces.
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters.  In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+Setup shared control data
+-------------------------
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory.  See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+Initialize device registers
+---------------------------
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+Register IRQ handler
+--------------------
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
+PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
+causes the PCI support to program CPU vector data into the PCI device
+capability registers. Many architectures, chip-sets, or BIOSes do NOT
+support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
+the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
+specify PCI_IRQ_LEGACY as well.
+
+Drivers that have different interrupt handlers for MSI/MSI-X and
+legacy INTx should chose the right one based on the msi_enabled
+and msix_enabled flags in the pci_dev structure after calling
+pci_alloc_irq_vectors.
+
+There are (at least) two really good reasons for using MSI:
+
+1) MSI is an exclusive interrupt vector by definition.
+   This means the interrupt handler doesn't have to verify
+   its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+   to be visible to the host CPU(s) when the MSI is delivered. This
+   is important for both data coherency and avoiding stale control data.
+   This guarantee allows the driver to omit MMIO reads to flush
+   the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+PCI device shutdown
+===================
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and consistent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Disable device from responding to MMIO/IO Port addresses
+  - Release MMIO/IO Port resource(s)
+
+
+Stop IRQs on the device
+-----------------------
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+Release the IRQ
+---------------
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+Stop all DMA activity
+---------------------
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+Release DMA buffers
+-------------------
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+Unregister from other subsystems
+--------------------------------
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+Disable Device from responding to MMIO/IO Port addresses
+--------------------------------------------------------
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+Release MMIO/IO Port Resource(s)
+--------------------------------
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+How to access PCI config space
+==============================
+
+You can use `pci_(read|write)_config_(byte|word|dword)` to access the config
+space of a device represented by `struct pci_dev *`. All these functions return
+0 when successful or an error code (`PCIBIOS_...`) which can be translated to a
+text string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+Other interesting functions
+===========================
+
+=============================	================================================
+pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
+				bus and slot and number. If the device is
+				found, its reference count is increased.
+pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability()		Find specified capability in device's capability
+				list.
+pci_resource_start()		Returns bus start address for a given PCI region
+pci_resource_end()		Returns bus end address for a given PCI region
+pci_resource_len()		Returns the byte length of a PCI region
+pci_set_drvdata()		Set private driver data pointer for a pci_dev
+pci_get_drvdata()		Return private driver data pointer for a pci_dev
+pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
+=============================	================================================
+
+
+Miscellaneous hints
+===================
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver.  All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+Vendor and device identifications
+=================================
+
+Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
+are shared across multiple drivers.  You can add private definitions in
+your driver if they're helpful, or just use plain hex constants.
+
+The device IDs are arbitrary hex numbers (vendor controlled) and normally used
+only in a single location, the pci_device_id table.
+
+Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
+There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
+and https://github.com/pciutils/pciids.
+
+
+Obsolete functions
+==================
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface.  They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+=================	===========================================
+pci_find_device()	Superseded by pci_get_device()
+pci_find_subsys()	Superseded by pci_get_subsys()
+pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
+pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
+=================	===========================================
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+MMIO Space and "Write Posting"
+==============================
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work.  The classic "bit banging"
+sequence works fine for I/O Port space::
+
+       for (i = 8; --i; val >>= 1) {
+               outb(val & 1, ioport_reg);      /* write bit */
+               udelay(10);
+       }
+
+The same sequence for MMIO space should be::
+
+       for (i = 8; --i; val >>= 1) {
+               writeb(val & 1, mmio_reg);      /* write bit */
+               readb(safe_mmio_reg);           /* flush posted write */
+               udelay(10);
+       }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl().  Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
deleted file mode 100644
index badb26ac33dc..000000000000
--- a/Documentation/PCI/pci.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-
-			How To Write Linux PCI Drivers
-
-		by Martin Mares <mj@ucw.cz> on 07-Feb-2000
-	updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The world of PCI is vast and full of (mostly unpleasant) surprises.
-Since each CPU architecture implements different chip-sets and PCI devices
-have different requirements (erm, "features"), the result is the PCI support
-in the Linux kernel is not as trivial as one would wish. This short paper
-tries to introduce all potential driver authors to Linux APIs for
-PCI device drivers.
-
-A more complete resource is the third edition of "Linux Device Drivers"
-by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
-LDD3 is available for free (under Creative Commons License) from:
-
-	http://lwn.net/Kernel/LDD3/
-
-However, keep in mind that all documents are subject to "bit rot".
-Refer to the source code if things are not working as described here.
-
-Please send questions/comments/patches about Linux PCI API to the
-"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
-
-
-
-0. Structure of PCI drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PCI drivers "discover" PCI devices in a system via pci_register_driver().
-Actually, it's the other way around. When the PCI generic code discovers
-a new device, the driver with a matching "description" will be notified.
-Details on this below.
-
-pci_register_driver() leaves most of the probing for devices to
-the PCI layer and supports online insertion/removal of devices [thus
-supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
-pci_register_driver() call requires passing in a table of function
-pointers and thus dictates the high level structure of a driver.
-
-Once the driver knows about a PCI device and takes ownership, the
-driver generally needs to perform the following initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines
-
-When done using the device, and perhaps the module needs to be unloaded,
-the driver needs to take the follow steps:
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and coherent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Release MMIO/IOP resources
-	Disable the device
-
-Most of these topics are covered in the following sections.
-For the rest look at LDD3 or <linux/pci.h> .
-
-If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
-the PCI functions described below are defined as inline functions either
-completely empty or just returning an appropriate error codes to avoid
-lots of ifdefs in the drivers.
-
-
-
-1. pci_register_driver() call
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI device drivers call pci_register_driver() during their
-initialization with a pointer to a structure describing the driver
-(struct pci_driver):
-
-	field name	Description
-	----------	------------------------------------------------------
-	id_table	Pointer to table of device ID's the driver is
-			interested in.  Most drivers should export this
-			table using MODULE_DEVICE_TABLE(pci,...).
-
-	probe		This probing function gets called (during execution
-			of pci_register_driver() for already existing
-			devices or later if a new device gets inserted) for
-			all PCI devices which match the ID table and are not
-			"owned" by the other drivers yet. This function gets
-			passed a "struct pci_dev *" for each device whose
-			entry in the ID table matches the device. The probe
-			function returns zero when the driver chooses to
-			take "ownership" of the device or an error code
-			(negative number) otherwise.
-			The probe function always gets called from process
-			context, so it can sleep.
-
-	remove		The remove() function gets called whenever a device
-			being handled by this driver is removed (either during
-			deregistration of the driver or when it's manually
-			pulled out of a hot-pluggable slot).
-			The remove function always gets called from process
-			context, so it can sleep.
-
-	suspend		Put device into low power state.
-	suspend_late	Put device into low power state.
-
-	resume_early	Wake device from low power state.
-	resume		Wake device from low power state.
-
-		(Please see Documentation/power/pci.txt for descriptions
-		of PCI Power Management and the related functions.)
-
-	shutdown	Hook into reboot_notifier_list (kernel/sys.c).
-			Intended to stop any idling DMA operations.
-			Useful for enabling wake-on-lan (NIC) or changing
-			the power state of a device before reboot.
-			e.g. drivers/net/e100.c.
-
-	err_handler	See Documentation/PCI/pci-error-recovery.txt
-
-
-The ID table is an array of struct pci_device_id entries ending with an
-all-zero entry.  Definitions with static const are generally preferred.
-
-Each entry consists of:
-
-	vendor,device	Vendor and device ID to match (or PCI_ANY_ID)
-
-	subvendor,	Subsystem vendor and device ID to match (or PCI_ANY_ID)
-	subdevice,
-
-	class		Device class, subclass, and "interface" to match.
-			See Appendix D of the PCI Local Bus Spec or
-			include/linux/pci_ids.h for a full list of classes.
-			Most drivers do not need to specify class/class_mask
-			as vendor/device is normally sufficient.
-
-	class_mask	limit which sub-fields of the class field are compared.
-			See drivers/scsi/sym53c8xx_2/ for example of usage.
-
-	driver_data	Data private to the driver.
-			Most drivers don't need to use driver_data field.
-			Best practice is to use driver_data as an index
-			into a static list of equivalent device types,
-			instead of using it as a pointer.
-
-
-Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
-a pci_device_id table.
-
-New PCI IDs may be added to a device driver pci_ids table at runtime
-as shown below:
-
-echo "vendor device subvendor subdevice class class_mask driver_data" > \
-/sys/bus/pci/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The vendor and device fields are mandatory, the others are optional. Users
-need pass only as many optional fields as necessary:
-	o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
-	o class and classmask fields default to 0
-	o driver_data defaults to 0UL.
-
-Note that driver_data must match the value used by any of the pci_device_id
-entries defined in the driver. This makes the driver_data field mandatory
-if all the pci_device_id entries have a non-zero driver_data value.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCI devices listed in its (newly updated) pci_ids list.
-
-When the driver exits, it just calls pci_unregister_driver() and the PCI layer
-automatically calls the remove hook for all devices handled by the driver.
-
-
-1.1 "Attributes" for driver functions/data
-
-Please mark the initialization and cleanup functions where appropriate
-(the corresponding macros are defined in <linux/init.h>):
-
-	__init		Initialization code. Thrown away after the driver
-			initializes.
-	__exit		Exit code. Ignored for non-modular drivers.
-
-Tips on when/where to use the above attributes:
-	o The module_init()/module_exit() functions (and all
-	  initialization functions called _only_ from these)
-	  should be marked __init/__exit.
-
-	o Do not mark the struct pci_driver.
-
-	o Do NOT mark a function if you are not sure which mark to use.
-	  Better to not mark the function than mark the function wrong.
-
-
-
-2. How to find PCI devices manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI drivers should have a really good reason for not using the
-pci_register_driver() interface to search for PCI devices.
-The main reason PCI devices are controlled by multiple drivers
-is because one PCI device implements several different HW services.
-E.g. combined serial/parallel port/floppy controller.
-
-A manual search may be performed using the following constructs:
-
-Searching by vendor and device ID:
-
-	struct pci_dev *dev = NULL;
-	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
-		configure_device(dev);
-
-Searching by class ID (iterate in a similar way):
-
-	pci_get_class(CLASS_ID, dev)
-
-Searching by both vendor/device and subsystem vendor/device ID:
-
-	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
-
-You can use the constant PCI_ANY_ID as a wildcard replacement for
-VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
-specific vendor, for example.
-
-These functions are hotplug-safe. They increment the reference count on
-the pci_dev that they return. You must eventually (possibly at module unload)
-decrement the reference count on these devices by calling pci_dev_put().
-
-
-
-3. Device Initialization Steps
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As noted in the introduction, most PCI drivers need the following steps
-for device initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines.
-
-The driver can access PCI config space registers at any time.
-(Well, almost. When running BIST, config space can go away...but
-that will just result in a PCI Bus Master Abort and config reads
-will return garbage).
-
-
-3.1 Enable the PCI device
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Before touching any device registers, the driver needs to enable
-the PCI device by calling pci_enable_device(). This will:
-	o wake up the device if it was in suspended state,
-	o allocate I/O and memory regions of the device (if BIOS did not),
-	o allocate an IRQ (if BIOS did not).
-
-NOTE: pci_enable_device() can fail! Check the return value.
-
-[ OS BUG: we don't check resource allocations before enabling those
-  resources. The sequence would make more sense if we called
-  pci_request_resources() before calling pci_enable_device().
-  Currently, the device drivers can't detect the bug when when two
-  devices have been allocated the same range. This is not a common
-  problem and unlikely to get fixed soon.
-
-  This has been discussed before but not changed as of 2.6.19:
-	http://lkml.org/lkml/2006/3/2/194
-]
-
-pci_set_master() will enable DMA by setting the bus master bit
-in the PCI_COMMAND register. It also fixes the latency timer value if
-it's set to something bogus by the BIOS.  pci_clear_master() will
-disable DMA by clearing the bus master bit.
-
-If the PCI device can use the PCI Memory-Write-Invalidate transaction,
-call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
-and also ensures that the cache line size register is set correctly.
-Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate.  Alternatively,
-if Mem-Wr-Inval would be nice to have but is not required, call
-pci_try_set_mwi() to have the system do its best effort at enabling
-Mem-Wr-Inval.
-
-
-3.2 Request MMIO/IOP resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Memory (MMIO), and I/O port addresses should NOT be read directly
-from the PCI device config space. Use the values in the pci_dev structure
-as the PCI "bus address" might have been remapped to a "host physical"
-address by the arch/chip-set specific kernel support.
-
-See Documentation/io-mapping.txt for how to access device registers
-or device memory.
-
-The device driver needs to call pci_request_region() to verify
-no other device is already using the same address resource.
-Conversely, drivers should call pci_release_region() AFTER
-calling pci_disable_device().
-The idea is to prevent two devices colliding on the same address range.
-
-[ See OS BUG comment above. Currently (2.6.19), The driver can only
-  determine MMIO and IO Port resource availability _after_ calling
-  pci_enable_device(). ]
-
-Generic flavors of pci_request_region() are request_mem_region()
-(for MMIO ranges) and request_region() (for IO Port ranges).
-Use these for address resources that are not described by "normal" PCI
-BARs.
-
-Also see pci_request_selected_regions() below.
-
-
-3.3 Set the DMA mask size
-~~~~~~~~~~~~~~~~~~~~~~~~~
-[ If anything below doesn't make sense, please refer to
-  Documentation/DMA-API.txt. This section is just a reminder that
-  drivers need to indicate DMA capabilities of the device and is not
-  an authoritative source for DMA interfaces. ]
-
-While all drivers should explicitly indicate the DMA capability
-(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
-32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
-appropriate parameters.  In general this allows more efficient DMA
-on systems where System RAM exists above 4G _physical_ address.
-
-Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
-
-Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
-Again, this includes drivers for all PCI-X and PCIe compliant devices.
-Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
-64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
-
-
-3.4 Setup shared control data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory.  See Documentation/DMA-API.txt for a full description of
-the DMA APIs. This section is just a reminder that it needs to be done
-before enabling DMA on the device.
-
-
-3.5 Initialize device registers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers will need specific "capability" fields programmed
-or other "vendor specific" register initialized or reset.
-E.g. clearing pending interrupts.
-
-
-3.6 Register IRQ handler
-~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the last step described here,
-this is often just another intermediate step to initialize a device.
-This step can often be deferred until the device is opened for use.
-
-All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
-and use the devid to map IRQs to devices (remember that all PCI IRQ lines
-can be shared).
-
-request_irq() will associate an interrupt handler and device handle
-with an interrupt number. Historically interrupt numbers represent
-IRQ lines which run from the PCI device to the Interrupt controller.
-With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
-
-request_irq() also enables the interrupt. Make sure the device is
-quiesced and does not have any interrupts pending before registering
-the interrupt handler.
-
-MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
-which deliver interrupts to the CPU via a DMA write to a Local APIC.
-The fundamental difference between MSI and MSI-X is how multiple
-"vectors" get allocated. MSI requires contiguous blocks of vectors
-while MSI-X can allocate several individual ones.
-
-MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
-PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
-causes the PCI support to program CPU vector data into the PCI device
-capability registers. Many architectures, chip-sets, or BIOSes do NOT
-support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
-the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
-specify PCI_IRQ_LEGACY as well.
-
-Drivers that have different interrupt handlers for MSI/MSI-X and
-legacy INTx should chose the right one based on the msi_enabled
-and msix_enabled flags in the pci_dev structure after calling
-pci_alloc_irq_vectors.
-
-There are (at least) two really good reasons for using MSI:
-1) MSI is an exclusive interrupt vector by definition.
-   This means the interrupt handler doesn't have to verify
-   its device caused the interrupt.
-
-2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
-   to be visible to the host CPU(s) when the MSI is delivered. This
-   is important for both data coherency and avoiding stale control data.
-   This guarantee allows the driver to omit MMIO reads to flush
-   the DMA stream.
-
-See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
-of MSI/MSI-X usage.
-
-
-
-4. PCI device shutdown
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When a PCI device driver is being unloaded, most of the following
-steps need to be performed:
-
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and consistent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Disable device from responding to MMIO/IO Port addresses
-	Release MMIO/IO Port resource(s)
-
-
-4.1 Stop IRQs on the device
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-How to do this is chip/device specific. If it's not done, it opens
-the possibility of a "screaming interrupt" if (and only if)
-the IRQ is shared with another device.
-
-When the shared IRQ handler is "unhooked", the remaining devices
-using the same IRQ line will still need the IRQ enabled. Thus if the
-"unhooked" device asserts IRQ line, the system will respond assuming
-it was one of the remaining devices asserted the IRQ line. Since none
-of the other devices will handle the IRQ, the system will "hang" until
-it decides the IRQ isn't going to get handled and masks the IRQ (100,000
-iterations later). Once the shared IRQ is masked, the remaining devices
-will stop functioning properly. Not a nice situation.
-
-This is another reason to use MSI or MSI-X if it's available.
-MSI and MSI-X are defined to be exclusive interrupts and thus
-are not susceptible to the "screaming interrupt" problem.
-
-
-4.2 Release the IRQ
-~~~~~~~~~~~~~~~~~~~
-Once the device is quiesced (no more IRQs), one can call free_irq().
-This function will return control once any pending IRQs are handled,
-"unhook" the drivers IRQ handler from that IRQ, and finally release
-the IRQ if no one else is using it.
-
-
-4.3 Stop all DMA activity
-~~~~~~~~~~~~~~~~~~~~~~~~~
-It's extremely important to stop all DMA operations BEFORE attempting
-to deallocate DMA control data. Failure to do so can result in memory
-corruption, hangs, and on some chip-sets a hard crash.
-
-Stopping DMA after stopping the IRQs can avoid races where the
-IRQ handler might restart DMA engines.
-
-While this step sounds obvious and trivial, several "mature" drivers
-didn't get this step right in the past.
-
-
-4.4 Release DMA buffers
-~~~~~~~~~~~~~~~~~~~~~~~
-Once DMA is stopped, clean up streaming DMA first.
-I.e. unmap data buffers and return buffers to "upstream"
-owners if there is one.
-
-Then clean up "consistent" buffers which contain the control data.
-
-See Documentation/DMA-API.txt for details on unmapping interfaces.
-
-
-4.5 Unregister from other subsystems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Most low level PCI device drivers support some other subsystem
-like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
-driver isn't losing resources from that other subsystem.
-If this happens, typically the symptom is an Oops (panic) when
-the subsystem attempts to call into a driver that has been unloaded.
-
-
-4.6 Disable Device from responding to MMIO/IO Port addresses
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-io_unmap() MMIO or IO Port resources and then call pci_disable_device().
-This is the symmetric opposite of pci_enable_device().
-Do not access device registers after calling pci_disable_device().
-
-
-4.7 Release MMIO/IO Port Resource(s)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Call pci_release_region() to mark the MMIO or IO Port range as available.
-Failure to do so usually results in the inability to reload the driver.
-
-
-
-5. How to access PCI config space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can use pci_(read|write)_config_(byte|word|dword) to access the config
-space of a device represented by struct pci_dev *. All these functions return 0
-when successful or an error code (PCIBIOS_...) which can be translated to a text
-string by pcibios_strerror. Most drivers expect that accesses to valid PCI
-devices don't fail.
-
-If you don't have a struct pci_dev available, you can call
-pci_bus_(read|write)_config_(byte|word|dword) to access a given device
-and function on that bus.
-
-If you access fields in the standard portion of the config header, please
-use symbolic names of locations and bits declared in <linux/pci.h>.
-
-If you need to access Extended PCI Capability registers, just call
-pci_find_capability() for the particular capability and it will find the
-corresponding register block for you.
-
-
-
-6. Other interesting functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
-				bus and slot and number. If the device is
-				found, its reference count is increased.
-pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
-pci_find_capability()		Find specified capability in device's capability
-				list.
-pci_resource_start()		Returns bus start address for a given PCI region
-pci_resource_end()		Returns bus end address for a given PCI region
-pci_resource_len()		Returns the byte length of a PCI region
-pci_set_drvdata()		Set private driver data pointer for a pci_dev
-pci_get_drvdata()		Return private driver data pointer for a pci_dev
-pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
-pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
-
-
-
-7. Miscellaneous hints
-~~~~~~~~~~~~~~~~~~~~~~
-
-When displaying PCI device names to the user (for example when a driver wants
-to tell the user what card has it found), please use pci_name(pci_dev).
-
-Always refer to the PCI devices by a pointer to the pci_dev structure.
-All PCI layer functions use this identification and it's the only
-reasonable one. Don't use bus/slot/function numbers except for very
-special purposes -- on systems with multiple primary buses their semantics
-can be pretty complex.
-
-Don't try to turn on Fast Back to Back writes in your driver.  All devices
-on the bus need to be capable of doing it, so this is something which needs
-to be handled by platform and generic code, not individual drivers.
-
-
-
-8. Vendor and device identifications
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
-are shared across multiple drivers.  You can add private definitions in
-your driver if they're helpful, or just use plain hex constants.
-
-The device IDs are arbitrary hex numbers (vendor controlled) and normally used
-only in a single location, the pci_device_id table.
-
-Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
-There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
-and https://github.com/pciutils/pciids.
-
-
-
-9. Obsolete functions
-~~~~~~~~~~~~~~~~~~~~~
-
-There are several functions which you might come across when trying to
-port an old driver to the new PCI interface.  They are no longer present
-in the kernel as they aren't compatible with hotplug or PCI domains or
-having sane locking.
-
-pci_find_device()	Superseded by pci_get_device()
-pci_find_subsys()	Superseded by pci_get_subsys()
-pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
-pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
-
-
-The alternative is the traditional PCI device driver that walks PCI
-device lists. This is still possible but discouraged.
-
-
-
-10. MMIO Space and "Write Posting"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Converting a driver from using I/O Port space to using MMIO space
-often requires some additional changes. Specifically, "write posting"
-needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
-already do this. I/O Port space guarantees write transactions reach the PCI
-device before the CPU can continue. Writes to MMIO space allow the CPU
-to continue before the transaction reaches the PCI device. HW weenies
-call this "Write Posting" because the write completion is "posted" to
-the CPU before the transaction has reached its destination.
-
-Thus, timing sensitive code should add readl() where the CPU is
-expected to wait before doing other work.  The classic "bit banging"
-sequence works fine for I/O Port space:
-
-       for (i = 8; --i; val >>= 1) {
-               outb(val & 1, ioport_reg);      /* write bit */
-               udelay(10);
-       }
-
-The same sequence for MMIO space should be:
-
-       for (i = 8; --i; val >>= 1) {
-               writeb(val & 1, mmio_reg);      /* write bit */
-               readb(safe_mmio_reg);           /* flush posted write */
-               udelay(10);
-       }
-
-It is important that "safe_mmio_reg" not have any side effects that
-interferes with the correct operation of the device.
-
-Another case to watch out for is when resetting a PCI device. Use PCI
-Configuration space reads to flush the writel(). This will gracefully
-handle the PCI master abort on all platforms if the PCI device is
-expected to not respond to a readl().  Most x86 platforms will allow
-MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
-(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
-
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..664c0fb1d53d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,25 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+/**
+ * struct pci_device_id - PCI device ID structure
+ * @vendor:		Vendor ID to match (or PCI_ANY_ID)
+ * @device:		Device ID to match (or PCI_ANY_ID)
+ * @subvendor:		Subsystem vendor ID to match (or PCI_ANY_ID)
+ * @subdevice:		Subsystem device ID to match (or PCI_ANY_ID)
+ * @class:		Device class, subclass, and "interface" to match.
+ *			See Appendix D of the PCI Local Bus Spec or
+ *			include/linux/pci_ids.h for a full list of classes.
+ *			Most drivers do not need to specify class/class_mask
+ *			as vendor/device is normally sufficient.
+ * @class_mask:		Limit which sub-fields of the class field are compared.
+ *			See drivers/scsi/sym53c8xx_2/ for example of usage.
+ * @driver_data:	Data private to the driver.
+ *			Most drivers don't need to use driver_data field.
+ *			Best practice is to use driver_data as an index
+ *			into a static list of equivalent device types,
+ *			instead of using it as a pointer.
+ */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
@@ -257,17 +276,17 @@ struct pcmcia_device_id {
 	__u16		match_flags;
 
 	__u16		manf_id;
-	__u16 		card_id;
+	__u16		card_id;
 
-	__u8  		func_id;
+	__u8		func_id;
 
 	/* for real multi-function devices */
-	__u8  		function;
+	__u8		function;
 
 	/* for pseudo multi-function devices */
-	__u8  		device_no;
+	__u8		device_no;
 
-	__u32 		prod_id_hash[4];
+	__u32		prod_id_hash[4];
 
 	/* not matched against in kernelspace */
 	const char *	prod_id[4];
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..b74b2a4e6df2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -151,6 +151,8 @@ static inline const char *pci_power_name(pci_power_t state)
 #define PCI_PM_BUS_WAIT		50
 
 /**
+ * typedef pci_channel_state_t
+ *
  * The pci_channel state describes connectivity between the CPU and
  * the PCI device.  If some PCI bus between here and the PCI device
  * has crashed or locked up, this info is reflected here.
@@ -775,6 +777,50 @@ struct pci_error_handlers {
 
 
 struct module;
+
+/**
+ * struct pci_driver - PCI driver structure
+ * @node:	List of driver structures.
+ * @name:	Driver name.
+ * @id_table:	Pointer to table of device IDs the driver is
+ *		interested in.  Most drivers should export this
+ *		table using MODULE_DEVICE_TABLE(pci,...).
+ * @probe:	This probing function gets called (during execution
+ *		of pci_register_driver() for already existing
+ *		devices or later if a new device gets inserted) for
+ *		all PCI devices which match the ID table and are not
+ *		"owned" by the other drivers yet. This function gets
+ *		passed a "struct pci_dev \*" for each device whose
+ *		entry in the ID table matches the device. The probe
+ *		function returns zero when the driver chooses to
+ *		take "ownership" of the device or an error code
+ *		(negative number) otherwise.
+ *		The probe function always gets called from process
+ *		context, so it can sleep.
+ * @remove:	The remove() function gets called whenever a device
+ *		being handled by this driver is removed (either during
+ *		deregistration of the driver or when it's manually
+ *		pulled out of a hot-pluggable slot).
+ *		The remove function always gets called from process
+ *		context, so it can sleep.
+ * @suspend:	Put device into low power state.
+ * @suspend_late: Put device into low power state.
+ * @resume_early: Wake device from low power state.
+ * @resume:	Wake device from low power state.
+ *		(Please see Documentation/power/pci.txt for descriptions
+ *		of PCI Power Management and the related functions.)
+ * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
+ *		Intended to stop any idling DMA operations.
+ *		Useful for enabling wake-on-lan (NIC) or changing
+ *		the power state of a device before reboot.
+ *		e.g. drivers/net/e100.c.
+ * @sriov_configure: Optional driver callback to allow configuration of
+ *		number of VFs to enable via sysfs "sriov_numvfs" file.
+ * @err_handler: See Documentation/PCI/pci-error-recovery.rst
+ * @groups:	Sysfs attribute groups.
+ * @driver:	Driver model structure.
+ * @dynids:	List of dynamically added device IDs.
+ */
 struct pci_driver {
 	struct list_head	node;
 	const char		*name;
@@ -2206,7 +2252,7 @@ static inline u8 pci_vpd_srdt_tag(const u8 *srdt)
 
 /**
  * pci_vpd_info_field_size - Extracts the information field length
- * @lrdt: Pointer to the beginning of an information field header
+ * @info_field: Pointer to the beginning of an information field header
  *
  * Returns the extracted information field length.
  */
-- 
cgit v1.2.3


From 9c3c0c2048149d946d7f3ebdcbe70e2946750bfb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Apr 2019 22:43:36 +0200
Subject: isdn: remove isdn4linux

With all isdn4linux hardware drivers gone, this is only a wrapper around
CAPI to support old user space. However, from looking at the mailing
list, it seems that the last time anyone asked about it was in 2014,
when the upgrade from a linux-2.4 installation failed, and mISDN was
suggested as a replacement.

The largest public ISDN network (Deutsche Telekom) was supposed to be
shut down 2018, which must have drastically reduced the number of legacy
installations.

When we last discussed removing i4l in 2016, Karsten Keil suggested
revisiting this in 2018. I guess this is overdue.

Link: http://listserv.isdn4linux.de/pipermail/isdn4linux/2014-October/006165.html
Link: https://patchwork.kernel.org/patch/8484861/#17900371
Link: https://listserv.isdn4linux.de/pipermail/isdn4linux/2019-April/thread.html
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 Documentation/isdn/INTERFACE        |  759 -------
 Documentation/isdn/INTERFACE.fax    |  163 --
 Documentation/isdn/README           |  599 ------
 Documentation/isdn/README.FAQ       |   26 -
 Documentation/isdn/README.audio     |  138 --
 Documentation/isdn/README.concap    |  259 ---
 Documentation/isdn/README.diversion |  127 --
 Documentation/isdn/README.fax       |   45 -
 Documentation/isdn/README.hfc-pci   |   41 -
 Documentation/isdn/README.syncppp   |   58 -
 Documentation/isdn/README.x25       |  184 --
 Documentation/isdn/syncPPP.FAQ      |  224 ---
 Documentation/process/changes.rst   |   16 +-
 MAINTAINERS                         |    2 -
 drivers/isdn/Kconfig                |   26 -
 drivers/isdn/Makefile               |    2 -
 drivers/isdn/capi/Kconfig           |    9 -
 drivers/isdn/capi/capidrv.c         | 2525 -----------------------
 drivers/isdn/capi/capidrv.h         |  140 --
 drivers/isdn/divert/Makefile        |   10 -
 drivers/isdn/divert/divert_init.c   |   82 -
 drivers/isdn/divert/divert_procfs.c |  336 ----
 drivers/isdn/divert/isdn_divert.c   |  846 --------
 drivers/isdn/divert/isdn_divert.h   |  132 --
 drivers/isdn/i4l/Kconfig            |  127 --
 drivers/isdn/i4l/Makefile           |   14 -
 drivers/isdn/i4l/isdn_audio.c       |  711 -------
 drivers/isdn/i4l/isdn_audio.h       |   44 -
 drivers/isdn/i4l/isdn_bsdcomp.c     |  930 ---------
 drivers/isdn/i4l/isdn_common.c      | 2368 ----------------------
 drivers/isdn/i4l/isdn_common.h      |   47 -
 drivers/isdn/i4l/isdn_concap.c      |   99 -
 drivers/isdn/i4l/isdn_concap.h      |   11 -
 drivers/isdn/i4l/isdn_net.c         | 3198 -----------------------------
 drivers/isdn/i4l/isdn_net.h         |  151 --
 drivers/isdn/i4l/isdn_ppp.c         | 3046 ----------------------------
 drivers/isdn/i4l/isdn_ppp.h         |   41 -
 drivers/isdn/i4l/isdn_tty.c         | 3756 -----------------------------------
 drivers/isdn/i4l/isdn_tty.h         |  120 --
 drivers/isdn/i4l/isdn_ttyfax.c      | 1123 -----------
 drivers/isdn/i4l/isdn_ttyfax.h      |   17 -
 drivers/isdn/i4l/isdn_v110.c        |  625 ------
 drivers/isdn/i4l/isdn_v110.h        |   29 -
 drivers/isdn/i4l/isdn_x25iface.c    |  332 ----
 drivers/isdn/i4l/isdn_x25iface.h    |   30 -
 drivers/isdn/isdnloop/Makefile      |    6 -
 drivers/isdn/isdnloop/isdnloop.c    | 1528 --------------
 drivers/isdn/isdnloop/isdnloop.h    |  112 --
 include/linux/concap.h              |  112 --
 include/linux/isdn.h                |  473 -----
 include/linux/isdn_divertif.h       |   35 -
 include/linux/isdn_ppp.h            |  194 --
 include/linux/isdnif.h              |  505 -----
 include/linux/wanrouter.h           |   11 -
 include/uapi/linux/isdn.h           |  144 --
 include/uapi/linux/isdn_divertif.h  |   31 -
 include/uapi/linux/isdn_ppp.h       |   68 -
 include/uapi/linux/isdnif.h         |   57 -
 include/uapi/linux/wanrouter.h      |   18 -
 59 files changed, 2 insertions(+), 26860 deletions(-)
 delete mode 100644 Documentation/isdn/INTERFACE
 delete mode 100644 Documentation/isdn/INTERFACE.fax
 delete mode 100644 Documentation/isdn/README
 delete mode 100644 Documentation/isdn/README.FAQ
 delete mode 100644 Documentation/isdn/README.audio
 delete mode 100644 Documentation/isdn/README.concap
 delete mode 100644 Documentation/isdn/README.diversion
 delete mode 100644 Documentation/isdn/README.fax
 delete mode 100644 Documentation/isdn/README.hfc-pci
 delete mode 100644 Documentation/isdn/README.syncppp
 delete mode 100644 Documentation/isdn/README.x25
 delete mode 100644 Documentation/isdn/syncPPP.FAQ
 delete mode 100644 drivers/isdn/capi/capidrv.c
 delete mode 100644 drivers/isdn/capi/capidrv.h
 delete mode 100644 drivers/isdn/divert/Makefile
 delete mode 100644 drivers/isdn/divert/divert_init.c
 delete mode 100644 drivers/isdn/divert/divert_procfs.c
 delete mode 100644 drivers/isdn/divert/isdn_divert.c
 delete mode 100644 drivers/isdn/divert/isdn_divert.h
 delete mode 100644 drivers/isdn/i4l/Kconfig
 delete mode 100644 drivers/isdn/i4l/isdn_audio.c
 delete mode 100644 drivers/isdn/i4l/isdn_audio.h
 delete mode 100644 drivers/isdn/i4l/isdn_bsdcomp.c
 delete mode 100644 drivers/isdn/i4l/isdn_common.c
 delete mode 100644 drivers/isdn/i4l/isdn_common.h
 delete mode 100644 drivers/isdn/i4l/isdn_concap.c
 delete mode 100644 drivers/isdn/i4l/isdn_concap.h
 delete mode 100644 drivers/isdn/i4l/isdn_net.c
 delete mode 100644 drivers/isdn/i4l/isdn_net.h
 delete mode 100644 drivers/isdn/i4l/isdn_ppp.c
 delete mode 100644 drivers/isdn/i4l/isdn_ppp.h
 delete mode 100644 drivers/isdn/i4l/isdn_tty.c
 delete mode 100644 drivers/isdn/i4l/isdn_tty.h
 delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.c
 delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.h
 delete mode 100644 drivers/isdn/i4l/isdn_v110.c
 delete mode 100644 drivers/isdn/i4l/isdn_v110.h
 delete mode 100644 drivers/isdn/i4l/isdn_x25iface.c
 delete mode 100644 drivers/isdn/i4l/isdn_x25iface.h
 delete mode 100644 drivers/isdn/isdnloop/Makefile
 delete mode 100644 drivers/isdn/isdnloop/isdnloop.c
 delete mode 100644 drivers/isdn/isdnloop/isdnloop.h
 delete mode 100644 include/linux/concap.h
 delete mode 100644 include/linux/isdn.h
 delete mode 100644 include/linux/isdn_divertif.h
 delete mode 100644 include/linux/isdn_ppp.h
 delete mode 100644 include/linux/isdnif.h
 delete mode 100644 include/linux/wanrouter.h
 delete mode 100644 include/uapi/linux/isdn.h
 delete mode 100644 include/uapi/linux/isdn_divertif.h
 delete mode 100644 include/uapi/linux/isdn_ppp.h
 delete mode 100644 include/uapi/linux/isdnif.h
 delete mode 100644 include/uapi/linux/wanrouter.h

(limited to 'include/linux')

diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE
deleted file mode 100644
index 5df17e5b25c8..000000000000
--- a/Documentation/isdn/INTERFACE
+++ /dev/null
@@ -1,759 +0,0 @@
-$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $
-
-Description of the Interface between Linklevel and Hardwarelevel
-  of isdn4linux:
-
-
-  The Communication between Linklevel (LL) and Hardwarelevel (HL)
-  is based on the struct isdn_if (defined in isdnif.h).
-
-  An HL-driver can register itself at LL by calling the function
-  register_isdn() with a pointer to that struct. Prior to that, it has
-  to preset some of the fields of isdn_if. The LL sets the rest of
-  the fields. All further communication is done via callbacks using
-  the function-pointers defined in isdn_if.
-
-  Changes/Version numbering:
-
-  During development of the ISDN subsystem, several changes have been
-  made to the interface. Before it went into kernel, the package
-  had a unique version number. The last version, distributed separately
-  was 0.7.4. When the subsystem went into kernel, every functional unit
-  got a separate version number. These numbers are shown at initialization,
-  separated by slashes:
-
-     c.c/t.t/n.n/p.p/a.a/v.v
-
-  where
-
-   c.c is the revision of the common code.
-   t.t is the revision of the tty related code.
-   n.n is the revision of the network related code.
-   p.p is the revision of the ppp related code.
-   a.a is the revision of the audio related code.
-   v.v is the revision of the V.110 related code.
-
-  Changes in this document are marked with '***CHANGEx' where x representing
-  the version number. If that number starts with 0, it refers to the old,
-  separately distributed package. If it starts with one of the letters
-  above, it refers to the revision of the corresponding module. 
-  ***CHANGEIx refers to the revision number of the isdnif.h  
-
-1. Description of the fields of isdn_if:
-
-  int channels;
-
-    This field has to be set by the HL-driver to the number of channels
-    supported prior to calling register_isdn(). Upon return of the call,
-    the LL puts an id there, which has to be used by the HL-driver when
-    invoking the other callbacks.
-
-  int maxbufsize;
-
-    ***CHANGE0.6: New since this version.
-
-    Also to be preset by the HL-driver. With this value the HL-driver
-    tells the LL the maximum size of a data-packet it will accept. 
-
-  unsigned long features;
-
-    To be preset by the HL-driver. Using this field, the HL-driver
-    announces the features supported. At the moment this is limited to
-    report the supported layer2 and layer3-protocols. For setting this
-    field the constants ISDN_FEATURE..., declared in isdnif.h have to be
-    used.
-
-    ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set.
-
-  unsigned short hl_hdrlen;
-
-    ***CHANGE0.7.4: New field.
-
-    To be preset by the HL-driver, if it supports sk_buff's. The driver
-    should put here the amount of additional space needed in sk_buff's for
-    its internal purposes. Drivers not supporting sk_buff's should 
-    initialize this field to 0.
-
-  void (*rcvcallb_skb)(int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-
-    This field will be set by LL. The HL-driver delivers received data-
-    packets by calling this function. Upon calling, the HL-driver must
-    already have its private data pulled off the head of the sk_buff.
-
-    Parameter:
-      int              driver-Id
-      int              Channel-number locally to the driver. (starting with 0)
-      struct sk_buff * Pointer to sk_buff, containing received data.
-
-  int (*statcallb)(isdn_ctrl*);
-
-    This field will be set by LL. This function has to be called by the
-    HL-driver for signaling status-changes or other events to the LL.
-
-    Parameter:
-      isdn_ctrl*
-
-      The struct isdn_ctrl also defined in isdn_if. The exact meanings of its
-      fields are described together with the descriptions of the possible
-      events. Here is only a short description of the fields:
-
-        driver  = driver Id.
-        command = event-type. (one of the constants ISDN_STAT_...)
-        arg     = depends on event-type.
-        num     = depends on event-type.
-
-    Returnvalue:
-      0 on success, else -1
-
-  int (*command)(isdn_ctrl*);
-
-    This field has to be preset by the HL-driver. It points to a function,
-    to be called by LL to perform functions like dialing, B-channel
-    setup, etc. The exact meaning of the parameters is described with the
-    descriptions of the possible commands.
-
-    Parameter:
-      isdn_ctrl*
-        driver  = driver-Id
-        command = command to perform. (one of the constants ISDN_CMD_...)
-        arg     = depends on command.
-        num     = depends on command.
-    
-    Returnvalue:
-      >=0 on success, else error-code (-ENODEV etc.)
-
-  int (*writebuf_skb)(int, int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-    ***CHANGEI.1.21: New field.
-
-    This field has to be preset by the HL-driver. The given function will
-    be called by the LL for delivering data to be send via B-Channel.
-
- 
-    Parameter:
-      int              driver-Id ***CHANGE0.7.4: New parameter.
-      int              channel-number locally to the HL-driver. (starts with 0)
-      int	       ack ***ChangeI1.21: New parameter
-		       If this is !0, the driver has to signal the delivery
-		       by sending an ISDN_STAT_BSENT. If this is 0, the driver
-		       MUST NOT send an ISDN_STAT_BSENT.
-      struct sk_buff * Pointer to sk_buff containing data to be send via
-                       B-channel.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL on
-      oversized packets etc.)
-
-  int (*writecmd)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform write-requests on /dev/isdnctrl (i.e. sending commands
-    to the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL etc.)
-
-  int (*readstat)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform read-requests on /dev/isdnctrl (i.e. reading replies
-    from the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data on success, else error-code (-EINVAL etc.)
-
-  char id[20];
-       ***CHANGE0.7: New since this version.
-
-   This string has to be preset by the HL-driver. Its purpose is for
-   identification of the driver by the user. Eg.: it is shown in the
-   status-info of /dev/isdninfo. Furthermore it is used as Id for binding
-   net-interfaces to a specific channel. If a string of length zero is
-   given, upon return, isdn4linux will replace it by a generic name. (line0,
-   line1 etc.) It is recommended to make this string configurable during
-   module-load-time. (copy a global variable to this string.) For doing that,
-   modules 1.2.8 or newer are necessary.
-
-2. Description of the commands, a HL-driver has to support:
-
-   All commands will be performed by calling the function command() described
-   above from within the LL. The field command of the struct-parameter will
-   contain the desired command, the field driver is always set to the
-   appropriate driver-Id.
-
-   Until now, the following commands are defined:
-
-***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing
-                the old "num" and a new setup_type struct used for ISDN_CMD_DIAL
-                and ISDN_STAT_ICALL callback.
-
-   ISDN_CMD_IOCTL:
-
-     This command is intended for performing ioctl-calls for configuring
-     hardware or similar purposes (setting port-addresses, loading firmware
-     etc.) For this purpose, in the LL all ioctl-calls with an argument
-     >= IIOCDRVCTL (0x100) will be handed transparently to this
-     function after subtracting 0x100 and placing the result in arg.
-     Example:
-       If a userlevel-program calls ioctl(0x101,...) the function gets
-       called with the field command set to 1.
-
-     Parameter:
-       driver   = driver-Id.
-       command  = ISDN_CMD_IOCTL
-       arg      = Original ioctl-cmd - IIOCDRVCTL
-       parm.num = first bytes filled with (unsigned long)arg
-   
-     Returnvalue:
-       Depending on driver.
-
-  
-  ISDN_CMD_DIAL:
-
-    This command is used to tell the HL-driver it should dial a given
-    number.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_DIAL
-      arg         = channel-number locally to the driver. (starting with 0)
-      
-      parm.setup.phone  = An ASCII-String containing the number to dial.
-      parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN.
-      parm.setup.si1    = The Service-Indicator.
-      parm.setup.si2    = Additional Service-Indicator.
-
-                    If the Line has been designed as SPV (a special german
-                    feature, meaning semi-leased-line) the phone has to
-                    start with an "S".
-      ***CHANGE0.6: In previous versions the EAZ has been given in the
-                    highbyte of arg.
-    ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo.
-
-  ISDN_CMD_ACCEPTD:
-
-    With this command, the HL-driver is told to accept a D-Channel-setup.
-    (Response to an incoming call)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_ACCEPTB:
-
-    With this command, the HL-driver is told to perform a B-Channel-setup.
-    (after establishing D-Channel-Connection)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTB
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_HANGUP:
-
-    With this command, the HL-driver is told to hangup (B-Channel if
-    established first, then D-Channel). This command is also used for
-    actively rejecting an incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_HANGUP
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_CLREAZ:
-
-    With this command, the HL-driver is told not to signal incoming
-    calls to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_CLREAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_SETEAZ:
-
-    With this command, the HL-driver is told to signal incoming calls for
-    the given EAZs/MSNs to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired EAZ's/MSN's
-                    (comma-separated). If an empty String is given, the
-                    HL-driver should respond to ALL incoming calls,
-                    regardless of the destination-address.
-      ***CHANGE0.6: New since this version the "empty-string"-feature.
-
-  ISDN_CMD_GETEAZ: (currently unused)
-
-    With this command, the HL-driver is told to report the current setting
-    given with ISDN_CMD_SETEAZ.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current EAZ's/MSN's
-
-  ISDN_CMD_SETSIL: (currently unused)
-
-    With this command, the HL-driver is told to signal only incoming
-    calls with the given Service-Indicators.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired Service-Indicators.
-
-  ISDN_CMD_GETSIL: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    Service-Indicators it will respond to.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current Service-Indicators.
-
-  ISDN_CMD_SETL2:
-
-    With this command, the HL-driver is told to select the given Layer-2-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L2...
-      parm        = unused.
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-2-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L2_PROTO)
-
-  ISDN_CMD_SETL3:
-
-    With this command, the HL-driver is told to select the given Layer-3-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L3...
-      parm.fax    = Pointer to T30_s fax struct. (fax usage only)
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-3-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L3_PROTO)
-
-  ISDN_CMD_PROCEED: 
-
-    With this command, the HL-driver is told to proceed with a incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_PROCEED
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    PROCEED message
-
-  ISDN_CMD_ALERT: 
-
-    With this command, the HL-driver is told to alert a proceeding call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ALERT
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    ALERT message
-
-  ISDN_CMD_REDIR: 
-
-    With this command, the HL-driver is told to redirect a call in proceeding
-    or alerting state.  
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_REDIR
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 protocol
-      setup.screen= screening indicator
-      setup.phone = redirected to party number
-
-  ISDN_CMD_PROT_IO:
-
-    With this call, the LL-driver invokes protocol specific features through
-    the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_CMD_PROT_IO
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific CMD.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-
-  ISDN_CMD_FAXCMD:
-
-    With this command the HL-driver receives a fax sub-command.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_FAXCMD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-
-3. Description of the events to be signaled by the HL-driver to the LL.
-
-  All status-changes are signaled via calling the previously described
-  function statcallb(). The field command of the struct isdn_cmd has
-  to be set by the HL-driver with the appropriate Status-Id (event-number).
-  The field arg has to be set to the channel-number (locally to the driver,
-  starting with 0) to which this event applies. (Exception: STAVAIL-event)
-
-  Until now, the following Status-Ids are defined:
-
-  ISDN_STAT_AVAIL:
-
-    With this call, the HL-driver signals the availability of new data
-    for readstat(). Used only for debugging-purposes, see description
-    of readstat().
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STAVAIL
-      arg         = length of available data.
-      parm        = unused.
-
-  ISDN_STAT_ICALL:
-  ISDN_STAT_ICALLW:
-
-    With this call, the HL-driver signals an incoming call to the LL.
-    If ICALLW is signalled the incoming call is a waiting call without
-    a available B-chan.
-
-    Parameter:
-      driver            = driver-Id
-      command           = ISDN_STAT_ICALL
-      arg               = channel-number, locally to the driver. (starting with 0)
-      para.setup.phone  = Callernumber.
-      para.setup.eazmsn = CalledNumber.
-      para.setup.si1    = Service Indicator.
-      para.setup.si2    = Additional Service Indicator.
-      para.setup.plan   = octet 3 from Calling party number Information Element.
-      para.setup.screen = octet 3a from Calling party number Information Element.
-
-    Return:
-      0           = No device matching this call.
-      1           = At least one device matching this call (RING on ttyI).
-                    HL-driver may send ALERTING on the D-channel in this case.
-      2           = Call will be rejected.
-      3           = Incoming called party number is currently incomplete.
-                    Additional digits are required. 
-                    Used for signalling with PtP connections.
-      4	          = Call will be held in a proceeding state 
-                    (HL driver sends PROCEEDING)
-                    Used when a user space prog needs time to interpret a call
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      5           = Call will be actively deflected to another party
-                    Only available in DSS1/EURO protocol
-		    para.setup.phone must be set to destination party number
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      -1          = An error happened. (Invalid parameters for example.)
-  The keypad support now is included in the dial command.	        
-
-
-  ISDN_STAT_RUN:
-
-    With this call, the HL-driver signals availability of the ISDN-card.
-    (after initializing, loading firmware)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_RUN
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_STOP:
-
-    With this call, the HL-driver signals unavailability of the ISDN-card.
-    (before unloading, while resetting/reconfiguring the card)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STOP
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_DCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the
-   remote-station has initiated establishment)
-
-   The HL driver should call this when the logical l2/l3 protocol 
-   connection on top of the physical B-channel is established.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing type of connection (for analog
-		    modem only). This will be appended to the CONNECT message
-		    e.g. 14400/V.32bis
-
-  ISDN_STAT_DHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup or if the remote-station has actively
-   rejected a call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup.
-
-   The HL driver should call this as soon as the logical l2/l3 protocol 
-   connection on top of the physical B-channel is released.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_CINF:
-
-   With this call, the HL-driver delivers charge-unit information to the
-   LL.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_CINF
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing charge-units (digits only).
-
-  ISDN_STAT_LOAD: (currently unused)
-
-  ISDN_STAT_UNLOAD:
-
-   With this call, the HL-driver signals that it will be unloaded now. This
-   tells the LL to release all corresponding data-structures.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_UNLOAD
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_BSENT:
-
-    With this call the HL-driver signals the delivery of a data-packet.
-    This callback is used by the network-interfaces only, tty-Emulation
-    does not need this call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BSENT
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.length = ***CHANGEI.1.21: New field.
-		    the driver has to set this to the original length
-		    of the skb at the time of receiving it from the linklevel.
-
-  ISDN_STAT_NODCH:
-
-    With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if
-    no D-Channel is available.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_ADDCH: 
-
-    This call is for HL-drivers, which are unable to check card-type
-    or numbers of supported channels before they have loaded any firmware
-    using ioctl. Those HL-driver simply set the channel-parameter to a
-    minimum channel-number when registering, and later if they know
-    the real amount, perform this call, allocating additional channels.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_ADDCH
-      arg         = number of channels to be added.
-      parm        = unused.
-
-  ISDN_STAT_CAUSE:
-
-    With this call, the HL-driver delivers CAUSE-messages to the LL.
-    Currently the LL does not use this messages. Their contents is simply
-    logged via kernel-messages. Therefore, currently the format of the
-    messages is completely free. However they should be printable.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing CAUSE-message.
-
-  ISDN_STAT_DISPLAY:
-
-    With this call, the HL-driver delivers DISPLAY-messages to the LL.
-    Currently the LL does not use this messages. 
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISPLAY
-      arg         = channel-number, locally to the driver. (starting with 0)
-      para.display= string containing DISPLAY-message.
-
-  ISDN_STAT_PROT:
-
-    With this call, the HL-driver delivers protocol specific infos to the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_PROT
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific STAT.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-  ISDN_STAT_DISCH:
-
-    With this call, the HL-driver signals the LL to disable or enable the
-    use of supplied channel and driver.
-    The call may be used to reduce the available number of B-channels after
-    loading the driver. The LL has to ignore a disabled channel when searching
-    for free channels. The HL driver itself never delivers STAT callbacks for
-    disabled channels. 	    
-    The LL returns a nonzero code if the operation was not successful or the
-    selected channel is actually regarded as busy.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num[0] = 0 if channel shall be disabled, else enabled.
-
-  ISDN_STAT_L1ERR:
-
-    ***CHANGEI1.21 new status message.
-    A signal can be sent to the linklevel if an Layer1-error results in
-    packet-loss on receive or send. The field errcode of the cmd.parm
-    union describes the error more precisely.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_L1ERR
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.errcode= ISDN_STAT_L1ERR_SEND:     Packet lost while sending.
-		    ISDN_STAT_L1ERR_RECV:     Packet lost while receiving.
-  ISDN_STAT_FAXIND:
-
-    With this call the HL-driver signals a fax sub-command to the LL.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_STAT_FAXIND
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax
deleted file mode 100644
index 9c8c6d914ec7..000000000000
--- a/Documentation/isdn/INTERFACE.fax
+++ /dev/null
@@ -1,163 +0,0 @@
-$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $
-
-
-Description of the fax-subinterface between linklevel and hardwarelevel of 
-  isdn4linux. 
-
-  The communication between linklevel (LL) and hardwarelevel (HL) for fax
-  is based on the struct T30_s (defined in isdnif.h).
-  This struct is allocated in the LL.  
-  In order to use fax, the LL provides the pointer to this struct with the 
-  command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup 
-  and when a new channel to a new connection is assigned. 
-
-
-Data handling:
-  In send-mode the HL-driver has to handle the <DLE> codes and the bit-order 
-  conversion by itself. 
-  In receive-mode the LL-driver takes care of the bit-order conversion
-  (specified by +FBOR)
-
-Structure T30_s description:
-
-  This structure stores the values (set by AT-commands), the remote-
-  capability-values and the command-codes between LL and HL.
-
-  If the HL-driver receives ISDN_CMD_FAXCMD, all needed information
-  is in this struct set by the LL.
-  To signal information to the LL, the HL-driver has to set the 
-  parameters and use ISDN_STAT_FAXIND.
-  (Please refer to INTERFACE)
-
-Structure T30_s:
-
-  All members are 8-bit unsigned (__u8)
-
-  -  resolution     
-  -  rate
-  -  width
-  -  length
-  -  compression
-  -  ecm
-  -  binary
-  -  scantime
-  -  id[]
-  Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ...
-
-  -  r_resolution
-  -  r_rate
-  -  r_width
-  -  r_length
-  -  r_compression
-  -  r_ecm
-  -  r_binary
-  -  r_scantime
-  -  r_id[]
-  Remote faxmachine's parameters. To be set by HL-driver.
-
-  -  phase      
-  Defines the actual state of fax connection. Set by HL or LL
-  depending on progress and type of connection.
-  If the phase changes because of an AT command, the LL driver
-  changes this value. Otherwise the HL-driver takes care of it, but
-  only necessary on call establishment (from IDLE to PHASE_A).
-  (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E])
-
-  -  direction
-  Defines outgoing/send or incoming/receive connection.
-  (ISDN_TTY_FAX_CONN_[IN,OUT])
-
-  -  code
-  Commands from LL to HL; possible constants : 
-      ISDN_TTY_FAX_DR        signals +FDR command to HL
-
-      ISDN_TTY_FAX_DT        signals +FDT command to HL 
-
-      ISDN_TTY_FAX_ET        signals +FET command to HL
-
-
-  Other than that the "code" is set with the hangup-code value at
-  the end of connection for the +FHNG message.
-        
-  -  r_code 
-  Commands from HL to LL; possible constants :
-      ISDN_TTY_FAX_CFR       output of +FCFR message. 
-
-      ISDN_TTY_FAX_RID       output of remote ID set in r_id[]
-                             (+FCSI/+FTSI on send/receive)
-
-      ISDN_TTY_FAX_DCS       output of +FDCS and CONNECT message,
-                             switching to phase C.
-
-      ISDN_TTY_FAX_ET        signals end of data,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_FCON      signals the established, outgoing connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_FCON_I    signals the established, incoming connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_DIS       output of +FDIS message and values.
-
-      ISDN_TTY_FAX_SENT      signals that all data has been sent 
-                             and <DLE><ETX> is acknowledged,
-                             OK message will be sent.
-
-      ISDN_TTY_FAX_PTS       signals a msg-confirmation (page sent successful),
-                             depending on fet value:
-                             0: output OK message (more pages follow)
-                             1: switching to phase B (next document)
-
-      ISDN_TTY_FAX_TRAIN_OK  output of +FDCS and OK message (for receive mode).
-
-      ISDN_TTY_FAX_EOP       signals end of data in receive mode,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_HNG       output of the +FHNG and value set by code and
-                             OK message, switching to phase E.
-
-
-  -  badlin
-  Value of +FBADLIN  
-
-  -  badmul
-  Value of +FBADMUL
-
-  -  bor
-  Value of +FBOR
-
-  -  fet
-  Value of +FET command in send-mode.
-  Set by HL in receive-mode for +FET message.
-
-  -  pollid[]  
-  ID-string, set by +FCIG
-
-  -  cq
-  Value of +FCQ
-
-  -  cr
-  Value of +FCR
-
-  -  ctcrty
-  Value of +FCTCRTY
-
-  -  minsp
-  Value of +FMINSP
-
-  -  phcto
-  Value of +FPHCTO
-
-  -  rel
-  Value of +FREL
-
-  -  nbc
-  Value of +FNBC (0,1)
-  (+FNBC is not a known class 2 fax command, I added this to change the
-   automatic "best capabilities" connection in the eicon HL-driver)
-
-  
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
deleted file mode 100644
index 74bd2bdb455b..000000000000
--- a/Documentation/isdn/README
+++ /dev/null
@@ -1,599 +0,0 @@
-README for the ISDN-subsystem
-
-1. Preface
-
-  1.1 Introduction
-
-  This README describes how to set up and how to use the different parts
-  of the ISDN-subsystem.
-
-  For using the ISDN-subsystem, some additional userlevel programs are
-  necessary. Those programs and some contributed utilities are available
-  at
-
-   ftp.isdn4linux.de
-
-   /pub/isdn4linux/isdn4k-utils-<VersionNumber>.tar.gz
-
-
-  We also have set up a mailing-list:
-
-   The isdn4linux-project originates in Germany, and therefore by historical
-   reasons, the mailing-list's primary language is german. However mails
-   written in english have been welcome all the time.
-
-   to subscribe: write a email to majordomo@listserv.isdn4linux.de,
-   Subject irrelevant, in the message body:
-   subscribe isdn4linux <your_email_address>
-
-   To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de
-
-   This mailinglist is bidirectionally gated to the newsgroup
-
-     de.alt.comm.isdn4linux
-
-  There is also a well maintained FAQ in English available at
-     https://www.mhessler.de/i4lfaq/
-  It can be viewed online, or downloaded in sgml/text/html format.
-  The FAQ can also be viewed online at
-     https://www.isdn4linux.de/faq/i4lfaq.html
-  or downloaded from
-     ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
-
-  1.1 Technical details
-
-  In the following Text, the terms MSN and EAZ are used.
-
-  MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies
-  to Euro(EDSS1)-type lines. Usually it is simply the phone number.
-
-  EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and
-  applies to German 1TR6-type lines. This is a one-digit string,
-  simply appended to the base phone number
-
-  The internal handling is nearly identical, so replace the appropriate
-  term to that one, which applies to your local ISDN-environment.
-
-  When the link-level-module isdn.o is loaded, it supports up to 16
-  low-level-modules with up to 64 channels. (The number 64 is arbitrarily
-  chosen and can be configured at compile-time --ISDN_MAX in isdn.h).
-  A low-level-driver can register itself through an interface (which is
-  defined in isdnif.h) and gets assigned a slot.
-  The following char-devices are made available for each channel:
-
-  A raw-control-device with the following functions:
-     write: raw D-channel-messages (format: depends on driver).
-     read:  raw D-channel-messages (format: depends on driver).
-     ioctl: depends on driver, i.e. for the ICN-driver, the base-address of
-            the ports and the shared memory on the card can be set and read
-            also the boot-code and the protocol software can be loaded into
-            the card.
-
-   O N L Y !!!  for debugging (no locking against other devices):
-   One raw-data-device with the following functions:
-     write: data to B-channel.
-     read:  data from B-channel.
-
-   In addition the following devices are made available:
-
-   128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator:
-   The functionality is almost the same as that of a serial device
-   (the line-discs are handled by the kernel), which lets you run
-   SLIP, CSLIP and asynchronous PPP through the devices. We have tested
-   Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax. 
-
-   The modem-emulation supports the following:
-           1.3.1 Commands:
-
-               ATA      Answer incoming call.
-               ATD<No.> Dial, the number may contain:
-                        [0-9] and [,#.*WPT-S]
-                        the latter are ignored until 'S'.
-                        The 'S' must precede the number, if
-                        the line is a SPV (German 1TR6).
-               ATE0     Echo off.
-               ATE1     Echo on (default).
-               ATH      Hang-up.
-               ATH1     Off hook (ignored).
-               ATH0     Hang-up.
-               ATI      Return "ISDN for Linux...".
-               ATI0        "
-               ATI1        "
-               ATI2     Report of last connection.
-               ATO      On line (data mode).
-               ATQ0     Enable result codes (default).
-               ATQ1     Disable result codes (default).
-               ATSx=y   Set register x to y.
-               ATSx?    Show contents of register x.
-               ATV0     Numeric responses.
-               ATV1     English responses (default).
-               ATZ      Load registers and EAZ/MSN from Profile.
-               AT&Bx    Set Send-Packet-size to x (max. 4000)
-                        The real packet-size may be limited by the
-                        low-level-driver used. e.g. the HiSax-Module-
-                        limit is 2000. You will get NO Error-Message,
-                        if you set it to higher values, because at the
-                        time of giving this command the corresponding
-                        driver may not be selected (see "Automatic
-                        Assignment") however the size of outgoing packets
-                        will be limited correctly.
-               AT&D0    Ignore DTR
-               AT&D2    DTR-low-edge: Hang up and return to
-                        command mode (default).
-               AT&D3    Same as AT&D2 but also resets all registers.
-               AT&Ex    Set the EAZ/MSN for this channel to x.
-               AT&F     Reset all registers and profile to "factory-defaults"
-               AT&Lx    Set list of phone numbers to listen on.  x is a
-                        list of wildcard patterns separated by semicolon.
-                        If this is set, it has precedence over the MSN set
-                        by AT&E.
-               AT&Rx    Select V.110 bitrate adaption.
-                        This command enables V.110 protocol with 9600 baud
-                        (x=9600), 19200 baud (x=19200) or 38400 baud
-                        (x=38400). A value of x=0 disables V.110 switching
-                        back to default X.75. This command sets the following
-                        Registers:
-                          Reg 14 (Layer-2 protocol):
-                            x = 0:     0
-                            x = 9600:  7
-                            x = 19200: 8
-                            x = 38400: 9
-                          Reg 18.2 = 1
-                          Reg 19 (Additional Service Indicator):
-                            x = 0:       0
-                            x = 9600:  197
-                            x = 19200: 199
-                            x = 38400: 198
-                          Note on value in Reg 19:
-                            There is _NO_ common convention for 38400 baud.
-                            The value 198 is chosen arbitrarily. Users
-                            _MUST_ negotiate this value before establishing
-                            a connection.
-               AT&Sx    Set window-size (x = 1..8) (not yet implemented)
-               AT&V     Show all settings.
-               AT&W0    Write registers and EAZ/MSN to profile. See also
-                        iprofd (5.c in this README).
-               AT&X0    BTX-mode and T.70-mode off (default)
-               AT&X1    BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0)
-               AT&X2    T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0)
-               AT+Rx    Resume a suspended call with CallID x (x = 1,2,3...)
-               AT+Sx    Suspend a call with CallID x (x = 1,2,3...)
-
-           For voice-mode commands refer to README.audio
-
-           1.3.2 Escape sequence:
-               During a connection, the emulation reacts just like
-               a normal modem to the escape sequence <DELAY>+++<DELAY>.
-               (The escape character - default '+' - can be set in the
-               register 2).
-               The DELAY must at least be 1.5 seconds long and delay
-               between the escape characters must not exceed 0.5 seconds.
-
-           1.3.3 Registers:
-
-              Nr.  Default  Description
-              0    0        Answer on ring number.
-                            (no auto-answer if S0=0).
-              1    0        Count of rings.
-              2    43       Escape character.
-                            (a value >= 128 disables the escape sequence).
-              3    13       Carriage return character (ASCII).
-              4    10       Line feed character (ASCII).
-              5    8        Backspace character (ASCII).
-              6    3        Delay in seconds before dialing.
-              7    60       Wait for carrier.
-              8    2        Pause time for comma (ignored)
-              9    6        Carrier detect time (ignored)
-             10    7        Carrier loss to disconnect time (ignored).
-             11    70       Touch tone timing (ignored).
-             12    69       Bit coded register:
-                            Bit 0:    0 = Suppress response messages.
-                                      1 = Show response messages.
-                            Bit 1:    0 = English response messages.
-                                      1 = Numeric response messages.
-                            Bit 2:    0 = Echo off.
-                                      1 = Echo on.
-                            Bit 3     0 = DCD always on.
-                                      1 = DCD follows carrier.
-                            Bit 4     0 = CTS follows RTS
-                                      1 = Ignore RTS, CTS always on.
-                            Bit 5     0 = return to command mode on DTR low.
-                                      1 = Same as 0 but also resets all
-                                          registers.
-                                      See also register 13, bit 2
-                            Bit 6     0 = DSR always on.
-                                      1 = DSR only on if channel is available.
-                            Bit 7     0 = Cisco-PPP-flag-hack off (default).
-                                      1 = Cisco-PPP-flag-hack on.
-             13   0         Bit coded register:
-                            Bit 0:    0 = Use delayed tty-send-algorithm
-                                      1 = Direct tty-send.
-                            Bit 1:    0 = T.70 protocol (Only for BTX!) off
-                                      1 = T.70 protocol (Only for BTX!) on
-                            Bit 2:    0 = Don't hangup on DTR low.
-                                      1 = Hangup on DTR low.
-                            Bit 3:    0 = Standard response messages
-                                      1 = Extended response messages
-                            Bit 4:    0 = CALLER NUMBER before every RING.
-                                      1 = CALLER NUMBER after first RING.
-                            Bit 5:    0 = T.70 extended protocol off
-                                      1 = T.70 extended protocol on
-                            Bit 6:    0 = Special RUNG Message off
-                                      1 = Special RUNG Message on
-                                          "RUNG" is delivered on a ttyI, if
-                                          an incoming call happened (RING) and
-                                          the remote party hung up before any
-                                          local ATA was given.
-			    Bit 7:    0 = Don't show display messages from net
-                                      1 = Show display messages from net
-				          (S12 Bit 1 must be 0 too)      
-             14   0         Layer-2 protocol:
-                                      0 = X75/LAPB with I-frames
-                                      1 = X75/LAPB with UI-frames
-                                      2 = X75/LAPB with BUI-frames
-                                      3 = HDLC
-                                      4 = Transparent (audio)
-                                      7 = V.110, 9600 baud
-                                      8 = V.110, 19200 baud
-                                      9 = V.110, 38400 baud
-                                     10 = Analog Modem (only if hardware supports this)
-                                     11 = Fax G3 (only if hardware supports this)
-             15   0         Layer-3 protocol:
-                                      0 = transparent
-                                      1 = transparent with audio features (e.g. DSP)
-                                      2 = Fax G3 Class 2 commands (S14 has to be set to 11)
-                                      3 = Fax G3 Class 1 commands (S14 has to be set to 11)
-             16   250       Send-Packet-size/16
-             17   8         Window-size (not yet implemented)
-             18   4         Bit coded register, Service-Octet-1 to accept,
-                            or to be used on dialout:
-                            Bit 0:    Service 1 (audio) when set.
-                            Bit 1:    Service 5 (BTX) when set.
-                            Bit 2:    Service 7 (data) when set.
-                            Note: It is possible to set more than one
-                                  bit. In this case, on incoming calls
-                                  the selected services are accepted,
-                                  and if the service is "audio", the
-                                  Layer-2-protocol is automatically
-                                  changed to 4 regardless of the setting
-                                  of register 14. On outgoing calls,
-                                  the most significant 1-bit is chosen to
-                                  select the outgoing service octet.
-             19   0         Service-Octet-2
-             20   0         Bit coded register (readonly)
-                            Service-Octet-1 of last call.
-                            Bit mapping is the same as register 18
-             21   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3 of calling party number IE (Numbering plan)
-                            See section 4.5.10 of ITU Q.931
-             22   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3a of calling party number IE (Screening info)
-                            See section 4.5.10 of ITU Q.931
-             23   0         Bit coded register:
-                            Bit 0:    0 = Add CPN to RING message off
-                                      1 = Add CPN to RING message on
-                            Bit 1:    0 = Add CPN to FCON message off
-                                      1 = Add CPN to FCON message on
-                            Bit 2:    0 = Add CDN to RING/FCON message off
-                                      1 = Add CDN to RING/FCON message on
-
-  Last but not least a (at the moment fairly primitive) device to request
-  the line-status (/dev/isdninfo) is made available.
-
-  Automatic assignment of devices to lines:
-
-  All inactive physical lines are listening to all EAZs for incoming
-  calls and are NOT assigned to a specific tty or network interface.
-  When an incoming call is detected, the driver looks first for a network
-  interface and then for an opened tty which:
-
-  1. is configured for the same EAZ.
-  2. has the same protocol settings for the B-channel.
-  3. (only for network interfaces if the security flag is set)
-     contains the caller number in its access list.
-  4. Either the channel is not bound exclusively to another Net-interface, or
-     it is bound AND the other checks apply to exactly this interface.
-     (For usage of the bind-features, refer to the isdnctrl-man-page)
-
-  Only when a matching interface or tty is found is the call accepted
-  and the "connection" between the low-level-layer and the link-level-layer
-  is established and kept until the end of the connection.
-  In all other cases no connection is established. Isdn4linux can be
-  configured to either do NOTHING in this case (which is useful, if
-  other, external devices with the same EAZ/MSN are connected to the bus)
-  or to reject the call actively. (isdnctrl busreject ...)
-
-  For an outgoing call, the inactive physical lines are searched.
-  The call is placed on the first physical line, which supports the
-  requested protocols for the B-channel. If a net-interface, however
-  is pre-bound to a channel, this channel is used directly.
-
-  This makes it possible to configure several network interfaces and ttys
-  for one EAZ, if the network interfaces are set to secure operation.
-  If an incoming call matches one network interface, it gets connected to it.
-  If another incoming call for the same EAZ arrives, which does not match
-  a network interface, the first tty gets a "RING" and so on.
-
-2 System prerequisites:
-
-  ATTENTION!
-
-  Always use the latest module utilities. The current version is
-  named in Documentation/Changes. Some old versions of insmod
-  are not capable of setting the driver-Ids correctly.
-
-3. Lowlevel-driver configuration.
-
-   Configuration depends on how the drivers are built. See the
-   README.<yourDriver> for information on driver-specific setup.
-
-4. Device-inodes
-
-   The major and minor numbers and their names are described in
-   Documentation/admin-guide/devices.rst. The major numbers are:
-
-     43 for the ISDN-tty's.
-     44 for the ISDN-callout-tty's.
-     45 for control/info/debug devices.
-
-5. Application
-
-   a) For some card-types, firmware has to be loaded into the cards, before
-      proceeding with device-independent setup. See README.<yourDriver>
-      for how to do that.
-
-   b) If you only intend to use ttys, you are nearly ready now.
-
-   c) If you want to have really permanent "Modem"-settings on disk, you
-      can start the daemon iprofd. Give it a path to a file at the command-
-      line. It will store the profile-settings in this file every time
-      an AT&W0 is performed on any ISDN-tty. If the file already exists,
-      all profiles are initialized from this file. If you want to unload
-      any of the modules, kill iprofd first.
-
-   d) For networking, continue: Create an interface:
-       isdnctrl addif isdn0
-
-   e) Set the EAZ (or MSN for Euro-ISDN):
-       isdnctrl eaz isdn0 2
-
-     (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your
-      real MSN e.g.: Phone-Number)
-
-   f) Set the number for outgoing calls on the interface:
-       isdnctrl addphone isdn0 out 1234567
-       ... (this can be executed more than once, all assigned numbers are
-            tried in order)
-      and the number(s) for incoming calls:
-       isdnctrl addphone isdn0 in 1234567
-
-   g) Set the timeout for hang-up:
-       isdnctrl huptimeout isdn0 <timeout_in_seconds>
-
-   h) additionally you may activate charge-hang-up (= Hang up before
-      next charge-info, this only works, if your isdn-provider transmits
-      the charge-info during and after the connection):
-       isdnctrl chargehup isdn0 on
-
-   i) Set the dial mode of the interface:
-       isdnctrl dialmode isdn0 auto
-      "off" means that you (or the system) cannot make any connection
-        (neither incoming or outgoing connections are possible). Use
-        this if you want to be sure that no connections will be made.
-      "auto" means that the interface is in auto-dial mode, and will
-        attempt to make a connection whenever a network data packet needs
-        the interface's link. Note that this can cause unexpected dialouts,
-        and lead to a high phone bill! Some daemons or other pc's that use
-        this interface can cause this.
-        Incoming connections are also possible.
-      "manual" is a dial mode created to prevent the unexpected dialouts.
-        In this mode, the interface will never make any connections on its
-        own. You must explicitly initiate a connection with "isdnctrl dial
-        isdn0". However, after an idle time of no traffic as configured for
-	the huptimeout value with isdnctrl, the connection _will_ be ended.
-	If you don't want any automatic hangup, set the huptimeout value to 0.
-        "manual" is the default.
-
-   j) Setup the interface with ifconfig as usual, and set a route to it.
-
-   k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use
-     the script tools/tcltk/isdnmon. You can add actions for line-status
-     changes. See the comments at the beginning of the script for how to
-     do that. There are other tty-based tools in the tools-subdirectory
-     contributed by Michael Knigge (imon), Volker Götz (imontty) and
-     Andreas Kool (isdnmon).
-
-   l) For initial testing, you can set the verbose-level to 2 (default: 0).
-      Then all incoming calls are logged, even if they are not addressed
-      to one of the configured net-interfaces:
-      isdnctrl verbose 2
-
-  Now you are ready! A ping to the set address should now result in an
-  automatic dial-out (look at syslog kernel-messages).
-  The phone numbers and EAZs can be assigned at any time with isdnctrl.
-  You can add as many interfaces as you like with addif following the
-  directions above. Of course, there may be some limitations. But we have
-  tested as many as 20 interfaces without any problem. However, if you
-  don't give an interface name to addif, the  kernel will assign a name
-  which starts with "eth". The number of "eth"-interfaces is limited by
-  the kernel.
-
-5. Additional options for isdnctrl:
-
-   "isdnctrl secure <InterfaceName> on"
-   Only incoming calls, for which the caller-id is listed in the access
-   list of the interface are accepted. You can add caller-id's With the
-   command "isdnctrl addphone <InterfaceName> in <caller-id>"
-   Euro-ISDN does not transmit the leading '0' of the caller-id for an
-   incoming call, therefore you should configure it accordingly.
-   If the real number for the dialout e.g. is "09311234567" the number
-   to configure here is "9311234567". The pattern-match function
-   works similar to the shell mechanism.
-
-     ?     one arbitrary digit
-     *     zero or arbitrary many digits
-     [123] one of the digits in the list
-     [1-5] one digit between '1' and '5'
-           a '^' as the first character in a list inverts the list
-
-
-   "isdnctrl secure <InterfaceName> off"
-   Switch off secure operation (default).
-
-   "isdnctrl ihup <InterfaceName> [on|off]"
-   Switch the hang-up-timer for incoming calls on or off.
-
-   "isdnctrl eaz <InterfaceName>"
-   Returns the EAZ of an interface.
-
-   "isdnctrl delphone <InterfaceName> in|out <number>"
-   Deletes a number from one of the access-lists of the interface.
-
-   "isdnctrl delif <InterfaceName>"
-   Removes the interface (and possible slaves) from the kernel.
-   (You have to unregister it with "ifconfig <InterfaceName> down" before).
-
-   "isdnctrl callback <InterfaceName> [on|off]"
-   Switches an interface to callback-mode. In this mode, an incoming call
-   will be rejected and after this the remote-station will be called. If
-   you test this feature by using ping, some routers will re-dial very
-   quickly, so that the callback from isdn4linux may not be recognized.
-   In this case use ping with the option -i <sec> to increase the interval
-   between echo-packets.
-
-   "isdnctrl cbdelay <InterfaceName> [seconds]"
-   Sets the delay (default 5 sec) between an incoming call and start of
-   dialing when callback is enabled.
-
-   "isdnctrl cbhup <InterfaceName> [on|off]"
-   This enables (default) or disables an active hangup (reject) when getting an
-   incoming call for an interface which is configured for callback.
-
-   "isdnctrl encap <InterfaceName> <EncapType>"
-   Selects the type of packet-encapsulation. The encapsulation can be changed
-   only while an interface is down.
-
-   At the moment the following values are supported:
-
-   rawip    (Default) Selects raw-IP-encapsulation. This means, MAC-headers
-            are stripped off.
-   ip       IP with type-field. Same as IP but the type-field of the MAC-header
-            is preserved.
-   x25iface X.25 interface encapsulation (first byte semantics as defined in
-            ../networking/x25-iface.txt). Use this for running the linux
-            X.25 network protocol stack (AF_X25 sockets) on top of isdn.
-   cisco-h  A special-mode for communicating with a Cisco, which is configured
-            to do "hdlc"
-   ethernet No stripping. Packets are sent with full MAC-header.
-            The Ethernet-address of the interface is faked, from its
-            IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values.
-   syncppp  Synchronous PPP
-
-   uihdlc   HDLC with UI-frame-header (for use with DOS ISPA, option -h1)
-
-
-   NOTE:    x25iface encapsulation is currently experimental. Please
-            read README.x25 for further details
-
-
-   Watching packets, using standard-tcpdump will fail for all encapsulations
-   except ethernet because tcpdump does not know how to handle packets
-   without MAC-header. A patch for tcpdump is included in the utility-package
-   mentioned above.
-
-   "isdnctrl l2_prot <InterfaceName> <L2-ProtocolName>"
-   Selects a layer-2-protocol.
-   (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available.
-   With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be
-   possible too. See README.x25 for x25 related l2 protocols.)
-
-   isdnctrl l3_prot <InterfaceName> <L3-ProtocolName>
-   The same for layer-3. (At the moment only "trans" is allowed)
-
-   "isdnctrl list <InterfaceName>"
-   Shows all parameters of an interface and the charge-info.
-   Try "all" as the interface name.
-
-   "isdnctrl hangup <InterfaceName>"
-   Forces hangup of an interface.
-
-   "isdnctrl bind <InterfaceName> <DriverId>,<ChannelNumber> [exclusive]"
-   If you are using more than one ISDN card, it is sometimes necessary to
-   dial out using a specific card or even preserve a specific channel for
-   dialout of a specific net-interface. This can be done with the above
-   command. Replace <DriverId> by whatever you assigned while loading the
-   module. The <ChannelNumber> is counted from zero. The upper limit
-   depends on the card used. At the moment no card supports more than
-   2 channels, so the upper limit is one.
-
-   "isdnctrl unbind <InterfaceName>"
-   unbinds a previously bound interface.
-
-   "isdnctrl busreject <DriverId> on|off"
-   If switched on, isdn4linux replies a REJECT to incoming calls, it
-   cannot match to any configured interface.
-   If switched off, nothing happens in this case.
-   You normally should NOT enable this feature, if the ISDN adapter is not
-   the only device connected to the S0-bus. Otherwise it could happen that
-   isdn4linux rejects an incoming call, which belongs to another device on
-   the bus.
-
-   "isdnctrl addslave <InterfaceName> <SlaveName>
-   Creates a slave interface for channel-bundling. Slave interfaces are
-   not seen by the kernel, but their ISDN-part can be configured with
-   isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more
-   than two channels are to be bundled, feel free to create as many as you
-   want. InterfaceName must be a real interface, NOT a slave. Slave interfaces
-   start dialing, if the master interface resp. the previous slave interface
-   has a load of more than 7000 cps. They hangup if the load goes under 7000
-   cps, according to their "huptimeout"-parameter.
-
-   "isdnctrl sdelay <InterfaceName> secs."
-   This sets the minimum time an Interface has to be fully loaded, until
-   it sends a dial-request to its slave.
-
-   "isdnctrl dial <InterfaceName>"
-   Forces an interface to start dialing even if no packets are to be
-   transferred.
-
-   "isdnctrl mapping <DriverId> MSN0,MSN1,MSN2,...MSN9"
-   This installs a mapping table for EAZ<->MSN-mapping for a single line.
-   Missing MSN's have to be given as "-" or can be omitted, if at the end
-   of the commandline.
-   With this command, it's now possible to have an interface listening to
-   mixed 1TR6- and Euro-Type lines. In this case, the interface has to be
-   configured to a 1TR6-type EAZ (one digit). The mapping is also valid
-   for tty-emulation. Seen from the interface/tty-level the mapping
-   CAN be used, however it's possible to use single tty's/interfaces with
-   real MSN's (more digits) also, in which case the mapping will be ignored.
-   Here is an example:
-
-   You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with
-   MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO".
-
-   isdnctrl mapping EURO -,987654,987655,987656,-,987655
-   ...
-   isdnctrl eaz isdn0 1      # listen on 12345671(1tr6) and 987654(euro)
-   ...
-   isdnctrl eaz isdn1 4      # listen on 12345674(1tr6) only.
-   ...
-   isdnctrl eaz isdn2 987654 # listen on 987654(euro) only.
-
-   Same scheme is used with AT&E...  at the tty's.
-
-6. If you want to write a new low-level-driver, you are welcome.
-   The interface to the link-level-module is described in the file INTERFACE.
-   If the interface should be expanded for any reason, don't do it
-   on your own, send me a mail containing the proposed changes and
-   some reasoning about them.
-   If other drivers will not be affected, I will include the changes
-   in the next release.
-   For developers only, there is a second mailing-list. Write to me
-   (fritz@isdn4linux.de), if you want to join that list.
-
-Have fun!
-
- -Fritz
-
diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ
deleted file mode 100644
index e5dd1addacdd..000000000000
--- a/Documentation/isdn/README.FAQ
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The FAQ for isdn4linux
-======================
-
-Please note that there is a big FAQ available in the isdn4k-utils.
-You find it in:
- isdn4k-utils/FAQ/i4lfaq.sgml
-
-In case you just want to see the FAQ online, or download the newest version,
-you can have a look at my website:
-https://www.mhessler.de/i4lfaq/ (view + download)
-or:
-https://www.isdn4linux.de/faq/4lfaq.html (view)
-
-As the extension tells, the FAQ is in SGML format, and you can convert it
-into text/html/... format by using the sgml2txt/sgml2html/... tools.
-Alternatively, you can also do a 'configure; make all' in the FAQ directory.
-
-
-Please have a look at the FAQ before posting anything in the Mailinglist,
-or the newsgroup!
-
-
-Matthias Hessler
-hessler@isdn4linux.de
-
diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio
deleted file mode 100644
index 8ebca19290d9..000000000000
--- a/Documentation/isdn/README.audio
+++ /dev/null
@@ -1,138 +0,0 @@
-$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $
-
-ISDN subsystem for Linux.
-  Description of audio mode.
-
-When enabled during kernel configuration, the tty emulator of the ISDN
-subsystem is capable of a reduced set of commands to support audio.
-This document describes the commands supported and the format of
-audio data.
-
-Commands for enabling/disabling audio mode:
-
-        AT+FCLASS=8      Enable audio mode.
-                         This affects the following registers:
-                           S18: Bits 0 and 2 are set.
-                           S16: Set to 48 and any further change to
-                                larger values is blocked.
-        AT+FCLASS=0      Disable audio mode.
-                         Register 18 is set to 4.
-        AT+FCLASS=?      Show possible modes.
-        AT+FCLASS?       Report current mode (0 or 8).
-
-Commands supported in audio mode:
-
-All audio mode commands have one of the following forms:
-
-        AT+Vxx?          Show current setting.
-        AT+Vxx=?         Show possible settings.
-        AT+Vxx=v         Set simple parameter.
-        AT+Vxx=v,v ...   Set complex parameter.
-
-where xx is a two-character code and v are alphanumerical parameters.
-The following commands are supported:
-
-        AT+VNH=x         Auto hangup setting. NO EFFECT, supported
-                         for compatibility only.
-        AT+VNH?          Always reporting "1"
-        AT+VNH=?         Always reporting "1"
-
-        AT+VIP           Reset all audio parameters.
-
-        AT+VLS=x         Line select. x is one of the following:
-                           0 = No device.
-                           2 = Phone line.
-        AT+VLS=?         Always reporting "0,2"
-        AT+VLS?          Show current line.
-
-        AT+VRX           Start recording. Emulator responds with
-                         CONNECT and starts sending audio data to
-                         the application. See below for data format
-
-        AT+VSD=x,y       Set silence-detection parameters.
-                         Possible parameters:
-                           x = 0 ... 31  sensitivity threshold level.
-                                         (default 0 , deactivated)
-                           y = 0 ... 255 range of interval in units
-                                         of 0.1 second. (default 70)
-        AT+VSD=?         Report possible parameters.
-        AT+VSD?          Show current parameters.
-
-        AT+VDD=x,y       Set DTMF-detection parameters.
-                         Only possible if online and during this connection.
-                         Possible parameters:
-                           x = 0 ... 15  sensitivity threshold level.
-                                         (default 0 , I4L soft-decode)
-                                         (1-15 soft-decode off, hardware on)
-                           y = 0 ... 255 tone duration in units of 5ms.
-                                         Not for I4L soft decode (default 8, 40ms)
-        AT+VDD=?         Report possible parameters.
-        AT+VDD?          Show current parameters.
-
-        AT+VSM=x         Select audio data format.
-                         Possible parameters:
-                           2 = ADPCM-2
-                           3 = ADPCM-3
-                           4 = ADPCM-4
-                           5 = aLAW
-                           6 = uLAW
-        AT+VSM=?         Show possible audio formats.
-
-        AT+VTX           Start audio playback. Emulator responds
-                         with CONNECT and starts sending audio data
-                         received from the application via phone line.
-General behavior and description of data formats/protocol.
-    when a connection is made:
-
-      On incoming calls, if the application responds to a RING
-      with ATA, depending on the calling service, the emulator
-      responds with either CONNECT (data call) or VCON (voice call).
-      
-      On outgoing voice calls, the emulator responds with VCON
-      upon connection setup.
-
-  Audio recording.
-
-    When receiving audio data, a kind of bisync protocol is used.
-    Upon AT+VRX command, the emulator responds with CONNECT, and
-    starts sending audio data to the application. There are several
-    escape sequences defined, all using DLE (0x10) as Escape char:
-
-    <DLE><ETX>              End of audio data. (i.e. caused by a
-                            hangup of the remote side) Emulator stops
-                            recording, responding with VCON.
-    <DLE><DC4>		    Abort recording, (send by appl.) Emulator
-			    stops recording, sends DLE,ETX.
-    <DLE><DLE>              Escape sequence for DLE in data stream.
-    <DLE>0                  Touchtone "0" received.
-         ...
-    <DLE>9                  Touchtone "9" received.
-    <DLE>#                  Touchtone "#" received.
-    <DLE>*                  Touchtone "*" received.
-    <DLE>A                  Touchtone "A" received.
-    <DLE>B                  Touchtone "B" received.
-    <DLE>C                  Touchtone "C" received.
-    <DLE>D                  Touchtone "D" received.
-
-    <DLE>q                  quiet. Silence detected after non-silence.
-    <DLE>s                  silence. Silence detected from the
-                            start of recording.
-
-    Currently unsupported DLE sequences:
-
-    <DLE>c                  FAX calling tone received.
-    <DLE>b                  busy tone received.
-
-  Audio playback.
-
-    When sending audio data, upon AT+VTX command, emulator responds with
-    CONNECT, and starts transferring data from application to the phone line.
-    The same DLE sequences apply to this mode.
-
-  Full-Duplex-Audio:
-
-    When _both_ commands for recording and playback are given in _one_
-    AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected.
-	In this mode, the only way to stop recording is sending <DLE><DC4>
-    and the only way to stop playback is to send <DLE><ETX>.
-
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
deleted file mode 100644
index a76d74845a4c..000000000000
--- a/Documentation/isdn/README.concap
+++ /dev/null
@@ -1,259 +0,0 @@
-Description of the "concap" encapsulation protocol interface
-============================================================
-
-The "concap" interface is intended to be used by network device
-drivers that need to process an encapsulation protocol. 
-It is assumed that the protocol interacts with a linux network device by
-- data transmission
-- connection control (establish, release)
-Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol".
-
-This is currently only used inside the isdn subsystem. But it might
-also be useful to other kinds of network devices. Thus, if you want
-to suggest changes that improve usability or performance of the
-interface, please let me know. I'm willing to include them in future
-releases (even if I needed to adapt the current isdn code to the
-changed interface).
-
-
-Why is this useful?
-===================
-
-The encapsulation protocol used on top of WAN connections or permanent
-point-to-point links are frequently chosen upon bilateral agreement.
-Thus, a device driver for a certain type of hardware must support
-several different encapsulation protocols at once.
-
-The isdn device driver did already support several different
-encapsulation protocols. The encapsulation protocol is configured by a
-user space utility (isdnctrl). The isdn network interface code then
-uses several case statements which select appropriate actions
-depending on the currently configured encapsulation protocol.
-
-In contrast, LAN network interfaces always used a single encapsulation
-protocol which is unique to the hardware type of the interface. The LAN
-encapsulation is usually done by just sticking a header on the data. Thus,
-traditional linux network device drivers used to process the
-encapsulation protocol directly (usually by just providing a hard_header()
-method in the device structure) using some hardware type specific support
-functions. This is simple, direct and efficient. But it doesn't fit all
-the requirements for complex WAN encapsulations. 
-
-
-   The configurability of the encapsulation protocol to be used
-   makes isdn network interfaces more flexible, but also much more
-   complex than traditional lan network interfaces.
-
-
-Many Encapsulation protocols used on top of WAN connections will not just
-stick a header on the data. They also might need to set up or release
-the WAN connection. They also might want to send other data for their
-private purpose over the wire, e.g. ppp does a lot of link level
-negotiation before the first piece of user data can be transmitted.
-Such encapsulation protocols for WAN devices are typically more complex
-than encapsulation protocols for lan devices. Thus, network interface
-code for typical WAN devices also tends to be more complex.
-
-
-In order to support Linux' x25 PLP implementation on top of
-isdn network interfaces I could have introduced yet another branch to
-the various case statements inside drivers/isdn/isdn_net.c.
-This eventually made isdn_net.c even more complex. In addition, it made
-isdn_net.c harder to maintain. Thus, by identifying an abstract
-interface between the network interface code and the encapsulation
-protocol, complexity could be reduced and maintainability could be
-increased.
-
-
-Likewise, a similar encapsulation protocol will frequently be needed by
-several different interfaces of even different hardware type, e.g. the
-synchronous ppp implementation used by the isdn driver and the
-asynchronous ppp implementation used by the ppp driver have a lot of
-similar code in them. By cleanly separating the encapsulation protocol
-from the hardware specific interface stuff such code could be shared
-better in future.
-
-
-When operating over dial-up-connections (e.g. telephone lines via modem,
-non-permanent virtual circuits of wide area networks, ISDN) many
-encapsulation protocols will need to control the connection. Therefore,
-some basic connection control primitives are supported. The type and
-semantics of the connection (i.e the ISO layer where connection service
-is provided) is outside our scope and might be different depending on
-the encapsulation protocol used, e.g. for a ppp module using our service
-on top of a modem connection a connect_request will result in dialing
-a (somewhere else configured) remote phone number. For an X25-interface
-module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt)
-a connect_request will ask for establishing a reliable lapb
-datalink connection.
-
-
-The encapsulation protocol currently provides the following
-service primitives to the network device.
-
-- create a new encapsulation protocol instance
-- delete encapsulation protocol instance and free all its resources
-- initialize (open) the encapsulation protocol instance for use.
-- deactivate (close) an encapsulation protocol instance.
-- process (xmit) data handed down by upper protocol layer
-- receive data from lower (hardware) layer
-- process connect indication from lower (hardware) layer
-- process disconnect indication from lower (hardware) layer
-
-
-The network interface driver accesses those primitives via callbacks
-provided by the encapsulation protocol instance within a
-struct concap_proto_ops.
-
-struct concap_proto_ops{
-
-	/* create a new encapsulation protocol instance of same type */
-	struct concap_proto *  (*proto_new) (void);
-
-	/* delete encapsulation protocol instance and free all its resources.
-	   cprot may no longer be referenced after calling this */
-	void (*proto_del)(struct concap_proto *cprot);
-
-	/* initialize the protocol's data. To be called at interface startup
-	   or when the device driver resets the interface. All services of the
-	   encapsulation protocol may be used after this*/
-	int (*restart)(struct concap_proto *cprot, 
-		       struct net_device *ndev,
-		       struct concap_device_ops *dops);
-
-	/* deactivate an encapsulation protocol instance. The encapsulation
-	   protocol may not call any *dops methods after this. */
-	int (*close)(struct concap_proto *cprot);
-
-	/* process a frame handed down to us by upper layer */
-	int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called for each data entity received from lower layer*/ 
-	int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called when a connection was set up/down.
-	   Protocols that don't process these primitives might fill in
-	   dummy methods here */
-	int (*connect_ind)(struct concap_proto *cprot);
-	int (*disconn_ind)(struct concap_proto *cprot);
-};
-
-
-The data structures are defined in the header file include/linux/concap.h.
-
-
-A Network interface using encapsulation protocols must also provide
-some service primitives to the encapsulation protocol:
-
-- request data being submitted by lower layer (device hardware) 
-- request a connection being set up by lower layer 
-- request a connection being released by lower layer
-
-The encapsulation protocol accesses those primitives via callbacks
-provided by the network interface within a struct concap_device_ops.
-
-struct concap_device_ops{
-
-	/* to request data be submitted by device */ 
-	int (*data_req)(struct concap_proto *, struct sk_buff *);
-
-	/* Control methods must be set to NULL by devices which do not
-	   support connection control. */
-	/* to request a connection be set up */ 
-	int (*connect_req)(struct concap_proto *);
-
-	/* to request a connection be released */
-	int (*disconn_req)(struct concap_proto *);	
-};
-
-The network interface does not explicitly provide a receive service
-because the encapsulation protocol directly calls netif_rx(). 
-
-
-
-
-An encapsulation protocol itself is actually the
-struct concap_proto{
-	struct net_device *net_dev;		/* net device using our service  */
-	struct concap_device_ops *dops; /* callbacks provided by device */
- 	struct concap_proto_ops  *pops; /* callbacks provided by us */
-	int flags;
-	void *proto_data;               /* protocol specific private data, to
-					   be accessed via *pops methods only*/
-	/*
-	  :
-	  whatever 
-	  :
-	  */
-};
-
-Most of this is filled in when the device requests the protocol to 
-be reset (opend). The network interface must provide the net_dev and
-dops pointers. Other concap_proto members should be considered private
-data that are only accessed by the pops callback functions. Likewise,
-a concap proto should access the network device's private data
-only by means of the callbacks referred to by the dops pointer.
-
-
-A possible extended device structure which uses the connection controlling
-encapsulation services could look like this:
-
-struct concap_device{
-	struct net_device net_dev;
-	struct my_priv  /* device->local stuff */
-			/* the my_priv struct might contain a 
-			   struct concap_device_ops *dops;
-	                   to provide the device specific callbacks
-			*/
-	struct concap_proto *cprot;        /* callbacks provided by protocol */
-};
-
-
-
-Misc Thoughts
-=============
-
-The concept of the concap proto might help to reuse protocol code and
-reduce the complexity of certain network interface implementations.
-The trade off is that it introduces yet another procedure call layer
-when processing the protocol. This has of course some impact on
-performance. However, typically the concap interface will be used by
-devices attached to slow lines (like telephone, isdn, leased synchronous
-lines). For such slow lines, the overhead is probably negligible.
-This might no longer hold for certain high speed WAN links (like
-ATM).
-
-
-If general linux network interfaces explicitly supported concap
-protocols (e.g. by a member struct concap_proto* in struct net_device)
-then the interface of the service function could be changed
-by passing a pointer of type (struct net_device*) instead of
-type (struct concap_proto*). Doing so would make many of the service
-functions compatible to network device support functions.
-
-e.g. instead of the concap protocol's service function
-
-  int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-we could have
-
-  int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb);
-
-As this is compatible to the dev->hard_start_xmit() method, the device
-driver could directly register the concap protocol's encap_and_xmit()
-function as its hard_start_xmit() method. This would eliminate one
-procedure call layer.
-
-
-The device's data request function could also be defined as
- 
-  int (*data_req)(struct net_device *ndev, struct sk_buff *skb);
-
-This might even allow for some protocol stacking. And the network
-interface might even register the same data_req() function directly
-as its hard_start_xmit() method when a zero layer encapsulation
-protocol is configured. Thus, eliminating the performance penalty
-of the concap interface when a trivial concap protocol is used.
-Nevertheless, the device remains able to support encapsulation
-protocol configuration.
-
diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion
deleted file mode 100644
index bddcd5fb86ff..000000000000
--- a/Documentation/isdn/README.diversion
+++ /dev/null
@@ -1,127 +0,0 @@
-The isdn diversion services are a supporting module working together with
-the isdn4linux and the HiSax module for passive cards. 
-Active cards, TAs and cards using a own or other driver than the HiSax 
-module need to be adapted to the HL<->LL interface described in a separate 
-document. The diversion services may be used with all cards supported by 
-the HiSax driver.
-The diversion kernel interface and controlling tool divertctrl were written
-by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the
-GNU General Public License.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-Table of contents
-=================
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-2. Required hard- and software
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-4. Tracing calling and diversion information
- 
-5. Format of the divert device ASCII output
- 
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-   The i4l diversion services offers call forwarding and logging normally 
-   only supported by isdn phones. Incoming calls may be diverted 
-   unconditionally (CFU), when not reachable (CFNR) or on busy condition 
-   (CFB). 
-   The diversions may be invoked statically in the providers exchange
-   as normally done by isdn phones. In this case all incoming calls
-   with a special (or all) service identifiers are forwarded if the 
-   forwarding reason is met. Activated static services may also be 
-   interrogated (queried).
-   The i4l diversion services additionally offers a dynamic version of
-   call forwarding which is not preprogrammed inside the providers exchange
-   but dynamically activated by i4l.
-   In this case all incoming calls are checked by rules that may be
-   compared to the mechanism of ipfwadm or ipchains. If a given rule matches
-   the checking process is finished and the rule matching will be applied
-   to the call.
-   The rules include primary and secondary service identifiers, called 
-   number and subaddress, callers number and subaddress and whether the rule
-   matches to all filtered calls or only those when all B-channel resources
-   are exhausted.
-   Actions that may be invoked by a rule are ignore, proceed, reject, 
-   direct divert or delayed divert of a call.
-   All incoming calls matching a rule except the ignore rule a reported and
-   logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed
-   is selected the call will be held in a proceeding state (without ringing)
-   for a certain amount of time to let an external program or client decide
-   how to handle the call. 
-            
-
-2. Required hard- and software
-   
-   For using the i4l diversion services the isdn line must be of a EURO/DSS1
-   type. Additionally the i4l services only work together with the HiSax 
-   driver for passive isdn cards. All HiSax supported cards may be used for
-   the diversion purposes.
-   The static diversion services require the provider having static services
-   CFU, CFNR, CFB activated on an MSN-line. The static services may not be 
-   used on a point-to-point connection. Further the static services are only
-   available in some countries (for example germany). Countries requiring the 
-   keypad protocol for activating static diversions (like the netherlands) are
-   not supported but may use the tty devices for this purpose.
-   The dynamic diversion services may be used in all countries if the provider
-   enables the feature CF (call forwarding). This should work on both MSN- and
-   point-to-point lines.
-   To add and delete rules the additional divertctrl program is needed. This
-   program is part of the isdn4kutils package.   
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-
-   To compile the i4l code with diversion support you need to say yes to the 
-   DSS1 diversion services when selecting the i4l options in the kernel 
-   config (menuconfig or config).
-   After having properly activated a make modules and make modules_install all
-   required modules will be correctly installed in the needed modules dirs.
-   As the diversion services are currently not included in the scripts of most
-   standard distributions you will have to add a "insmod dss1_divert" after
-   having loaded the global isdn module.
-   The module can be loaded without any command line parameters.
-   If the module is actually loaded and active may be checked with a 
-   "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is 
-   dynamically created by the diversion module and removed when the module is
-   unloaded.
-
-
-4. Tracing calling and diversion information
- 
-   You also may put a "cat /proc/net/isdn/divert" in the background with the
-   output redirected to a file. Then all actions of the module are logged.
-   The divert file in the proc system may be opened more than once, so in 
-   conjunction with inetd and a small remote client on other machines inside
-   your network incoming calls and reactions by the module may be shown on 
-   every listening machine. 
-   If a call is reported as proceeding an external program or client may 
-   specify during a certain amount of time (normally 4 to 10 seconds) what
-   to do with that call.      
-   To unload the module all open files to the device in the proc system must
-   be closed. Otherwise the module (and isdn.o) may not be unloaded. 
-
-5. Format of the divert device ASCII output
- 
-   To be done later
-
diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax
deleted file mode 100644
index 5314958a8a6e..000000000000
--- a/Documentation/isdn/README.fax
+++ /dev/null
@@ -1,45 +0,0 @@
-
-Fax with isdn4linux
-===================
-
-When enabled during kernel configuration, the tty emulator
-of the ISDN subsystem is capable of the Fax Class 2 commands.
-
-This only makes sense under the following conditions :
-
-- You need the commands as dummy, because you are using
-  hylafax (with patch) for AVM capi.
-- You want to use the fax capabilities of your isdn-card.
-  (supported cards are listed below)
-
-
-NOTE: This implementation does *not* support fax with passive
-      ISDN-cards (known as softfax). The low-level driver of
-      the ISDN-card and/or the card itself must support this.
-
-
-Supported ISDN-Cards
---------------------
-
-Eicon DIVA Server BRI/PCI
-	- full support with both B-channels.
-
-Eicon DIVA Server 4BRI/PCI
-	- full support with all B-channels.
-
-Eicon DIVA Server PRI/PCI
-	- full support on amount of B-channels
-		depending on DSPs on board.
-
-
-
-The command set is known as Class 2 (not Class 2.0) and
-can be activated by AT+FCLASS=2
-
-
-The interface between the link-level-module and the hardware-level driver
-is described in the files INTERFACE.fax and INTERFACE.
-
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci
deleted file mode 100644
index e8a4ef0226e8..000000000000
--- a/Documentation/isdn/README.hfc-pci
+++ /dev/null
@@ -1,41 +0,0 @@
-The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used
-for many OEM cards using this chips.
-Additionally the driver has a special feature which makes it possible
-to read the echo-channel of the isdn bus. So all frames in both directions
-may be logged.
-When the echo logging feature is used the number of available B-channels
-for a HFC-PCI card is reduced to 1. Of course this is only relevant to
-the card, not to the isdn line.
-To activate the echo mode the following ioctls must be entered:
-
-hisaxctrl <driver/cardname> 10 1
-
-This reduces the available channels to 1. There must not be open connections
-through this card when entering the command.
-And then:
-
-hisaxctrl <driver/cardname> 12 1
-
-This enables the echo mode. If Hex logging is activated the isdnctrlx 
-devices show a output with a line beginning of HEX: for the providers
-exchange and ECHO: for isdn devices sending to the provider.
-
-If more than one HFC-PCI cards are installed, a specific card may be selected
-at the hisax module load command line. Supply the load command with the desired
-IO-address of the desired card. 
-Example:
-There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400 
-and 0xdc00
-If you want to use the card at 0xd400 standalone you should supply the insmod
-or depmod with type=35 io=0xd400.
-If you want to use all three cards, but the order needs to be at 0xdc00,0xd400,
-0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00 
-Then the desired card will be the initialised in the desired order.
-If the io parameter is used the io addresses of all used cards should be 
-supplied else the parameter is assumed 0 and a auto search for a free card is
-invoked which may not give the wanted result. 
-
-Comments and reports to werner@isdn4linux.de or werner@isdn-development.de
-
-
-
diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp
deleted file mode 100644
index 27d260095cce..000000000000
--- a/Documentation/isdn/README.syncppp
+++ /dev/null
@@ -1,58 +0,0 @@
-Some additional information for setting up a syncPPP
-connection using network interfaces.
----------------------------------------------------------------
-
-You need one thing beside the isdn4linux package:
-
-  a patched pppd .. (I called it ipppd to show the difference)
-
-Compiling isdn4linux with sync PPP:
------------------------------------
-To compile isdn4linux with the sync PPP part, you have
-to answer the appropriate question when doing a "make config"
-Don't forget to load the slhc.o
-module before the isdn.o module, if VJ-compression support
-is not compiled into your kernel. (e.g if you have no PPP or
-CSLIP in the kernel)
-
-Using isdn4linux with sync PPP:
--------------------------------
-Sync PPP is just another encapsulation for isdn4linux. The
-name to enable sync PPP encapsulation is 'syncppp' .. e.g:
-
-  /sbin/isdnctrl encap ippp0 syncppp
-
-The name of the interface is here 'ippp0'. You need 
-one interface with the name 'ippp0' to saturate the
-ipppd, which checks the ppp version via this interface.
-Currently, all devices must have the name ipppX where
-'X' is a decimal value.
-
-To set up a PPP connection you need the ipppd .. You must start 
-the ipppd once after installing the modules. The ipppd 
-communicates with the isdn4linux link-level driver using the
-/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle
-all devices at once. If you want to use two PPP connections
-at the same time, you have to connect the ipppd to two
-devices .. and so on. 
-I've implemented one additional option for the ipppd:
- 'useifip' will get (if set to not 0.0.0.0) the IP address 
- for the negotiation from the attached network-interface. 
-(also: ipppd will try to negotiate pointopoint IP as remote IP)
-You must disable BSD-compression, this implementation can't
-handle compressed packets.
-
-Check the etc/rc.isdn.syncppp in the isdn4kernel-util package
-for an example setup script.
-
-To use the MPPP stuff, you must configure a slave device
-with isdn4linux. Now call the ipppd with the '+mp' option.
-To increase the number of links, you must use the
-'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is
-an example script)
-
-enjoy it,
-    michael
-     
-
-
diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25
deleted file mode 100644
index e561a77c4e22..000000000000
--- a/Documentation/isdn/README.x25
+++ /dev/null
@@ -1,184 +0,0 @@
-  
-X.25 support within isdn4linux
-==============================
-
-This is alpha/beta test code. Use it completely at your own risk.
-As new versions appear, the stuff described here might suddenly change
-or become invalid without notice.
-
-Keep in mind:
-
-You are using several new parts of the 2.2.x kernel series which
-have not been tested in a large scale. Therefore, you might encounter
-more bugs as usual.
-
-- If you connect to an X.25 neighbour not operated by yourself, ASK the
-  other side first. Be prepared that bugs in the protocol implementation
-  might result in problems.
-
-- This implementation has never wiped out my whole hard disk yet. But as
-  this is experimental code, don't blame me if that happened to you.
-  Backing up important data will never harm.
-
-- Monitor your isdn connections while using this software. This should
-  prevent you from undesired phone bills in case of driver problems.
-  
- 
-
-
-How to configure the kernel
-===========================
- 
-The ITU-T (former CCITT) X.25 network protocol layer has been implemented
-in the Linux source tree since version 2.1.16. The isdn subsystem might be 
-useful to run X.25 on top of ISDN. If you want to try it, select
-
-   "CCITT X.25 Packet Layer"
-
-from the networking options as well as
-
-   "ISDN Support" and "X.25 PLP on Top of ISDN"
-
-from the ISDN subsystem options when you configure your kernel for
-compilation. You currently also need to enable
-"Prompt for development and/or incomplete code/drivers" from the
-"Code maturity level options" menu. For the x25trace utility to work
-you also need to enable "Packet socket".
-
-For local testing it is also recommended to enable the isdnloop driver
-from the isdn subsystem's configuration menu.
-
-For testing, it is recommended that all isdn drivers and the X.25 PLP
-protocol are compiled as loadable modules. Like this, you can recover
-from certain errors by simply unloading and reloading the modules.
-
-
-
-What's it for? How to use it?
-=============================
-
-X.25 on top of isdn might be useful with two different scenarios:
-
-- You might want to access a public X.25 data network from your Linux box.
-  You can use i4l if you were physically connected to the X.25 switch
-  by an ISDN B-channel (leased line as well as dial up connection should
-  work).
-
-  This corresponds to ITU-T recommendation X.31 Case A (circuit-mode
-  access to PSPDN [packet switched public data network]).
-
-  NOTE: X.31 also covers a Case B (access to PSPDN via virtual
-  circuit / packet mode service). The latter mode (which in theory
-  also allows using the D-channel) is not supported by isdn4linux.
-  It should however be possible to establish such packet mode connections
-  with certain active isdn cards provided that the firmware supports X.31
-  and the driver exports this functionality to the user. Currently, 
-  the AVM B1 driver is the only driver which does so. (It should be
-  possible to access D-channel X.31 with active AVM cards using the
-  CAPI interface of the AVM-B1 driver).
-
-- Or you might want to operate certain ISDN teleservices on your linux
-  box. A lot of those teleservices run on top of the ISO-8208
-  (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the
-  same as ITU-T X.25.
-
-  Popular candidates of such teleservices are EUROfile transfer or any
-  teleservice applying ITU-T recommendation T.90.
-
-To use the X.25 protocol on top of isdn, just create an isdn network
-interface as usual, configure your own and/or peer's ISDN numbers,
-and choose x25iface encapsulation by
-
-   isdnctrl encap <iface-name> x25iface.
-
-Once encap is set like this, the device can be used by the X.25 packet layer.
-
-All the stuff needed for X.25 is implemented inside the isdn link
-level (mainly isdn_net.c and some new source files). Thus, it should
-work with every existing HL driver. I was able to successfully open X.25
-connections on top of the isdnloop driver and the hisax driver.
-"x25iface"-encapsulation bypasses demand dialing. Dialing will be
-initiated when the upper (X.25 packet) layer requests the lapb datalink to
-be established. But hangup timeout is still active. Whenever a hangup
-occurs, all existing X.25 connections on that link will be cleared
-It is recommended to use sufficiently large hangup-timeouts for the
-isdn interfaces.
-
-
-In order to set up a conforming protocol stack you also need to
-specify the proper l2_prot parameter:
-
-To operate in ISO-8208  X.25 DTE-DTE mode, use
-
-   isdnctrl l2_prot <iface-name> x75i
-
-To access an X.25 network switch via isdn (your linux box is the DTE), use
-
-   isdnctrl l2_prot <iface-name> x25dte
-
-To mimic an X.25 network switch (DCE side of the connection), use
-
-   isdnctrl l2_prot <iface-name> x25dce
-
-However, x25dte or x25dce is currently not supported by any real HL
-level driver. The main difference between x75i and x25dte/dce is that
-x25d[tc]e uses fixed lap_b addresses. With x75i, the side which
-initiates the isdn connection uses the DTE's lap_b address while the
-called side used the DCE's lap_b address. Thus, l2_prot x75i might
-probably work if you access a public X.25 network as long as the
-corresponding isdn connection is set up by you. At least one test
-was successful to connect via isdn4linux to an X.25 switch using this
-trick. At the switch side, a terminal adapter X.21 was used to connect
-it to the isdn.
-
-
-How to set up a test installation?
-==================================
-
-To test X.25 on top of isdn, you need to get
-
-- a recent version of the "isdnctrl" program that supports setting the new
-  X.25 specific parameters.
-
-- the x25-utils-2.X package from 
-  ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-*
-  (don't confuse the x25-utils with the ax25-utils)
-
-- an application program that uses linux PF_X25 sockets (some are
-  contained in the x25-util package).
-
-Before compiling the user level utilities make sure that the compiler/
-preprocessor will fetch the proper kernel header files of this kernel
-source tree. Either make /usr/include/linux a symbolic link pointing to 
-this kernel's include/linux directory or set the appropriate compiler flags.
-
-When all drivers and interfaces are loaded and configured you need to
-ifconfig the network interfaces up and add X.25-routes to them. Use
-the usual ifconfig tool.
-
-ifconfig <iface-name> up
-
-But a special x25route tool (distributed with the x25-util package)
-is needed to set up X.25 routes. I.e. 
-
-x25route add 01 <iface-name>
-
-will cause all x.25 connections to the destination X.25-address
-"01" to be routed to your created isdn network interface.
-
-There are currently no real X.25 applications available. However, for
-tests, the x25-utils package contains a modified version of telnet
-and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can
-use those for your first tests. Furthermore, you might check
-ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some
-alpha-test implementation ("eftp4linux") of the EUROfile transfer
-protocol.
-
-The scripts distributed with the eftp4linux test releases might also
-provide useful examples for setting up X.25 on top of isdn.
-
-The x25-utility package also contains an x25trace tool that can be
-used to monitor X.25 packets received by the network interfaces.
-The /proc/net/x25* files also contain useful information. 
-
-- Henner
diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ
deleted file mode 100644
index 3257a4bc0786..000000000000
--- a/Documentation/isdn/syncPPP.FAQ
+++ /dev/null
@@ -1,224 +0,0 @@
-simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' 
--------------------------------------------------------------------
-
-Q01: what's pppd, ipppd, syncPPP, asyncPPP ??
-Q02: error message "this system lacks PPP support"
-Q03: strange information using 'ifconfig'
-Q04: MPPP?? What's that and how can I use it ...
-Q05: I tried MPPP but it doesn't work 
-Q06: can I use asynchronous PPP encapsulation with network devices
-Q07: A SunISDN machine can't connect to my i4l system
-Q08: I wanna talk to several machines, which need different configs
-Q09: Starting the ipppd, I get only error messages from i4l
-Q10: I wanna use dynamic IP address assignment 
-Q11: I can't connect. How can I check where the problem is.
-Q12: How can I reduce login delay? 
-
--------------------------------------------------------------------
-
-Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ?
-   what should I use?
-A: The pppd is for asynchronous PPP .. asynchronous means
-   here, the framing is character based. (e.g when
-   using ttyI* or tty* devices)
-
-   The ipppd handles PPP packets coming in HDLC
-   frames (bit based protocol) ... The PPP driver
-   in isdn4linux pushes all IP packets direct
-   to the network layer and all PPP protocol
-   frames to the /dev/ippp* device. 
-   So, the ipppd is a simple external network
-   protocol handler.
-
-   If you login into a remote machine using the
-   /dev/ttyI* devices and then enable PPP on the
-   remote terminal server -> use the 'old' pppd
-
-   If your remote side immediately starts to send
-   frames ... you probably connect to a 
-   syncPPP machine .. use the network device part
-   of isdn4linux with the 'syncppp' encapsulation
-   and make sure, that the ipppd is running and 
-   connected to at least one /dev/ippp*. Check the 
-   isdn4linux manual on how to configure a network device.
-
---
-
-Q02: when I start the ipppd .. I only get the
-   error message "this system lacks PPP support"
-A: check that at least the device 'ippp0' exists.
-   (you can check this e.g with the program 'ifconfig')
-   The ipppd NEEDS this device under THIS name .. 
-   If this device doesn't exists, use:
-	isdnctrl addif ippp0
-	isdnctrl encap ippp0 syncppp
-	... (see isdn4linux doc for more) ...
-A: Maybe you have compiled the ipppd with another
-   kernel source tree than the kernel you currently
-   run ... 
-
---
-
-Q03: when I list the netdevices with ifconfig I see, that
-   my ISDN interface has a HWaddr and IRQ=0 and Base 
-   address = 0 
-A: The device is a fake ethernet device .. ignore IRQ and baseaddr
-   You need the HWaddr only for ethernet encapsulation.
-   
---
-
-Q04: MPPP?? What's that and how can I use it ...
-
-A: MPPP or MP or MPP (Warning: MP is also an 
-   acronym for 'Multi Processor') stands for
-   Multi Point to Point and means bundling
-   of several channels to one logical stream.
-   To enable MPPP negotiation you must call the
-   ipppd with the '+mp' option. 
-   You must also configure a slave device for
-   every additional channel. (see the i4l manual
-   for more)
-   To use channel bundling you must first activate
-   the 'master' or initial call. Now you can add 
-   the slave channels with the command:
-       isdnctrl addlink <device>
-   e.g:
-       isdnctrl addlink ippp0
-   This is different from other encapsulations of
-   isdn4linux! With syncPPP, there is no automatic
-   activation of slave devices.
-
---
-
-Q05: I tried MPPP but it doesn't work .. the ipppd
-   writes in the debug log something like:
-   .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ...
-   .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ...
-
-A: you forgot to compile MPPP/RFC1717 support into the
-   ISDN Subsystem. Recompile with this option enabled.
-
---
-
-Q06: can I use asynchronous PPP encapsulation
-   over the network interface of isdn4linux ..
-
-A: No .. that's not possible .. Use the standard
-   PPP package over the /dev/ttyI* devices. You
-   must not use the ipppd for this.
-   
---
-
-Q07: A SunISDN machine tries to connect my i4l system,
-   which doesn't work.
-   Checking the debug log I just saw garbage like:
-!![ ... fill in the line ... ]!!
-
-A: The Sun tries to talk asynchronous PPP ... i4l
-   can't understand this ... try to use the ttyI*
-   devices with the standard PPP/pppd package
-
-A: (from Alexanter Strauss: )
-!![ ... fill in mail ]!!
-
---
-
-Q08: I wanna talk to remote machines, which need
-   a different configuration. The only way
-   I found to do this is to kill the ipppd and
-   start a new one with another config to connect
-   to the second machine. 
-
-A: you must bind a network interface explicitly to
-   an ippp device, where you can connect a (for this
-   interface) individually configured ipppd.
-
---
-
-Q09: When I start the ipppd I only get error messages
-   from the i4l driver .. 
-
-A: When starting, the ipppd calls functions which may 
-   trigger a network packet. (e.g gethostbyname()).
-   Without the ipppd (at this moment, it is not
-   fully started) we can't handle this network request.
-   Try to configure hostnames necessary for the ipppd
-   in your local /etc/hosts file or in a way, that
-   your system can resolve it without using an
-   isdn/ippp network-interface.
-
---
-
-Q10: I wanna use dynamic IP address assignment ... How 
-   must I configure the network device.
-
-A: At least you must have a route which forwards
-   a packet to the ippp network-interface to trigger
-   the dial-on-demand.
-   A default route to the ippp-interface will work.
-   Now you must choose a dummy IP address for your
-   interface.
-   If for some reason you can't set the default
-   route to the ippp interface, you may take any 
-   address of the subnet from which you expect your
-   dynamic IP number and set a 'network route' for
-   this subnet to the ippp interface.
-   To allow overriding of the dummy address you
-   must call the ipppd with the 'ipcp-accept-local' option.
-
-A: You must know, how the ipppd gets the addresses it wanna
-   configure. If you don't give any option, the ipppd
-   tries to negotiate the local host address!
-   With the option 'noipdefault' it requests an address
-   from the remote machine. With 'useifip' it gets the
-   addresses from the net interface. Or you set the address
-   on the option line with the <a.b.c.d:e.f.g.h> option.
-   Note: the IP address of the remote machine must be configured
-   locally or the remote machine must send it in an IPCP request.
-   If your side doesn't know the IP address after negotiation, it
-   closes the connection!
-   You must allow overriding of address with the 'ipcp-accept-*'
-   options, if you have set your own or the remote address 
-   explicitly.
-
-A: Maybe you try these options .. e.g:   
-
-    /sbin/ipppd :$REMOTE noipdefault /dev/ippp0
-
-   where REMOTE must be the address of the remote machine (the
-   machine, which gives you your address)
-
---
-
-Q11: I can't connect. How can I check where the problem is.
-
-A: A good help log is the debug output from the ipppd...
-   Check whether you can find there:
-   - only a few LCP-conf-req SENT messages (less then 10)
-     and then a Term-REQ:
-     -> check whether your ISDN card is well configured
-        it seems, that your machine doesn't dial
-        (IRQ,IO,Proto, etc problems)
-        Configure your ISDN card to print debug messages and
-        check the /dev/isdnctrl output next time. There
-        you can see, whether there is activity on the card/line.
-   - there are at least a few RECV messages in the log:
-     -> fine: your card is dialing and your remote machine
-        tries to talk with you. Maybe only a missing 
-        authentication. Check your ipppd configuration again.
-   - the ipppd exits for some reason:
-     -> not good ... check /var/adm/syslog and /var/adm/daemon.
-        Could be a bug in the ipppd.
-
---
-
-Q12: How can I reduce login delay?
-
-A: Log a login session ('debug' log) and check which options 
-  your remote side rejects. Next time configure your ipppd
-  to not negotiate these options. Another 'side effect' is, that
-  this increases redundancy. (e.g your remote side is buggy and
-  rejects options in a wrong way).
-
-
-
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 18735dc460a0..111636ad1bad 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -23,8 +23,8 @@ running, the suggested command should tell you.
 
 Again, keep in mind that this list assumes you are already functionally
 running a Linux kernel.  Also, not all tools are necessary on all
-systems; obviously, if you don't have any ISDN hardware, for example,
-you probably needn't concern yourself with isdn4k-utils.
+systems; obviously, if you don't have any PC Card hardware, for example,
+you probably needn't concern yourself with pcmciautils.
 
 ====================== ===============  ========================================
         Program        Minimal version       Command to check the version
@@ -45,7 +45,6 @@ btrfs-progs            0.18             btrfsck
 pcmciautils            004              pccardctl -V
 quota-tools            3.09             quota -V
 PPP                    2.4.0            pppd --version
-isdn4k-utils           3.1pre1          isdnctrl 2>&1|grep version
 nfs-utils              1.0.5            showmount --version
 procps                 3.2.0            ps --version
 oprofile               0.9              oprofiled --version
@@ -279,12 +278,6 @@ which can be made by::
 
 as root.
 
-Isdn4k-utils
-------------
-
-Due to changes in the length of the phone number field, isdn4k-utils
-needs to be recompiled or (preferably) upgraded.
-
 NFS-utils
 ---------
 
@@ -448,11 +441,6 @@ PPP
 
 - <ftp://ftp.samba.org/pub/ppp/>
 
-Isdn4k-utils
-------------
-
-- <ftp://ftp.isdn4linux.de/pub/isdn4linux/utils/>
-
 NFS-utils
 ---------
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c55b0fedbe2..3a761e680296 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8371,9 +8371,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kkeil/isdn-2.6.git
 S:	Maintained
 F:	Documentation/isdn/
 F:	drivers/isdn/
-F:	include/linux/isdn.h
 F:	include/linux/isdn/
-F:	include/uapi/linux/isdn.h
 F:	include/uapi/linux/isdn/
 
 IT87 HARDWARE MONITORING DRIVER
diff --git a/drivers/isdn/Kconfig b/drivers/isdn/Kconfig
index 1ca4d70d198a..6e3bf833c67e 100644
--- a/drivers/isdn/Kconfig
+++ b/drivers/isdn/Kconfig
@@ -21,27 +21,6 @@ menuconfig ISDN
 
 if ISDN
 
-menuconfig ISDN_I4L
-	tristate "Old ISDN4Linux (deprecated)"
-	depends on TTY
-	---help---
-	  This driver allows you to use an ISDN adapter for networking
-	  connections and as dialin/out device.  The isdn-tty's have a built
-	  in AT-compatible modem emulator.  Network devices support autodial,
-	  channel-bundling, callback and caller-authentication without having
-	  a daemon running.  A reduced T.70 protocol is supported with tty's
-	  suitable for German BTX.  On D-Channel, the protocols EDSS1
-	  (Euro-ISDN) and 1TR6 (German style) are supported.  See
-	  <file:Documentation/isdn/README> for more information.
-
-	  ISDN support in the linux kernel is moving towards a new API,
-	  called CAPI (Common ISDN Application Programming Interface).
-	  Therefore the old ISDN4Linux layer will eventually become obsolete.
-	  It is still available, though, for use with adapters that are not
-	  supported by the new CAPI subsystem yet.
-
-source "drivers/isdn/i4l/Kconfig"
-
 menuconfig ISDN_CAPI
 	tristate "CAPI 2.0 subsystem"
 	help
@@ -71,9 +50,4 @@ source "drivers/isdn/hysdn/Kconfig"
 
 source "drivers/isdn/mISDN/Kconfig"
 
-config ISDN_HDLC
-	tristate
-	select CRC_CCITT
-	select BITREVERSE
-
 endif # ISDN
diff --git a/drivers/isdn/Makefile b/drivers/isdn/Makefile
index 7487f0bbe855..379b4a03c321 100644
--- a/drivers/isdn/Makefile
+++ b/drivers/isdn/Makefile
@@ -7,7 +7,5 @@ obj-$(CONFIG_ISDN_I4L)			+= i4l/
 obj-$(CONFIG_ISDN_CAPI)			+= capi/
 obj-$(CONFIG_MISDN)			+= mISDN/
 obj-$(CONFIG_ISDN)			+= hardware/
-obj-$(CONFIG_ISDN_DIVERSION)		+= divert/
-obj-$(CONFIG_ISDN_DRV_LOOP)		+= isdnloop/
 obj-$(CONFIG_HYSDN)			+= hysdn/
 obj-$(CONFIG_ISDN_DRV_GIGASET)		+= gigaset/
diff --git a/drivers/isdn/capi/Kconfig b/drivers/isdn/capi/Kconfig
index abaadce376c5..089dbee18f36 100644
--- a/drivers/isdn/capi/Kconfig
+++ b/drivers/isdn/capi/Kconfig
@@ -27,15 +27,6 @@ config ISDN_CAPI_MIDDLEWARE
 	  device.  If you want to use pppd with pppdcapiplugin to dial up to
 	  your ISP, say Y here.
 
-config ISDN_CAPI_CAPIDRV
-	tristate "CAPI2.0 capidrv interface support"
-	depends on ISDN_I4L
-	help
-	  This option provides the glue code to hook up CAPI driven cards to
-	  the legacy isdn4linux link layer.  If you have a card which is
-	  supported by a CAPI driver, but still want to use old features like
-	  ippp interfaces or ttyI emulation, say Y/M here.
-
 config ISDN_CAPI_CAPIDRV_VERBOSE
 	bool "Verbose reason code reporting"
 	depends on ISDN_CAPI_CAPIDRV
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
deleted file mode 100644
index e8949f3dcae1..000000000000
--- a/drivers/isdn/capi/capidrv.c
+++ /dev/null
@@ -1,2525 +0,0 @@
-/* $Id: capidrv.c,v 1.1.2.2 2004/01/12 23:17:24 keil Exp $
- *
- * ISDN4Linux Driver, using capi20 interface (kernelcapi)
- *
- * Copyright 1997 by Carsten Paeth <calle@calle.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/slab.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/signal.h>
-#include <linux/mm.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/skbuff.h>
-#include <linux/isdn.h>
-#include <linux/isdnif.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/capi.h>
-#include <linux/kernelcapi.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/moduleparam.h>
-
-#include <linux/isdn/capiutil.h>
-#include <linux/isdn/capicmd.h>
-#include "capidrv.h"
-
-static int debugmode = 0;
-
-MODULE_DESCRIPTION("CAPI4Linux: Interface to ISDN4Linux");
-MODULE_AUTHOR("Carsten Paeth");
-MODULE_LICENSE("GPL");
-module_param(debugmode, uint, S_IRUGO | S_IWUSR);
-
-/* -------- type definitions ----------------------------------------- */
-
-
-struct capidrv_contr {
-
-	struct capidrv_contr *next;
-	struct module *owner;
-	u32 contrnr;
-	char name[20];
-
-	/*
-	 * for isdn4linux
-	 */
-	isdn_if interface;
-	int myid;
-
-	/*
-	 * LISTEN state
-	 */
-	int state;
-	u32 cipmask;
-	u32 cipmask2;
-	struct timer_list listentimer;
-
-	/*
-	 * ID of capi message sent
-	 */
-	u16 msgid;
-
-	/*
-	 * B-Channels
-	 */
-	int nbchan;
-	struct capidrv_bchan {
-		struct capidrv_contr *contr;
-		u8 msn[ISDN_MSNLEN];
-		int l2;
-		int l3;
-		u8 num[ISDN_MSNLEN];
-		u8 mynum[ISDN_MSNLEN];
-		int si1;
-		int si2;
-		int incoming;
-		int disconnecting;
-		struct capidrv_plci {
-			struct capidrv_plci *next;
-			u32 plci;
-			u32 ncci;	/* ncci for CONNECT_ACTIVE_IND */
-			u16 msgid;	/* to identfy CONNECT_CONF */
-			int chan;
-			int state;
-			int leasedline;
-			struct capidrv_ncci {
-				struct capidrv_ncci *next;
-				struct capidrv_plci *plcip;
-				u32 ncci;
-				u16 msgid;	/* to identfy CONNECT_B3_CONF */
-				int chan;
-				int state;
-				int oldstate;
-				/* */
-				u16 datahandle;
-				struct ncci_datahandle_queue {
-					struct ncci_datahandle_queue *next;
-					u16                         datahandle;
-					int                           len;
-				} *ackqueue;
-			} *ncci_list;
-		} *plcip;
-		struct capidrv_ncci *nccip;
-	} *bchans;
-
-	struct capidrv_plci *plci_list;
-
-	/* for q931 data */
-	u8  q931_buf[4096];
-	u8 *q931_read;
-	u8 *q931_write;
-	u8 *q931_end;
-};
-
-
-struct capidrv_data {
-	struct capi20_appl ap;
-	int ncontr;
-	struct capidrv_contr *contr_list;
-};
-
-typedef struct capidrv_plci capidrv_plci;
-typedef struct capidrv_ncci capidrv_ncci;
-typedef struct capidrv_contr capidrv_contr;
-typedef struct capidrv_data capidrv_data;
-typedef struct capidrv_bchan capidrv_bchan;
-
-/* -------- data definitions ----------------------------------------- */
-
-static capidrv_data global;
-static DEFINE_SPINLOCK(global_lock);
-
-static void handle_dtrace_data(capidrv_contr *card,
-			       int send, int level2, u8 *data, u16 len);
-
-/* -------- convert functions ---------------------------------------- */
-
-static inline u32 b1prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-		return 0;
-	case ISDN_PROTO_L2_HDLC:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_TRANS:
-		return 1;
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-		return 2;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	case ISDN_PROTO_L2_MODEM:
-		return 8;
-	}
-}
-
-static inline u32 b2prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-	case ISDN_PROTO_L2_MODEM:
-		return 1;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	}
-}
-
-static inline u32 b3prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-	case ISDN_PROTO_L2_MODEM:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	}
-}
-
-static _cstruct b1config_async_v110(u16 rate)
-{
-	/* CAPI-Spec "B1 Configuration" */
-	static unsigned char buf[9];
-	buf[0] = 8; /* len */
-	/* maximum bitrate */
-	buf[1] = rate & 0xff; buf[2] = (rate >> 8) & 0xff;
-	buf[3] = 8; buf[4] = 0; /* 8 bits per character */
-	buf[5] = 0; buf[6] = 0; /* parity none */
-	buf[7] = 0; buf[8] = 0; /* 1 stop bit */
-	return buf;
-}
-
-static _cstruct b1config(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	default:
-		return NULL;
-	case ISDN_PROTO_L2_V11096:
-		return b1config_async_v110(9600);
-	case ISDN_PROTO_L2_V11019:
-		return b1config_async_v110(19200);
-	case ISDN_PROTO_L2_V11038:
-		return b1config_async_v110(38400);
-	}
-}
-
-static inline u16 si2cip(u8 si1, u8 si2)
-{
-	static const u8 cip[17][5] =
-		{
-			/*  0  1  2  3  4  */
-			{0, 0, 0, 0, 0},	/*0 */
-			{16, 16, 4, 26, 16},	/*1 */
-			{17, 17, 17, 4, 4},	/*2 */
-			{2, 2, 2, 2, 2},	/*3 */
-			{18, 18, 18, 18, 18},	/*4 */
-			{2, 2, 2, 2, 2},	/*5 */
-			{0, 0, 0, 0, 0},	/*6 */
-			{2, 2, 2, 2, 2},	/*7 */
-			{2, 2, 2, 2, 2},	/*8 */
-			{21, 21, 21, 21, 21},	/*9 */
-			{19, 19, 19, 19, 19},	/*10 */
-			{0, 0, 0, 0, 0},	/*11 */
-			{0, 0, 0, 0, 0},	/*12 */
-			{0, 0, 0, 0, 0},	/*13 */
-			{0, 0, 0, 0, 0},	/*14 */
-			{22, 22, 22, 22, 22},	/*15 */
-			{27, 27, 27, 28, 27}	/*16 */
-		};
-	if (si1 > 16)
-		si1 = 0;
-	if (si2 > 4)
-		si2 = 0;
-
-	return (u16) cip[si1][si2];
-}
-
-static inline u8 cip2si1(u16 cipval)
-{
-	static const u8 si[32] =
-		{7, 1, 7, 7, 1, 1, 7, 7,	/*0-7 */
-		 7, 1, 0, 0, 0, 0, 0, 0,	/*8-15 */
-		 1, 2, 4, 10, 9, 9, 15, 7,	/*16-23 */
-		 7, 7, 1, 16, 16, 0, 0, 0};	/*24-31 */
-
-	if (cipval > 31)
-		cipval = 0;	/* .... */
-	return si[cipval];
-}
-
-static inline u8 cip2si2(u16 cipval)
-{
-	static const u8 si[32] =
-		{0, 0, 0, 0, 2, 3, 0, 0,	/*0-7 */
-		 0, 3, 0, 0, 0, 0, 0, 0,	/*8-15 */
-		 1, 2, 0, 0, 9, 0, 0, 0,	/*16-23 */
-		 0, 0, 3, 2, 3, 0, 0, 0};	/*24-31 */
-
-	if (cipval > 31)
-		cipval = 0;	/* .... */
-	return si[cipval];
-}
-
-
-/* -------- controller management ------------------------------------- */
-
-static inline capidrv_contr *findcontrbydriverid(int driverid)
-{
-	unsigned long flags;
-	capidrv_contr *p;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (p = global.contr_list; p; p = p->next)
-		if (p->myid == driverid)
-			break;
-	spin_unlock_irqrestore(&global_lock, flags);
-	return p;
-}
-
-static capidrv_contr *findcontrbynumber(u32 contr)
-{
-	unsigned long flags;
-	capidrv_contr *p = global.contr_list;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (p = global.contr_list; p; p = p->next)
-		if (p->contrnr == contr)
-			break;
-	spin_unlock_irqrestore(&global_lock, flags);
-	return p;
-}
-
-
-/* -------- plci management ------------------------------------------ */
-
-static capidrv_plci *new_plci(capidrv_contr *card, int chan)
-{
-	capidrv_plci *plcip;
-
-	plcip = kzalloc(sizeof(capidrv_plci), GFP_ATOMIC);
-
-	if (plcip == NULL)
-		return NULL;
-
-	plcip->state = ST_PLCI_NONE;
-	plcip->plci = 0;
-	plcip->msgid = 0;
-	plcip->chan = chan;
-	plcip->next = card->plci_list;
-	card->plci_list = plcip;
-	card->bchans[chan].plcip = plcip;
-
-	return plcip;
-}
-
-static capidrv_plci *find_plci_by_plci(capidrv_contr *card, u32 plci)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->plci == plci)
-			return p;
-	return NULL;
-}
-
-static capidrv_plci *find_plci_by_msgid(capidrv_contr *card, u16 msgid)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->msgid == msgid)
-			return p;
-	return NULL;
-}
-
-static capidrv_plci *find_plci_by_ncci(capidrv_contr *card, u32 ncci)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->plci == (ncci & 0xffff))
-			return p;
-	return NULL;
-}
-
-static void free_plci(capidrv_contr *card, capidrv_plci *plcip)
-{
-	capidrv_plci **pp;
-
-	for (pp = &card->plci_list; *pp; pp = &(*pp)->next) {
-		if (*pp == plcip) {
-			*pp = (*pp)->next;
-			card->bchans[plcip->chan].plcip = NULL;
-			card->bchans[plcip->chan].disconnecting = 0;
-			card->bchans[plcip->chan].incoming = 0;
-			kfree(plcip);
-			return;
-		}
-	}
-	printk(KERN_ERR "capidrv-%d: free_plci %p (0x%x) not found, Huh?\n",
-	       card->contrnr, plcip, plcip->plci);
-}
-
-/* -------- ncci management ------------------------------------------ */
-
-static inline capidrv_ncci *new_ncci(capidrv_contr *card,
-				     capidrv_plci *plcip,
-				     u32 ncci)
-{
-	capidrv_ncci *nccip;
-
-	nccip = kzalloc(sizeof(capidrv_ncci), GFP_ATOMIC);
-
-	if (nccip == NULL)
-		return NULL;
-
-	nccip->ncci = ncci;
-	nccip->state = ST_NCCI_NONE;
-	nccip->plcip = plcip;
-	nccip->chan = plcip->chan;
-	nccip->datahandle = 0;
-
-	nccip->next = plcip->ncci_list;
-	plcip->ncci_list = nccip;
-
-	card->bchans[plcip->chan].nccip = nccip;
-
-	return nccip;
-}
-
-static inline capidrv_ncci *find_ncci(capidrv_contr *card, u32 ncci)
-{
-	capidrv_plci *plcip;
-	capidrv_ncci *p;
-
-	if ((plcip = find_plci_by_ncci(card, ncci)) == NULL)
-		return NULL;
-
-	for (p = plcip->ncci_list; p; p = p->next)
-		if (p->ncci == ncci)
-			return p;
-	return NULL;
-}
-
-static inline capidrv_ncci *find_ncci_by_msgid(capidrv_contr *card,
-					       u32 ncci, u16 msgid)
-{
-	capidrv_plci *plcip;
-	capidrv_ncci *p;
-
-	if ((plcip = find_plci_by_ncci(card, ncci)) == NULL)
-		return NULL;
-
-	for (p = plcip->ncci_list; p; p = p->next)
-		if (p->msgid == msgid)
-			return p;
-	return NULL;
-}
-
-static void free_ncci(capidrv_contr *card, struct capidrv_ncci *nccip)
-{
-	struct capidrv_ncci **pp;
-
-	for (pp = &(nccip->plcip->ncci_list); *pp; pp = &(*pp)->next) {
-		if (*pp == nccip) {
-			*pp = (*pp)->next;
-			break;
-		}
-	}
-	card->bchans[nccip->chan].nccip = NULL;
-	kfree(nccip);
-}
-
-static int capidrv_add_ack(struct capidrv_ncci *nccip,
-			   u16 datahandle, int len)
-{
-	struct ncci_datahandle_queue *n, **pp;
-
-	n = kmalloc(sizeof(struct ncci_datahandle_queue), GFP_ATOMIC);
-	if (!n) {
-		printk(KERN_ERR "capidrv: kmalloc ncci_datahandle failed\n");
-		return -1;
-	}
-	n->next = NULL;
-	n->datahandle = datahandle;
-	n->len = len;
-	for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next);
-	*pp = n;
-	return 0;
-}
-
-static int capidrv_del_ack(struct capidrv_ncci *nccip, u16 datahandle)
-{
-	struct ncci_datahandle_queue **pp, *p;
-	int len;
-
-	for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next) {
-		if ((*pp)->datahandle == datahandle) {
-			p = *pp;
-			len = p->len;
-			*pp = (*pp)->next;
-			kfree(p);
-			return len;
-		}
-	}
-	return -1;
-}
-
-/* -------- convert and send capi message ---------------------------- */
-
-static void send_message(capidrv_contr *card, _cmsg *cmsg)
-{
-	struct sk_buff *skb;
-	size_t len;
-
-	if (capi_cmsg2message(cmsg, cmsg->buf)) {
-		printk(KERN_ERR "capidrv::send_message: parser failure\n");
-		return;
-	}
-	len = CAPIMSG_LEN(cmsg->buf);
-	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb) {
-		printk(KERN_ERR "capidrv::send_message: can't allocate mem\n");
-		return;
-	}
-	skb_put_data(skb, cmsg->buf, len);
-	if (capi20_put_message(&global.ap, skb) != CAPI_NOERROR)
-		kfree_skb(skb);
-}
-
-/* -------- state machine -------------------------------------------- */
-
-struct listenstatechange {
-	int actstate;
-	int nextstate;
-	int event;
-};
-
-static struct listenstatechange listentable[] =
-{
-	{ST_LISTEN_NONE, ST_LISTEN_WAIT_CONF, EV_LISTEN_REQ},
-	{ST_LISTEN_ACTIVE, ST_LISTEN_ACTIVE_WAIT_CONF, EV_LISTEN_REQ},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_ERROR},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_ERROR},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK},
-	{},
-};
-
-static void listen_change_state(capidrv_contr *card, int event)
-{
-	struct listenstatechange *p = listentable;
-	while (p->event) {
-		if (card->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: listen_change_state %d -> %d\n",
-				       card->contrnr, card->state, p->nextstate);
-			card->state = p->nextstate;
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: listen_change_state state=%d event=%d ????\n",
-	       card->contrnr, card->state, event);
-
-}
-
-/* ------------------------------------------------------------------ */
-
-static void p0(capidrv_contr *card, capidrv_plci *plci)
-{
-	isdn_ctrl cmd;
-
-	card->bchans[plci->chan].contr = NULL;
-	cmd.command = ISDN_STAT_DHUP;
-	cmd.driver = card->myid;
-	cmd.arg = plci->chan;
-	card->interface.statcallb(&cmd);
-	free_plci(card, plci);
-}
-
-/* ------------------------------------------------------------------ */
-
-struct plcistatechange {
-	int actstate;
-	int nextstate;
-	int event;
-	void (*changefunc)(capidrv_contr *card, capidrv_plci *plci);
-};
-
-static struct plcistatechange plcitable[] =
-{
-	/* P-0 */
-	{ST_PLCI_NONE, ST_PLCI_OUTGOING, EV_PLCI_CONNECT_REQ, NULL},
-	{ST_PLCI_NONE, ST_PLCI_ALLOCATED, EV_PLCI_FACILITY_IND_UP, NULL},
-	{ST_PLCI_NONE, ST_PLCI_INCOMING, EV_PLCI_CONNECT_IND, NULL},
-	{ST_PLCI_NONE, ST_PLCI_RESUMEING, EV_PLCI_RESUME_REQ, NULL},
-	/* P-0.1 */
-	{ST_PLCI_OUTGOING, ST_PLCI_NONE, EV_PLCI_CONNECT_CONF_ERROR, p0},
-	{ST_PLCI_OUTGOING, ST_PLCI_ALLOCATED, EV_PLCI_CONNECT_CONF_OK, NULL},
-	/* P-1 */
-	{ST_PLCI_ALLOCATED, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-ACT */
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_HELD, EV_PLCI_HOLD_IND, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_SUSPEND_IND, NULL},
-	/* P-2 */
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_FACILITY_IND, EV_PLCI_FACILITY_IND_UP, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_RESP, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CD_IND, NULL},
-	/* P-3 */
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-4 */
-	{ST_PLCI_ACCEPTING, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-5 */
-	{ST_PLCI_DISCONNECTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-6 */
-	{ST_PLCI_DISCONNECTED, ST_PLCI_NONE, EV_PLCI_DISCONNECT_RESP, p0},
-	/* P-0.Res */
-	{ST_PLCI_RESUMEING, ST_PLCI_NONE, EV_PLCI_RESUME_CONF_ERROR, p0},
-	{ST_PLCI_RESUMEING, ST_PLCI_RESUME, EV_PLCI_RESUME_CONF_OK, NULL},
-	/* P-RES */
-	{ST_PLCI_RESUME, ST_PLCI_ACTIVE, EV_PLCI_RESUME_IND, NULL},
-	/* P-HELD */
-	{ST_PLCI_HELD, ST_PLCI_ACTIVE, EV_PLCI_RETRIEVE_IND, NULL},
-	{},
-};
-
-static void plci_change_state(capidrv_contr *card, capidrv_plci *plci, int event)
-{
-	struct plcistatechange *p = plcitable;
-	while (p->event) {
-		if (plci->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: plci_change_state:0x%x %d -> %d\n",
-				       card->contrnr, plci->plci, plci->state, p->nextstate);
-			plci->state = p->nextstate;
-			if (p->changefunc)
-				p->changefunc(card, plci);
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: plci_change_state:0x%x state=%d event=%d ????\n",
-	       card->contrnr, plci->plci, plci->state, event);
-}
-
-/* ------------------------------------------------------------------ */
-
-static _cmsg cmsg;
-
-static void n0(capidrv_contr *card, capidrv_ncci *ncci)
-{
-	isdn_ctrl cmd;
-
-	capi_fill_DISCONNECT_REQ(&cmsg,
-				 global.ap.applid,
-				 card->msgid++,
-				 ncci->plcip->plci,
-				 NULL,	/* BChannelinformation */
-				 NULL,	/* Keypadfacility */
-				 NULL,	/* Useruserdata */   /* $$$$ */
-				 NULL	/* Facilitydataarray */
-		);
-	plci_change_state(card, ncci->plcip, EV_PLCI_DISCONNECT_REQ);
-	send_message(card, &cmsg);
-
-	cmd.command = ISDN_STAT_BHUP;
-	cmd.driver = card->myid;
-	cmd.arg = ncci->chan;
-	card->interface.statcallb(&cmd);
-	free_ncci(card, ncci);
-}
-
-/* ------------------------------------------------------------------ */
-
-struct nccistatechange {
-	int actstate;
-	int nextstate;
-	int event;
-	void (*changefunc)(capidrv_contr *card, capidrv_ncci *ncci);
-};
-
-static struct nccistatechange nccitable[] =
-{
-	/* N-0 */
-	{ST_NCCI_NONE, ST_NCCI_OUTGOING, EV_NCCI_CONNECT_B3_REQ, NULL},
-	{ST_NCCI_NONE, ST_NCCI_INCOMING, EV_NCCI_CONNECT_B3_IND, NULL},
-	/* N-0.1 */
-	{ST_NCCI_OUTGOING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_CONF_OK, NULL},
-	{ST_NCCI_OUTGOING, ST_NCCI_NONE, EV_NCCI_CONNECT_B3_CONF_ERROR, n0},
-	/* N-1 */
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_CONNECT_B3_REJECT, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_RESP, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-2 */
-	{ST_NCCI_ALLOCATED, ST_NCCI_ACTIVE, EV_NCCI_CONNECT_B3_ACTIVE_IND, NULL},
-	{ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-ACT */
-	{ST_NCCI_ACTIVE, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_RESETING, EV_NCCI_RESET_B3_REQ, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-3 */
-	{ST_NCCI_RESETING, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL},
-	{ST_NCCI_RESETING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_RESETING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-4 */
-	{ST_NCCI_DISCONNECTING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_DISCONNECTING, ST_NCCI_PREVIOUS, EV_NCCI_DISCONNECT_B3_CONF_ERROR, NULL},
-	/* N-5 */
-	{ST_NCCI_DISCONNECTED, ST_NCCI_NONE, EV_NCCI_DISCONNECT_B3_RESP, n0},
-	{},
-};
-
-static void ncci_change_state(capidrv_contr *card, capidrv_ncci *ncci, int event)
-{
-	struct nccistatechange *p = nccitable;
-	while (p->event) {
-		if (ncci->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: ncci_change_state:0x%x %d -> %d\n",
-				       card->contrnr, ncci->ncci, ncci->state, p->nextstate);
-			if (p->nextstate == ST_NCCI_PREVIOUS) {
-				ncci->state = ncci->oldstate;
-				ncci->oldstate = p->actstate;
-			} else {
-				ncci->oldstate = p->actstate;
-				ncci->state = p->nextstate;
-			}
-			if (p->changefunc)
-				p->changefunc(card, ncci);
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: ncci_change_state:0x%x state=%d event=%d ????\n",
-	       card->contrnr, ncci->ncci, ncci->state, event);
-}
-
-/* ------------------------------------------------------------------- */
-
-static inline int new_bchan(capidrv_contr *card)
-{
-	int i;
-	for (i = 0; i < card->nbchan; i++) {
-		if (card->bchans[i].plcip == NULL) {
-			card->bchans[i].disconnecting = 0;
-			return i;
-		}
-	}
-	return -1;
-}
-
-/* ------------------------------------------------------------------- */
-static char *capi_info2str(u16 reason)
-{
-#ifndef CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE
-	return "..";
-#else
-	switch (reason) {
-
-/*-- informative values (corresponding message was processed) -----*/
-	case 0x0001:
-		return "NCPI not supported by current protocol, NCPI ignored";
-	case 0x0002:
-		return "Flags not supported by current protocol, flags ignored";
-	case 0x0003:
-		return "Alert already sent by another application";
-
-/*-- error information concerning CAPI_REGISTER -----*/
-	case 0x1001:
-		return "Too many applications";
-	case 0x1002:
-		return "Logical block size too small, must be at least 128 Bytes";
-	case 0x1003:
-		return "Buffer exceeds 64 kByte";
-	case 0x1004:
-		return "Message buffer size too small, must be at least 1024 Bytes";
-	case 0x1005:
-		return "Max. number of logical connections not supported";
-	case 0x1006:
-		return "Reserved";
-	case 0x1007:
-		return "The message could not be accepted because of an internal busy condition";
-	case 0x1008:
-		return "OS resource error (no memory ?)";
-	case 0x1009:
-		return "CAPI not installed";
-	case 0x100A:
-		return "Controller does not support external equipment";
-	case 0x100B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning message exchange functions -----*/
-	case 0x1101:
-		return "Illegal application number";
-	case 0x1102:
-		return "Illegal command or subcommand or message length less than 12 bytes";
-	case 0x1103:
-		return "The message could not be accepted because of a queue full condition !! The error code does not imply that CAPI cannot receive messages directed to another controller, PLCI or NCCI";
-	case 0x1104:
-		return "Queue is empty";
-	case 0x1105:
-		return "Queue overflow, a message was lost !! This indicates a configuration error. The only recovery from this error is to perform a CAPI_RELEASE";
-	case 0x1106:
-		return "Unknown notification parameter";
-	case 0x1107:
-		return "The Message could not be accepted because of an internal busy condition";
-	case 0x1108:
-		return "OS Resource error (no memory ?)";
-	case 0x1109:
-		return "CAPI not installed";
-	case 0x110A:
-		return "Controller does not support external equipment";
-	case 0x110B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning resource / coding problems -----*/
-	case 0x2001:
-		return "Message not supported in current state";
-	case 0x2002:
-		return "Illegal Controller / PLCI / NCCI";
-	case 0x2003:
-		return "Out of PLCI";
-	case 0x2004:
-		return "Out of NCCI";
-	case 0x2005:
-		return "Out of LISTEN";
-	case 0x2006:
-		return "Out of FAX resources (protocol T.30)";
-	case 0x2007:
-		return "Illegal message parameter coding";
-
-/*-- error information concerning requested services  -----*/
-	case 0x3001:
-		return "B1 protocol not supported";
-	case 0x3002:
-		return "B2 protocol not supported";
-	case 0x3003:
-		return "B3 protocol not supported";
-	case 0x3004:
-		return "B1 protocol parameter not supported";
-	case 0x3005:
-		return "B2 protocol parameter not supported";
-	case 0x3006:
-		return "B3 protocol parameter not supported";
-	case 0x3007:
-		return "B protocol combination not supported";
-	case 0x3008:
-		return "NCPI not supported";
-	case 0x3009:
-		return "CIP Value unknown";
-	case 0x300A:
-		return "Flags not supported (reserved bits)";
-	case 0x300B:
-		return "Facility not supported";
-	case 0x300C:
-		return "Data length not supported by current protocol";
-	case 0x300D:
-		return "Reset procedure not supported by current protocol";
-
-/*-- informations about the clearing of a physical connection -----*/
-	case 0x3301:
-		return "Protocol error layer 1 (broken line or B-channel removed by signalling protocol)";
-	case 0x3302:
-		return "Protocol error layer 2";
-	case 0x3303:
-		return "Protocol error layer 3";
-	case 0x3304:
-		return "Another application got that call";
-/*-- T.30 specific reasons -----*/
-	case 0x3311:
-		return "Connecting not successful (remote station is no FAX G3 machine)";
-	case 0x3312:
-		return "Connecting not successful (training error)";
-	case 0x3313:
-		return "Disconnected before transfer (remote station does not support transfer mode, e.g. resolution)";
-	case 0x3314:
-		return "Disconnected during transfer (remote abort)";
-	case 0x3315:
-		return "Disconnected during transfer (remote procedure error, e.g. unsuccessful repetition of T.30 commands)";
-	case 0x3316:
-		return "Disconnected during transfer (local tx data underrun)";
-	case 0x3317:
-		return "Disconnected during transfer (local rx data overflow)";
-	case 0x3318:
-		return "Disconnected during transfer (local abort)";
-	case 0x3319:
-		return "Illegal parameter coding (e.g. SFF coding error)";
-
-/*-- disconnect causes from the network according to ETS 300 102-1/Q.931 -----*/
-	case 0x3481: return "Unallocated (unassigned) number";
-	case 0x3482: return "No route to specified transit network";
-	case 0x3483: return "No route to destination";
-	case 0x3486: return "Channel unacceptable";
-	case 0x3487:
-		return "Call awarded and being delivered in an established channel";
-	case 0x3490: return "Normal call clearing";
-	case 0x3491: return "User busy";
-	case 0x3492: return "No user responding";
-	case 0x3493: return "No answer from user (user alerted)";
-	case 0x3495: return "Call rejected";
-	case 0x3496: return "Number changed";
-	case 0x349A: return "Non-selected user clearing";
-	case 0x349B: return "Destination out of order";
-	case 0x349C: return "Invalid number format";
-	case 0x349D: return "Facility rejected";
-	case 0x349E: return "Response to STATUS ENQUIRY";
-	case 0x349F: return "Normal, unspecified";
-	case 0x34A2: return "No circuit / channel available";
-	case 0x34A6: return "Network out of order";
-	case 0x34A9: return "Temporary failure";
-	case 0x34AA: return "Switching equipment congestion";
-	case 0x34AB: return "Access information discarded";
-	case 0x34AC: return "Requested circuit / channel not available";
-	case 0x34AF: return "Resources unavailable, unspecified";
-	case 0x34B1: return "Quality of service unavailable";
-	case 0x34B2: return "Requested facility not subscribed";
-	case 0x34B9: return "Bearer capability not authorized";
-	case 0x34BA: return "Bearer capability not presently available";
-	case 0x34BF: return "Service or option not available, unspecified";
-	case 0x34C1: return "Bearer capability not implemented";
-	case 0x34C2: return "Channel type not implemented";
-	case 0x34C5: return "Requested facility not implemented";
-	case 0x34C6: return "Only restricted digital information bearer capability is available";
-	case 0x34CF: return "Service or option not implemented, unspecified";
-	case 0x34D1: return "Invalid call reference value";
-	case 0x34D2: return "Identified channel does not exist";
-	case 0x34D3: return "A suspended call exists, but this call identity does not";
-	case 0x34D4: return "Call identity in use";
-	case 0x34D5: return "No call suspended";
-	case 0x34D6: return "Call having the requested call identity has been cleared";
-	case 0x34D8: return "Incompatible destination";
-	case 0x34DB: return "Invalid transit network selection";
-	case 0x34DF: return "Invalid message, unspecified";
-	case 0x34E0: return "Mandatory information element is missing";
-	case 0x34E1: return "Message type non-existent or not implemented";
-	case 0x34E2: return "Message not compatible with call state or message type non-existent or not implemented";
-	case 0x34E3: return "Information element non-existent or not implemented";
-	case 0x34E4: return "Invalid information element contents";
-	case 0x34E5: return "Message not compatible with call state";
-	case 0x34E6: return "Recovery on timer expiry";
-	case 0x34EF: return "Protocol error, unspecified";
-	case 0x34FF: return "Interworking, unspecified";
-
-	default: return "No additional information";
-	}
-#endif
-}
-
-static void handle_controller(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_LISTEN_CONF:	/* Controller */
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: listenconf Info=0x%4x (%s) cipmask=0x%x\n",
-			       card->contrnr, cmsg->Info, capi_info2str(cmsg->Info), card->cipmask);
-		if (cmsg->Info) {
-			listen_change_state(card, EV_LISTEN_CONF_ERROR);
-		} else if (card->cipmask == 0) {
-			listen_change_state(card, EV_LISTEN_CONF_EMPTY);
-		} else {
-			listen_change_state(card, EV_LISTEN_CONF_OK);
-		}
-		break;
-
-	case CAPI_MANUFACTURER_IND:	/* Controller */
-		if (cmsg->ManuID == 0x214D5641
-		    && cmsg->Class == 0
-		    && cmsg->Function == 1) {
-			u8  *data = cmsg->ManuData + 3;
-			u16  len = cmsg->ManuData[0];
-			u16 layer;
-			int direction;
-			if (len == 255) {
-				len = (cmsg->ManuData[1] | (cmsg->ManuData[2] << 8));
-				data += 2;
-			}
-			len -= 2;
-			layer = ((*(data - 1)) << 8) | *(data - 2);
-			if (layer & 0x300)
-				direction = (layer & 0x200) ? 0 : 1;
-			else direction = (layer & 0x800) ? 0 : 1;
-			if (layer & 0x0C00) {
-				if ((layer & 0xff) == 0x80) {
-					handle_dtrace_data(card, direction, 1, data, len);
-					break;
-				}
-			} else if ((layer & 0xff) < 0x80) {
-				handle_dtrace_data(card, direction, 0, data, len);
-				break;
-			}
-			printk(KERN_INFO "capidrv-%d: %s from controller 0x%x layer 0x%x, ignored\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->adr.adrController, layer);
-			break;
-		}
-		goto ignored;
-	case CAPI_MANUFACTURER_CONF:	/* Controller */
-		if (cmsg->ManuID == 0x214D5641) {
-			char *s = NULL;
-			switch (cmsg->Class) {
-			case 0: break;
-			case 1: s = "unknown class"; break;
-			case 2: s = "unknown function"; break;
-			default: s = "unknown error"; break;
-			}
-			if (s)
-				printk(KERN_INFO "capidrv-%d: %s from controller 0x%x function %d: %s\n",
-				       card->contrnr,
-				       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-				       cmsg->adr.adrController,
-				       cmsg->Function, s);
-			break;
-		}
-		goto ignored;
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_INFO_IND:	/* Controller/plci */
-		goto ignored;
-	case CAPI_INFO_CONF:	/* Controller/plci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s from controller 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController);
-	}
-	return;
-
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s from controller 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrController);
-}
-
-static void handle_incoming_call(capidrv_contr *card, _cmsg *cmsg)
-{
-	capidrv_plci *plcip;
-	capidrv_bchan *bchan;
-	isdn_ctrl cmd;
-	int chan;
-
-	if ((chan = new_bchan(card)) == -1) {
-		printk(KERN_ERR "capidrv-%d: incoming call on not existing bchan ?\n", card->contrnr);
-		return;
-	}
-	bchan = &card->bchans[chan];
-	if ((plcip = new_plci(card, chan)) == NULL) {
-		printk(KERN_ERR "capidrv-%d: incoming call: no memory, sorry.\n", card->contrnr);
-		return;
-	}
-	bchan->incoming = 1;
-	plcip->plci = cmsg->adr.adrPLCI;
-	plci_change_state(card, plcip, EV_PLCI_CONNECT_IND);
-
-	cmd.command = ISDN_STAT_ICALL;
-	cmd.driver = card->myid;
-	cmd.arg = chan;
-	memset(&cmd.parm.setup, 0, sizeof(cmd.parm.setup));
-	strncpy(cmd.parm.setup.phone,
-		cmsg->CallingPartyNumber + 3,
-		cmsg->CallingPartyNumber[0] - 2);
-	strncpy(cmd.parm.setup.eazmsn,
-		cmsg->CalledPartyNumber + 2,
-		cmsg->CalledPartyNumber[0] - 1);
-	cmd.parm.setup.si1 = cip2si1(cmsg->CIPValue);
-	cmd.parm.setup.si2 = cip2si2(cmsg->CIPValue);
-	cmd.parm.setup.plan = cmsg->CallingPartyNumber[1];
-	cmd.parm.setup.screen = cmsg->CallingPartyNumber[2];
-
-	printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s\n",
-	       card->contrnr,
-	       cmd.parm.setup.phone,
-	       cmd.parm.setup.si1,
-	       cmd.parm.setup.si2,
-	       cmd.parm.setup.eazmsn);
-
-	if (cmd.parm.setup.si1 == 1 && cmd.parm.setup.si2 != 0) {
-		printk(KERN_INFO "capidrv-%d: patching si2=%d to 0 for VBOX\n",
-		       card->contrnr,
-		       cmd.parm.setup.si2);
-		cmd.parm.setup.si2 = 0;
-	}
-
-	switch (card->interface.statcallb(&cmd)) {
-	case 0:
-	case 3:
-		/* No device matching this call.
-		 * and isdn_common.c has send a HANGUP command
-		 * which is ignored in state ST_PLCI_INCOMING,
-		 * so we send RESP to ignore the call
-		 */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 1;	/* ignore */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s ignored\n",
-		       card->contrnr,
-		       cmd.parm.setup.phone,
-		       cmd.parm.setup.si1,
-		       cmd.parm.setup.si2,
-		       cmd.parm.setup.eazmsn);
-		break;
-	case 1:
-		/* At least one device matching this call (RING on ttyI)
-		 * HL-driver may send ALERTING on the D-channel in this
-		 * case.
-		 * really means: RING on ttyI or a net interface
-		 * accepted this call already.
-		 *
-		 * If the call was accepted, state has already changed,
-		 * and CONNECT_RESP already sent.
-		 */
-		if (plcip->state == ST_PLCI_INCOMING) {
-			printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s tty alerting\n",
-			       card->contrnr,
-			       cmd.parm.setup.phone,
-			       cmd.parm.setup.si1,
-			       cmd.parm.setup.si2,
-			       cmd.parm.setup.eazmsn);
-			capi_fill_ALERT_REQ(cmsg,
-					    global.ap.applid,
-					    card->msgid++,
-					    plcip->plci,	/* adr */
-					    NULL,/* BChannelinformation */
-					    NULL,/* Keypadfacility */
-					    NULL,/* Useruserdata */
-					    NULL /* Facilitydataarray */
-				);
-			plcip->msgid = cmsg->Messagenumber;
-			send_message(card, cmsg);
-		} else {
-			printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s on netdev\n",
-			       card->contrnr,
-			       cmd.parm.setup.phone,
-			       cmd.parm.setup.si1,
-			       cmd.parm.setup.si2,
-			       cmd.parm.setup.eazmsn);
-		}
-		break;
-
-	case 2:		/* Call will be rejected. */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 2;	/* reject call, normal call clearing */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		break;
-
-	default:
-		/* An error happened. (Invalid parameters for example.) */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 8;	/* reject call,
-					   destination out of order */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		break;
-	}
-	return;
-}
-
-static void handle_plci(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_plci *plcip;
-	isdn_ctrl cmd;
-	_cdebbuf *cdb;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_DISCONNECT_IND:	/* plci */
-		if (cmsg->Reason) {
-			printk(KERN_INFO "capidrv-%d: %s reason 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Reason, capi_info2str(cmsg->Reason), cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) {
-			capi_cmsg_answer(cmsg);
-			send_message(card, cmsg);
-			goto notfound;
-		}
-		card->bchans[plcip->chan].disconnecting = 1;
-		plci_change_state(card, plcip, EV_PLCI_DISCONNECT_IND);
-		capi_cmsg_answer(cmsg);
-		plci_change_state(card, plcip, EV_PLCI_DISCONNECT_RESP);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DISCONNECT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		card->bchans[plcip->chan].disconnecting = 1;
-		break;
-
-	case CAPI_ALERT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		break;
-
-	case CAPI_CONNECT_IND:	/* plci */
-		handle_incoming_call(card, cmsg);
-		break;
-
-	case CAPI_CONNECT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_msgid(card, cmsg->Messagenumber)))
-			goto notfound;
-
-		plcip->plci = cmsg->adr.adrPLCI;
-		if (cmsg->Info) {
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_ERROR);
-		} else {
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_OK);
-		}
-		break;
-
-	case CAPI_CONNECT_ACTIVE_IND:	/* plci */
-
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		if (card->bchans[plcip->chan].incoming) {
-			capi_cmsg_answer(cmsg);
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_ACTIVE_IND);
-			send_message(card, cmsg);
-		} else {
-			capidrv_ncci *nccip;
-			capi_cmsg_answer(cmsg);
-			send_message(card, cmsg);
-
-			nccip = new_ncci(card, plcip, cmsg->adr.adrPLCI);
-
-			if (!nccip) {
-				printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n", card->contrnr);
-				break;	/* $$$$ */
-			}
-			capi_fill_CONNECT_B3_REQ(cmsg,
-						 global.ap.applid,
-						 card->msgid++,
-						 plcip->plci,	/* adr */
-						 NULL	/* NCPI */
-				);
-			nccip->msgid = cmsg->Messagenumber;
-			plci_change_state(card, plcip,
-					  EV_PLCI_CONNECT_ACTIVE_IND);
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_REQ);
-			send_message(card, cmsg);
-			cmd.command = ISDN_STAT_DCONN;
-			cmd.driver = card->myid;
-			cmd.arg = plcip->chan;
-			card->interface.statcallb(&cmd);
-		}
-		break;
-
-	case CAPI_INFO_IND:	/* Controller/plci */
-
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		if (cmsg->InfoNumber == 0x4000) {
-			if (cmsg->InfoElement[0] == 4) {
-				cmd.command = ISDN_STAT_CINF;
-				cmd.driver = card->myid;
-				cmd.arg = plcip->chan;
-				sprintf(cmd.parm.num, "%lu",
-					(unsigned long)
-					((u32) cmsg->InfoElement[1]
-					 | ((u32) (cmsg->InfoElement[2]) << 8)
-					 | ((u32) (cmsg->InfoElement[3]) << 16)
-					 | ((u32) (cmsg->InfoElement[4]) << 24)));
-				card->interface.statcallb(&cmd);
-				break;
-			}
-		}
-		cdb = capi_cmsg2str(cmsg);
-		if (cdb) {
-			printk(KERN_WARNING "capidrv-%d: %s\n",
-			       card->contrnr, cdb->buf);
-			cdebbuf_free(cdb);
-		} else
-			printk(KERN_WARNING "capidrv-%d: CAPI_INFO_IND InfoNumber %x not handled\n",
-			       card->contrnr, cmsg->InfoNumber);
-
-		break;
-
-	case CAPI_CONNECT_ACTIVE_CONF:		/* plci */
-		goto ignored;
-	case CAPI_SELECT_B_PROTOCOL_CONF:	/* plci */
-		goto ignored;
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-
-	case CAPI_INFO_CONF:	/* Controller/plci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s for plci 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrPLCI);
-	}
-	return;
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s for plci 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrPLCI);
-	return;
-notfound:
-	printk(KERN_ERR "capidrv-%d: %s: plci 0x%x not found\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrPLCI);
-	return;
-}
-
-static void handle_ncci(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_plci *plcip;
-	capidrv_ncci *nccip;
-	isdn_ctrl cmd;
-	int len;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_CONNECT_B3_ACTIVE_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		capi_cmsg_answer(cmsg);
-		ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_ACTIVE_IND);
-		send_message(card, cmsg);
-
-		cmd.command = ISDN_STAT_BCONN;
-		cmd.driver = card->myid;
-		cmd.arg = nccip->chan;
-		card->interface.statcallb(&cmd);
-
-		printk(KERN_INFO "capidrv-%d: chan %d up with ncci 0x%x\n",
-		       card->contrnr, nccip->chan, nccip->ncci);
-		break;
-
-	case CAPI_CONNECT_B3_ACTIVE_CONF:	/* ncci */
-		goto ignored;
-
-	case CAPI_CONNECT_B3_IND:	/* ncci */
-
-		plcip = find_plci_by_ncci(card, cmsg->adr.adrNCCI);
-		if (plcip) {
-			nccip = new_ncci(card, plcip, cmsg->adr.adrNCCI);
-			if (nccip) {
-				ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_IND);
-				capi_fill_CONNECT_B3_RESP(cmsg,
-							  global.ap.applid,
-							  card->msgid++,
-							  nccip->ncci,	/* adr */
-							  0,	/* Reject */
-							  NULL	/* NCPI */
-					);
-				ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_RESP);
-				send_message(card, cmsg);
-				break;
-			}
-			printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n",							card->contrnr);
-		} else {
-			printk(KERN_ERR "capidrv-%d: %s: plci for ncci 0x%x not found\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->adr.adrNCCI);
-		}
-		capi_fill_CONNECT_B3_RESP(cmsg,
-					  global.ap.applid,
-					  card->msgid++,
-					  cmsg->adr.adrNCCI,
-					  2,	/* Reject */
-					  NULL	/* NCPI */
-			);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_CONNECT_B3_CONF:	/* ncci */
-
-		if (!(nccip = find_ncci_by_msgid(card,
-						 cmsg->adr.adrNCCI,
-						 cmsg->Messagenumber)))
-			goto notfound;
-
-		nccip->ncci = cmsg->adr.adrNCCI;
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrNCCI);
-		}
-
-		if (cmsg->Info)
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_ERROR);
-		else
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_OK);
-		break;
-
-	case CAPI_CONNECT_B3_T90_ACTIVE_IND:	/* ncci */
-		capi_cmsg_answer(cmsg);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DATA_B3_IND:	/* ncci */
-		/* handled in handle_data() */
-		goto ignored;
-
-	case CAPI_DATA_B3_CONF:	/* ncci */
-		if (cmsg->Info) {
-			printk(KERN_WARNING "CAPI_DATA_B3_CONF: Info %x - %s\n",
-			       cmsg->Info, capi_info2str(cmsg->Info));
-		}
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		len = capidrv_del_ack(nccip, cmsg->DataHandle);
-		if (len < 0)
-			break;
-		cmd.command = ISDN_STAT_BSENT;
-		cmd.driver = card->myid;
-		cmd.arg = nccip->chan;
-		cmd.parm.length = len;
-		card->interface.statcallb(&cmd);
-		break;
-
-	case CAPI_DISCONNECT_B3_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		card->bchans[nccip->chan].disconnecting = 1;
-		ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_IND);
-		capi_cmsg_answer(cmsg);
-		ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_RESP);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DISCONNECT_B3_CONF:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrNCCI);
-			ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_CONF_ERROR);
-		}
-		break;
-
-	case CAPI_RESET_B3_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-		ncci_change_state(card, nccip, EV_NCCI_RESET_B3_IND);
-		capi_cmsg_answer(cmsg);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_RESET_B3_CONF:	/* ncci */
-		goto ignored;	/* $$$$ */
-
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s for ncci 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrNCCI);
-	}
-	return;
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s for ncci 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrNCCI);
-	return;
-notfound:
-	printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrNCCI);
-}
-
-
-static void handle_data(_cmsg *cmsg, struct sk_buff *skb)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_ncci *nccip;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		kfree_skb(skb);
-		return;
-	}
-	if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) {
-		printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrNCCI);
-		kfree_skb(skb);
-		return;
-	}
-	(void) skb_pull(skb, CAPIMSG_LEN(skb->data));
-	card->interface.rcvcallb_skb(card->myid, nccip->chan, skb);
-	capi_cmsg_answer(cmsg);
-	send_message(card, cmsg);
-}
-
-static _cmsg s_cmsg;
-
-static void capidrv_recv_message(struct capi20_appl *ap, struct sk_buff *skb)
-{
-	if (capi_message2cmsg(&s_cmsg, skb->data)) {
-		printk(KERN_ERR "capidrv: applid=%d: received invalid message\n",
-		       ap->applid);
-		kfree_skb(skb);
-		return;
-	}
-	if (debugmode > 3) {
-		_cdebbuf *cdb = capi_cmsg2str(&s_cmsg);
-
-		if (cdb) {
-			printk(KERN_DEBUG "%s: applid=%d %s\n", __func__,
-			       ap->applid, cdb->buf);
-			cdebbuf_free(cdb);
-		} else
-			printk(KERN_DEBUG "%s: applid=%d %s not traced\n",
-			       __func__, ap->applid,
-			       capi_cmd2str(s_cmsg.Command, s_cmsg.Subcommand));
-	}
-	if (s_cmsg.Command == CAPI_DATA_B3
-	    && s_cmsg.Subcommand == CAPI_IND) {
-		handle_data(&s_cmsg, skb);
-		return;
-	}
-	if ((s_cmsg.adr.adrController & 0xffffff00) == 0)
-		handle_controller(&s_cmsg);
-	else if ((s_cmsg.adr.adrPLCI & 0xffff0000) == 0)
-		handle_plci(&s_cmsg);
-	else
-		handle_ncci(&s_cmsg);
-	/*
-	 * data of skb used in s_cmsg,
-	 * free data when s_cmsg is not used again
-	 * thanks to Lars Heete <hel@admin.de>
-	 */
-	kfree_skb(skb);
-}
-
-/* ------------------------------------------------------------------- */
-
-#define PUTBYTE_TO_STATUS(card, byte)				\
-	do {							\
-		*(card)->q931_write++ = (byte);			\
-		if ((card)->q931_write > (card)->q931_end)	\
-			(card)->q931_write = (card)->q931_buf;	\
-	} while (0)
-
-static void handle_dtrace_data(capidrv_contr *card,
-			       int send, int level2, u8 *data, u16 len)
-{
-	u8 *p, *end;
-	isdn_ctrl cmd;
-
-	if (!len) {
-		printk(KERN_DEBUG "capidrv-%d: avmb1_q931_data: len == %d\n",
-		       card->contrnr, len);
-		return;
-	}
-
-	if (level2) {
-		PUTBYTE_TO_STATUS(card, 'D');
-		PUTBYTE_TO_STATUS(card, '2');
-		PUTBYTE_TO_STATUS(card, send ? '>' : '<');
-		PUTBYTE_TO_STATUS(card, ':');
-	} else {
-		PUTBYTE_TO_STATUS(card, 'D');
-		PUTBYTE_TO_STATUS(card, '3');
-		PUTBYTE_TO_STATUS(card, send ? '>' : '<');
-		PUTBYTE_TO_STATUS(card, ':');
-	}
-
-	for (p = data, end = data + len; p < end; p++) {
-		PUTBYTE_TO_STATUS(card, ' ');
-		PUTBYTE_TO_STATUS(card, hex_asc_hi(*p));
-		PUTBYTE_TO_STATUS(card, hex_asc_lo(*p));
-	}
-	PUTBYTE_TO_STATUS(card, '\n');
-
-	cmd.command = ISDN_STAT_STAVAIL;
-	cmd.driver = card->myid;
-	cmd.arg = len * 3 + 5;
-	card->interface.statcallb(&cmd);
-}
-
-/* ------------------------------------------------------------------- */
-
-static _cmsg cmdcmsg;
-
-static int capidrv_ioctl(isdn_ctrl *c, capidrv_contr *card)
-{
-	switch (c->arg) {
-	case 1:
-		debugmode = (int)(*((unsigned int *)c->parm.num));
-		printk(KERN_DEBUG "capidrv-%d: debugmode=%d\n",
-		       card->contrnr, debugmode);
-		return 0;
-	default:
-		printk(KERN_DEBUG "capidrv-%d: capidrv_ioctl(%ld) called ??\n",
-		       card->contrnr, c->arg);
-		return -EINVAL;
-	}
-	return -EINVAL;
-}
-
-/*
- * Handle leased lines (CAPI-Bundling)
- */
-
-struct internal_bchannelinfo {
-	unsigned short channelalloc;
-	unsigned short operation;
-	unsigned char  cmask[31];
-};
-
-static int decodeFVteln(char *teln, unsigned long *bmaskp, int *activep)
-{
-	unsigned long bmask = 0;
-	int active = !0;
-	char *s;
-	int i;
-
-	if (strncmp(teln, "FV:", 3) != 0)
-		return 1;
-	s = teln + 3;
-	while (*s && *s == ' ') s++;
-	if (!*s) return -2;
-	if (*s == 'p' || *s == 'P') {
-		active = 0;
-		s++;
-	}
-	if (*s == 'a' || *s == 'A') {
-		active = !0;
-		s++;
-	}
-	while (*s) {
-		int digit1 = 0;
-		int digit2 = 0;
-		char *endp;
-
-		digit1 = simple_strtoul(s, &endp, 10);
-		if (s == endp)
-			return -3;
-		s = endp;
-
-		if (digit1 <= 0 || digit1 > 30) return -4;
-		if (*s == 0 || *s == ',' || *s == ' ') {
-			bmask |= (1 << digit1);
-			digit1 = 0;
-			if (*s) s++;
-			continue;
-		}
-		if (*s != '-') return -5;
-		s++;
-
-		digit2 = simple_strtoul(s, &endp, 10);
-		if (s == endp)
-			return -3;
-		s = endp;
-
-		if (digit2 <= 0 || digit2 > 30) return -4;
-		if (*s == 0 || *s == ',' || *s == ' ') {
-			if (digit1 > digit2)
-				for (i = digit2; i <= digit1; i++)
-					bmask |= (1 << i);
-			else
-				for (i = digit1; i <= digit2; i++)
-					bmask |= (1 << i);
-			digit1 = digit2 = 0;
-			if (*s) s++;
-			continue;
-		}
-		return -6;
-	}
-	if (activep) *activep = active;
-	if (bmaskp) *bmaskp = bmask;
-	return 0;
-}
-
-static int FVteln2capi20(char *teln, u8 AdditionalInfo[1 + 2 + 2 + 31])
-{
-	unsigned long bmask;
-	int active;
-	int rc, i;
-
-	rc = decodeFVteln(teln, &bmask, &active);
-	if (rc) return rc;
-	/* Length */
-	AdditionalInfo[0] = 2 + 2 + 31;
-	/* Channel: 3 => use channel allocation */
-	AdditionalInfo[1] = 3; AdditionalInfo[2] = 0;
-	/* Operation: 0 => DTE mode, 1 => DCE mode */
-	if (active) {
-		AdditionalInfo[3] = 0; AdditionalInfo[4] = 0;
-	} else {
-		AdditionalInfo[3] = 1; AdditionalInfo[4] = 0;
-	}
-	/* Channel mask array */
-	AdditionalInfo[5] = 0; /* no D-Channel */
-	for (i = 1; i <= 30; i++)
-		AdditionalInfo[5 + i] = (bmask & (1 << i)) ? 0xff : 0;
-	return 0;
-}
-
-static int capidrv_command(isdn_ctrl *c, capidrv_contr *card)
-{
-	isdn_ctrl cmd;
-	struct capidrv_bchan *bchan;
-	struct capidrv_plci *plcip;
-	u8 AdditionalInfo[1 + 2 + 2 + 31];
-	int rc, isleasedline = 0;
-
-	if (c->command == ISDN_CMD_IOCTL)
-		return capidrv_ioctl(c, card);
-
-	switch (c->command) {
-	case ISDN_CMD_DIAL: {
-		u8 calling[ISDN_MSNLEN + 3];
-		u8 called[ISDN_MSNLEN + 2];
-
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_DIAL(ch=%ld,\"%s,%d,%d,%s\")\n",
-			       card->contrnr,
-			       c->arg,
-			       c->parm.setup.phone,
-			       c->parm.setup.si1,
-			       c->parm.setup.si2,
-			       c->parm.setup.eazmsn);
-
-		bchan = &card->bchans[c->arg % card->nbchan];
-
-		if (bchan->plcip) {
-			printk(KERN_ERR "capidrv-%d: dail ch=%ld,\"%s,%d,%d,%s\" in use (plci=0x%x)\n",
-			       card->contrnr,
-			       c->arg,
-			       c->parm.setup.phone,
-			       c->parm.setup.si1,
-			       c->parm.setup.si2,
-			       c->parm.setup.eazmsn,
-			       bchan->plcip->plci);
-			return 0;
-		}
-		bchan->si1 = c->parm.setup.si1;
-		bchan->si2 = c->parm.setup.si2;
-
-		strncpy(bchan->num, c->parm.setup.phone, sizeof(bchan->num));
-		strncpy(bchan->mynum, c->parm.setup.eazmsn, sizeof(bchan->mynum));
-		rc = FVteln2capi20(bchan->num, AdditionalInfo);
-		isleasedline = (rc == 0);
-		if (rc < 0)
-			printk(KERN_ERR "capidrv-%d: WARNING: invalid leased linedefinition \"%s\"\n", card->contrnr, bchan->num);
-
-		if (isleasedline) {
-			calling[0] = 0;
-			called[0] = 0;
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: connecting leased line\n", card->contrnr);
-		} else {
-			calling[0] = strlen(bchan->mynum) + 2;
-			calling[1] = 0;
-			calling[2] = 0x80;
-			strncpy(calling + 3, bchan->mynum, ISDN_MSNLEN);
-			called[0] = strlen(bchan->num) + 1;
-			called[1] = 0x80;
-			strncpy(called + 2, bchan->num, ISDN_MSNLEN);
-		}
-
-		capi_fill_CONNECT_REQ(&cmdcmsg,
-				      global.ap.applid,
-				      card->msgid++,
-				      card->contrnr,	/* adr */
-				      si2cip(bchan->si1, bchan->si2),	/* cipvalue */
-				      called,	/* CalledPartyNumber */
-				      calling,	/* CallingPartyNumber */
-				      NULL,	/* CalledPartySubaddress */
-				      NULL,	/* CallingPartySubaddress */
-				      b1prot(bchan->l2, bchan->l3),	/* B1protocol */
-				      b2prot(bchan->l2, bchan->l3),	/* B2protocol */
-				      b3prot(bchan->l2, bchan->l3),	/* B3protocol */
-				      b1config(bchan->l2, bchan->l3),	/* B1configuration */
-				      NULL,	/* B2configuration */
-				      NULL,	/* B3configuration */
-				      NULL,	/* BC */
-				      NULL,	/* LLC */
-				      NULL,	/* HLC */
-				      /* BChannelinformation */
-				      isleasedline ? AdditionalInfo : NULL,
-				      NULL,	/* Keypadfacility */
-				      NULL,	/* Useruserdata */
-				      NULL	/* Facilitydataarray */
-			);
-		if ((plcip = new_plci(card, (c->arg % card->nbchan))) == NULL) {
-			cmd.command = ISDN_STAT_DHUP;
-			cmd.driver = card->myid;
-			cmd.arg = (c->arg % card->nbchan);
-			card->interface.statcallb(&cmd);
-			return -1;
-		}
-		plcip->msgid = cmdcmsg.Messagenumber;
-		plcip->leasedline = isleasedline;
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REQ);
-		send_message(card, &cmdcmsg);
-		return 0;
-	}
-
-	case ISDN_CMD_ACCEPTD:
-
-		bchan = &card->bchans[c->arg % card->nbchan];
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTD(ch=%ld) l2=%d l3=%d\n",
-			       card->contrnr,
-			       c->arg, bchan->l2, bchan->l3);
-
-		capi_fill_CONNECT_RESP(&cmdcmsg,
-				       global.ap.applid,
-				       card->msgid++,
-				       bchan->plcip->plci,	/* adr */
-				       0,	/* Reject */
-				       b1prot(bchan->l2, bchan->l3),	/* B1protocol */
-				       b2prot(bchan->l2, bchan->l3),	/* B2protocol */
-				       b3prot(bchan->l2, bchan->l3),	/* B3protocol */
-				       b1config(bchan->l2, bchan->l3),	/* B1configuration */
-				       NULL,	/* B2configuration */
-				       NULL,	/* B3configuration */
-				       NULL,	/* ConnectedNumber */
-				       NULL,	/* ConnectedSubaddress */
-				       NULL,	/* LLC */
-				       NULL,	/* BChannelinformation */
-				       NULL,	/* Keypadfacility */
-				       NULL,	/* Useruserdata */
-				       NULL	/* Facilitydataarray */
-			);
-		if (capi_cmsg2message(&cmdcmsg, cmdcmsg.buf)) {
-			printk(KERN_ERR "capidrv-%d: capidrv_command: parser failure\n",
-			       card->contrnr);
-			return -EINVAL;
-		}
-		plci_change_state(card, bchan->plcip, EV_PLCI_CONNECT_RESP);
-		send_message(card, &cmdcmsg);
-		return 0;
-
-	case ISDN_CMD_ACCEPTB:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTB(ch=%ld)\n",
-			       card->contrnr,
-			       c->arg);
-		return -ENOSYS;
-
-	case ISDN_CMD_HANGUP:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_HANGUP(ch=%ld)\n",
-			       card->contrnr,
-			       c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-
-		if (bchan->disconnecting) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: chan %ld already disconnecting ...\n",
-				       card->contrnr,
-				       c->arg);
-			return 0;
-		}
-		if (bchan->nccip) {
-			bchan->disconnecting = 1;
-			capi_fill_DISCONNECT_B3_REQ(&cmdcmsg,
-						    global.ap.applid,
-						    card->msgid++,
-						    bchan->nccip->ncci,
-						    NULL	/* NCPI */
-				);
-			ncci_change_state(card, bchan->nccip, EV_NCCI_DISCONNECT_B3_REQ);
-			send_message(card, &cmdcmsg);
-			return 0;
-		} else if (bchan->plcip) {
-			if (bchan->plcip->state == ST_PLCI_INCOMING) {
-				/*
-				 * just ignore, we a called from
-				 * isdn_status_callback(),
-				 * which will return 0 or 2, this is handled
-				 * by the CONNECT_IND handler
-				 */
-				bchan->disconnecting = 1;
-				return 0;
-			} else if (bchan->plcip->plci) {
-				bchan->disconnecting = 1;
-				capi_fill_DISCONNECT_REQ(&cmdcmsg,
-							 global.ap.applid,
-							 card->msgid++,
-							 bchan->plcip->plci,
-							 NULL,	/* BChannelinformation */
-							 NULL,	/* Keypadfacility */
-							 NULL,	/* Useruserdata */
-							 NULL	/* Facilitydataarray */
-					);
-				plci_change_state(card, bchan->plcip, EV_PLCI_DISCONNECT_REQ);
-				send_message(card, &cmdcmsg);
-				return 0;
-			} else {
-				printk(KERN_ERR "capidrv-%d: chan %ld disconnect request while waiting for CONNECT_CONF\n",
-				       card->contrnr,
-				       c->arg);
-				return -EINVAL;
-			}
-		}
-		printk(KERN_ERR "capidrv-%d: chan %ld disconnect request on free channel\n",
-		       card->contrnr,
-		       c->arg);
-		return -EINVAL;
-/* ready */
-
-	case ISDN_CMD_SETL2:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set L2 on chan %ld to %ld\n",
-			       card->contrnr,
-			       (c->arg & 0xff), (c->arg >> 8));
-		bchan = &card->bchans[(c->arg & 0xff) % card->nbchan];
-		bchan->l2 = (c->arg >> 8);
-		return 0;
-
-	case ISDN_CMD_SETL3:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set L3 on chan %ld to %ld\n",
-			       card->contrnr,
-			       (c->arg & 0xff), (c->arg >> 8));
-		bchan = &card->bchans[(c->arg & 0xff) % card->nbchan];
-		bchan->l3 = (c->arg >> 8);
-		return 0;
-
-	case ISDN_CMD_SETEAZ:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set EAZ \"%s\" on chan %ld\n",
-			       card->contrnr,
-			       c->parm.num, c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-		strncpy(bchan->msn, c->parm.num, ISDN_MSNLEN);
-		return 0;
-
-	case ISDN_CMD_CLREAZ:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: clearing EAZ on chan %ld\n",
-			       card->contrnr, c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-		bchan->msn[0] = 0;
-		return 0;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: ISDN_CMD_%d, Huh?\n",
-		       card->contrnr, c->command);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int if_command(isdn_ctrl *c)
-{
-	capidrv_contr *card = findcontrbydriverid(c->driver);
-
-	if (card)
-		return capidrv_command(c, card);
-
-	printk(KERN_ERR
-	       "capidrv: if_command %d called with invalid driverId %d!\n",
-	       c->command, c->driver);
-	return -ENODEV;
-}
-
-static _cmsg sendcmsg;
-
-static int if_sendbuf(int id, int channel, int doack, struct sk_buff *skb)
-{
-	capidrv_contr *card = findcontrbydriverid(id);
-	capidrv_bchan *bchan;
-	capidrv_ncci *nccip;
-	int len = skb->len;
-	int msglen;
-	u16 errcode;
-	u16 datahandle;
-	u32 data;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: if_sendbuf called with invalid driverId %d!\n",
-		       id);
-		return 0;
-	}
-	if (debugmode > 4)
-		printk(KERN_DEBUG "capidrv-%d: sendbuf len=%d skb=%p doack=%d\n",
-		       card->contrnr, len, skb, doack);
-	bchan = &card->bchans[channel % card->nbchan];
-	nccip = bchan->nccip;
-	if (!nccip || nccip->state != ST_NCCI_ACTIVE) {
-		printk(KERN_ERR "capidrv-%d: if_sendbuf: %s:%d: chan not up!\n",
-		       card->contrnr, card->name, channel);
-		return 0;
-	}
-	datahandle = nccip->datahandle;
-
-	/*
-	 * Here we copy pointer skb->data into the 32-bit 'Data' field.
-	 * The 'Data' field is not used in practice in linux kernel
-	 * (neither in 32 or 64 bit), but should have some value,
-	 * since a CAPI message trace will display it.
-	 *
-	 * The correct value in the 32 bit case is the address of the
-	 * data, in 64 bit it makes no sense, we use 0 there.
-	 */
-
-#ifdef CONFIG_64BIT
-	data = 0;
-#else
-	data = (unsigned long) skb->data;
-#endif
-
-	capi_fill_DATA_B3_REQ(&sendcmsg, global.ap.applid, card->msgid++,
-			      nccip->ncci,	/* adr */
-			      data,		/* Data */
-			      skb->len,		/* DataLength */
-			      datahandle,	/* DataHandle */
-			      0	/* Flags */
-		);
-
-	if (capidrv_add_ack(nccip, datahandle, doack ? (int)skb->len : -1) < 0)
-		return 0;
-
-	if (capi_cmsg2message(&sendcmsg, sendcmsg.buf)) {
-		printk(KERN_ERR "capidrv-%d: if_sendbuf: parser failure\n",
-		       card->contrnr);
-		return -EINVAL;
-	}
-	msglen = CAPIMSG_LEN(sendcmsg.buf);
-	if (skb_headroom(skb) < msglen) {
-		struct sk_buff *nskb = skb_realloc_headroom(skb, msglen);
-		if (!nskb) {
-			printk(KERN_ERR "capidrv-%d: if_sendbuf: no memory\n",
-			       card->contrnr);
-			(void)capidrv_del_ack(nccip, datahandle);
-			return 0;
-		}
-		printk(KERN_DEBUG "capidrv-%d: only %d bytes headroom, need %d\n",
-		       card->contrnr, skb_headroom(skb), msglen);
-		memcpy(skb_push(nskb, msglen), sendcmsg.buf, msglen);
-		errcode = capi20_put_message(&global.ap, nskb);
-		if (errcode == CAPI_NOERROR) {
-			dev_kfree_skb(skb);
-			nccip->datahandle++;
-			return len;
-		}
-		if (debugmode > 3)
-			printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n",
-			       card->contrnr, errcode, capi_info2str(errcode));
-		(void)capidrv_del_ack(nccip, datahandle);
-		dev_kfree_skb(nskb);
-		return errcode == CAPI_SENDQUEUEFULL ? 0 : -1;
-	} else {
-		memcpy(skb_push(skb, msglen), sendcmsg.buf, msglen);
-		errcode = capi20_put_message(&global.ap, skb);
-		if (errcode == CAPI_NOERROR) {
-			nccip->datahandle++;
-			return len;
-		}
-		if (debugmode > 3)
-			printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n",
-			       card->contrnr, errcode, capi_info2str(errcode));
-		skb_pull(skb, msglen);
-		(void)capidrv_del_ack(nccip, datahandle);
-		return errcode == CAPI_SENDQUEUEFULL ? 0 : -1;
-	}
-}
-
-static int if_readstat(u8 __user *buf, int len, int id, int channel)
-{
-	capidrv_contr *card = findcontrbydriverid(id);
-	int count;
-	u8 __user *p;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: if_readstat called with invalid driverId %d!\n",
-		       id);
-		return -ENODEV;
-	}
-
-	for (p = buf, count = 0; count < len; p++, count++) {
-		if (put_user(*card->q931_read++, p))
-			return -EFAULT;
-		if (card->q931_read > card->q931_end)
-			card->q931_read = card->q931_buf;
-	}
-	return count;
-
-}
-
-static void enable_dchannel_trace(capidrv_contr *card)
-{
-	u8 manufacturer[CAPI_MANUFACTURER_LEN];
-	capi_version version;
-	u16 contr = card->contrnr;
-	u16 errcode;
-	u16 avmversion[3];
-
-	errcode = capi20_get_manufacturer(contr, manufacturer);
-	if (errcode != CAPI_NOERROR) {
-		printk(KERN_ERR "%s: can't get manufacturer (0x%x)\n",
-		       card->name, errcode);
-		return;
-	}
-	if (strstr(manufacturer, "AVM") == NULL) {
-		printk(KERN_ERR "%s: not from AVM, no d-channel trace possible (%s)\n",
-		       card->name, manufacturer);
-		return;
-	}
-	errcode = capi20_get_version(contr, &version);
-	if (errcode != CAPI_NOERROR) {
-		printk(KERN_ERR "%s: can't get version (0x%x)\n",
-		       card->name, errcode);
-		return;
-	}
-	avmversion[0] = (version.majormanuversion >> 4) & 0x0f;
-	avmversion[1] = (version.majormanuversion << 4) & 0xf0;
-	avmversion[1] |= (version.minormanuversion >> 4) & 0x0f;
-	avmversion[2] |= version.minormanuversion & 0x0f;
-
-	if (avmversion[0] > 3 || (avmversion[0] == 3 && avmversion[1] > 5)) {
-		printk(KERN_INFO "%s: D2 trace enabled\n", card->name);
-		capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid,
-					   card->msgid++,
-					   contr,
-					   0x214D5641,  /* ManuID */
-					   0,           /* Class */
-					   1,           /* Function */
-					   (_cstruct)"\004\200\014\000\000");
-	} else {
-		printk(KERN_INFO "%s: D3 trace enabled\n", card->name);
-		capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid,
-					   card->msgid++,
-					   contr,
-					   0x214D5641,  /* ManuID */
-					   0,           /* Class */
-					   1,           /* Function */
-					   (_cstruct)"\004\002\003\000\000");
-	}
-	send_message(card, &cmdcmsg);
-}
-
-
-static void send_listen(capidrv_contr *card)
-{
-	capi_fill_LISTEN_REQ(&cmdcmsg, global.ap.applid,
-			     card->msgid++,
-			     card->contrnr, /* controller */
-			     1 << 6,	/* Infomask */
-			     card->cipmask,
-			     card->cipmask2,
-			     NULL, NULL);
-	listen_change_state(card, EV_LISTEN_REQ);
-	send_message(card, &cmdcmsg);
-}
-
-static void listentimerfunc(struct timer_list *t)
-{
-	capidrv_contr *card = from_timer(card, t, listentimer);
-	if (card->state != ST_LISTEN_NONE && card->state != ST_LISTEN_ACTIVE)
-		printk(KERN_ERR "%s: controller dead ??\n", card->name);
-	send_listen(card);
-	mod_timer(&card->listentimer, jiffies + 60 * HZ);
-}
-
-
-static int capidrv_addcontr(u16 contr, struct capi_profile *profp)
-{
-	capidrv_contr *card;
-	unsigned long flags;
-	isdn_ctrl cmd;
-	char id[20];
-	int i;
-
-	sprintf(id, "capidrv-%d", contr);
-	if (!try_module_get(THIS_MODULE)) {
-		printk(KERN_WARNING "capidrv: (%s) Could not reserve module\n", id);
-		return -1;
-	}
-	if (!(card = kzalloc(sizeof(capidrv_contr), GFP_ATOMIC))) {
-		printk(KERN_WARNING
-		       "capidrv: (%s) Could not allocate contr-struct.\n", id);
-		return -1;
-	}
-	card->owner = THIS_MODULE;
-	timer_setup(&card->listentimer, listentimerfunc, 0);
-	strcpy(card->name, id);
-	card->contrnr = contr;
-	card->nbchan = profp->nbchannel;
-	card->bchans = kmalloc_array(card->nbchan, sizeof(capidrv_bchan),
-				     GFP_ATOMIC);
-	if (!card->bchans) {
-		printk(KERN_WARNING
-		       "capidrv: (%s) Could not allocate bchan-structs.\n", id);
-		module_put(card->owner);
-		kfree(card);
-		return -1;
-	}
-	card->interface.channels = profp->nbchannel;
-	card->interface.maxbufsize = 2048;
-	card->interface.command = if_command;
-	card->interface.writebuf_skb = if_sendbuf;
-	card->interface.writecmd = NULL;
-	card->interface.readstat = if_readstat;
-	card->interface.features =
-		ISDN_FEATURE_L2_HDLC |
-		ISDN_FEATURE_L2_TRANS |
-		ISDN_FEATURE_L3_TRANS |
-		ISDN_FEATURE_P_UNKNOWN |
-		ISDN_FEATURE_L2_X75I |
-		ISDN_FEATURE_L2_X75UI |
-		ISDN_FEATURE_L2_X75BUI;
-	if (profp->support1 & (1 << 2))
-		card->interface.features |=
-			ISDN_FEATURE_L2_V11096 |
-			ISDN_FEATURE_L2_V11019 |
-			ISDN_FEATURE_L2_V11038;
-	if (profp->support1 & (1 << 8))
-		card->interface.features |= ISDN_FEATURE_L2_MODEM;
-	card->interface.hl_hdrlen = 22; /* len of DATA_B3_REQ */
-	strncpy(card->interface.id, id, sizeof(card->interface.id) - 1);
-
-
-	card->q931_read = card->q931_buf;
-	card->q931_write = card->q931_buf;
-	card->q931_end = card->q931_buf + sizeof(card->q931_buf) - 1;
-
-	if (!register_isdn(&card->interface)) {
-		printk(KERN_ERR "capidrv: Unable to register contr %s\n", id);
-		kfree(card->bchans);
-		module_put(card->owner);
-		kfree(card);
-		return -1;
-	}
-	card->myid = card->interface.channels;
-	memset(card->bchans, 0, sizeof(capidrv_bchan) * card->nbchan);
-	for (i = 0; i < card->nbchan; i++) {
-		card->bchans[i].contr = card;
-	}
-
-	spin_lock_irqsave(&global_lock, flags);
-	card->next = global.contr_list;
-	global.contr_list = card;
-	global.ncontr++;
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	cmd.command = ISDN_STAT_RUN;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	card->cipmask = 0x1FFF03FF;	/* any */
-	card->cipmask2 = 0;
-
-	send_listen(card);
-	mod_timer(&card->listentimer, jiffies + 60 * HZ);
-
-	printk(KERN_INFO "%s: now up (%d B channels)\n",
-	       card->name, card->nbchan);
-
-	enable_dchannel_trace(card);
-
-	return 0;
-}
-
-static int capidrv_delcontr(u16 contr)
-{
-	capidrv_contr **pp, *card;
-	unsigned long flags;
-	isdn_ctrl cmd;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (card = global.contr_list; card; card = card->next) {
-		if (card->contrnr == contr)
-			break;
-	}
-	if (!card) {
-		spin_unlock_irqrestore(&global_lock, flags);
-		printk(KERN_ERR "capidrv: delcontr: no contr %u\n", contr);
-		return -1;
-	}
-
-	/* FIXME: maybe a race condition the card should be removed
-	 * here from global list /kkeil
-	 */
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	del_timer(&card->listentimer);
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d unloading\n",
-		       card->contrnr, card->myid);
-
-	cmd.command = ISDN_STAT_STOP;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	while (card->nbchan) {
-
-		cmd.command = ISDN_STAT_DISCH;
-		cmd.driver = card->myid;
-		cmd.arg = card->nbchan - 1;
-		cmd.parm.num[0] = 0;
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: id=%d disable chan=%ld\n",
-			       card->contrnr, card->myid, cmd.arg);
-		card->interface.statcallb(&cmd);
-
-		if (card->bchans[card->nbchan - 1].nccip)
-			free_ncci(card, card->bchans[card->nbchan - 1].nccip);
-		if (card->bchans[card->nbchan - 1].plcip)
-			free_plci(card, card->bchans[card->nbchan - 1].plcip);
-		if (card->plci_list)
-			printk(KERN_ERR "capidrv: bug in free_plci()\n");
-		card->nbchan--;
-	}
-	kfree(card->bchans);
-	card->bchans = NULL;
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d isdn unload\n",
-		       card->contrnr, card->myid);
-
-	cmd.command = ISDN_STAT_UNLOAD;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d remove contr from list\n",
-		       card->contrnr, card->myid);
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (pp = &global.contr_list; *pp; pp = &(*pp)->next) {
-		if (*pp == card) {
-			*pp = (*pp)->next;
-			card->next = NULL;
-			global.ncontr--;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	module_put(card->owner);
-	printk(KERN_INFO "%s: now down.\n", card->name);
-	kfree(card);
-	return 0;
-}
-
-
-static int
-lower_callback(struct notifier_block *nb, unsigned long val, void *v)
-{
-	capi_profile profile;
-	u32 contr = (long)v;
-
-	switch (val) {
-	case CAPICTR_UP:
-		printk(KERN_INFO "capidrv: controller %hu up\n", contr);
-		if (capi20_get_profile(contr, &profile) == CAPI_NOERROR)
-			(void) capidrv_addcontr(contr, &profile);
-		break;
-	case CAPICTR_DOWN:
-		printk(KERN_INFO "capidrv: controller %hu down\n", contr);
-		(void) capidrv_delcontr(contr);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-/*
- * /proc/capi/capidrv:
- * nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt
- */
-static int __maybe_unused capidrv_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%lu %lu %lu %lu\n",
-		   global.ap.nrecvctlpkt,
-		   global.ap.nrecvdatapkt,
-		   global.ap.nsentctlpkt,
-		   global.ap.nsentdatapkt);
-	return 0;
-}
-
-static void __init proc_init(void)
-{
-	proc_create_single("capi/capidrv", 0, NULL, capidrv_proc_show);
-}
-
-static void __exit proc_exit(void)
-{
-	remove_proc_entry("capi/capidrv", NULL);
-}
-
-static struct notifier_block capictr_nb = {
-	.notifier_call = lower_callback,
-};
-
-static int __init capidrv_init(void)
-{
-	capi_profile profile;
-	u32 ncontr, contr;
-	u16 errcode;
-
-	global.ap.rparam.level3cnt = -2;  /* number of bchannels twice */
-	global.ap.rparam.datablkcnt = 16;
-	global.ap.rparam.datablklen = 2048;
-
-	global.ap.recv_message = capidrv_recv_message;
-	errcode = capi20_register(&global.ap);
-	if (errcode) {
-		return -EIO;
-	}
-
-	register_capictr_notifier(&capictr_nb);
-
-	errcode = capi20_get_profile(0, &profile);
-	if (errcode != CAPI_NOERROR) {
-		unregister_capictr_notifier(&capictr_nb);
-		capi20_release(&global.ap);
-		return -EIO;
-	}
-
-	ncontr = profile.ncontroller;
-	for (contr = 1; contr <= ncontr; contr++) {
-		errcode = capi20_get_profile(contr, &profile);
-		if (errcode != CAPI_NOERROR)
-			continue;
-		(void) capidrv_addcontr(contr, &profile);
-	}
-	proc_init();
-
-	return 0;
-}
-
-static void __exit capidrv_exit(void)
-{
-	unregister_capictr_notifier(&capictr_nb);
-	capi20_release(&global.ap);
-
-	proc_exit();
-}
-
-module_init(capidrv_init);
-module_exit(capidrv_exit);
diff --git a/drivers/isdn/capi/capidrv.h b/drivers/isdn/capi/capidrv.h
deleted file mode 100644
index 4466b2e0176d..000000000000
--- a/drivers/isdn/capi/capidrv.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* $Id: capidrv.h,v 1.2.8.2 2001/09/23 22:24:33 kai Exp $
- *
- * ISDN4Linux Driver, using capi20 interface (kernelcapi)
- *
- * Copyright 1997 by Carsten Paeth <calle@calle.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef __CAPIDRV_H__
-#define __CAPIDRV_H__
-
-/*
- * LISTEN state machine
- */
-#define ST_LISTEN_NONE			0	/* L-0 */
-#define ST_LISTEN_WAIT_CONF		1	/* L-0.1 */
-#define ST_LISTEN_ACTIVE		2	/* L-1 */
-#define ST_LISTEN_ACTIVE_WAIT_CONF	3	/* L-1.1 */
-
-
-#define EV_LISTEN_REQ			1	/* L-0 -> L-0.1
-						   L-1 -> L-1.1 */
-#define EV_LISTEN_CONF_ERROR		2	/* L-0.1 -> L-0
-						   L-1.1 -> L-1 */
-#define EV_LISTEN_CONF_EMPTY		3	/* L-0.1 -> L-0
-						   L-1.1 -> L-0 */
-#define EV_LISTEN_CONF_OK		4	/* L-0.1 -> L-1
-						   L-1.1 -> L.1 */
-
-/*
- * per plci state machine
- */
-#define ST_PLCI_NONE			0	/* P-0 */
-#define ST_PLCI_OUTGOING		1	/* P-0.1 */
-#define ST_PLCI_ALLOCATED		2	/* P-1 */
-#define ST_PLCI_ACTIVE			3	/* P-ACT */
-#define ST_PLCI_INCOMING		4	/* P-2 */
-#define ST_PLCI_FACILITY_IND		5	/* P-3 */
-#define ST_PLCI_ACCEPTING		6	/* P-4 */
-#define ST_PLCI_DISCONNECTING		7	/* P-5 */
-#define ST_PLCI_DISCONNECTED		8	/* P-6 */
-#define ST_PLCI_RESUMEING		9	/* P-0.Res */
-#define ST_PLCI_RESUME			10	/* P-Res */
-#define ST_PLCI_HELD			11	/* P-HELD */
-
-#define EV_PLCI_CONNECT_REQ		1	/* P-0 -> P-0.1
-						 */
-#define EV_PLCI_CONNECT_CONF_ERROR	2	/* P-0.1 -> P-0
-						 */
-#define EV_PLCI_CONNECT_CONF_OK		3	/* P-0.1 -> P-1
-						 */
-#define EV_PLCI_FACILITY_IND_UP		4	/* P-0 -> P-1
-						 */
-#define EV_PLCI_CONNECT_IND		5	/* P-0 -> P-2
-						 */
-#define EV_PLCI_CONNECT_ACTIVE_IND	6	/* P-1 -> P-ACT
-						 */
-#define EV_PLCI_CONNECT_REJECT		7	/* P-2 -> P-5
-						   P-3 -> P-5
-						*/
-#define EV_PLCI_DISCONNECT_REQ		8	/* P-1 -> P-5
-						   P-2 -> P-5
-						   P-3 -> P-5
-						   P-4 -> P-5
-						   P-ACT -> P-5
-						   P-Res -> P-5 (*)
-						   P-HELD -> P-5 (*)
-						*/
-#define EV_PLCI_DISCONNECT_IND		9	/* P-1 -> P-6
-						   P-2 -> P-6
-						   P-3 -> P-6
-						   P-4 -> P-6
-						   P-5 -> P-6
-						   P-ACT -> P-6
-						   P-Res -> P-6 (*)
-						   P-HELD -> P-6 (*)
-						*/
-#define EV_PLCI_FACILITY_IND_DOWN	10	/* P-0.1 -> P-5
-						   P-1 -> P-5
-						   P-ACT -> P-5
-						   P-2 -> P-5
-						   P-3 -> P-5
-						   P-4 -> P-5
-						*/
-#define EV_PLCI_DISCONNECT_RESP		11	/* P-6 -> P-0
-						 */
-#define EV_PLCI_CONNECT_RESP		12	/* P-6 -> P-0
-						 */
-
-#define EV_PLCI_RESUME_REQ		13	/* P-0 -> P-0.Res
-						 */
-#define EV_PLCI_RESUME_CONF_OK		14	/* P-0.Res -> P-Res
-						 */
-#define EV_PLCI_RESUME_CONF_ERROR	15	/* P-0.Res -> P-0
-						 */
-#define EV_PLCI_RESUME_IND		16	/* P-Res -> P-ACT
-						 */
-#define EV_PLCI_HOLD_IND		17	/* P-ACT -> P-HELD
-						 */
-#define EV_PLCI_RETRIEVE_IND		18	/* P-HELD -> P-ACT
-						 */
-#define EV_PLCI_SUSPEND_IND		19	/* P-ACT -> P-5
-						 */
-#define EV_PLCI_CD_IND			20	/* P-2 -> P-5
-						 */
-
-/*
- * per ncci state machine
- */
-#define ST_NCCI_PREVIOUS			-1
-#define ST_NCCI_NONE				0	/* N-0 */
-#define ST_NCCI_OUTGOING			1	/* N-0.1 */
-#define ST_NCCI_INCOMING			2	/* N-1 */
-#define ST_NCCI_ALLOCATED			3	/* N-2 */
-#define ST_NCCI_ACTIVE				4	/* N-ACT */
-#define ST_NCCI_RESETING			5	/* N-3 */
-#define ST_NCCI_DISCONNECTING			6	/* N-4 */
-#define ST_NCCI_DISCONNECTED			7	/* N-5 */
-
-#define EV_NCCI_CONNECT_B3_REQ			1	/* N-0 -> N-0.1 */
-#define EV_NCCI_CONNECT_B3_IND			2	/* N-0 -> N.1 */
-#define EV_NCCI_CONNECT_B3_CONF_OK		3	/* N-0.1 -> N.2 */
-#define EV_NCCI_CONNECT_B3_CONF_ERROR		4	/* N-0.1 -> N.0 */
-#define EV_NCCI_CONNECT_B3_REJECT		5	/* N-1 -> N-4 */
-#define EV_NCCI_CONNECT_B3_RESP			6	/* N-1 -> N-2 */
-#define EV_NCCI_CONNECT_B3_ACTIVE_IND		7	/* N-2 -> N-ACT */
-#define EV_NCCI_RESET_B3_REQ			8	/* N-ACT -> N-3 */
-#define EV_NCCI_RESET_B3_IND			9	/* N-3 -> N-ACT */
-#define EV_NCCI_DISCONNECT_B3_IND		10	/* N-4 -> N.5 */
-#define EV_NCCI_DISCONNECT_B3_CONF_ERROR	11	/* N-4 -> previous */
-#define EV_NCCI_DISCONNECT_B3_REQ		12	/* N-1 -> N-4
-							   N-2 -> N-4
-							   N-3 -> N-4
-							   N-ACT -> N-4 */
-#define EV_NCCI_DISCONNECT_B3_RESP		13	/* N-5 -> N-0 */
-
-#endif				/* __CAPIDRV_H__ */
diff --git a/drivers/isdn/divert/Makefile b/drivers/isdn/divert/Makefile
deleted file mode 100644
index 07684fe53537..000000000000
--- a/drivers/isdn/divert/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-# Makefile for the dss1_divert ISDN module
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_DIVERSION)	+= dss1_divert.o
-
-# Multipart objects.
-
-dss1_divert-y			:= isdn_divert.o divert_procfs.o divert_init.o
diff --git a/drivers/isdn/divert/divert_init.c b/drivers/isdn/divert/divert_init.c
deleted file mode 100644
index 267dede13bfd..000000000000
--- a/drivers/isdn/divert/divert_init.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* $Id divert_init.c,v 1.5.6.2 2001/01/24 22:18:17 kai Exp $
- *
- * Module init for DSS1 diversion services for i4l.
- *
- * Copyright 1999       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include "isdn_divert.h"
-
-MODULE_DESCRIPTION("ISDN4Linux: Call diversion support");
-MODULE_AUTHOR("Werner Cornelius");
-MODULE_LICENSE("GPL");
-
-/****************************************/
-/* structure containing interface to hl */
-/****************************************/
-isdn_divert_if divert_if = {
-	DIVERT_IF_MAGIC,	/* magic value */
-	DIVERT_CMD_REG,		/* register cmd */
-	ll_callback,		/* callback routine from ll */
-	NULL,			/* command still not specified */
-	NULL,			/* drv_to_name */
-	NULL,			/* name_to_drv */
-};
-
-/*************************/
-/* Module interface code */
-/* no cmd line parms     */
-/*************************/
-static int __init divert_init(void)
-{
-	int i;
-
-	if (divert_dev_init()) {
-		printk(KERN_WARNING "dss1_divert: cannot install device, not loaded\n");
-		return (-EIO);
-	}
-	if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) {
-		divert_dev_deinit();
-		printk(KERN_WARNING "dss1_divert: error %d registering module, not loaded\n", i);
-		return (-EIO);
-	}
-	printk(KERN_INFO "dss1_divert module successfully installed\n");
-	return (0);
-}
-
-/**********************/
-/* Module deinit code */
-/**********************/
-static void __exit divert_exit(void)
-{
-	unsigned long flags;
-	int i;
-
-	spin_lock_irqsave(&divert_lock, flags);
-	divert_if.cmd = DIVERT_CMD_REL; /* release */
-	if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) {
-		printk(KERN_WARNING "dss1_divert: error %d releasing module\n", i);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return;
-	}
-	if (divert_dev_deinit()) {
-		printk(KERN_WARNING "dss1_divert: device busy, remove cancelled\n");
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return;
-	}
-	spin_unlock_irqrestore(&divert_lock, flags);
-	deleterule(-1); /* delete all rules and free mem */
-	deleteprocs();
-	printk(KERN_INFO "dss1_divert module successfully removed \n");
-}
-
-module_init(divert_init);
-module_exit(divert_exit);
diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c
deleted file mode 100644
index 342585e04fd3..000000000000
--- a/drivers/isdn/divert/divert_procfs.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/* $Id: divert_procfs.c,v 1.11.6.2 2001/09/23 22:24:36 kai Exp $
- *
- * Filesystem handling for the diversion supplementary services.
- *
- * Copyright 1998       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-#else
-#include <linux/fs.h>
-#endif
-#include <linux/sched.h>
-#include <linux/isdnif.h>
-#include <net/net_namespace.h>
-#include <linux/mutex.h>
-#include "isdn_divert.h"
-
-
-/*********************************/
-/* Variables for interface queue */
-/*********************************/
-ulong if_used = 0;		/* number of interface users */
-static DEFINE_MUTEX(isdn_divert_mutex);
-static struct divert_info *divert_info_head = NULL;	/* head of queue */
-static struct divert_info *divert_info_tail = NULL;	/* pointer to last entry */
-static DEFINE_SPINLOCK(divert_info_lock);/* lock for queue */
-static wait_queue_head_t rd_queue;
-
-/*********************************/
-/* put an info buffer into queue */
-/*********************************/
-void
-put_info_buffer(char *cp)
-{
-	struct divert_info *ib;
-	unsigned long flags;
-
-	if (if_used <= 0)
-		return;
-	if (!cp)
-		return;
-	if (!*cp)
-		return;
-	if (!(ib = kmalloc(sizeof(struct divert_info) + strlen(cp), GFP_ATOMIC)))
-		return;	/* no memory */
-	strcpy(ib->info_start, cp);	/* set output string */
-	ib->next = NULL;
-	spin_lock_irqsave(&divert_info_lock, flags);
-	ib->usage_cnt = if_used;
-	if (!divert_info_head)
-		divert_info_head = ib;	/* new head */
-	else
-		divert_info_tail->next = ib;	/* follows existing messages */
-	divert_info_tail = ib;	/* new tail */
-
-	/* delete old entrys */
-	while (divert_info_head->next) {
-		if ((divert_info_head->usage_cnt <= 0) &&
-		    (divert_info_head->next->usage_cnt <= 0)) {
-			ib = divert_info_head;
-			divert_info_head = divert_info_head->next;
-			kfree(ib);
-		} else
-			break;
-	}			/* divert_info_head->next */
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	wake_up_interruptible(&(rd_queue));
-}				/* put_info_buffer */
-
-#ifdef CONFIG_PROC_FS
-
-/**********************************/
-/* deflection device read routine */
-/**********************************/
-static ssize_t
-isdn_divert_read(struct file *file, char __user *buf, size_t count, loff_t *off)
-{
-	struct divert_info *inf;
-	int len;
-
-	if (!(inf = *((struct divert_info **) file->private_data))) {
-		if (file->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		wait_event_interruptible(rd_queue, (inf =
-			*((struct divert_info **) file->private_data)));
-	}
-	if (!inf)
-		return (0);
-
-	inf->usage_cnt--;	/* new usage count */
-	file->private_data = &inf->next;	/* next structure */
-	if ((len = strlen(inf->info_start)) <= count) {
-		if (copy_to_user(buf, inf->info_start, len))
-			return -EFAULT;
-		*off += len;
-		return (len);
-	}
-	return (0);
-}				/* isdn_divert_read */
-
-/**********************************/
-/* deflection device write routine */
-/**********************************/
-static ssize_t
-isdn_divert_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
-{
-	return (-ENODEV);
-}				/* isdn_divert_write */
-
-
-/***************************************/
-/* select routines for various kernels */
-/***************************************/
-static __poll_t
-isdn_divert_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask = 0;
-
-	poll_wait(file, &(rd_queue), wait);
-	/* mask = EPOLLOUT | EPOLLWRNORM; */
-	if (*((struct divert_info **) file->private_data)) {
-		mask |= EPOLLIN | EPOLLRDNORM;
-	}
-	return mask;
-}				/* isdn_divert_poll */
-
-/****************/
-/* Open routine */
-/****************/
-static int
-isdn_divert_open(struct inode *ino, struct file *filep)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_info_lock, flags);
-	if_used++;
-	if (divert_info_head)
-		filep->private_data = &(divert_info_tail->next);
-	else
-		filep->private_data = &divert_info_head;
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	/*  start_divert(); */
-	return nonseekable_open(ino, filep);
-}				/* isdn_divert_open */
-
-/*******************/
-/* close routine   */
-/*******************/
-static int
-isdn_divert_close(struct inode *ino, struct file *filep)
-{
-	struct divert_info *inf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_info_lock, flags);
-	if_used--;
-	inf = *((struct divert_info **) filep->private_data);
-	while (inf) {
-		inf->usage_cnt--;
-		inf = inf->next;
-	}
-	if (if_used <= 0)
-		while (divert_info_head) {
-			inf = divert_info_head;
-			divert_info_head = divert_info_head->next;
-			kfree(inf);
-		}
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	return (0);
-}				/* isdn_divert_close */
-
-/*********/
-/* IOCTL */
-/*********/
-static int isdn_divert_ioctl_unlocked(struct file *file, uint cmd, ulong arg)
-{
-	divert_ioctl dioctl;
-	int i;
-	unsigned long flags;
-	divert_rule *rulep;
-	char *cp;
-
-	if (copy_from_user(&dioctl, (void __user *) arg, sizeof(dioctl)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case IIOCGETVER:
-		dioctl.drv_version = DIVERT_IIOC_VERSION;	/* set version */
-		break;
-
-	case IIOCGETDRV:
-		if ((dioctl.getid.drvid = divert_if.name_to_drv(dioctl.getid.drvnam)) < 0)
-			return (-EINVAL);
-		break;
-
-	case IIOCGETNAM:
-		cp = divert_if.drv_to_name(dioctl.getid.drvid);
-		if (!cp)
-			return (-EINVAL);
-		if (!*cp)
-			return (-EINVAL);
-		strcpy(dioctl.getid.drvnam, cp);
-		break;
-
-	case IIOCGETRULE:
-		if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx)))
-			return (-EINVAL);
-		dioctl.getsetrule.rule = *rulep;	/* copy data */
-		break;
-
-	case IIOCMODRULE:
-		if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx)))
-			return (-EINVAL);
-		spin_lock_irqsave(&divert_lock, flags);
-		*rulep = dioctl.getsetrule.rule;	/* copy data */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return (0);	/* no copy required */
-		break;
-
-	case IIOCINSRULE:
-		return (insertrule(dioctl.getsetrule.ruleidx, &dioctl.getsetrule.rule));
-		break;
-
-	case IIOCDELRULE:
-		return (deleterule(dioctl.getsetrule.ruleidx));
-		break;
-
-	case IIOCDODFACT:
-		return (deflect_extern_action(dioctl.fwd_ctrl.subcmd,
-					      dioctl.fwd_ctrl.callid,
-					      dioctl.fwd_ctrl.to_nr));
-
-	case IIOCDOCFACT:
-	case IIOCDOCFDIS:
-	case IIOCDOCFINT:
-		if (!divert_if.drv_to_name(dioctl.cf_ctrl.drvid))
-			return (-EINVAL);	/* invalid driver */
-		if (strnlen(dioctl.cf_ctrl.msn, sizeof(dioctl.cf_ctrl.msn)) ==
-		    sizeof(dioctl.cf_ctrl.msn))
-			return -EINVAL;
-		if (strnlen(dioctl.cf_ctrl.fwd_nr, sizeof(dioctl.cf_ctrl.fwd_nr)) ==
-		    sizeof(dioctl.cf_ctrl.fwd_nr))
-			return -EINVAL;
-		if ((i = cf_command(dioctl.cf_ctrl.drvid,
-				    (cmd == IIOCDOCFACT) ? 1 : (cmd == IIOCDOCFDIS) ? 0 : 2,
-				    dioctl.cf_ctrl.cfproc,
-				    dioctl.cf_ctrl.msn,
-				    dioctl.cf_ctrl.service,
-				    dioctl.cf_ctrl.fwd_nr,
-				    &dioctl.cf_ctrl.procid)))
-			return (i);
-		break;
-
-	default:
-		return (-EINVAL);
-	}			/* switch cmd */
-	return copy_to_user((void __user *)arg, &dioctl, sizeof(dioctl)) ? -EFAULT : 0;
-}				/* isdn_divert_ioctl */
-
-static long isdn_divert_ioctl(struct file *file, uint cmd, ulong arg)
-{
-	long ret;
-
-	mutex_lock(&isdn_divert_mutex);
-	ret = isdn_divert_ioctl_unlocked(file, cmd, arg);
-	mutex_unlock(&isdn_divert_mutex);
-
-	return ret;
-}
-
-static const struct file_operations isdn_fops =
-{
-	.owner          = THIS_MODULE,
-	.llseek         = no_llseek,
-	.read           = isdn_divert_read,
-	.write          = isdn_divert_write,
-	.poll           = isdn_divert_poll,
-	.unlocked_ioctl = isdn_divert_ioctl,
-	.open           = isdn_divert_open,
-	.release        = isdn_divert_close,
-};
-
-/****************************/
-/* isdn subdir in /proc/net */
-/****************************/
-static struct proc_dir_entry *isdn_proc_entry = NULL;
-static struct proc_dir_entry *isdn_divert_entry = NULL;
-#endif	/* CONFIG_PROC_FS */
-
-/***************************************************************************/
-/* divert_dev_init must be called before the proc filesystem may be used   */
-/***************************************************************************/
-int
-divert_dev_init(void)
-{
-
-	init_waitqueue_head(&rd_queue);
-
-#ifdef CONFIG_PROC_FS
-	isdn_proc_entry = proc_mkdir("isdn", init_net.proc_net);
-	if (!isdn_proc_entry)
-		return (-1);
-	isdn_divert_entry = proc_create("divert", S_IFREG | S_IRUGO,
-					isdn_proc_entry, &isdn_fops);
-	if (!isdn_divert_entry) {
-		remove_proc_entry("isdn", init_net.proc_net);
-		return (-1);
-	}
-#endif	/* CONFIG_PROC_FS */
-
-	return (0);
-}				/* divert_dev_init */
-
-/***************************************************************************/
-/* divert_dev_deinit must be called before leaving isdn when included as   */
-/* a module.                                                               */
-/***************************************************************************/
-int
-divert_dev_deinit(void)
-{
-
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("divert", isdn_proc_entry);
-	remove_proc_entry("isdn", init_net.proc_net);
-#endif	/* CONFIG_PROC_FS */
-
-	return (0);
-}				/* divert_dev_deinit */
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
deleted file mode 100644
index 5620fd2c6009..000000000000
--- a/drivers/isdn/divert/isdn_divert.c
+++ /dev/null
@@ -1,846 +0,0 @@
-/* $Id: isdn_divert.c,v 1.6.6.3 2001/09/23 22:24:36 kai Exp $
- *
- * DSS1 main diversion supplementary handling for i4l.
- *
- * Copyright 1999       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/jiffies.h>
-
-#include "isdn_divert.h"
-
-/**********************************/
-/* structure keeping calling info */
-/**********************************/
-struct call_struc {
-	isdn_ctrl ics; /* delivered setup + driver parameters */
-	ulong divert_id; /* Id delivered to user */
-	unsigned char akt_state; /* actual state */
-	char deflect_dest[35]; /* deflection destination */
-	struct timer_list timer; /* timer control structure */
-	char info[90]; /* device info output */
-	struct call_struc *next; /* pointer to next entry */
-	struct call_struc *prev;
-};
-
-
-/********************************************/
-/* structure keeping deflection table entry */
-/********************************************/
-struct deflect_struc {
-	struct deflect_struc *next, *prev;
-	divert_rule rule; /* used rule */
-};
-
-
-/*****************************************/
-/* variables for main diversion services */
-/*****************************************/
-/* diversion/deflection processes */
-static struct call_struc *divert_head = NULL; /* head of remembered entrys */
-static ulong next_id = 1; /* next info id */
-static struct deflect_struc *table_head = NULL;
-static struct deflect_struc *table_tail = NULL;
-static unsigned char extern_wait_max = 4; /* maximum wait in s for external process */
-
-DEFINE_SPINLOCK(divert_lock);
-
-/***************************/
-/* timer callback function */
-/***************************/
-static void deflect_timer_expire(struct timer_list *t)
-{
-	unsigned long flags;
-	struct call_struc *cs = from_timer(cs, t, timer);
-
-	spin_lock_irqsave(&divert_lock, flags);
-	del_timer(&cs->timer); /* delete active timer */
-	spin_unlock_irqrestore(&divert_lock, flags);
-
-	switch (cs->akt_state) {
-	case DEFLECT_PROCEED:
-		cs->ics.command = ISDN_CMD_HANGUP; /* cancel action */
-		divert_if.ll_cmd(&cs->ics);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case DEFLECT_ALERT:
-		cs->ics.command = ISDN_CMD_REDIR; /* protocol */
-		strlcpy(cs->ics.parm.setup.phone, cs->deflect_dest, sizeof(cs->ics.parm.setup.phone));
-		strcpy(cs->ics.parm.setup.eazmsn, "Testtext delayed");
-		divert_if.ll_cmd(&cs->ics);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case DEFLECT_AUTODEL:
-	default:
-		spin_lock_irqsave(&divert_lock, flags);
-		if (cs->prev)
-			cs->prev->next = cs->next; /* forward link */
-		else
-			divert_head = cs->next;
-		if (cs->next)
-			cs->next->prev = cs->prev; /* back link */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		kfree(cs);
-		return;
-
-	} /* switch */
-} /* deflect_timer_func */
-
-
-/*****************************************/
-/* handle call forwarding de/activations */
-/* 0 = deact, 1 = act, 2 = interrogate   */
-/*****************************************/
-int cf_command(int drvid, int mode,
-	       u_char proc, char *msn,
-	       u_char service, char *fwd_nr, ulong *procid)
-{
-	unsigned long flags;
-	int retval, msnlen;
-	int fwd_len;
-	char *p, *ielenp, tmp[60];
-	struct call_struc *cs;
-
-	if (strchr(msn, '.')) return (-EINVAL); /* subaddress not allowed in msn */
-	if ((proc & 0x7F) > 2) return (-EINVAL);
-	proc &= 3;
-	p = tmp;
-	*p++ = 0x30; /* enumeration */
-	ielenp = p++; /* remember total length position */
-	*p++ = 0xa; /* proc tag */
-	*p++ = 1;   /* length */
-	*p++ = proc & 0x7F; /* procedure to de/activate/interrogate */
-	*p++ = 0xa; /* service tag */
-	*p++ = 1;   /* length */
-	*p++ = service; /* service to handle */
-
-	if (mode == 1) {
-		if (!*fwd_nr) return (-EINVAL); /* destination missing */
-		if (strchr(fwd_nr, '.')) return (-EINVAL); /* subaddress not allowed */
-		fwd_len = strlen(fwd_nr);
-		*p++ = 0x30; /* number enumeration */
-		*p++ = fwd_len + 2; /* complete forward to len */
-		*p++ = 0x80; /* fwd to nr */
-		*p++ = fwd_len; /* length of number */
-		strcpy(p, fwd_nr); /* copy number */
-		p += fwd_len; /* pointer beyond fwd */
-	} /* activate */
-
-	msnlen = strlen(msn);
-	*p++ = 0x80; /* msn number */
-	if (msnlen > 1) {
-		*p++ = msnlen; /* length */
-		strcpy(p, msn);
-		p += msnlen;
-	} else
-		*p++ = 0;
-
-	*ielenp = p - ielenp - 1; /* set total IE length */
-
-	/* allocate mem for information struct */
-	if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
-		return (-ENOMEM); /* no memory */
-	timer_setup(&cs->timer, deflect_timer_expire, 0);
-	cs->info[0] = '\0';
-	cs->ics.driver = drvid;
-	cs->ics.command = ISDN_CMD_PROT_IO; /* protocol specific io */
-	cs->ics.arg = DSS1_CMD_INVOKE; /* invoke supplementary service */
-	cs->ics.parm.dss1_io.proc = (mode == 1) ? 7 : (mode == 2) ? 11 : 8; /* operation */
-	cs->ics.parm.dss1_io.timeout = 4000; /* from ETS 300 207-1 */
-	cs->ics.parm.dss1_io.datalen = p - tmp; /* total len */
-	cs->ics.parm.dss1_io.data = tmp; /* start of buffer */
-
-	spin_lock_irqsave(&divert_lock, flags);
-	cs->ics.parm.dss1_io.ll_id = next_id++; /* id for callback */
-	spin_unlock_irqrestore(&divert_lock, flags);
-	*procid = cs->ics.parm.dss1_io.ll_id;
-
-	sprintf(cs->info, "%d 0x%lx %s%s 0 %s %02x %d%s%s\n",
-		(!mode) ? DIVERT_DEACTIVATE : (mode == 1) ? DIVERT_ACTIVATE : DIVERT_REPORT,
-		cs->ics.parm.dss1_io.ll_id,
-		(mode != 2) ? "" : "0 ",
-		divert_if.drv_to_name(cs->ics.driver),
-		msn,
-		service & 0xFF,
-		proc,
-		(mode != 1) ? "" : " 0 ",
-		(mode != 1) ? "" : fwd_nr);
-
-	retval = divert_if.ll_cmd(&cs->ics); /* execute command */
-
-	if (!retval) {
-		cs->prev = NULL;
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->next = divert_head;
-		divert_head = cs;
-		spin_unlock_irqrestore(&divert_lock, flags);
-	} else
-		kfree(cs);
-	return (retval);
-} /* cf_command */
-
-
-/****************************************/
-/* handle a external deflection command */
-/****************************************/
-int deflect_extern_action(u_char cmd, ulong callid, char *to_nr)
-{
-	struct call_struc *cs;
-	isdn_ctrl ic;
-	unsigned long flags;
-	int i;
-
-	if ((cmd & 0x7F) > 2) return (-EINVAL); /* invalid command */
-	cs = divert_head; /* start of parameter list */
-	while (cs) {
-		if (cs->divert_id == callid) break; /* found */
-		cs = cs->next;
-	} /* search entry */
-	if (!cs) return (-EINVAL); /* invalid callid */
-
-	ic.driver = cs->ics.driver;
-	ic.arg = cs->ics.arg;
-	i = -EINVAL;
-	if (cs->akt_state == DEFLECT_AUTODEL) return (i); /* no valid call */
-	switch (cmd & 0x7F) {
-	case 0: /* hangup */
-		del_timer(&cs->timer);
-		ic.command = ISDN_CMD_HANGUP;
-		i = divert_if.ll_cmd(&ic);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case 1: /* alert */
-		if (cs->akt_state == DEFLECT_ALERT) return (0);
-		cmd &= 0x7F; /* never wait */
-		del_timer(&cs->timer);
-		ic.command = ISDN_CMD_ALERT;
-		if ((i = divert_if.ll_cmd(&ic))) {
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-			cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-			add_timer(&cs->timer);
-			spin_unlock_irqrestore(&divert_lock, flags);
-		} else
-			cs->akt_state = DEFLECT_ALERT;
-		break;
-
-	case 2: /* redir */
-		del_timer(&cs->timer);
-		strlcpy(cs->ics.parm.setup.phone, to_nr, sizeof(cs->ics.parm.setup.phone));
-		strcpy(cs->ics.parm.setup.eazmsn, "Testtext manual");
-		ic.command = ISDN_CMD_REDIR;
-		if ((i = divert_if.ll_cmd(&ic))) {
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-			cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-			add_timer(&cs->timer);
-			spin_unlock_irqrestore(&divert_lock, flags);
-		} else
-			cs->akt_state = DEFLECT_ALERT;
-		break;
-
-	} /* switch */
-	return (i);
-} /* deflect_extern_action */
-
-/********************************/
-/* insert a new rule before idx */
-/********************************/
-int insertrule(int idx, divert_rule *newrule)
-{
-	struct deflect_struc *ds, *ds1 = NULL;
-	unsigned long flags;
-
-	if (!(ds = kmalloc(sizeof(struct deflect_struc), GFP_KERNEL)))
-		return (-ENOMEM); /* no memory */
-
-	ds->rule = *newrule; /* set rule */
-
-	spin_lock_irqsave(&divert_lock, flags);
-
-	if (idx >= 0) {
-		ds1 = table_head;
-		while ((ds1) && (idx > 0))
-		{ idx--;
-			ds1 = ds1->next;
-		}
-		if (!ds1) idx = -1;
-	}
-
-	if (idx < 0) {
-		ds->prev = table_tail; /* previous entry */
-		ds->next = NULL; /* end of chain */
-		if (ds->prev)
-			ds->prev->next = ds; /* last forward */
-		else
-			table_head = ds; /* is first entry */
-		table_tail = ds; /* end of queue */
-	} else {
-		ds->next = ds1; /* next entry */
-		ds->prev = ds1->prev; /* prev entry */
-		ds1->prev = ds; /* backward chain old element */
-		if (!ds->prev)
-			table_head = ds; /* first element */
-	}
-
-	spin_unlock_irqrestore(&divert_lock, flags);
-	return (0);
-} /* insertrule */
-
-/***********************************/
-/* delete the rule at position idx */
-/***********************************/
-int deleterule(int idx)
-{
-	struct deflect_struc *ds, *ds1;
-	unsigned long flags;
-
-	if (idx < 0) {
-		spin_lock_irqsave(&divert_lock, flags);
-		ds = table_head;
-		table_head = NULL;
-		table_tail = NULL;
-		spin_unlock_irqrestore(&divert_lock, flags);
-		while (ds) {
-			ds1 = ds;
-			ds = ds->next;
-			kfree(ds1);
-		}
-		return (0);
-	}
-
-	spin_lock_irqsave(&divert_lock, flags);
-	ds = table_head;
-
-	while ((ds) && (idx > 0)) {
-		idx--;
-		ds = ds->next;
-	}
-
-	if (!ds) {
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return (-EINVAL);
-	}
-
-	if (ds->next)
-		ds->next->prev = ds->prev; /* backward chain */
-	else
-		table_tail = ds->prev; /* end of chain */
-
-	if (ds->prev)
-		ds->prev->next = ds->next; /* forward chain */
-	else
-		table_head = ds->next; /* start of chain */
-
-	spin_unlock_irqrestore(&divert_lock, flags);
-	kfree(ds);
-	return (0);
-} /* deleterule */
-
-/*******************************************/
-/* get a pointer to a specific rule number */
-/*******************************************/
-divert_rule *getruleptr(int idx)
-{
-	struct deflect_struc *ds = table_head;
-
-	if (idx < 0) return (NULL);
-	while ((ds) && (idx >= 0)) {
-		if (!(idx--)) {
-			return (&ds->rule);
-			break;
-		}
-		ds = ds->next;
-	}
-	return (NULL);
-} /* getruleptr */
-
-/*************************************************/
-/* called from common module on an incoming call */
-/*************************************************/
-static int isdn_divert_icall(isdn_ctrl *ic)
-{
-	int retval = 0;
-	unsigned long flags;
-	struct call_struc *cs = NULL;
-	struct deflect_struc *dv;
-	char *p, *p1;
-	u_char accept;
-
-	/* first check the internal deflection table */
-	for (dv = table_head; dv; dv = dv->next) {
-		/* scan table */
-		if (((dv->rule.callopt == 1) && (ic->command == ISDN_STAT_ICALLW)) ||
-		    ((dv->rule.callopt == 2) && (ic->command == ISDN_STAT_ICALL)))
-			continue; /* call option check */
-		if (!(dv->rule.drvid & (1L << ic->driver)))
-			continue; /* driver not matching */
-		if ((dv->rule.si1) && (dv->rule.si1 != ic->parm.setup.si1))
-			continue; /* si1 not matching */
-		if ((dv->rule.si2) && (dv->rule.si2 != ic->parm.setup.si2))
-			continue; /* si2 not matching */
-
-		p = dv->rule.my_msn;
-		p1 = ic->parm.setup.eazmsn;
-		accept = 0;
-		while (*p) {
-			/* complete compare */
-			if (*p == '-') {
-				accept = 1; /* call accepted */
-				break;
-			}
-			if (*p++ != *p1++)
-				break; /* not accepted */
-			if ((!*p) && (!*p1))
-				accept = 1;
-		} /* complete compare */
-		if (!accept) continue; /* not accepted */
-
-		if ((strcmp(dv->rule.caller, "0")) ||
-		    (ic->parm.setup.phone[0])) {
-			p = dv->rule.caller;
-			p1 = ic->parm.setup.phone;
-			accept = 0;
-			while (*p) {
-				/* complete compare */
-				if (*p == '-') {
-					accept = 1; /* call accepted */
-					break;
-				}
-				if (*p++ != *p1++)
-					break; /* not accepted */
-				if ((!*p) && (!*p1))
-					accept = 1;
-			} /* complete compare */
-			if (!accept) continue; /* not accepted */
-		}
-
-		switch (dv->rule.action) {
-		case DEFLECT_IGNORE:
-			return 0;
-
-		case DEFLECT_ALERT:
-		case DEFLECT_PROCEED:
-		case DEFLECT_REPORT:
-		case DEFLECT_REJECT:
-			if (dv->rule.action == DEFLECT_PROCEED)
-				if ((!if_used) || ((!extern_wait_max) && (!dv->rule.waittime)))
-					return (0); /* no external deflection needed */
-			if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
-				return (0); /* no memory */
-			timer_setup(&cs->timer, deflect_timer_expire, 0);
-			cs->info[0] = '\0';
-
-			cs->ics = *ic; /* copy incoming data */
-			if (!cs->ics.parm.setup.phone[0]) strcpy(cs->ics.parm.setup.phone, "0");
-			if (!cs->ics.parm.setup.eazmsn[0]) strcpy(cs->ics.parm.setup.eazmsn, "0");
-			cs->ics.parm.setup.screen = dv->rule.screen;
-			if (dv->rule.waittime)
-				cs->timer.expires = jiffies + (HZ * dv->rule.waittime);
-			else if (dv->rule.action == DEFLECT_PROCEED)
-				cs->timer.expires = jiffies + (HZ * extern_wait_max);
-			else
-				cs->timer.expires = 0;
-			cs->akt_state = dv->rule.action;
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->divert_id = next_id++; /* new sequence number */
-			spin_unlock_irqrestore(&divert_lock, flags);
-			cs->prev = NULL;
-			if (cs->akt_state == DEFLECT_ALERT) {
-				strcpy(cs->deflect_dest, dv->rule.to_nr);
-				if (!cs->timer.expires) {
-					strcpy(ic->parm.setup.eazmsn,
-					       "Testtext direct");
-					ic->parm.setup.screen = dv->rule.screen;
-					strlcpy(ic->parm.setup.phone, dv->rule.to_nr, sizeof(ic->parm.setup.phone));
-					cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-					cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-					retval = 5;
-				} else
-					retval = 1; /* alerting */
-			} else {
-				cs->deflect_dest[0] = '\0';
-				retval = 4; /* only proceed */
-			}
-			snprintf(cs->info, sizeof(cs->info),
-				 "%d 0x%lx %s %s %s %s 0x%x 0x%x %d %d %s\n",
-				 cs->akt_state,
-				 cs->divert_id,
-				 divert_if.drv_to_name(cs->ics.driver),
-				 (ic->command == ISDN_STAT_ICALLW) ? "1" : "0",
-				 cs->ics.parm.setup.phone,
-				 cs->ics.parm.setup.eazmsn,
-				 cs->ics.parm.setup.si1,
-				 cs->ics.parm.setup.si2,
-				 cs->ics.parm.setup.screen,
-				 dv->rule.waittime,
-				 cs->deflect_dest);
-			if ((dv->rule.action == DEFLECT_REPORT) ||
-			    (dv->rule.action == DEFLECT_REJECT)) {
-				put_info_buffer(cs->info);
-				kfree(cs); /* remove */
-				return ((dv->rule.action == DEFLECT_REPORT) ? 0 : 2); /* nothing to do */
-			}
-			break;
-
-		default:
-			return 0; /* ignore call */
-		} /* switch action */
-		break; /* will break the 'for' looping */
-	} /* scan_table */
-
-	if (cs) {
-		cs->prev = NULL;
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->next = divert_head;
-		divert_head = cs;
-		if (cs->timer.expires) add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-
-		put_info_buffer(cs->info);
-		return (retval);
-	} else
-		return (0);
-} /* isdn_divert_icall */
-
-
-void deleteprocs(void)
-{
-	struct call_struc *cs, *cs1;
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_lock, flags);
-	cs = divert_head;
-	divert_head = NULL;
-	while (cs) {
-		del_timer(&cs->timer);
-		cs1 = cs;
-		cs = cs->next;
-		kfree(cs1);
-	}
-	spin_unlock_irqrestore(&divert_lock, flags);
-} /* deleteprocs */
-
-/****************************************************/
-/* put a address including address type into buffer */
-/****************************************************/
-static int put_address(char *st, u_char *p, int len)
-{
-	u_char retval = 0;
-	u_char adr_typ = 0; /* network standard */
-
-	if (len < 2) return (retval);
-	if (*p == 0xA1) {
-		retval = *(++p) + 2; /* total length */
-		if (retval > len) return (0); /* too short */
-		len = retval - 2; /* remaining length */
-		if (len < 3) return (0);
-		if ((*(++p) != 0x0A) || (*(++p) != 1)) return (0);
-		adr_typ = *(++p);
-		len -= 3;
-		p++;
-		if (len < 2) return (0);
-		if (*p++ != 0x12) return (0);
-		if (*p > len) return (0); /* check number length */
-		len = *p++;
-	} else if (*p == 0x80) {
-		retval = *(++p) + 2; /* total length */
-		if (retval > len) return (0);
-		len = retval - 2;
-		p++;
-	} else
-		return (0); /* invalid address information */
-
-	sprintf(st, "%d ", adr_typ);
-	st += strlen(st);
-	if (!len)
-		*st++ = '-';
-	else
-		while (len--)
-			*st++ = *p++;
-	*st = '\0';
-	return (retval);
-} /* put_address */
-
-/*************************************/
-/* report a successful interrogation */
-/*************************************/
-static int interrogate_success(isdn_ctrl *ic, struct call_struc *cs)
-{
-	char *src = ic->parm.dss1_io.data;
-	int restlen = ic->parm.dss1_io.datalen;
-	int cnt = 1;
-	u_char n, n1;
-	char st[90], *p, *stp;
-
-	if (restlen < 2) return (-100); /* frame too short */
-	if (*src++ != 0x30) return (-101);
-	if ((n = *src++) > 0x81) return (-102); /* invalid length field */
-	restlen -= 2; /* remaining bytes */
-	if (n == 0x80) {
-		if (restlen < 2) return (-103);
-		if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-104);
-		restlen -= 2;
-	} else if (n == 0x81) {
-		n = *src++;
-		restlen--;
-		if (n > restlen) return (-105);
-		restlen = n;
-	} else if (n > restlen)
-		return (-106);
-	else
-		restlen = n; /* standard format */
-	if (restlen < 3) return (-107); /* no procedure */
-	if ((*src++ != 2) || (*src++ != 1) || (*src++ != 0x0B)) return (-108);
-	restlen -= 3;
-	if (restlen < 2) return (-109); /* list missing */
-	if (*src == 0x31) {
-		src++;
-		if ((n = *src++) > 0x81) return (-110); /* invalid length field */
-		restlen -= 2; /* remaining bytes */
-		if (n == 0x80) {
-			if (restlen < 2) return (-111);
-			if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-112);
-			restlen -= 2;
-		} else if (n == 0x81) {
-			n = *src++;
-			restlen--;
-			if (n > restlen) return (-113);
-			restlen = n;
-		} else if (n > restlen)
-			return (-114);
-		else
-			restlen = n; /* standard format */
-	} /* result list header */
-
-	while (restlen >= 2) {
-		stp = st;
-		sprintf(stp, "%d 0x%lx %d %s ", DIVERT_REPORT, ic->parm.dss1_io.ll_id,
-			cnt++, divert_if.drv_to_name(ic->driver));
-		stp += strlen(stp);
-		if (*src++ != 0x30) return (-115); /* invalid enum */
-		n = *src++;
-		restlen -= 2;
-		if (n > restlen) return (-116); /* enum length wrong */
-		restlen -= n;
-		p = src; /* one entry */
-		src += n;
-		if (!(n1 = put_address(stp, p, n & 0xFF))) continue;
-		stp += strlen(stp);
-		p += n1;
-		n -= n1;
-		if (n < 6) continue; /* no service and proc */
-		if ((*p++ != 0x0A) || (*p++ != 1)) continue;
-		sprintf(stp, " 0x%02x ", (*p++) & 0xFF);
-		stp += strlen(stp);
-		if ((*p++ != 0x0A) || (*p++ != 1)) continue;
-		sprintf(stp, "%d ", (*p++) & 0xFF);
-		stp += strlen(stp);
-		n -= 6;
-		if (n > 2) {
-			if (*p++ != 0x30) continue;
-			if (*p > (n - 2)) continue;
-			n = *p++;
-			if (!(n1 = put_address(stp, p, n & 0xFF))) continue;
-			stp += strlen(stp);
-		}
-		sprintf(stp, "\n");
-		put_info_buffer(st);
-	} /* while restlen */
-	if (restlen) return (-117);
-	return (0);
-} /* interrogate_success */
-
-/*********************************************/
-/* callback for protocol specific extensions */
-/*********************************************/
-static int prot_stat_callback(isdn_ctrl *ic)
-{
-	struct call_struc *cs, *cs1;
-	int i;
-	unsigned long flags;
-
-	cs = divert_head; /* start of list */
-	cs1 = NULL;
-	while (cs) {
-		if (ic->driver == cs->ics.driver) {
-			switch (cs->ics.arg) {
-			case DSS1_CMD_INVOKE:
-				if ((cs->ics.parm.dss1_io.ll_id == ic->parm.dss1_io.ll_id) &&
-				    (cs->ics.parm.dss1_io.hl_id == ic->parm.dss1_io.hl_id)) {
-					switch (ic->arg) {
-					case DSS1_STAT_INVOKE_ERR:
-						sprintf(cs->info, "128 0x%lx 0x%x\n",
-							ic->parm.dss1_io.ll_id,
-							ic->parm.dss1_io.timeout);
-						put_info_buffer(cs->info);
-						break;
-
-					case DSS1_STAT_INVOKE_RES:
-						switch (cs->ics.parm.dss1_io.proc) {
-						case  7:
-						case  8:
-							put_info_buffer(cs->info);
-							break;
-
-						case  11:
-							i = interrogate_success(ic, cs);
-							if (i)
-								sprintf(cs->info, "%d 0x%lx %d\n", DIVERT_REPORT,
-									ic->parm.dss1_io.ll_id, i);
-							put_info_buffer(cs->info);
-							break;
-
-						default:
-							printk(KERN_WARNING "dss1_divert: unknown proc %d\n", cs->ics.parm.dss1_io.proc);
-							break;
-						}
-
-						break;
-
-					default:
-						printk(KERN_WARNING "dss1_divert unknown invoke answer %lx\n", ic->arg);
-						break;
-					}
-					cs1 = cs; /* remember structure */
-					cs = NULL;
-					continue; /* abort search */
-				} /* id found */
-				break;
-
-			case DSS1_CMD_INVOKE_ABORT:
-				printk(KERN_WARNING "dss1_divert unhandled invoke abort\n");
-				break;
-
-			default:
-				printk(KERN_WARNING "dss1_divert unknown cmd 0x%lx\n", cs->ics.arg);
-				break;
-			} /* switch ics.arg */
-			cs = cs->next;
-		} /* driver ok */
-	}
-
-	if (!cs1) {
-		printk(KERN_WARNING "dss1_divert unhandled process\n");
-		return (0);
-	}
-
-	if (cs1->ics.driver == -1) {
-		spin_lock_irqsave(&divert_lock, flags);
-		del_timer(&cs1->timer);
-		if (cs1->prev)
-			cs1->prev->next = cs1->next; /* forward link */
-		else
-			divert_head = cs1->next;
-		if (cs1->next)
-			cs1->next->prev = cs1->prev; /* back link */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		kfree(cs1);
-	}
-
-	return (0);
-} /* prot_stat_callback */
-
-
-/***************************/
-/* status callback from HL */
-/***************************/
-static int isdn_divert_stat_callback(isdn_ctrl *ic)
-{
-	struct call_struc *cs, *cs1;
-	unsigned long flags;
-	int retval;
-
-	retval = -1;
-	cs = divert_head; /* start of list */
-	while (cs) {
-		if ((ic->driver == cs->ics.driver) &&
-		    (ic->arg == cs->ics.arg)) {
-			switch (ic->command) {
-			case ISDN_STAT_DHUP:
-				sprintf(cs->info, "129 0x%lx\n", cs->divert_id);
-				del_timer(&cs->timer);
-				cs->ics.driver = -1;
-				break;
-
-			case ISDN_STAT_CAUSE:
-				sprintf(cs->info, "130 0x%lx %s\n", cs->divert_id, ic->parm.num);
-				break;
-
-			case ISDN_STAT_REDIR:
-				sprintf(cs->info, "131 0x%lx\n", cs->divert_id);
-				del_timer(&cs->timer);
-				cs->ics.driver = -1;
-				break;
-
-			default:
-				sprintf(cs->info, "999 0x%lx 0x%x\n", cs->divert_id, (int)(ic->command));
-				break;
-			}
-			put_info_buffer(cs->info);
-			retval = 0;
-		}
-		cs1 = cs;
-		cs = cs->next;
-		if (cs1->ics.driver == -1) {
-			spin_lock_irqsave(&divert_lock, flags);
-			if (cs1->prev)
-				cs1->prev->next = cs1->next; /* forward link */
-			else
-				divert_head = cs1->next;
-			if (cs1->next)
-				cs1->next->prev = cs1->prev; /* back link */
-			spin_unlock_irqrestore(&divert_lock, flags);
-			kfree(cs1);
-		}
-	}
-	return (retval); /* not found */
-} /* isdn_divert_stat_callback */
-
-
-/********************/
-/* callback from ll */
-/********************/
-int ll_callback(isdn_ctrl *ic)
-{
-	switch (ic->command) {
-	case ISDN_STAT_ICALL:
-	case ISDN_STAT_ICALLW:
-		return (isdn_divert_icall(ic));
-		break;
-
-	case ISDN_STAT_PROT:
-		if ((ic->arg & 0xFF) == ISDN_PTYPE_EURO) {
-			if (ic->arg != DSS1_STAT_INVOKE_BRD)
-				return (prot_stat_callback(ic));
-			else
-				return (0); /* DSS1 invoke broadcast */
-		} else
-			return (-1); /* protocol not euro */
-
-	default:
-		return (isdn_divert_stat_callback(ic));
-	}
-} /* ll_callback */
diff --git a/drivers/isdn/divert/isdn_divert.h b/drivers/isdn/divert/isdn_divert.h
deleted file mode 100644
index 55033dd872c0..000000000000
--- a/drivers/isdn/divert/isdn_divert.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* $Id: isdn_divert.h,v 1.5.6.1 2001/09/23 22:24:36 kai Exp $
- *
- * Header for the diversion supplementary ioctl interface.
- *
- * Copyright 1998       by Werner Cornelius (werner@ikt.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-/******************************************/
-/* IOCTL codes for interface to user prog */
-/******************************************/
-#define DIVERT_IIOC_VERSION 0x01 /* actual version */
-#define IIOCGETVER   _IO('I', 1)  /* get version of interface */
-#define IIOCGETDRV   _IO('I', 2)  /* get driver number */
-#define IIOCGETNAM   _IO('I', 3)  /* get driver name */
-#define IIOCGETRULE  _IO('I', 4)  /* read one rule */
-#define IIOCMODRULE  _IO('I', 5)  /* modify/replace a rule */
-#define IIOCINSRULE  _IO('I', 6)  /* insert/append one rule */
-#define IIOCDELRULE  _IO('I', 7)  /* delete a rule */
-#define IIOCDODFACT  _IO('I', 8)  /* hangup/reject/alert/immediately deflect a call */
-#define IIOCDOCFACT  _IO('I', 9)  /* activate control forwarding in PBX */
-#define IIOCDOCFDIS  _IO('I', 10)  /* deactivate control forwarding in PBX */
-#define IIOCDOCFINT  _IO('I', 11)  /* interrogate control forwarding in PBX */
-
-/*************************************/
-/* states reported through interface */
-/*************************************/
-#define DEFLECT_IGNORE    0  /* ignore incoming call */
-#define DEFLECT_REPORT    1  /* only report */
-#define DEFLECT_PROCEED   2  /* deflect when externally triggered */
-#define DEFLECT_ALERT     3  /* alert and deflect after delay */
-#define DEFLECT_REJECT    4  /* reject immediately */
-#define DIVERT_ACTIVATE   5  /* diversion activate */
-#define DIVERT_DEACTIVATE 6  /* diversion deactivate */
-#define DIVERT_REPORT     7  /* interrogation result */
-#define DEFLECT_AUTODEL 255  /* only for internal use */
-
-#define DEFLECT_ALL_IDS   0xFFFFFFFF /* all drivers selected */
-
-typedef struct {
-	ulong drvid;     /* driver ids, bit mapped */
-	char my_msn[35]; /* desired msn, subaddr allowed */
-	char caller[35]; /* caller id, partial string with * + subaddr allowed */
-	char to_nr[35];  /* deflected to number incl. subaddress */
-	u_char si1, si2;  /* service indicators, si1=bitmask, si1+2 0 = all */
-	u_char screen;   /* screening: 0 = no info, 1 = info, 2 = nfo with nr */
-	u_char callopt;  /* option for call handling:
-			    0 = all calls
-			    1 = only non waiting calls
-			    2 = only waiting calls */
-	u_char action;   /* desired action:
-			    0 = don't report call -> ignore
-			    1 = report call, do not allow/proceed for deflection
-			    2 = report call, send proceed, wait max waittime secs
-			    3 = report call, alert and deflect after waittime
-			    4 = report call, reject immediately
-			    actions 1-2 only take place if interface is opened
-			 */
-	u_char waittime; /* maximum wait time for proceeding */
-} divert_rule;
-
-typedef union {
-	int drv_version; /* return of driver version */
-	struct {
-		int drvid;		/* id of driver */
-		char drvnam[30];	/* name of driver */
-	} getid;
-	struct {
-		int ruleidx;	/* index of rule */
-		divert_rule rule;	/* rule parms */
-	} getsetrule;
-	struct {
-		u_char subcmd;  /* 0 = hangup/reject,
-			     1 = alert,
-			     2 = deflect */
-		ulong callid;   /* id of call delivered by ascii output */
-		char to_nr[35]; /* destination when deflect,
-				   else uus1 string (maxlen 31),
-				   data from rule used if empty */
-	} fwd_ctrl;
-	struct {
-		int drvid;      /* id of driver */
-		u_char cfproc;  /* cfu = 0, cfb = 1, cfnr = 2 */
-		ulong procid;   /* process id returned when no error */
-		u_char service; /* basically coded service, 0 = all */
-		char msn[25];   /* desired msn, empty = all */
-		char fwd_nr[35];/* forwarded to number + subaddress */
-	} cf_ctrl;
-} divert_ioctl;
-
-#ifdef __KERNEL__
-
-#include <linux/isdnif.h>
-#include <linux/isdn_divertif.h>
-
-#define AUTODEL_TIME 30 /* timeout in s to delete internal entries */
-
-/**************************************************/
-/* structure keeping ascii info for device output */
-/**************************************************/
-struct divert_info {
-	struct divert_info *next;
-	ulong usage_cnt; /* number of files still to work */
-	char info_start[2]; /* info string start */
-};
-
-
-/**************/
-/* Prototypes */
-/**************/
-extern spinlock_t divert_lock;
-
-extern ulong if_used; /* number of interface users */
-extern int divert_dev_deinit(void);
-extern int divert_dev_init(void);
-extern void put_info_buffer(char *);
-extern int ll_callback(isdn_ctrl *);
-extern isdn_divert_if divert_if;
-extern divert_rule *getruleptr(int);
-extern int insertrule(int, divert_rule *);
-extern int deleterule(int);
-extern void deleteprocs(void);
-extern int deflect_extern_action(u_char, ulong, char *);
-extern int cf_command(int, int, u_char, char *, u_char, char *, ulong *);
-
-#endif /* __KERNEL__ */
diff --git a/drivers/isdn/i4l/Kconfig b/drivers/isdn/i4l/Kconfig
deleted file mode 100644
index cacde8de38a3..000000000000
--- a/drivers/isdn/i4l/Kconfig
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Old ISDN4Linux config
-#
-
-if ISDN_I4L
-
-config ISDN_PPP
-	bool "Support synchronous PPP"
-	depends on INET
-	select SLHC
-	help
-	  Over digital connections such as ISDN, there is no need to
-	  synchronize sender and recipient's clocks with start and stop bits
-	  as is done over analog telephone lines. Instead, one can use
-	  "synchronous PPP". Saying Y here will include this protocol. This
-	  protocol is used by Cisco and Sun for example. So you want to say Y
-	  here if the other end of your ISDN connection supports it. You will
-	  need a special version of pppd (called ipppd) for using this
-	  feature. See <file:Documentation/isdn/README.syncppp> and
-	  <file:Documentation/isdn/syncPPP.FAQ> for more information.
-
-config ISDN_PPP_VJ
-	bool "Use VJ-compression with synchronous PPP"
-	depends on ISDN_PPP
-	help
-	  This enables Van Jacobson header compression for synchronous PPP.
-	  Say Y if the other end of the connection supports it.
-
-config ISDN_MPP
-	bool "Support generic MP (RFC 1717)"
-	depends on ISDN_PPP
-	help
-	  With synchronous PPP enabled, it is possible to increase throughput
-	  by bundling several ISDN-connections, using this protocol. See
-	  <file:Documentation/isdn/README.syncppp> for more information.
-
-config IPPP_FILTER
-	bool "Filtering for synchronous PPP"
-	depends on ISDN_PPP
-	help
-	  Say Y here if you want to be able to filter the packets passing over
-	  IPPP interfaces.  This allows you to control which packets count as
-	  activity (i.e. which packets will reset the idle timer or bring up
-	  a demand-dialled link) and which packets are to be dropped entirely.
-	  You need to say Y here if you wish to use the pass-filter and
-	  active-filter options to ipppd.
-
-config ISDN_PPP_BSDCOMP
-	tristate "Support BSD compression"
-	depends on ISDN_PPP
-	help
-	  Support for the BSD-Compress compression method for PPP, which uses
-	  the LZW compression method to compress each PPP packet before it is
-	  sent over the wire. The machine at the other end of the PPP link
-	  (usually your ISP) has to support the BSD-Compress compression
-	  method as well for this to be useful. Even if they don't support it,
-	  it is safe to say Y here.
-
-config ISDN_AUDIO
-	bool "Support audio via ISDN"
-	help
-	  If you say Y here, the modem-emulator will support a subset of the
-	  EIA Class 8 Voice commands. Using a getty with voice-support
-	  (mgetty+sendfax by <gert@greenie.muc.de> with an extension, available
-	  with the ISDN utility package for example), you will be able to use
-	  your Linux box as an ISDN-answering machine. Of course, this must be
-	  supported by the lowlevel driver also. Currently, the HiSax driver
-	  is the only voice-supporting driver. See
-	  <file:Documentation/isdn/README.audio> for more information.
-
-config ISDN_TTY_FAX
-	bool "Support AT-Fax Class 1 and 2 commands"
-	depends on ISDN_AUDIO
-	help
-	  If you say Y here, the modem-emulator will support a subset of the
-	  Fax Class 1 and 2 commands. Using a getty with fax-support
-	  (mgetty+sendfax, hylafax), you will be able to use your Linux box as
-	  an ISDN-fax-machine. This must be supported by the lowlevel driver
-	  also. See <file:Documentation/isdn/README.fax> for more information.
-
-config ISDN_X25
-	bool "X.25 PLP on top of ISDN"
-	depends on X25
-	help
-	  This feature provides the X.25 protocol over ISDN connections.
-	  See <file:Documentation/isdn/README.x25> for more information
-	  if you are thinking about using this.
-
-
-menu "ISDN feature submodules"
-
-config ISDN_DRV_LOOP
-	tristate "isdnloop support"
-	depends on BROKEN_ON_SMP
-	help
-	  This driver provides a virtual ISDN card. Its primary purpose is
-	  testing of linklevel features or configuration without getting
-	  charged by your service-provider for lots of phone calls.
-	  You need will need the loopctrl utility from the latest isdn4k-utils
-	  package to set up this driver.
-
-config ISDN_DIVERSION
-	tristate "Support isdn diversion services"
-	help
-	  This option allows you to use some supplementary diversion
-	  services in conjunction with the HiSax driver on an EURO/DSS1
-	  line.
-
-	  Supported options are CD (call deflection), CFU (Call forward
-	  unconditional), CFB (Call forward when busy) and CFNR (call forward
-	  not reachable). Additionally the actual CFU, CFB and CFNR state may
-	  be interrogated.
-
-	  The use of CFU, CFB, CFNR and interrogation may be limited to some
-	  countries. The keypad protocol is still not implemented. CD should
-	  work in all countries if the service has been subscribed to.
-
-	  Please read the file <file:Documentation/isdn/README.diversion>.
-
-endmenu
-
-comment "ISDN4Linux hardware drivers"
-
-# end ISDN_I4L
-endif
-
diff --git a/drivers/isdn/i4l/Makefile b/drivers/isdn/i4l/Makefile
index be77500c9e86..11fe697739d5 100644
--- a/drivers/isdn/i4l/Makefile
+++ b/drivers/isdn/i4l/Makefile
@@ -3,18 +3,4 @@
 
 # Each configuration option enables a list of files.
 
-obj-$(CONFIG_ISDN_I4L)		+= isdn.o
-obj-$(CONFIG_ISDN_PPP_BSDCOMP)	+= isdn_bsdcomp.o
 obj-$(CONFIG_ISDN_HDLC)		+= isdnhdlc.o
-
-# Multipart objects.
-
-isdn-y				:= isdn_net.o isdn_tty.o isdn_v110.o isdn_common.o
-
-# Optional parts of multipart objects.
-
-isdn-$(CONFIG_ISDN_PPP)		+= isdn_ppp.o
-isdn-$(CONFIG_ISDN_X25)		+= isdn_concap.o isdn_x25iface.o
-isdn-$(CONFIG_ISDN_AUDIO)		+= isdn_audio.o
-isdn-$(CONFIG_ISDN_TTY_FAX)	+= isdn_ttyfax.o
-
diff --git a/drivers/isdn/i4l/isdn_audio.c b/drivers/isdn/i4l/isdn_audio.c
deleted file mode 100644
index b6bcd1eca128..000000000000
--- a/drivers/isdn/i4l/isdn_audio.c
+++ /dev/null
@@ -1,711 +0,0 @@
-/* $Id: isdn_audio.c,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $
- *
- * Linux ISDN subsystem, audio conversion and compression (linklevel).
- *
- * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de)
- * DTMF code (c) 1996 by Christian Mock (cm@kukuruz.ping.at)
- * Silence detection (c) 1998 by Armin Schindler (mac@gismo.telekom.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/isdn.h>
-#include <linux/slab.h>
-#include "isdn_audio.h"
-#include "isdn_common.h"
-
-char *isdn_audio_revision = "$Revision: 1.1.2.2 $";
-
-/*
- * Misc. lookup-tables.
- */
-
-/* ulaw -> signed 16-bit */
-static short isdn_audio_ulaw_to_s16[] =
-{
-	0x8284, 0x8684, 0x8a84, 0x8e84, 0x9284, 0x9684, 0x9a84, 0x9e84,
-	0xa284, 0xa684, 0xaa84, 0xae84, 0xb284, 0xb684, 0xba84, 0xbe84,
-	0xc184, 0xc384, 0xc584, 0xc784, 0xc984, 0xcb84, 0xcd84, 0xcf84,
-	0xd184, 0xd384, 0xd584, 0xd784, 0xd984, 0xdb84, 0xdd84, 0xdf84,
-	0xe104, 0xe204, 0xe304, 0xe404, 0xe504, 0xe604, 0xe704, 0xe804,
-	0xe904, 0xea04, 0xeb04, 0xec04, 0xed04, 0xee04, 0xef04, 0xf004,
-	0xf0c4, 0xf144, 0xf1c4, 0xf244, 0xf2c4, 0xf344, 0xf3c4, 0xf444,
-	0xf4c4, 0xf544, 0xf5c4, 0xf644, 0xf6c4, 0xf744, 0xf7c4, 0xf844,
-	0xf8a4, 0xf8e4, 0xf924, 0xf964, 0xf9a4, 0xf9e4, 0xfa24, 0xfa64,
-	0xfaa4, 0xfae4, 0xfb24, 0xfb64, 0xfba4, 0xfbe4, 0xfc24, 0xfc64,
-	0xfc94, 0xfcb4, 0xfcd4, 0xfcf4, 0xfd14, 0xfd34, 0xfd54, 0xfd74,
-	0xfd94, 0xfdb4, 0xfdd4, 0xfdf4, 0xfe14, 0xfe34, 0xfe54, 0xfe74,
-	0xfe8c, 0xfe9c, 0xfeac, 0xfebc, 0xfecc, 0xfedc, 0xfeec, 0xfefc,
-	0xff0c, 0xff1c, 0xff2c, 0xff3c, 0xff4c, 0xff5c, 0xff6c, 0xff7c,
-	0xff88, 0xff90, 0xff98, 0xffa0, 0xffa8, 0xffb0, 0xffb8, 0xffc0,
-	0xffc8, 0xffd0, 0xffd8, 0xffe0, 0xffe8, 0xfff0, 0xfff8, 0x0000,
-	0x7d7c, 0x797c, 0x757c, 0x717c, 0x6d7c, 0x697c, 0x657c, 0x617c,
-	0x5d7c, 0x597c, 0x557c, 0x517c, 0x4d7c, 0x497c, 0x457c, 0x417c,
-	0x3e7c, 0x3c7c, 0x3a7c, 0x387c, 0x367c, 0x347c, 0x327c, 0x307c,
-	0x2e7c, 0x2c7c, 0x2a7c, 0x287c, 0x267c, 0x247c, 0x227c, 0x207c,
-	0x1efc, 0x1dfc, 0x1cfc, 0x1bfc, 0x1afc, 0x19fc, 0x18fc, 0x17fc,
-	0x16fc, 0x15fc, 0x14fc, 0x13fc, 0x12fc, 0x11fc, 0x10fc, 0x0ffc,
-	0x0f3c, 0x0ebc, 0x0e3c, 0x0dbc, 0x0d3c, 0x0cbc, 0x0c3c, 0x0bbc,
-	0x0b3c, 0x0abc, 0x0a3c, 0x09bc, 0x093c, 0x08bc, 0x083c, 0x07bc,
-	0x075c, 0x071c, 0x06dc, 0x069c, 0x065c, 0x061c, 0x05dc, 0x059c,
-	0x055c, 0x051c, 0x04dc, 0x049c, 0x045c, 0x041c, 0x03dc, 0x039c,
-	0x036c, 0x034c, 0x032c, 0x030c, 0x02ec, 0x02cc, 0x02ac, 0x028c,
-	0x026c, 0x024c, 0x022c, 0x020c, 0x01ec, 0x01cc, 0x01ac, 0x018c,
-	0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104,
-	0x00f4, 0x00e4, 0x00d4, 0x00c4, 0x00b4, 0x00a4, 0x0094, 0x0084,
-	0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040,
-	0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
-};
-
-/* alaw -> signed 16-bit */
-static short isdn_audio_alaw_to_s16[] =
-{
-	0x13fc, 0xec04, 0x0144, 0xfebc, 0x517c, 0xae84, 0x051c, 0xfae4,
-	0x0a3c, 0xf5c4, 0x0048, 0xffb8, 0x287c, 0xd784, 0x028c, 0xfd74,
-	0x1bfc, 0xe404, 0x01cc, 0xfe34, 0x717c, 0x8e84, 0x071c, 0xf8e4,
-	0x0e3c, 0xf1c4, 0x00c4, 0xff3c, 0x387c, 0xc784, 0x039c, 0xfc64,
-	0x0ffc, 0xf004, 0x0104, 0xfefc, 0x417c, 0xbe84, 0x041c, 0xfbe4,
-	0x083c, 0xf7c4, 0x0008, 0xfff8, 0x207c, 0xdf84, 0x020c, 0xfdf4,
-	0x17fc, 0xe804, 0x018c, 0xfe74, 0x617c, 0x9e84, 0x061c, 0xf9e4,
-	0x0c3c, 0xf3c4, 0x0084, 0xff7c, 0x307c, 0xcf84, 0x030c, 0xfcf4,
-	0x15fc, 0xea04, 0x0164, 0xfe9c, 0x597c, 0xa684, 0x059c, 0xfa64,
-	0x0b3c, 0xf4c4, 0x0068, 0xff98, 0x2c7c, 0xd384, 0x02cc, 0xfd34,
-	0x1dfc, 0xe204, 0x01ec, 0xfe14, 0x797c, 0x8684, 0x07bc, 0xf844,
-	0x0f3c, 0xf0c4, 0x00e4, 0xff1c, 0x3c7c, 0xc384, 0x03dc, 0xfc24,
-	0x11fc, 0xee04, 0x0124, 0xfedc, 0x497c, 0xb684, 0x049c, 0xfb64,
-	0x093c, 0xf6c4, 0x0028, 0xffd8, 0x247c, 0xdb84, 0x024c, 0xfdb4,
-	0x19fc, 0xe604, 0x01ac, 0xfe54, 0x697c, 0x9684, 0x069c, 0xf964,
-	0x0d3c, 0xf2c4, 0x00a4, 0xff5c, 0x347c, 0xcb84, 0x034c, 0xfcb4,
-	0x12fc, 0xed04, 0x0134, 0xfecc, 0x4d7c, 0xb284, 0x04dc, 0xfb24,
-	0x09bc, 0xf644, 0x0038, 0xffc8, 0x267c, 0xd984, 0x026c, 0xfd94,
-	0x1afc, 0xe504, 0x01ac, 0xfe54, 0x6d7c, 0x9284, 0x06dc, 0xf924,
-	0x0dbc, 0xf244, 0x00b4, 0xff4c, 0x367c, 0xc984, 0x036c, 0xfc94,
-	0x0f3c, 0xf0c4, 0x00f4, 0xff0c, 0x3e7c, 0xc184, 0x03dc, 0xfc24,
-	0x07bc, 0xf844, 0x0008, 0xfff8, 0x1efc, 0xe104, 0x01ec, 0xfe14,
-	0x16fc, 0xe904, 0x0174, 0xfe8c, 0x5d7c, 0xa284, 0x05dc, 0xfa24,
-	0x0bbc, 0xf444, 0x0078, 0xff88, 0x2e7c, 0xd184, 0x02ec, 0xfd14,
-	0x14fc, 0xeb04, 0x0154, 0xfeac, 0x557c, 0xaa84, 0x055c, 0xfaa4,
-	0x0abc, 0xf544, 0x0058, 0xffa8, 0x2a7c, 0xd584, 0x02ac, 0xfd54,
-	0x1cfc, 0xe304, 0x01cc, 0xfe34, 0x757c, 0x8a84, 0x075c, 0xf8a4,
-	0x0ebc, 0xf144, 0x00d4, 0xff2c, 0x3a7c, 0xc584, 0x039c, 0xfc64,
-	0x10fc, 0xef04, 0x0114, 0xfeec, 0x457c, 0xba84, 0x045c, 0xfba4,
-	0x08bc, 0xf744, 0x0018, 0xffe8, 0x227c, 0xdd84, 0x022c, 0xfdd4,
-	0x18fc, 0xe704, 0x018c, 0xfe74, 0x657c, 0x9a84, 0x065c, 0xf9a4,
-	0x0cbc, 0xf344, 0x0094, 0xff6c, 0x327c, 0xcd84, 0x032c, 0xfcd4
-};
-
-/* alaw -> ulaw */
-static char isdn_audio_alaw_to_ulaw[] =
-{
-	0xab, 0x2b, 0xe3, 0x63, 0x8b, 0x0b, 0xc9, 0x49,
-	0xba, 0x3a, 0xf6, 0x76, 0x9b, 0x1b, 0xd7, 0x57,
-	0xa3, 0x23, 0xdd, 0x5d, 0x83, 0x03, 0xc1, 0x41,
-	0xb2, 0x32, 0xeb, 0x6b, 0x93, 0x13, 0xcf, 0x4f,
-	0xaf, 0x2f, 0xe7, 0x67, 0x8f, 0x0f, 0xcd, 0x4d,
-	0xbe, 0x3e, 0xfe, 0x7e, 0x9f, 0x1f, 0xdb, 0x5b,
-	0xa7, 0x27, 0xdf, 0x5f, 0x87, 0x07, 0xc5, 0x45,
-	0xb6, 0x36, 0xef, 0x6f, 0x97, 0x17, 0xd3, 0x53,
-	0xa9, 0x29, 0xe1, 0x61, 0x89, 0x09, 0xc7, 0x47,
-	0xb8, 0x38, 0xf2, 0x72, 0x99, 0x19, 0xd5, 0x55,
-	0xa1, 0x21, 0xdc, 0x5c, 0x81, 0x01, 0xbf, 0x3f,
-	0xb0, 0x30, 0xe9, 0x69, 0x91, 0x11, 0xce, 0x4e,
-	0xad, 0x2d, 0xe5, 0x65, 0x8d, 0x0d, 0xcb, 0x4b,
-	0xbc, 0x3c, 0xfa, 0x7a, 0x9d, 0x1d, 0xd9, 0x59,
-	0xa5, 0x25, 0xde, 0x5e, 0x85, 0x05, 0xc3, 0x43,
-	0xb4, 0x34, 0xed, 0x6d, 0x95, 0x15, 0xd1, 0x51,
-	0xac, 0x2c, 0xe4, 0x64, 0x8c, 0x0c, 0xca, 0x4a,
-	0xbb, 0x3b, 0xf8, 0x78, 0x9c, 0x1c, 0xd8, 0x58,
-	0xa4, 0x24, 0xde, 0x5e, 0x84, 0x04, 0xc2, 0x42,
-	0xb3, 0x33, 0xec, 0x6c, 0x94, 0x14, 0xd0, 0x50,
-	0xb0, 0x30, 0xe8, 0x68, 0x90, 0x10, 0xce, 0x4e,
-	0xbf, 0x3f, 0xfe, 0x7e, 0xa0, 0x20, 0xdc, 0x5c,
-	0xa8, 0x28, 0xe0, 0x60, 0x88, 0x08, 0xc6, 0x46,
-	0xb7, 0x37, 0xf0, 0x70, 0x98, 0x18, 0xd4, 0x54,
-	0xaa, 0x2a, 0xe2, 0x62, 0x8a, 0x0a, 0xc8, 0x48,
-	0xb9, 0x39, 0xf4, 0x74, 0x9a, 0x1a, 0xd6, 0x56,
-	0xa2, 0x22, 0xdd, 0x5d, 0x82, 0x02, 0xc0, 0x40,
-	0xb1, 0x31, 0xea, 0x6a, 0x92, 0x12, 0xcf, 0x4f,
-	0xae, 0x2e, 0xe6, 0x66, 0x8e, 0x0e, 0xcc, 0x4c,
-	0xbd, 0x3d, 0xfc, 0x7c, 0x9e, 0x1e, 0xda, 0x5a,
-	0xa6, 0x26, 0xdf, 0x5f, 0x86, 0x06, 0xc4, 0x44,
-	0xb5, 0x35, 0xee, 0x6e, 0x96, 0x16, 0xd2, 0x52
-};
-
-/* ulaw -> alaw */
-static char isdn_audio_ulaw_to_alaw[] =
-{
-	0xab, 0x55, 0xd5, 0x15, 0x95, 0x75, 0xf5, 0x35,
-	0xb5, 0x45, 0xc5, 0x05, 0x85, 0x65, 0xe5, 0x25,
-	0xa5, 0x5d, 0xdd, 0x1d, 0x9d, 0x7d, 0xfd, 0x3d,
-	0xbd, 0x4d, 0xcd, 0x0d, 0x8d, 0x6d, 0xed, 0x2d,
-	0xad, 0x51, 0xd1, 0x11, 0x91, 0x71, 0xf1, 0x31,
-	0xb1, 0x41, 0xc1, 0x01, 0x81, 0x61, 0xe1, 0x21,
-	0x59, 0xd9, 0x19, 0x99, 0x79, 0xf9, 0x39, 0xb9,
-	0x49, 0xc9, 0x09, 0x89, 0x69, 0xe9, 0x29, 0xa9,
-	0xd7, 0x17, 0x97, 0x77, 0xf7, 0x37, 0xb7, 0x47,
-	0xc7, 0x07, 0x87, 0x67, 0xe7, 0x27, 0xa7, 0xdf,
-	0x9f, 0x7f, 0xff, 0x3f, 0xbf, 0x4f, 0xcf, 0x0f,
-	0x8f, 0x6f, 0xef, 0x2f, 0x53, 0x13, 0x73, 0x33,
-	0xb3, 0x43, 0xc3, 0x03, 0x83, 0x63, 0xe3, 0x23,
-	0xa3, 0x5b, 0xdb, 0x1b, 0x9b, 0x7b, 0xfb, 0x3b,
-	0xbb, 0xbb, 0x4b, 0x4b, 0xcb, 0xcb, 0x0b, 0x0b,
-	0x8b, 0x8b, 0x6b, 0x6b, 0xeb, 0xeb, 0x2b, 0x2b,
-	0xab, 0x54, 0xd4, 0x14, 0x94, 0x74, 0xf4, 0x34,
-	0xb4, 0x44, 0xc4, 0x04, 0x84, 0x64, 0xe4, 0x24,
-	0xa4, 0x5c, 0xdc, 0x1c, 0x9c, 0x7c, 0xfc, 0x3c,
-	0xbc, 0x4c, 0xcc, 0x0c, 0x8c, 0x6c, 0xec, 0x2c,
-	0xac, 0x50, 0xd0, 0x10, 0x90, 0x70, 0xf0, 0x30,
-	0xb0, 0x40, 0xc0, 0x00, 0x80, 0x60, 0xe0, 0x20,
-	0x58, 0xd8, 0x18, 0x98, 0x78, 0xf8, 0x38, 0xb8,
-	0x48, 0xc8, 0x08, 0x88, 0x68, 0xe8, 0x28, 0xa8,
-	0xd6, 0x16, 0x96, 0x76, 0xf6, 0x36, 0xb6, 0x46,
-	0xc6, 0x06, 0x86, 0x66, 0xe6, 0x26, 0xa6, 0xde,
-	0x9e, 0x7e, 0xfe, 0x3e, 0xbe, 0x4e, 0xce, 0x0e,
-	0x8e, 0x6e, 0xee, 0x2e, 0x52, 0x12, 0x72, 0x32,
-	0xb2, 0x42, 0xc2, 0x02, 0x82, 0x62, 0xe2, 0x22,
-	0xa2, 0x5a, 0xda, 0x1a, 0x9a, 0x7a, 0xfa, 0x3a,
-	0xba, 0xba, 0x4a, 0x4a, 0xca, 0xca, 0x0a, 0x0a,
-	0x8a, 0x8a, 0x6a, 0x6a, 0xea, 0xea, 0x2a, 0x2a
-};
-
-#define NCOEFF            8     /* number of frequencies to be analyzed       */
-#define DTMF_TRESH     4000     /* above this is dtmf                         */
-#define SILENCE_TRESH   200     /* below this is silence                      */
-#define AMP_BITS          9     /* bits per sample, reduced to avoid overflow */
-#define LOGRP             0
-#define HIGRP             1
-
-/* For DTMF recognition:
- * 2 * cos(2 * PI * k / N) precalculated for all k
- */
-static int cos2pik[NCOEFF] =
-{
-	55813, 53604, 51193, 48591, 38114, 33057, 25889, 18332
-};
-
-static char dtmf_matrix[4][4] =
-{
-	{'1', '2', '3', 'A'},
-	{'4', '5', '6', 'B'},
-	{'7', '8', '9', 'C'},
-	{'*', '0', '#', 'D'}
-};
-
-static inline void
-isdn_audio_tlookup(const u_char *table, u_char *buff, unsigned long n)
-{
-#ifdef __i386__
-	unsigned long d0, d1, d2, d3;
-	__asm__ __volatile__(
-		"cld\n"
-		"1:\tlodsb\n\t"
-		"xlatb\n\t"
-		"stosb\n\t"
-		"loop 1b\n\t"
-		:	"=&b"(d0), "=&c"(d1), "=&D"(d2), "=&S"(d3)
-		:	"0"((long) table), "1"(n), "2"((long) buff), "3"((long) buff)
-		:	"memory", "ax");
-#else
-	while (n--)
-		*buff = table[*(unsigned char *)buff], buff++;
-#endif
-}
-
-void
-isdn_audio_ulaw2alaw(unsigned char *buff, unsigned long len)
-{
-	isdn_audio_tlookup(isdn_audio_ulaw_to_alaw, buff, len);
-}
-
-void
-isdn_audio_alaw2ulaw(unsigned char *buff, unsigned long len)
-{
-	isdn_audio_tlookup(isdn_audio_alaw_to_ulaw, buff, len);
-}
-
-/*
- * linear <-> adpcm conversion stuff
- * Most parts from the mgetty-package.
- * (C) by Gert Doering and Klaus Weidner
- * Used by permission of Gert Doering
- */
-
-
-#define ZEROTRAP                /* turn on the trap as per the MIL-STD */
-#undef ZEROTRAP
-#define BIAS 0x84               /* define the add-in bias for 16 bit samples */
-#define CLIP 32635
-
-static unsigned char
-isdn_audio_linear2ulaw(int sample)
-{
-	static int exp_lut[256] =
-		{
-			0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-			5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-			5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-		};
-	int sign,
-		exponent,
-		mantissa;
-	unsigned char ulawbyte;
-
-	/* Get the sample into sign-magnitude. */
-	sign = (sample >> 8) & 0x80;	/* set aside the sign  */
-	if (sign != 0)
-		sample = -sample;	/* get magnitude       */
-	if (sample > CLIP)
-		sample = CLIP;  /* clip the magnitude  */
-
-	/* Convert from 16 bit linear to ulaw. */
-	sample = sample + BIAS;
-	exponent = exp_lut[(sample >> 7) & 0xFF];
-	mantissa = (sample >> (exponent + 3)) & 0x0F;
-	ulawbyte = ~(sign | (exponent << 4) | mantissa);
-#ifdef ZEROTRAP
-	/* optional CCITT trap */
-	if (ulawbyte == 0)
-		ulawbyte = 0x02;
-#endif
-	return (ulawbyte);
-}
-
-
-static int Mx[3][8] =
-{
-	{0x3800, 0x5600, 0, 0, 0, 0, 0, 0},
-	{0x399a, 0x3a9f, 0x4d14, 0x6607, 0, 0, 0, 0},
-	{0x3556, 0x3556, 0x399A, 0x3A9F, 0x4200, 0x4D14, 0x6607, 0x6607},
-};
-
-static int bitmask[9] =
-{
-	0, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff
-};
-
-static int
-isdn_audio_get_bits(adpcm_state *s, unsigned char **in, int *len)
-{
-	while (s->nleft < s->nbits) {
-		int d = *((*in)++);
-		(*len)--;
-		s->word = (s->word << 8) | d;
-		s->nleft += 8;
-	}
-	s->nleft -= s->nbits;
-	return (s->word >> s->nleft) & bitmask[s->nbits];
-}
-
-static void
-isdn_audio_put_bits(int data, int nbits, adpcm_state *s,
-		    unsigned char **out, int *len)
-{
-	s->word = (s->word << nbits) | (data & bitmask[nbits]);
-	s->nleft += nbits;
-	while (s->nleft >= 8) {
-		int d = (s->word >> (s->nleft - 8));
-		*(out[0]++) = d & 255;
-		(*len)++;
-		s->nleft -= 8;
-	}
-}
-
-adpcm_state *
-isdn_audio_adpcm_init(adpcm_state *s, int nbits)
-{
-	if (!s)
-		s = kmalloc(sizeof(adpcm_state), GFP_ATOMIC);
-	if (s) {
-		s->a = 0;
-		s->d = 5;
-		s->word = 0;
-		s->nleft = 0;
-		s->nbits = nbits;
-	}
-	return s;
-}
-
-dtmf_state *
-isdn_audio_dtmf_init(dtmf_state *s)
-{
-	if (!s)
-		s = kmalloc(sizeof(dtmf_state), GFP_ATOMIC);
-	if (s) {
-		s->idx = 0;
-		s->last = ' ';
-	}
-	return s;
-}
-
-/*
- * Decompression of adpcm data to a/u-law
- *
- */
-
-int
-isdn_audio_adpcm2xlaw(adpcm_state *s, int fmt, unsigned char *in,
-		      unsigned char *out, int len)
-{
-	int a = s->a;
-	int d = s->d;
-	int nbits = s->nbits;
-	int olen = 0;
-
-	while (len) {
-		int e = isdn_audio_get_bits(s, &in, &len);
-		int sign;
-
-		if (nbits == 4 && e == 0)
-			d = 4;
-		sign = (e >> (nbits - 1)) ? -1 : 1;
-		e &= bitmask[nbits - 1];
-		a += sign * ((e << 1) + 1) * d >> 1;
-		if (d & 1)
-			a++;
-		if (fmt)
-			*out++ = isdn_audio_ulaw_to_alaw[
-				isdn_audio_linear2ulaw(a << 2)];
-		else
-			*out++ = isdn_audio_linear2ulaw(a << 2);
-		olen++;
-		d = (d * Mx[nbits - 2][e] + 0x2000) >> 14;
-		if (d < 5)
-			d = 5;
-	}
-	s->a = a;
-	s->d = d;
-	return olen;
-}
-
-int
-isdn_audio_xlaw2adpcm(adpcm_state *s, int fmt, unsigned char *in,
-		      unsigned char *out, int len)
-{
-	int a = s->a;
-	int d = s->d;
-	int nbits = s->nbits;
-	int olen = 0;
-
-	while (len--) {
-		int e = 0,
-			nmax = 1 << (nbits - 1);
-		int sign,
-			delta;
-
-		if (fmt)
-			delta = (isdn_audio_alaw_to_s16[*in++] >> 2) - a;
-		else
-			delta = (isdn_audio_ulaw_to_s16[*in++] >> 2) - a;
-		if (delta < 0) {
-			e = nmax;
-			delta = -delta;
-		}
-		while (--nmax && delta > d) {
-			delta -= d;
-			e++;
-		}
-		if (nbits == 4 && ((e & 0x0f) == 0))
-			e = 8;
-		isdn_audio_put_bits(e, nbits, s, &out, &olen);
-		sign = (e >> (nbits - 1)) ? -1 : 1;
-		e &= bitmask[nbits - 1];
-
-		a += sign * ((e << 1) + 1) * d >> 1;
-		if (d & 1)
-			a++;
-		d = (d * Mx[nbits - 2][e] + 0x2000) >> 14;
-		if (d < 5)
-			d = 5;
-	}
-	s->a = a;
-	s->d = d;
-	return olen;
-}
-
-/*
- * Goertzel algorithm.
- * See http://ptolemy.eecs.berkeley.edu/papers/96/dtmf_ict/
- * for more info.
- * Result is stored into an sk_buff and queued up for later
- * evaluation.
- */
-static void
-isdn_audio_goertzel(int *sample, modem_info *info)
-{
-	int sk,
-		sk1,
-		sk2;
-	int k,
-		n;
-	struct sk_buff *skb;
-	int *result;
-
-	skb = dev_alloc_skb(sizeof(int) * NCOEFF);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_audio: Could not alloc DTMF result for ttyI%d\n",
-		       info->line);
-		return;
-	}
-	result = skb_put(skb, sizeof(int) * NCOEFF);
-	for (k = 0; k < NCOEFF; k++) {
-		sk = sk1 = sk2 = 0;
-		for (n = 0; n < DTMF_NPOINTS; n++) {
-			sk = sample[n] + ((cos2pik[k] * sk1) >> 15) - sk2;
-			sk2 = sk1;
-			sk1 = sk;
-		}
-		/* Avoid overflows */
-		sk >>= 1;
-		sk2 >>= 1;
-		/* compute |X(k)|**2 */
-		/* report overflows. This should not happen. */
-		/* Comment this out if desired */
-		if (sk < -32768 || sk > 32767)
-			printk(KERN_DEBUG
-			       "isdn_audio: dtmf goertzel overflow, sk=%d\n", sk);
-		if (sk2 < -32768 || sk2 > 32767)
-			printk(KERN_DEBUG
-			       "isdn_audio: dtmf goertzel overflow, sk2=%d\n", sk2);
-		result[k] =
-			((sk * sk) >> AMP_BITS) -
-			((((cos2pik[k] * sk) >> 15) * sk2) >> AMP_BITS) +
-			((sk2 * sk2) >> AMP_BITS);
-	}
-	skb_queue_tail(&info->dtmf_queue, skb);
-	isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-}
-
-void
-isdn_audio_eval_dtmf(modem_info *info)
-{
-	struct sk_buff *skb;
-	int *result;
-	dtmf_state *s;
-	int silence;
-	int i;
-	int di;
-	int ch;
-	int grp[2];
-	char what;
-	char *p;
-	int thresh;
-
-	while ((skb = skb_dequeue(&info->dtmf_queue))) {
-		result = (int *) skb->data;
-		s = info->dtmf_state;
-		grp[LOGRP] = grp[HIGRP] = -1;
-		silence = 0;
-		thresh = 0;
-		for (i = 0; i < NCOEFF; i++) {
-			if (result[i] > DTMF_TRESH) {
-				if (result[i] > thresh)
-					thresh = result[i];
-			}
-			else if (result[i] < SILENCE_TRESH)
-				silence++;
-		}
-		if (silence == NCOEFF)
-			what = ' ';
-		else {
-			if (thresh > 0)	{
-				thresh = thresh >> 4;  /* touchtones must match within 12 dB */
-				for (i = 0; i < NCOEFF; i++) {
-					if (result[i] < thresh)
-						continue;  /* ignore */
-					/* good level found. This is allowed only one time per group */
-					if (i < NCOEFF / 2) {
-						/* lowgroup*/
-						if (grp[LOGRP] >= 0) {
-							// Bad. Another tone found. */
-							grp[LOGRP] = -1;
-							break;
-						}
-						else
-							grp[LOGRP] = i;
-					}
-					else { /* higroup */
-						if (grp[HIGRP] >= 0) { // Bad. Another tone found. */
-							grp[HIGRP] = -1;
-							break;
-						}
-						else
-							grp[HIGRP] = i - NCOEFF/2;
-					}
-				}
-				if ((grp[LOGRP] >= 0) && (grp[HIGRP] >= 0)) {
-					what = dtmf_matrix[grp[LOGRP]][grp[HIGRP]];
-					if (s->last != ' ' && s->last != '.')
-						s->last = what;	/* min. 1 non-DTMF between DTMF */
-				} else
-					what = '.';
-			}
-			else
-				what = '.';
-		}
-		if ((what != s->last) && (what != ' ') && (what != '.')) {
-			printk(KERN_DEBUG "dtmf: tt='%c'\n", what);
-			p = skb->data;
-			*p++ = 0x10;
-			*p = what;
-			skb_trim(skb, 2);
-			ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-			di = info->isdn_driver;
-			ch = info->isdn_channel;
-			__skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb);
-			dev->drv[di]->rcvcount[ch] += 2;
-			/* Schedule dequeuing */
-			if ((dev->modempoll) && (info->rcvsched))
-				isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-			wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]);
-		} else
-			kfree_skb(skb);
-		s->last = what;
-	}
-}
-
-/*
- * Decode DTMF tones, queue result in separate sk_buf for
- * later examination.
- * Parameters:
- *   s    = pointer to state-struct.
- *   buf  = input audio data
- *   len  = size of audio data.
- *   fmt  = audio data format (0 = ulaw, 1 = alaw)
- */
-void
-isdn_audio_calc_dtmf(modem_info *info, unsigned char *buf, int len, int fmt)
-{
-	dtmf_state *s = info->dtmf_state;
-	int i;
-	int c;
-
-	while (len) {
-		c = DTMF_NPOINTS - s->idx;
-		if (c > len)
-			c = len;
-		if (c <= 0)
-			break;
-		for (i = 0; i < c; i++) {
-			if (fmt)
-				s->buf[s->idx++] =
-					isdn_audio_alaw_to_s16[*buf++] >> (15 - AMP_BITS);
-			else
-				s->buf[s->idx++] =
-					isdn_audio_ulaw_to_s16[*buf++] >> (15 - AMP_BITS);
-		}
-		if (s->idx == DTMF_NPOINTS) {
-			isdn_audio_goertzel(s->buf, info);
-			s->idx = 0;
-		}
-		len -= c;
-	}
-}
-
-silence_state *
-isdn_audio_silence_init(silence_state *s)
-{
-	if (!s)
-		s = kmalloc(sizeof(silence_state), GFP_ATOMIC);
-	if (s) {
-		s->idx = 0;
-		s->state = 0;
-	}
-	return s;
-}
-
-void
-isdn_audio_calc_silence(modem_info *info, unsigned char *buf, int len, int fmt)
-{
-	silence_state *s = info->silence_state;
-	int i;
-	signed char c;
-
-	if (!info->emu.vpar[1]) return;
-
-	for (i = 0; i < len; i++) {
-		if (fmt)
-			c = isdn_audio_alaw_to_ulaw[*buf++];
-		else
-			c = *buf++;
-
-		if (c > 0) c -= 128;
-		c = abs(c);
-
-		if (c > (info->emu.vpar[1] * 4)) {
-			s->idx = 0;
-			s->state = 1;
-		} else {
-			if (s->idx < 210000) s->idx++;
-		}
-	}
-}
-
-void
-isdn_audio_put_dle_code(modem_info *info, u_char code)
-{
-	struct sk_buff *skb;
-	int di;
-	int ch;
-	char *p;
-
-	skb = dev_alloc_skb(2);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_audio: Could not alloc skb for ttyI%d\n",
-		       info->line);
-		return;
-	}
-	p = skb_put(skb, 2);
-	p[0] = 0x10;
-	p[1] = code;
-	ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-	ISDN_AUDIO_SKB_LOCK(skb) = 0;
-	di = info->isdn_driver;
-	ch = info->isdn_channel;
-	__skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb);
-	dev->drv[di]->rcvcount[ch] += 2;
-	/* Schedule dequeuing */
-	if ((dev->modempoll) && (info->rcvsched))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-	wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]);
-}
-
-void
-isdn_audio_eval_silence(modem_info *info)
-{
-	silence_state *s = info->silence_state;
-	char what;
-
-	what = ' ';
-
-	if (s->idx > (info->emu.vpar[2] * 800)) {
-		s->idx = 0;
-		if (!s->state) {	/* silence from beginning of rec */
-			what = 's';
-		} else {
-			what = 'q';
-		}
-	}
-	if ((what == 's') || (what == 'q')) {
-		printk(KERN_DEBUG "ttyI%d: %s\n", info->line,
-		       (what == 's') ? "silence" : "quiet");
-		isdn_audio_put_dle_code(info, what);
-	}
-}
diff --git a/drivers/isdn/i4l/isdn_audio.h b/drivers/isdn/i4l/isdn_audio.h
deleted file mode 100644
index 013c3582e0d1..000000000000
--- a/drivers/isdn/i4l/isdn_audio.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* $Id: isdn_audio.h,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $
- *
- * Linux ISDN subsystem, audio conversion and compression (linklevel).
- *
- * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#define DTMF_NPOINTS 205        /* Number of samples for DTMF recognition */
-typedef struct adpcm_state {
-	int a;
-	int d;
-	int word;
-	int nleft;
-	int nbits;
-} adpcm_state;
-
-typedef struct dtmf_state {
-	char last;
-	char llast;
-	int idx;
-	int buf[DTMF_NPOINTS];
-} dtmf_state;
-
-typedef struct silence_state {
-	int state;
-	unsigned int idx;
-} silence_state;
-
-extern void isdn_audio_ulaw2alaw(unsigned char *, unsigned long);
-extern void isdn_audio_alaw2ulaw(unsigned char *, unsigned long);
-extern adpcm_state *isdn_audio_adpcm_init(adpcm_state *, int);
-extern int isdn_audio_adpcm2xlaw(adpcm_state *, int, unsigned char *, unsigned char *, int);
-extern int isdn_audio_xlaw2adpcm(adpcm_state *, int, unsigned char *, unsigned char *, int);
-extern void isdn_audio_calc_dtmf(modem_info *, unsigned char *, int, int);
-extern void isdn_audio_eval_dtmf(modem_info *);
-dtmf_state *isdn_audio_dtmf_init(dtmf_state *);
-extern void isdn_audio_calc_silence(modem_info *, unsigned char *, int, int);
-extern void isdn_audio_eval_silence(modem_info *);
-silence_state *isdn_audio_silence_init(silence_state *);
-extern void isdn_audio_put_dle_code(modem_info *, u_char);
diff --git a/drivers/isdn/i4l/isdn_bsdcomp.c b/drivers/isdn/i4l/isdn_bsdcomp.c
deleted file mode 100644
index 7f28b967ed19..000000000000
--- a/drivers/isdn/i4l/isdn_bsdcomp.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * BSD compression module
- *
- * Patched version for ISDN syncPPP written 1997/1998 by Michael Hipp
- * The whole module is now SKB based.
- *
- */
-
-/*
- * Update: The Berkeley copyright was changed, and the change
- * is retroactive to all "true" BSD software (ie everything
- * from UCB as opposed to other peoples code that just carried
- * the same license). The new copyright doesn't clash with the
- * GPL, so the module-only restriction has been removed..
- */
-
-/*
- * Original copyright notice:
- *
- * Copyright (c) 1985, 1986 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * James A. Woods, derived from original work by Spencer Thomas
- * and Joseph Orost.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/tty.h>
-#include <linux/errno.h>
-#include <linux/string.h>	/* used in new tty drivers */
-#include <linux/signal.h>	/* used in new tty drivers */
-#include <linux/bitops.h>
-
-#include <asm/byteorder.h>
-#include <asm/types.h>
-
-#include <linux/if.h>
-
-#include <linux/if_ether.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/inet.h>
-#include <linux/ioctl.h>
-#include <linux/vmalloc.h>
-
-#include <linux/ppp_defs.h>
-
-#include <linux/isdn.h>
-#include <linux/isdn_ppp.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_arp.h>
-#include <linux/ppp-comp.h>
-
-#include "isdn_ppp.h"
-
-MODULE_DESCRIPTION("ISDN4Linux: BSD Compression for PPP over ISDN");
-MODULE_LICENSE("Dual BSD/GPL");
-
-#define BSD_VERSION(x)	((x) >> 5)
-#define BSD_NBITS(x)	((x) & 0x1F)
-
-#define BSD_CURRENT_VERSION	1
-
-#define DEBUG 1
-
-/*
- * A dictionary for doing BSD compress.
- */
-
-struct bsd_dict {
-	u32 fcode;
-	u16 codem1;		/* output of hash table -1 */
-	u16 cptr;		/* map code to hash table entry */
-};
-
-struct bsd_db {
-	int            totlen;		/* length of this structure */
-	unsigned int   hsize;		/* size of the hash table */
-	unsigned char  hshift;		/* used in hash function */
-	unsigned char  n_bits;		/* current bits/code */
-	unsigned char  maxbits;		/* maximum bits/code */
-	unsigned char  debug;		/* non-zero if debug desired */
-	unsigned char  unit;		/* ppp unit number */
-	u16 seqno;			/* sequence # of next packet */
-	unsigned int   mru;		/* size of receive (decompress) bufr */
-	unsigned int   maxmaxcode;	/* largest valid code */
-	unsigned int   max_ent;		/* largest code in use */
-	unsigned int   in_count;	/* uncompressed bytes, aged */
-	unsigned int   bytes_out;	/* compressed bytes, aged */
-	unsigned int   ratio;		/* recent compression ratio */
-	unsigned int   checkpoint;	/* when to next check the ratio */
-	unsigned int   clear_count;	/* times dictionary cleared */
-	unsigned int   incomp_count;	/* incompressible packets */
-	unsigned int   incomp_bytes;	/* incompressible bytes */
-	unsigned int   uncomp_count;	/* uncompressed packets */
-	unsigned int   uncomp_bytes;	/* uncompressed bytes */
-	unsigned int   comp_count;	/* compressed packets */
-	unsigned int   comp_bytes;	/* compressed bytes */
-	unsigned short  *lens;		/* array of lengths of codes */
-	struct bsd_dict *dict;		/* dictionary */
-	int xmit;
-};
-
-#define BSD_OVHD	2		/* BSD compress overhead/packet */
-#define MIN_BSD_BITS	9
-#define BSD_INIT_BITS	MIN_BSD_BITS
-#define MAX_BSD_BITS	15
-
-/*
- * the next two codes should not be changed lightly, as they must not
- * lie within the contiguous general code space.
- */
-#define CLEAR	256			/* table clear output code */
-#define FIRST	257			/* first free entry */
-#define LAST	255
-
-#define MAXCODE(b)	((1 << (b)) - 1)
-#define BADCODEM1	MAXCODE(MAX_BSD_BITS)
-
-#define BSD_HASH(prefix, suffix, hshift) ((((unsigned long)(suffix)) << (hshift)) \
-					  ^ (unsigned long)(prefix))
-#define BSD_KEY(prefix, suffix)		((((unsigned long)(suffix)) << 16) \
-					 + (unsigned long)(prefix))
-
-#define CHECK_GAP	10000		/* Ratio check interval */
-
-#define RATIO_SCALE_LOG	8
-#define RATIO_SCALE	(1 << RATIO_SCALE_LOG)
-#define RATIO_MAX	(0x7fffffff >> RATIO_SCALE_LOG)
-
-/*
- * clear the dictionary
- */
-
-static void bsd_clear(struct bsd_db *db)
-{
-	db->clear_count++;
-	db->max_ent      = FIRST - 1;
-	db->n_bits       = BSD_INIT_BITS;
-	db->bytes_out    = 0;
-	db->in_count     = 0;
-	db->incomp_count = 0;
-	db->ratio	     = 0;
-	db->checkpoint   = CHECK_GAP;
-}
-
-/*
- * If the dictionary is full, then see if it is time to reset it.
- *
- * Compute the compression ratio using fixed-point arithmetic
- * with 8 fractional bits.
- *
- * Since we have an infinite stream instead of a single file,
- * watch only the local compression ratio.
- *
- * Since both peers must reset the dictionary at the same time even in
- * the absence of CLEAR codes (while packets are incompressible), they
- * must compute the same ratio.
- */
-static int bsd_check(struct bsd_db *db)	/* 1=output CLEAR */
-{
-	unsigned int new_ratio;
-
-	if (db->in_count >= db->checkpoint)
-	{
-		/* age the ratio by limiting the size of the counts */
-		if (db->in_count >= RATIO_MAX || db->bytes_out >= RATIO_MAX)
-		{
-			db->in_count  -= (db->in_count  >> 2);
-			db->bytes_out -= (db->bytes_out >> 2);
-		}
-
-		db->checkpoint = db->in_count + CHECK_GAP;
-
-		if (db->max_ent >= db->maxmaxcode)
-		{
-			/* Reset the dictionary only if the ratio is worse,
-			 * or if it looks as if it has been poisoned
-			 * by incompressible data.
-			 *
-			 * This does not overflow, because
-			 *	db->in_count <= RATIO_MAX.
-			 */
-
-			new_ratio = db->in_count << RATIO_SCALE_LOG;
-			if (db->bytes_out != 0)
-			{
-				new_ratio /= db->bytes_out;
-			}
-
-			if (new_ratio < db->ratio || new_ratio < 1 * RATIO_SCALE)
-			{
-				bsd_clear(db);
-				return 1;
-			}
-			db->ratio = new_ratio;
-		}
-	}
-	return 0;
-}
-
-/*
- * Return statistics.
- */
-
-static void bsd_stats(void *state, struct compstat *stats)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	stats->unc_bytes    = db->uncomp_bytes;
-	stats->unc_packets  = db->uncomp_count;
-	stats->comp_bytes   = db->comp_bytes;
-	stats->comp_packets = db->comp_count;
-	stats->inc_bytes    = db->incomp_bytes;
-	stats->inc_packets  = db->incomp_count;
-	stats->in_count     = db->in_count;
-	stats->bytes_out    = db->bytes_out;
-}
-
-/*
- * Reset state, as on a CCP ResetReq.
- */
-static void bsd_reset(void *state, unsigned char code, unsigned char id,
-		      unsigned char *data, unsigned len,
-		      struct isdn_ppp_resetparams *rsparm)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	bsd_clear(db);
-	db->seqno       = 0;
-	db->clear_count = 0;
-}
-
-/*
- * Release the compression structure
- */
-static void bsd_free(void *state)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	if (db) {
-		/*
-		 * Release the dictionary
-		 */
-		vfree(db->dict);
-		db->dict = NULL;
-
-		/*
-		 * Release the string buffer
-		 */
-		vfree(db->lens);
-		db->lens = NULL;
-
-		/*
-		 * Finally release the structure itself.
-		 */
-		kfree(db);
-	}
-}
-
-
-/*
- * Allocate space for a (de) compressor.
- */
-static void *bsd_alloc(struct isdn_ppp_comp_data *data)
-{
-	int bits;
-	unsigned int hsize, hshift, maxmaxcode;
-	struct bsd_db *db;
-	int decomp;
-
-	static unsigned int htab[][2] = {
-		{ 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } ,
-		{ 9001 , 5 } , { 18013 , 6 } , { 35023 , 7 } , { 69001 , 8 }
-	};
-
-	if (data->optlen != 1 || data->num != CI_BSD_COMPRESS
-	    || BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION)
-		return NULL;
-
-	bits = BSD_NBITS(data->options[0]);
-
-	if (bits < 9 || bits > 15)
-		return NULL;
-
-	hsize = htab[bits - 9][0];
-	hshift = htab[bits - 9][1];
-
-	/*
-	 * Allocate the main control structure for this instance.
-	 */
-	maxmaxcode = MAXCODE(bits);
-	db = kzalloc(sizeof(struct bsd_db), GFP_KERNEL);
-	if (!db)
-		return NULL;
-
-	db->xmit = data->flags & IPPP_COMP_FLAG_XMIT;
-	decomp = db->xmit ? 0 : 1;
-
-	/*
-	 * Allocate space for the dictionary. This may be more than one page in
-	 * length.
-	 */
-	db->dict = vmalloc(array_size(hsize, sizeof(struct bsd_dict)));
-	if (!db->dict) {
-		bsd_free(db);
-		return NULL;
-	}
-
-	/*
-	 * If this is the compression buffer then there is no length data.
-	 * For decompression, the length information is needed as well.
-	 */
-	if (!decomp)
-		db->lens = NULL;
-	else {
-		db->lens = vmalloc(array_size(sizeof(db->lens[0]),
-					      maxmaxcode + 1));
-		if (!db->lens) {
-			bsd_free(db);
-			return (NULL);
-		}
-	}
-
-	/*
-	 * Initialize the data information for the compression code
-	 */
-	db->totlen = sizeof(struct bsd_db) + (sizeof(struct bsd_dict) * hsize);
-	db->hsize = hsize;
-	db->hshift = hshift;
-	db->maxmaxcode = maxmaxcode;
-	db->maxbits = bits;
-
-	return (void *)db;
-}
-
-/*
- * Initialize the database.
- */
-static int bsd_init(void *state, struct isdn_ppp_comp_data *data, int unit, int debug)
-{
-	struct bsd_db *db = state;
-	int indx;
-	int decomp;
-
-	if (!state || !data) {
-		printk(KERN_ERR "isdn_bsd_init: [%d] ERR, state %lx data %lx\n", unit, (long)state, (long)data);
-		return 0;
-	}
-
-	decomp = db->xmit ? 0 : 1;
-
-	if (data->optlen != 1 || data->num != CI_BSD_COMPRESS
-	    || (BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION)
-	    || (BSD_NBITS(data->options[0]) != db->maxbits)
-	    || (decomp && db->lens == NULL)) {
-		printk(KERN_ERR "isdn_bsd: %d %d %d %d %lx\n", data->optlen, data->num, data->options[0], decomp, (unsigned long)db->lens);
-		return 0;
-	}
-
-	if (decomp)
-		for (indx = LAST; indx >= 0; indx--)
-			db->lens[indx] = 1;
-
-	indx = db->hsize;
-	while (indx-- != 0) {
-		db->dict[indx].codem1 = BADCODEM1;
-		db->dict[indx].cptr   = 0;
-	}
-
-	db->unit = unit;
-	db->mru  = 0;
-
-	db->debug = 1;
-
-	bsd_reset(db, 0, 0, NULL, 0, NULL);
-
-	return 1;
-}
-
-/*
- * Obtain pointers to the various structures in the compression tables
- */
-
-#define dict_ptrx(p, idx) &(p->dict[idx])
-#define lens_ptrx(p, idx) &(p->lens[idx])
-
-#ifdef DEBUG
-static unsigned short *lens_ptr(struct bsd_db *db, int idx)
-{
-	if ((unsigned int) idx > (unsigned int) db->maxmaxcode) {
-		printk(KERN_DEBUG "<9>ppp: lens_ptr(%d) > max\n", idx);
-		idx = 0;
-	}
-	return lens_ptrx(db, idx);
-}
-
-static struct bsd_dict *dict_ptr(struct bsd_db *db, int idx)
-{
-	if ((unsigned int) idx >= (unsigned int) db->hsize) {
-		printk(KERN_DEBUG "<9>ppp: dict_ptr(%d) > max\n", idx);
-		idx = 0;
-	}
-	return dict_ptrx(db, idx);
-}
-
-#else
-#define lens_ptr(db, idx) lens_ptrx(db, idx)
-#define dict_ptr(db, idx) dict_ptrx(db, idx)
-#endif
-
-/*
- * compress a packet
- */
-static int bsd_compress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out, int proto)
-{
-	struct bsd_db *db;
-	int hshift;
-	unsigned int max_ent;
-	unsigned int n_bits;
-	unsigned int bitno;
-	unsigned long accm;
-	int ent;
-	unsigned long fcode;
-	struct bsd_dict *dictp;
-	unsigned char c;
-	int hval, disp, ilen, mxcode;
-	unsigned char *rptr = skb_in->data;
-	int isize = skb_in->len;
-
-#define OUTPUT(ent)							\
-	{								\
-		bitno -= n_bits;					\
-		accm |= ((ent) << bitno);				\
-		do	{						\
-			if (skb_out && skb_tailroom(skb_out) > 0)	\
-				skb_put_u8(skb_out, (u8)(accm >> 24));	\
-			accm <<= 8;					\
-			bitno += 8;					\
-		} while (bitno <= 24);					\
-	}
-
-	/*
-	 * If the protocol is not in the range we're interested in,
-	 * just return without compressing the packet.  If it is,
-	 * the protocol becomes the first byte to compress.
-	 */
-	printk(KERN_DEBUG "bsd_compress called with %x\n", proto);
-
-	ent = proto;
-	if (proto < 0x21 || proto > 0xf9 || !(proto & 0x1))
-		return 0;
-
-	db      = (struct bsd_db *) state;
-	hshift  = db->hshift;
-	max_ent = db->max_ent;
-	n_bits  = db->n_bits;
-	bitno   = 32;
-	accm    = 0;
-	mxcode  = MAXCODE(n_bits);
-
-	/* This is the PPP header information */
-	if (skb_out && skb_tailroom(skb_out) >= 2) {
-		char *v = skb_put(skb_out, 2);
-		/* we only push our own data on the header,
-		   AC,PC and protos is pushed by caller  */
-		v[0] = db->seqno >> 8;
-		v[1] = db->seqno;
-	}
-
-	ilen = ++isize; /* This is off by one, but that is what is in draft! */
-
-	while (--ilen > 0) {
-		c = *rptr++;
-		fcode = BSD_KEY(ent, c);
-		hval = BSD_HASH(ent, c, hshift);
-		dictp = dict_ptr(db, hval);
-
-		/* Validate and then check the entry. */
-		if (dictp->codem1 >= max_ent)
-			goto nomatch;
-
-		if (dictp->fcode == fcode) {
-			ent = dictp->codem1 + 1;
-			continue;	/* found (prefix,suffix) */
-		}
-
-		/* continue probing until a match or invalid entry */
-		disp = (hval == 0) ? 1 : hval;
-
-		do {
-			hval += disp;
-			if (hval >= db->hsize)
-				hval -= db->hsize;
-			dictp = dict_ptr(db, hval);
-			if (dictp->codem1 >= max_ent)
-				goto nomatch;
-		} while (dictp->fcode != fcode);
-
-		ent = dictp->codem1 + 1;	/* finally found (prefix,suffix) */
-		continue;
-
-	nomatch:
-		OUTPUT(ent);		/* output the prefix */
-
-		/* code -> hashtable */
-		if (max_ent < db->maxmaxcode) {
-			struct bsd_dict *dictp2;
-			struct bsd_dict *dictp3;
-			int indx;
-
-			/* expand code size if needed */
-			if (max_ent >= mxcode) {
-				db->n_bits = ++n_bits;
-				mxcode = MAXCODE(n_bits);
-			}
-
-			/*
-			 * Invalidate old hash table entry using
-			 * this code, and then take it over.
-			 */
-			dictp2 = dict_ptr(db, max_ent + 1);
-			indx   = dictp2->cptr;
-			dictp3 = dict_ptr(db, indx);
-
-			if (dictp3->codem1 == max_ent)
-				dictp3->codem1 = BADCODEM1;
-
-			dictp2->cptr   = hval;
-			dictp->codem1  = max_ent;
-			dictp->fcode = fcode;
-			db->max_ent    = ++max_ent;
-
-			if (db->lens) {
-				unsigned short *len1 = lens_ptr(db, max_ent);
-				unsigned short *len2 = lens_ptr(db, ent);
-				*len1 = *len2 + 1;
-			}
-		}
-		ent = c;
-	}
-
-	OUTPUT(ent);		/* output the last code */
-
-	if (skb_out)
-		db->bytes_out    += skb_out->len; /* Do not count bytes from here */
-	db->uncomp_bytes += isize;
-	db->in_count     += isize;
-	++db->uncomp_count;
-	++db->seqno;
-
-	if (bitno < 32)
-		++db->bytes_out; /* must be set before calling bsd_check */
-
-	/*
-	 * Generate the clear command if needed
-	 */
-
-	if (bsd_check(db))
-		OUTPUT(CLEAR);
-
-	/*
-	 * Pad dribble bits of last code with ones.
-	 * Do not emit a completely useless byte of ones.
-	 */
-	if (bitno < 32 && skb_out && skb_tailroom(skb_out) > 0)
-		skb_put_u8(skb_out,
-			   (unsigned char)((accm | (0xff << (bitno - 8))) >> 24));
-
-	/*
-	 * Increase code size if we would have without the packet
-	 * boundary because the decompressor will do so.
-	 */
-	if (max_ent >= mxcode && max_ent < db->maxmaxcode)
-		db->n_bits++;
-
-	/* If output length is too large then this is an incompressible frame. */
-	if (!skb_out || skb_out->len >= skb_in->len) {
-		++db->incomp_count;
-		db->incomp_bytes += isize;
-		return 0;
-	}
-
-	/* Count the number of compressed frames */
-	++db->comp_count;
-	db->comp_bytes += skb_out->len;
-	return skb_out->len;
-
-#undef OUTPUT
-}
-
-/*
- * Update the "BSD Compress" dictionary on the receiver for
- * incompressible data by pretending to compress the incoming data.
- */
-static void bsd_incomp(void *state, struct sk_buff *skb_in, int proto)
-{
-	bsd_compress(state, skb_in, NULL, proto);
-}
-
-/*
- * Decompress "BSD Compress".
- */
-static int bsd_decompress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out,
-			  struct isdn_ppp_resetparams *rsparm)
-{
-	struct bsd_db *db;
-	unsigned int max_ent;
-	unsigned long accm;
-	unsigned int bitno;		/* 1st valid bit in accm */
-	unsigned int n_bits;
-	unsigned int tgtbitno;	/* bitno when we have a code */
-	struct bsd_dict *dictp;
-	int seq;
-	unsigned int incode;
-	unsigned int oldcode;
-	unsigned int finchar;
-	unsigned char *p, *ibuf;
-	int ilen;
-	int codelen;
-	int extra;
-
-	db       = (struct bsd_db *) state;
-	max_ent  = db->max_ent;
-	accm     = 0;
-	bitno    = 32;		/* 1st valid bit in accm */
-	n_bits   = db->n_bits;
-	tgtbitno = 32 - n_bits;	/* bitno when we have a code */
-
-	printk(KERN_DEBUG "bsd_decompress called\n");
-
-	if (!skb_in || !skb_out) {
-		printk(KERN_ERR "bsd_decompress called with NULL parameter\n");
-		return DECOMP_ERROR;
-	}
-
-	/*
-	 * Get the sequence number.
-	 */
-	if ((p = skb_pull(skb_in, 2)) == NULL) {
-		return DECOMP_ERROR;
-	}
-	p -= 2;
-	seq = (p[0] << 8) + p[1];
-	ilen = skb_in->len;
-	ibuf = skb_in->data;
-
-	/*
-	 * Check the sequence number and give up if it differs from
-	 * the value we're expecting.
-	 */
-	if (seq != db->seqno) {
-		if (db->debug) {
-			printk(KERN_DEBUG "bsd_decomp%d: bad sequence # %d, expected %d\n",
-			       db->unit, seq, db->seqno - 1);
-		}
-		return DECOMP_ERROR;
-	}
-
-	++db->seqno;
-	db->bytes_out += ilen;
-
-	if (skb_tailroom(skb_out) > 0)
-		skb_put_u8(skb_out, 0);
-	else
-		return DECOMP_ERR_NOMEM;
-
-	oldcode = CLEAR;
-
-	/*
-	 * Keep the checkpoint correctly so that incompressible packets
-	 * clear the dictionary at the proper times.
-	 */
-
-	for (;;) {
-		if (ilen-- <= 0) {
-			db->in_count += (skb_out->len - 1); /* don't count the header */
-			break;
-		}
-
-		/*
-		 * Accumulate bytes until we have a complete code.
-		 * Then get the next code, relying on the 32-bit,
-		 * unsigned accm to mask the result.
-		 */
-
-		bitno -= 8;
-		accm  |= *ibuf++ << bitno;
-		if (tgtbitno < bitno)
-			continue;
-
-		incode = accm >> tgtbitno;
-		accm <<= n_bits;
-		bitno += n_bits;
-
-		/*
-		 * The dictionary must only be cleared at the end of a packet.
-		 */
-
-		if (incode == CLEAR) {
-			if (ilen > 0) {
-				if (db->debug)
-					printk(KERN_DEBUG "bsd_decomp%d: bad CLEAR\n", db->unit);
-				return DECOMP_FATALERROR;	/* probably a bug */
-			}
-			bsd_clear(db);
-			break;
-		}
-
-		if ((incode > max_ent + 2) || (incode > db->maxmaxcode)
-		    || (incode > max_ent && oldcode == CLEAR)) {
-			if (db->debug) {
-				printk(KERN_DEBUG "bsd_decomp%d: bad code 0x%x oldcode=0x%x ",
-				       db->unit, incode, oldcode);
-				printk(KERN_DEBUG "max_ent=0x%x skb->Len=%d seqno=%d\n",
-				       max_ent, skb_out->len, db->seqno);
-			}
-			return DECOMP_FATALERROR;	/* probably a bug */
-		}
-
-		/* Special case for KwKwK string. */
-		if (incode > max_ent) {
-			finchar = oldcode;
-			extra   = 1;
-		} else {
-			finchar = incode;
-			extra   = 0;
-		}
-
-		codelen = *(lens_ptr(db, finchar));
-		if (skb_tailroom(skb_out) < codelen + extra) {
-			if (db->debug) {
-				printk(KERN_DEBUG "bsd_decomp%d: ran out of mru\n", db->unit);
-#ifdef DEBUG
-				printk(KERN_DEBUG "  len=%d, finchar=0x%x, codelen=%d,skblen=%d\n",
-				       ilen, finchar, codelen, skb_out->len);
-#endif
-			}
-			return DECOMP_FATALERROR;
-		}
-
-		/*
-		 * Decode this code and install it in the decompressed buffer.
-		 */
-
-		p = skb_put(skb_out, codelen);
-		p += codelen;
-		while (finchar > LAST) {
-			struct bsd_dict *dictp2 = dict_ptr(db, finchar);
-
-			dictp = dict_ptr(db, dictp2->cptr);
-
-#ifdef DEBUG
-			if (--codelen <= 0 || dictp->codem1 != finchar - 1) {
-				if (codelen <= 0) {
-					printk(KERN_ERR "bsd_decomp%d: fell off end of chain ", db->unit);
-					printk(KERN_ERR "0x%x at 0x%x by 0x%x, max_ent=0x%x\n", incode, finchar, dictp2->cptr, max_ent);
-				} else {
-					if (dictp->codem1 != finchar - 1) {
-						printk(KERN_ERR "bsd_decomp%d: bad code chain 0x%x finchar=0x%x ", db->unit, incode, finchar);
-						printk(KERN_ERR "oldcode=0x%x cptr=0x%x codem1=0x%x\n", oldcode, dictp2->cptr, dictp->codem1);
-					}
-				}
-				return DECOMP_FATALERROR;
-			}
-#endif
-
-			{
-				u32 fcode = dictp->fcode;
-				*--p    = (fcode >> 16) & 0xff;
-				finchar = fcode & 0xffff;
-			}
-		}
-		*--p = finchar;
-
-#ifdef DEBUG
-		if (--codelen != 0)
-			printk(KERN_ERR "bsd_decomp%d: short by %d after code 0x%x, max_ent=0x%x\n", db->unit, codelen, incode, max_ent);
-#endif
-
-		if (extra)		/* the KwKwK case again */
-			skb_put_u8(skb_out, finchar);
-
-		/*
-		 * If not first code in a packet, and
-		 * if not out of code space, then allocate a new code.
-		 *
-		 * Keep the hash table correct so it can be used
-		 * with uncompressed packets.
-		 */
-		if (oldcode != CLEAR && max_ent < db->maxmaxcode) {
-			struct bsd_dict *dictp2, *dictp3;
-			u16 *lens1, *lens2;
-			unsigned long fcode;
-			int hval, disp, indx;
-
-			fcode = BSD_KEY(oldcode, finchar);
-			hval  = BSD_HASH(oldcode, finchar, db->hshift);
-			dictp = dict_ptr(db, hval);
-
-			/* look for a free hash table entry */
-			if (dictp->codem1 < max_ent) {
-				disp = (hval == 0) ? 1 : hval;
-				do {
-					hval += disp;
-					if (hval >= db->hsize)
-						hval -= db->hsize;
-					dictp = dict_ptr(db, hval);
-				} while (dictp->codem1 < max_ent);
-			}
-
-			/*
-			 * Invalidate previous hash table entry
-			 * assigned this code, and then take it over
-			 */
-
-			dictp2 = dict_ptr(db, max_ent + 1);
-			indx   = dictp2->cptr;
-			dictp3 = dict_ptr(db, indx);
-
-			if (dictp3->codem1 == max_ent)
-				dictp3->codem1 = BADCODEM1;
-
-			dictp2->cptr   = hval;
-			dictp->codem1  = max_ent;
-			dictp->fcode = fcode;
-			db->max_ent    = ++max_ent;
-
-			/* Update the length of this string. */
-			lens1  = lens_ptr(db, max_ent);
-			lens2  = lens_ptr(db, oldcode);
-			*lens1 = *lens2 + 1;
-
-			/* Expand code size if needed. */
-			if (max_ent >= MAXCODE(n_bits) && max_ent < db->maxmaxcode) {
-				db->n_bits = ++n_bits;
-				tgtbitno   = 32-n_bits;
-			}
-		}
-		oldcode = incode;
-	}
-
-	++db->comp_count;
-	++db->uncomp_count;
-	db->comp_bytes   += skb_in->len - BSD_OVHD;
-	db->uncomp_bytes += skb_out->len;
-
-	if (bsd_check(db)) {
-		if (db->debug)
-			printk(KERN_DEBUG "bsd_decomp%d: peer should have cleared dictionary on %d\n",
-			       db->unit, db->seqno - 1);
-	}
-	return skb_out->len;
-}
-
-/*************************************************************
- * Table of addresses for the BSD compression module
- *************************************************************/
-
-static struct isdn_ppp_compressor ippp_bsd_compress = {
-	.owner          = THIS_MODULE,
-	.num            = CI_BSD_COMPRESS,
-	.alloc          = bsd_alloc,
-	.free           = bsd_free,
-	.init           = bsd_init,
-	.reset          = bsd_reset,
-	.compress       = bsd_compress,
-	.decompress     = bsd_decompress,
-	.incomp         = bsd_incomp,
-	.stat           = bsd_stats,
-};
-
-/*************************************************************
- * Module support routines
- *************************************************************/
-
-static int __init isdn_bsdcomp_init(void)
-{
-	int answer = isdn_ppp_register_compressor(&ippp_bsd_compress);
-	if (answer == 0)
-		printk(KERN_INFO "PPP BSD Compression module registered\n");
-	return answer;
-}
-
-static void __exit isdn_bsdcomp_exit(void)
-{
-	isdn_ppp_unregister_compressor(&ippp_bsd_compress);
-}
-
-module_init(isdn_bsdcomp_init);
-module_exit(isdn_bsdcomp_exit);
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
deleted file mode 100644
index 74ee00f5b310..000000000000
--- a/drivers/isdn/i4l/isdn_common.c
+++ /dev/null
@@ -1,2368 +0,0 @@
-/* $Id: isdn_common.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $
- *
- * Linux ISDN subsystem, common used functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/isdn.h>
-#include <linux/mutex.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#include "isdn_net.h"
-#include "isdn_ppp.h"
-#ifdef CONFIG_ISDN_AUDIO
-#include "isdn_audio.h"
-#endif
-#ifdef CONFIG_ISDN_DIVERSION_MODULE
-#define CONFIG_ISDN_DIVERSION
-#endif
-#ifdef CONFIG_ISDN_DIVERSION
-#include <linux/isdn_divertif.h>
-#endif /* CONFIG_ISDN_DIVERSION */
-#include "isdn_v110.h"
-
-/* Debugflags */
-#undef ISDN_DEBUG_STATCALLB
-
-MODULE_DESCRIPTION("ISDN4Linux: link layer");
-MODULE_AUTHOR("Fritz Elfert");
-MODULE_LICENSE("GPL");
-
-isdn_dev *dev;
-
-static DEFINE_MUTEX(isdn_mutex);
-static char *isdn_revision = "$Revision: 1.1.2.3 $";
-
-extern char *isdn_net_revision;
-#ifdef CONFIG_ISDN_PPP
-extern char *isdn_ppp_revision;
-#else
-static char *isdn_ppp_revision = ": none $";
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-extern char *isdn_audio_revision;
-#else
-static char *isdn_audio_revision = ": none $";
-#endif
-extern char *isdn_v110_revision;
-
-#ifdef CONFIG_ISDN_DIVERSION
-static isdn_divert_if *divert_if; /* = NULL */
-#endif /* CONFIG_ISDN_DIVERSION */
-
-
-static int isdn_writebuf_stub(int, int, const u_char __user *, int);
-static void set_global_features(void);
-static int isdn_wildmat(char *s, char *p);
-static int isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding);
-
-static inline void
-isdn_lock_driver(isdn_driver_t *drv)
-{
-	try_module_get(drv->interface->owner);
-	drv->locks++;
-}
-
-void
-isdn_lock_drivers(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (!dev->drv[i])
-			continue;
-		isdn_lock_driver(dev->drv[i]);
-	}
-}
-
-static inline void
-isdn_unlock_driver(isdn_driver_t *drv)
-{
-	if (drv->locks > 0) {
-		drv->locks--;
-		module_put(drv->interface->owner);
-	}
-}
-
-void
-isdn_unlock_drivers(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (!dev->drv[i])
-			continue;
-		isdn_unlock_driver(dev->drv[i]);
-	}
-}
-
-#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP)
-void
-isdn_dumppkt(char *s, u_char *p, int len, int dumplen)
-{
-	int dumpc;
-
-	printk(KERN_DEBUG "%s(%d) ", s, len);
-	for (dumpc = 0; (dumpc < dumplen) && (len); len--, dumpc++)
-		printk(" %02x", *p++);
-	printk("\n");
-}
-#endif
-
-/*
- * I picked the pattern-matching-functions from an old GNU-tar version (1.10)
- * It was originally written and put to PD by rs@mirror.TMC.COM (Rich Salz)
- */
-static int
-isdn_star(char *s, char *p)
-{
-	while (isdn_wildmat(s, p)) {
-		if (*++s == '\0')
-			return (2);
-	}
-	return (0);
-}
-
-/*
- * Shell-type Pattern-matching for incoming caller-Ids
- * This function gets a string in s and checks, if it matches the pattern
- * given in p.
- *
- * Return:
- *   0 = match.
- *   1 = no match.
- *   2 = no match. Would eventually match, if s would be longer.
- *
- * Possible Patterns:
- *
- * '?'     matches one character
- * '*'     matches zero or more characters
- * [xyz]   matches the set of characters in brackets.
- * [^xyz]  matches any single character not in the set of characters
- */
-
-static int
-isdn_wildmat(char *s, char *p)
-{
-	register int last;
-	register int matched;
-	register int reverse;
-	register int nostar = 1;
-
-	if (!(*s) && !(*p))
-		return (1);
-	for (; *p; s++, p++)
-		switch (*p) {
-		case '\\':
-			/* Literal match with following character. */
-			p++;
-			/* fall through */
-		default:
-			if (*s != *p)
-				return (*s == '\0') ? 2 : 1;
-					continue;
-		case '?':
-			/* Match anything. */
-			if (*s == '\0')
-				return (2);
-			continue;
-		case '*':
-			nostar = 0;
-			/* Trailing star matches everything. */
-			return (*++p ? isdn_star(s, p) : 0);
-		case '[':
-			/* [^....] means inverse character class. */
-			if ((reverse = (p[1] == '^')))
-				p++;
-			for (last = 0, matched = 0; *++p && (*p != ']'); last = *p)
-				/* This next line requires a good C compiler. */
-				if (*p == '-' ? *s <= *++p && *s >= last : *s == *p)
-					matched = 1;
-			if (matched == reverse)
-				return (1);
-			continue;
-		}
-	return (*s == '\0') ? 0 : nostar;
-}
-
-int isdn_msncmp(const char *msn1, const char *msn2)
-{
-	char TmpMsn1[ISDN_MSNLEN];
-	char TmpMsn2[ISDN_MSNLEN];
-	char *p;
-
-	for (p = TmpMsn1; *msn1 && *msn1 != ':';)  // Strip off a SPID
-		*p++ = *msn1++;
-	*p = '\0';
-
-	for (p = TmpMsn2; *msn2 && *msn2 != ':';)  // Strip off a SPID
-		*p++ = *msn2++;
-	*p = '\0';
-
-	return isdn_wildmat(TmpMsn1, TmpMsn2);
-}
-
-int
-isdn_dc2minor(int di, int ch)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (dev->chanmap[i] == ch && dev->drvmap[i] == di)
-			return i;
-	return -1;
-}
-
-static int isdn_timer_cnt1 = 0;
-static int isdn_timer_cnt2 = 0;
-static int isdn_timer_cnt3 = 0;
-
-static void
-isdn_timer_funct(struct timer_list *unused)
-{
-	int tf = dev->tflags;
-	if (tf & ISDN_TIMER_FAST) {
-		if (tf & ISDN_TIMER_MODEMREAD)
-			isdn_tty_readmodem();
-		if (tf & ISDN_TIMER_MODEMPLUS)
-			isdn_tty_modem_escape();
-		if (tf & ISDN_TIMER_MODEMXMIT)
-			isdn_tty_modem_xmit();
-	}
-	if (tf & ISDN_TIMER_SLOW) {
-		if (++isdn_timer_cnt1 >= ISDN_TIMER_02SEC) {
-			isdn_timer_cnt1 = 0;
-			if (tf & ISDN_TIMER_NETDIAL)
-				isdn_net_dial();
-		}
-		if (++isdn_timer_cnt2 >= ISDN_TIMER_1SEC) {
-			isdn_timer_cnt2 = 0;
-			if (tf & ISDN_TIMER_NETHANGUP)
-				isdn_net_autohup();
-			if (++isdn_timer_cnt3 >= ISDN_TIMER_RINGING) {
-				isdn_timer_cnt3 = 0;
-				if (tf & ISDN_TIMER_MODEMRING)
-					isdn_tty_modem_ring();
-			}
-			if (tf & ISDN_TIMER_CARRIER)
-				isdn_tty_carrier_timeout();
-		}
-	}
-	if (tf)
-		mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES);
-}
-
-void
-isdn_timer_ctrl(int tf, int onoff)
-{
-	unsigned long flags;
-	int old_tflags;
-
-	spin_lock_irqsave(&dev->timerlock, flags);
-	if ((tf & ISDN_TIMER_SLOW) && (!(dev->tflags & ISDN_TIMER_SLOW))) {
-		/* If the slow-timer wasn't activated until now */
-		isdn_timer_cnt1 = 0;
-		isdn_timer_cnt2 = 0;
-	}
-	old_tflags = dev->tflags;
-	if (onoff)
-		dev->tflags |= tf;
-	else
-		dev->tflags &= ~tf;
-	if (dev->tflags && !old_tflags)
-		mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES);
-	spin_unlock_irqrestore(&dev->timerlock, flags);
-}
-
-/*
- * Receive a packet from B-Channel. (Called from low-level-module)
- */
-static void
-isdn_receive_skb_callback(int di, int channel, struct sk_buff *skb)
-{
-	int i;
-
-	if ((i = isdn_dc2minor(di, channel)) == -1) {
-		dev_kfree_skb(skb);
-		return;
-	}
-	/* Update statistics */
-	dev->ibytes[i] += skb->len;
-
-	/* First, try to deliver data to network-device */
-	if (isdn_net_rcv_skb(i, skb))
-		return;
-
-	/* V.110 handling
-	 * makes sense for async streams only, so it is
-	 * called after possible net-device delivery.
-	 */
-	if (dev->v110[i]) {
-		atomic_inc(&dev->v110use[i]);
-		skb = isdn_v110_decode(dev->v110[i], skb);
-		atomic_dec(&dev->v110use[i]);
-		if (!skb)
-			return;
-	}
-
-	/* No network-device found, deliver to tty or raw-channel */
-	if (skb->len) {
-		if (isdn_tty_rcv_skb(i, di, channel, skb))
-			return;
-		wake_up_interruptible(&dev->drv[di]->rcv_waitq[channel]);
-	} else
-		dev_kfree_skb(skb);
-}
-
-/*
- * Intercept command from Linklevel to Lowlevel.
- * If layer 2 protocol is V.110 and this is not supported by current
- * lowlevel-driver, use driver's transparent mode and handle V.110 in
- * linklevel instead.
- */
-int
-isdn_command(isdn_ctrl *cmd)
-{
-	if (cmd->driver == -1) {
-		printk(KERN_WARNING "isdn_command command(%x) driver -1\n", cmd->command);
-		return (1);
-	}
-	if (!dev->drv[cmd->driver]) {
-		printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d] NULL\n",
-		       cmd->command, cmd->driver);
-		return (1);
-	}
-	if (!dev->drv[cmd->driver]->interface) {
-		printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d]->interface NULL\n",
-		       cmd->command, cmd->driver);
-		return (1);
-	}
-	if (cmd->command == ISDN_CMD_SETL2) {
-		int idx = isdn_dc2minor(cmd->driver, cmd->arg & 255);
-		unsigned long l2prot = (cmd->arg >> 8) & 255;
-		unsigned long features = (dev->drv[cmd->driver]->interface->features
-					  >> ISDN_FEATURE_L2_SHIFT) &
-			ISDN_FEATURE_L2_MASK;
-		unsigned long l2_feature = (1 << l2prot);
-
-		switch (l2prot) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			/* If V.110 requested, but not supported by
-			 * HL-driver, set emulator-flag and change
-			 * Layer-2 to transparent
-			 */
-			if (!(features & l2_feature)) {
-				dev->v110emu[idx] = l2prot;
-				cmd->arg = (cmd->arg & 255) |
-					(ISDN_PROTO_L2_TRANS << 8);
-			} else
-				dev->v110emu[idx] = 0;
-		}
-	}
-	return dev->drv[cmd->driver]->interface->command(cmd);
-}
-
-void
-isdn_all_eaz(int di, int ch)
-{
-	isdn_ctrl cmd;
-
-	if (di < 0)
-		return;
-	cmd.driver = di;
-	cmd.arg = ch;
-	cmd.command = ISDN_CMD_SETEAZ;
-	cmd.parm.num[0] = '\0';
-	isdn_command(&cmd);
-}
-
-/*
- * Begin of a CAPI like LL<->HL interface, currently used only for
- * supplementary service (CAPI 2.0 part III)
- */
-#include <linux/isdn/capicmd.h>
-
-static int
-isdn_capi_rec_hl_msg(capi_msg *cm)
-{
-	switch (cm->Command) {
-	case CAPI_FACILITY:
-		/* in the moment only handled in tty */
-		return (isdn_tty_capi_facility(cm));
-	default:
-		return (-1);
-	}
-}
-
-static int
-isdn_status_callback(isdn_ctrl *c)
-{
-	int di;
-	u_long flags;
-	int i;
-	int r;
-	int retval = 0;
-	isdn_ctrl cmd;
-	isdn_net_dev *p;
-
-	di = c->driver;
-	i = isdn_dc2minor(di, c->arg);
-	switch (c->command) {
-	case ISDN_STAT_BSENT:
-		if (i < 0)
-			return -1;
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (isdn_net_stat_callback(i, c))
-			return 0;
-		if (isdn_v110_stat_callback(i, c))
-			return 0;
-		if (isdn_tty_stat_callback(i, c))
-			return 0;
-		wake_up_interruptible(&dev->drv[di]->snd_waitq[c->arg]);
-		break;
-	case ISDN_STAT_STAVAIL:
-		dev->drv[di]->stavail += c->arg;
-		wake_up_interruptible(&dev->drv[di]->st_waitq);
-		break;
-	case ISDN_STAT_RUN:
-		dev->drv[di]->flags |= DRV_FLAG_RUNNING;
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if (dev->drvmap[i] == di)
-				isdn_all_eaz(di, dev->chanmap[i]);
-		set_global_features();
-		break;
-	case ISDN_STAT_STOP:
-		dev->drv[di]->flags &= ~DRV_FLAG_RUNNING;
-		break;
-	case ISDN_STAT_ICALL:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "ICALL (net): %d %ld %s\n", di, c->arg, c->parm.num);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED) {
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_HANGUP;
-			isdn_command(&cmd);
-			return 0;
-		}
-		/* Try to find a network-interface which will accept incoming call */
-		r = ((c->command == ISDN_STAT_ICALLW) ? 0 : isdn_net_find_icall(di, c->arg, i, &c->parm.setup));
-		switch (r) {
-		case 0:
-			/* No network-device replies.
-			 * Try ttyI's.
-			 * These return 0 on no match, 1 on match and
-			 * 3 on eventually match, if CID is longer.
-			 */
-			if (c->command == ISDN_STAT_ICALL)
-				if ((retval = isdn_tty_find_icall(di, c->arg, &c->parm.setup))) return (retval);
-#ifdef CONFIG_ISDN_DIVERSION
-			if (divert_if)
-				if ((retval = divert_if->stat_callback(c)))
-					return (retval); /* processed */
-#endif /* CONFIG_ISDN_DIVERSION */
-			if ((!retval) && (dev->drv[di]->flags & DRV_FLAG_REJBUS)) {
-				/* No tty responding */
-				cmd.driver = di;
-				cmd.arg = c->arg;
-				cmd.command = ISDN_CMD_HANGUP;
-				isdn_command(&cmd);
-				retval = 2;
-			}
-			break;
-		case 1:
-			/* Schedule connection-setup */
-			isdn_net_dial();
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_ACCEPTD;
-			for (p = dev->netdev; p; p = p->next)
-				if (p->local->isdn_channel == cmd.arg)
-				{
-					strcpy(cmd.parm.setup.eazmsn, p->local->msn);
-					isdn_command(&cmd);
-					retval = 1;
-					break;
-				}
-			break;
-
-		case 2:	/* For calling back, first reject incoming call ... */
-		case 3:	/* Interface found, but down, reject call actively  */
-			retval = 2;
-			printk(KERN_INFO "isdn: Rejecting Call\n");
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_HANGUP;
-			isdn_command(&cmd);
-			if (r == 3)
-				break;
-			/* Fall through */
-		case 4:
-			/* ... then start callback. */
-			isdn_net_dial();
-			break;
-		case 5:
-			/* Number would eventually match, if longer */
-			retval = 3;
-			break;
-		}
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "ICALL: ret=%d\n", retval);
-#endif
-		return retval;
-		break;
-	case ISDN_STAT_CINF:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "CINF: %ld %s\n", c->arg, c->parm.num);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (strcmp(c->parm.num, "0"))
-			isdn_net_stat_callback(i, c);
-		isdn_tty_stat_callback(i, c);
-		break;
-	case ISDN_STAT_CAUSE:
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "CAUSE: %ld %s\n", c->arg, c->parm.num);
-#endif
-		printk(KERN_INFO "isdn: %s,ch%ld cause: %s\n",
-		       dev->drvid[di], c->arg, c->parm.num);
-		isdn_tty_stat_callback(i, c);
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-	case ISDN_STAT_DISPLAY:
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DISPLAY: %ld %s\n", c->arg, c->parm.display);
-#endif
-		isdn_tty_stat_callback(i, c);
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-	case ISDN_STAT_DCONN:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DCONN: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		/* Find any net-device, waiting for D-channel setup */
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		/* Find any ttyI, waiting for D-channel setup */
-		if (isdn_tty_stat_callback(i, c)) {
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			isdn_command(&cmd);
-			break;
-		}
-		break;
-	case ISDN_STAT_DHUP:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DHUP: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online &= ~(1 << (c->arg));
-		isdn_info_update();
-		/* Signal hangup to network-devices */
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-		break;
-	case ISDN_STAT_BCONN:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "BCONN: %ld\n", c->arg);
-#endif
-		/* Signal B-channel-connect to network-devices */
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online |= (1 << (c->arg));
-		isdn_info_update();
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_BHUP:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "BHUP: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online &= ~(1 << (c->arg));
-		isdn_info_update();
-#ifdef CONFIG_ISDN_X25
-		/* Signal hangup to network-devices */
-		if (isdn_net_stat_callback(i, c))
-			break;
-#endif
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_NODCH:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "NODCH: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (isdn_net_stat_callback(i, c))
-			break;
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_ADDCH:
-		spin_lock_irqsave(&dev->lock, flags);
-		if (isdn_add_channels(dev->drv[di], di, c->arg, 1)) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			return -1;
-		}
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_info_update();
-		break;
-	case ISDN_STAT_DISCH:
-		spin_lock_irqsave(&dev->lock, flags);
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if ((dev->drvmap[i] == di) &&
-			    (dev->chanmap[i] == c->arg)) {
-				if (c->parm.num[0])
-					dev->usage[i] &= ~ISDN_USAGE_DISABLED;
-				else
-					if (USG_NONE(dev->usage[i])) {
-						dev->usage[i] |= ISDN_USAGE_DISABLED;
-					}
-					else
-						retval = -1;
-				break;
-			}
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_info_update();
-		break;
-	case ISDN_STAT_UNLOAD:
-		while (dev->drv[di]->locks > 0) {
-			isdn_unlock_driver(dev->drv[di]);
-		}
-		spin_lock_irqsave(&dev->lock, flags);
-		isdn_tty_stat_callback(i, c);
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if (dev->drvmap[i] == di) {
-				dev->drvmap[i] = -1;
-				dev->chanmap[i] = -1;
-				dev->usage[i] &= ~ISDN_USAGE_DISABLED;
-			}
-		dev->drivers--;
-		dev->channels -= dev->drv[di]->channels;
-		kfree(dev->drv[di]->rcverr);
-		kfree(dev->drv[di]->rcvcount);
-		for (i = 0; i < dev->drv[di]->channels; i++)
-			skb_queue_purge(&dev->drv[di]->rpqueue[i]);
-		kfree(dev->drv[di]->rpqueue);
-		kfree(dev->drv[di]->rcv_waitq);
-		kfree(dev->drv[di]);
-		dev->drv[di] = NULL;
-		dev->drvid[di][0] = '\0';
-		isdn_info_update();
-		set_global_features();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		return 0;
-	case ISDN_STAT_L1ERR:
-		break;
-	case CAPI_PUT_MESSAGE:
-		return (isdn_capi_rec_hl_msg(&c->parm.cmsg));
-#ifdef CONFIG_ISDN_TTY_FAX
-	case ISDN_STAT_FAXIND:
-		isdn_tty_stat_callback(i, c);
-		break;
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-	case ISDN_STAT_AUDIO:
-		isdn_tty_stat_callback(i, c);
-		break;
-#endif
-#ifdef CONFIG_ISDN_DIVERSION
-	case ISDN_STAT_PROT:
-	case ISDN_STAT_REDIR:
-		if (divert_if)
-			return (divert_if->stat_callback(c));
-#endif /* CONFIG_ISDN_DIVERSION */
-		/* fall through */
-	default:
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * Get integer from char-pointer, set pointer to end of number
- */
-int
-isdn_getnum(char **p)
-{
-	int v = -1;
-
-	while (*p[0] >= '0' && *p[0] <= '9')
-		v = ((v < 0) ? 0 : (v * 10)) + (int) ((*p[0]++) - '0');
-	return v;
-}
-
-#define DLE 0x10
-
-/*
- * isdn_readbchan() tries to get data from the read-queue.
- * It MUST be called with interrupts off.
- *
- * Be aware that this is not an atomic operation when sleep != 0, even though
- * interrupts are turned off! Well, like that we are currently only called
- * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for debugging purpose only). The inode semaphore
- * takes care that this is not called for the same minor device number while
- * we are sleeping, but access is not serialized against simultaneous read()
- * from the corresponding ttyI device. Can other ugly events, like changes
- * of the mapping (di,ch)<->minor, happen during the sleep? --he
- */
-int
-isdn_readbchan(int di, int channel, u_char *buf, u_char *fp, int len, wait_queue_head_t *sleep)
-{
-	int count;
-	int count_pull;
-	int count_put;
-	int dflag;
-	struct sk_buff *skb;
-	u_char *cp;
-
-	if (!dev->drv[di])
-		return 0;
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel])) {
-		if (sleep)
-			wait_event_interruptible(*sleep,
-				!skb_queue_empty(&dev->drv[di]->rpqueue[channel]));
-		else
-			return 0;
-	}
-	if (len > dev->drv[di]->rcvcount[channel])
-		len = dev->drv[di]->rcvcount[channel];
-	cp = buf;
-	count = 0;
-	while (len) {
-		if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel])))
-			break;
-#ifdef CONFIG_ISDN_AUDIO
-		if (ISDN_AUDIO_SKB_LOCK(skb))
-			break;
-		ISDN_AUDIO_SKB_LOCK(skb) = 1;
-		if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) {
-			char *p = skb->data;
-			unsigned long DLEmask = (1 << channel);
-
-			dflag = 0;
-			count_pull = count_put = 0;
-			while ((count_pull < skb->len) && (len > 0)) {
-				len--;
-				if (dev->drv[di]->DLEflag & DLEmask) {
-					*cp++ = DLE;
-					dev->drv[di]->DLEflag &= ~DLEmask;
-				} else {
-					*cp++ = *p;
-					if (*p == DLE) {
-						dev->drv[di]->DLEflag |= DLEmask;
-						(ISDN_AUDIO_SKB_DLECOUNT(skb))--;
-					}
-					p++;
-					count_pull++;
-				}
-				count_put++;
-			}
-			if (count_pull >= skb->len)
-				dflag = 1;
-		} else {
-#endif
-			/* No DLE's in buff, so simply copy it */
-			dflag = 1;
-			if ((count_pull = skb->len) > len) {
-				count_pull = len;
-				dflag = 0;
-			}
-			count_put = count_pull;
-			skb_copy_from_linear_data(skb, cp, count_put);
-			cp += count_put;
-			len -= count_put;
-#ifdef CONFIG_ISDN_AUDIO
-		}
-#endif
-		count += count_put;
-		if (fp) {
-			memset(fp, 0, count_put);
-			fp += count_put;
-		}
-		if (dflag) {
-			/* We got all the data in this buff.
-			 * Now we can dequeue it.
-			 */
-			if (fp)
-				*(fp - 1) = 0xff;
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-			skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]);
-			dev_kfree_skb(skb);
-		} else {
-			/* Not yet emptied this buff, so it
-			 * must stay in the queue, for further calls
-			 * but we pull off the data we got until now.
-			 */
-			skb_pull(skb, count_pull);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-		}
-		dev->drv[di]->rcvcount[channel] -= count_put;
-	}
-	return count;
-}
-
-/*
- * isdn_readbchan_tty() tries to get data from the read-queue.
- * It MUST be called with interrupts off.
- *
- * Be aware that this is not an atomic operation when sleep != 0, even though
- * interrupts are turned off! Well, like that we are currently only called
- * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for debugging purpose only). The inode semaphore
- * takes care that this is not called for the same minor device number while
- * we are sleeping, but access is not serialized against simultaneous read()
- * from the corresponding ttyI device. Can other ugly events, like changes
- * of the mapping (di,ch)<->minor, happen during the sleep? --he
- */
-int
-isdn_readbchan_tty(int di, int channel, struct tty_port *port, int cisco_hack)
-{
-	int count;
-	int count_pull;
-	int count_put;
-	int dflag;
-	struct sk_buff *skb;
-	char last = 0;
-	int len;
-
-	if (!dev->drv[di])
-		return 0;
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
-		return 0;
-
-	len = tty_buffer_request_room(port, dev->drv[di]->rcvcount[channel]);
-	if (len == 0)
-		return len;
-
-	count = 0;
-	while (len) {
-		if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel])))
-			break;
-#ifdef CONFIG_ISDN_AUDIO
-		if (ISDN_AUDIO_SKB_LOCK(skb))
-			break;
-		ISDN_AUDIO_SKB_LOCK(skb) = 1;
-		if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) {
-			char *p = skb->data;
-			unsigned long DLEmask = (1 << channel);
-
-			dflag = 0;
-			count_pull = count_put = 0;
-			while ((count_pull < skb->len) && (len > 0)) {
-				/* push every character but the last to the tty buffer directly */
-				if (count_put)
-					tty_insert_flip_char(port, last, TTY_NORMAL);
-				len--;
-				if (dev->drv[di]->DLEflag & DLEmask) {
-					last = DLE;
-					dev->drv[di]->DLEflag &= ~DLEmask;
-				} else {
-					last = *p;
-					if (last == DLE) {
-						dev->drv[di]->DLEflag |= DLEmask;
-						(ISDN_AUDIO_SKB_DLECOUNT(skb))--;
-					}
-					p++;
-					count_pull++;
-				}
-				count_put++;
-			}
-			if (count_pull >= skb->len)
-				dflag = 1;
-		} else {
-#endif
-			/* No DLE's in buff, so simply copy it */
-			dflag = 1;
-			if ((count_pull = skb->len) > len) {
-				count_pull = len;
-				dflag = 0;
-			}
-			count_put = count_pull;
-			if (count_put > 1)
-				tty_insert_flip_string(port, skb->data, count_put - 1);
-			last = skb->data[count_put - 1];
-			len -= count_put;
-#ifdef CONFIG_ISDN_AUDIO
-		}
-#endif
-		count += count_put;
-		if (dflag) {
-			/* We got all the data in this buff.
-			 * Now we can dequeue it.
-			 */
-			if (cisco_hack)
-				tty_insert_flip_char(port, last, 0xFF);
-			else
-				tty_insert_flip_char(port, last, TTY_NORMAL);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-			skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]);
-			dev_kfree_skb(skb);
-		} else {
-			tty_insert_flip_char(port, last, TTY_NORMAL);
-			/* Not yet emptied this buff, so it
-			 * must stay in the queue, for further calls
-			 * but we pull off the data we got until now.
-			 */
-			skb_pull(skb, count_pull);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-		}
-		dev->drv[di]->rcvcount[channel] -= count_put;
-	}
-	return count;
-}
-
-
-static inline int
-isdn_minor2drv(int minor)
-{
-	return (dev->drvmap[minor]);
-}
-
-static inline int
-isdn_minor2chan(int minor)
-{
-	return (dev->chanmap[minor]);
-}
-
-static char *
-isdn_statstr(void)
-{
-	static char istatbuf[2048];
-	char *p;
-	int i;
-
-	sprintf(istatbuf, "idmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%s ", (dev->drvmap[i] < 0) ? "-" : dev->drvid[dev->drvmap[i]]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nchmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->chanmap[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\ndrmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->drvmap[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nusage:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->usage[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nflags:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (dev->drv[i]) {
-			sprintf(p, "%ld ", dev->drv[i]->online);
-			p = istatbuf + strlen(istatbuf);
-		} else {
-			sprintf(p, "? ");
-			p = istatbuf + strlen(istatbuf);
-		}
-	}
-	sprintf(p, "\nphone:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%s ", dev->num[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\n");
-	return istatbuf;
-}
-
-/* Module interface-code */
-
-void
-isdn_info_update(void)
-{
-	infostruct *p = dev->infochain;
-
-	while (p) {
-		*(p->private) = 1;
-		p = (infostruct *) p->next;
-	}
-	wake_up_interruptible(&(dev->info_waitq));
-}
-
-static ssize_t
-isdn_read(struct file *file, char __user *buf, size_t count, loff_t *off)
-{
-	uint minor = iminor(file_inode(file));
-	int len = 0;
-	int drvidx;
-	int chidx;
-	int retval;
-	char *p;
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		if (!file->private_data) {
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				goto out;
-			}
-			wait_event_interruptible(dev->info_waitq,
-						 file->private_data);
-		}
-		p = isdn_statstr();
-		file->private_data = NULL;
-		if ((len = strlen(p)) <= count) {
-			if (copy_to_user(buf, p, len)) {
-				retval = -EFAULT;
-				goto out;
-			}
-			*off += len;
-			retval = len;
-			goto out;
-		}
-		retval = 0;
-		goto out;
-	}
-	if (!dev->drivers) {
-		retval = -ENODEV;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_read minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) {
-			retval = -ENODEV;
-			goto out;
-		}
-		chidx = isdn_minor2chan(minor);
-		if (!(p = kmalloc(count, GFP_KERNEL))) {
-			retval = -ENOMEM;
-			goto out;
-		}
-		len = isdn_readbchan(drvidx, chidx, p, NULL, count,
-				     &dev->drv[drvidx]->rcv_waitq[chidx]);
-		*off += len;
-		if (copy_to_user(buf, p, len))
-			len = -EFAULT;
-		kfree(p);
-		retval = len;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!dev->drv[drvidx]->stavail) {
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				goto out;
-			}
-			wait_event_interruptible(dev->drv[drvidx]->st_waitq,
-						 dev->drv[drvidx]->stavail);
-		}
-		if (dev->drv[drvidx]->interface->readstat) {
-			if (count > dev->drv[drvidx]->stavail)
-				count = dev->drv[drvidx]->stavail;
-			len = dev->drv[drvidx]->interface->readstat(buf, count,
-								    drvidx, isdn_minor2chan(minor - ISDN_MINOR_CTRL));
-			if (len < 0) {
-				retval = len;
-				goto out;
-			}
-		} else {
-			len = 0;
-		}
-		if (len)
-			dev->drv[drvidx]->stavail -= len;
-		else
-			dev->drv[drvidx]->stavail = 0;
-		*off += len;
-		retval = len;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_read(minor - ISDN_MINOR_PPP, file, buf, count);
-		goto out;
-	}
-#endif
-	retval = -ENODEV;
-out:
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static ssize_t
-isdn_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
-{
-	uint minor = iminor(file_inode(file));
-	int drvidx;
-	int chidx;
-	int retval;
-
-	if (minor == ISDN_MINOR_STATUS)
-		return -EPERM;
-	if (!dev->drivers)
-		return -ENODEV;
-
-	mutex_lock(&isdn_mutex);
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_write minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) {
-			retval = -ENODEV;
-			goto out;
-		}
-		chidx = isdn_minor2chan(minor);
-		wait_event_interruptible(dev->drv[drvidx]->snd_waitq[chidx],
-			(retval = isdn_writebuf_stub(drvidx, chidx, buf, count)));
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		/*
-		 * We want to use the isdnctrl device to load the firmware
-		 *
-		 if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-		 return -ENODEV;
-		*/
-		if (dev->drv[drvidx]->interface->writecmd)
-			retval = dev->drv[drvidx]->interface->
-				writecmd(buf, count, drvidx,
-					 isdn_minor2chan(minor - ISDN_MINOR_CTRL));
-		else
-			retval = count;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_write(minor - ISDN_MINOR_PPP, file, buf, count);
-		goto out;
-	}
-#endif
-	retval = -ENODEV;
-out:
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static __poll_t
-isdn_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask = 0;
-	unsigned int minor = iminor(file_inode(file));
-	int drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		poll_wait(file, &(dev->info_waitq), wait);
-		/* mask = EPOLLOUT | EPOLLWRNORM; */
-		if (file->private_data) {
-			mask |= EPOLLIN | EPOLLRDNORM;
-		}
-		goto out;
-	}
-	if (minor >= ISDN_MINOR_CTRL && minor <= ISDN_MINOR_CTRLMAX) {
-		if (drvidx < 0) {
-			/* driver deregistered while file open */
-			mask = EPOLLHUP;
-			goto out;
-		}
-		poll_wait(file, &(dev->drv[drvidx]->st_waitq), wait);
-		mask = EPOLLOUT | EPOLLWRNORM;
-		if (dev->drv[drvidx]->stavail) {
-			mask |= EPOLLIN | EPOLLRDNORM;
-		}
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		mask = isdn_ppp_poll(file, wait);
-		goto out;
-	}
-#endif
-	mask = EPOLLERR;
-out:
-	mutex_unlock(&isdn_mutex);
-	return mask;
-}
-
-
-static int
-isdn_ioctl(struct file *file, uint cmd, ulong arg)
-{
-	uint minor = iminor(file_inode(file));
-	isdn_ctrl c;
-	int drvidx;
-	int ret;
-	int i;
-	char __user *p;
-	char *s;
-	union iocpar {
-		char name[10];
-		char bname[22];
-		isdn_ioctl_struct iocts;
-		isdn_net_ioctl_phone phone;
-		isdn_net_ioctl_cfg cfg;
-	} iocpar;
-	void __user *argp = (void __user *)arg;
-
-#define name  iocpar.name
-#define bname iocpar.bname
-#define iocts iocpar.iocts
-#define phone iocpar.phone
-#define cfg   iocpar.cfg
-
-	if (minor == ISDN_MINOR_STATUS) {
-		switch (cmd) {
-		case IIOCGETDVR:
-			return (TTY_DV +
-				(NET_DV << 8) +
-				(INF_DV << 16));
-		case IIOCGETCPS:
-			if (arg) {
-				ulong __user *p = argp;
-				int i;
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					put_user(dev->ibytes[i], p++);
-					put_user(dev->obytes[i], p++);
-				}
-				return 0;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCNETGPN:
-			/* Get peer phone number of a connected
-			 * isdn network interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				return isdn_net_getpeer(&phone, argp);
-			} else
-				return -EINVAL;
-		default:
-			return -EINVAL;
-		}
-	}
-	if (!dev->drivers)
-		return -ENODEV;
-	if (minor <= ISDN_MINOR_BMAX) {
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0)
-			return -ENODEV;
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-			return -ENODEV;
-		return 0;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-/*
- * isdn net devices manage lots of configuration variables as linked lists.
- * Those lists must only be manipulated from user space. Some of the ioctl's
- * service routines access user space and are not atomic. Therefore, ioctl's
- * manipulating the lists and ioctl's sleeping while accessing the lists
- * are serialized by means of a semaphore.
- */
-		switch (cmd) {
-		case IIOCNETDWRSET:
-			printk(KERN_INFO "INFO: ISDN_DW_ABC_EXTENSION not enabled\n");
-			return (-EINVAL);
-		case IIOCNETLCR:
-			printk(KERN_INFO "INFO: ISDN_ABC_LCR_SUPPORT not enabled\n");
-			return -ENODEV;
-		case IIOCNETAIF:
-			/* Add a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				s = name;
-			} else {
-				s = NULL;
-			}
-			ret = mutex_lock_interruptible(&dev->mtx);
-			if (ret) return ret;
-			if ((s = isdn_net_new(s, NULL))) {
-				if (copy_to_user(argp, s, strlen(s) + 1)) {
-					ret = -EFAULT;
-				} else {
-					ret = 0;
-				}
-			} else
-				ret = -ENODEV;
-			mutex_unlock(&dev->mtx);
-			return ret;
-		case IIOCNETASL:
-			/* Add a slave to a network-interface */
-			if (arg) {
-				if (copy_from_user(bname, argp, sizeof(bname) - 1))
-					return -EFAULT;
-				bname[sizeof(bname)-1] = 0;
-			} else
-				return -EINVAL;
-			ret = mutex_lock_interruptible(&dev->mtx);
-			if (ret) return ret;
-			if ((s = isdn_net_newslave(bname))) {
-				if (copy_to_user(argp, s, strlen(s) + 1)) {
-					ret = -EFAULT;
-				} else {
-					ret = 0;
-				}
-			} else
-				ret = -ENODEV;
-			mutex_unlock(&dev->mtx);
-			return ret;
-		case IIOCNETDIF:
-			/* Delete a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_rm(name);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETSCF:
-			/* Set configurable parameters of a network-interface */
-			if (arg) {
-				if (copy_from_user(&cfg, argp, sizeof(cfg)))
-					return -EFAULT;
-				return isdn_net_setcfg(&cfg);
-			} else
-				return -EINVAL;
-		case IIOCNETGCF:
-			/* Get configurable parameters of a network-interface */
-			if (arg) {
-				if (copy_from_user(&cfg, argp, sizeof(cfg)))
-					return -EFAULT;
-				if (!(ret = isdn_net_getcfg(&cfg))) {
-					if (copy_to_user(argp, &cfg, sizeof(cfg)))
-						return -EFAULT;
-				}
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETANM:
-			/* Add a phone-number to a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_addphone(&phone);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETGNM:
-			/* Get list of phone-numbers of a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_getphones(&phone, argp);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETDNM:
-			/* Delete a phone-number of a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_delphone(&phone);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETDIL:
-			/* Force dialing of a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				return isdn_net_force_dial(name);
-			} else
-				return -EINVAL;
-#ifdef CONFIG_ISDN_PPP
-		case IIOCNETALN:
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_ppp_dial_slave(name);
-		case IIOCNETDLN:
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_ppp_hangup_slave(name);
-#endif
-		case IIOCNETHUP:
-			/* Force hangup of a network-interface */
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_net_force_hangup(name);
-			break;
-		case IIOCSETVER:
-			dev->net_verbose = arg;
-			printk(KERN_INFO "isdn: Verbose-Level is %d\n", dev->net_verbose);
-			return 0;
-		case IIOCSETGST:
-			if (arg)
-				dev->global_flags |= ISDN_GLOBAL_STOPPED;
-			else
-				dev->global_flags &= ~ISDN_GLOBAL_STOPPED;
-			printk(KERN_INFO "isdn: Global Mode %s\n",
-			       (dev->global_flags & ISDN_GLOBAL_STOPPED) ? "stopped" : "running");
-			return 0;
-		case IIOCSETBRJ:
-			drvidx = -1;
-			if (arg) {
-				int i;
-				char *p;
-				if (copy_from_user(&iocts, argp,
-						   sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					if ((p = strchr(iocts.drvid, ',')))
-						*p = 0;
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				}
-			}
-			if (drvidx == -1)
-				return -ENODEV;
-			if (iocts.arg)
-				dev->drv[drvidx]->flags |= DRV_FLAG_REJBUS;
-			else
-				dev->drv[drvidx]->flags &= ~DRV_FLAG_REJBUS;
-			return 0;
-		case IIOCSIGPRF:
-			dev->profd = current;
-			return 0;
-			break;
-		case IIOCGETPRF:
-			/* Get all Modem-Profiles */
-			if (arg) {
-				char __user *p = argp;
-				int i;
-
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					if (copy_to_user(p, dev->mdm.info[i].emu.profile,
-							 ISDN_MODEM_NUMREG))
-						return -EFAULT;
-					p += ISDN_MODEM_NUMREG;
-					if (copy_to_user(p, dev->mdm.info[i].emu.pmsn, ISDN_MSNLEN))
-						return -EFAULT;
-					p += ISDN_MSNLEN;
-					if (copy_to_user(p, dev->mdm.info[i].emu.plmsn, ISDN_LMSNLEN))
-						return -EFAULT;
-					p += ISDN_LMSNLEN;
-				}
-				return (ISDN_MODEM_NUMREG + ISDN_MSNLEN + ISDN_LMSNLEN) * ISDN_MAX_CHANNELS;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCSETPRF:
-			/* Set all Modem-Profiles */
-			if (arg) {
-				char __user *p = argp;
-				int i;
-
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					if (copy_from_user(dev->mdm.info[i].emu.profile, p,
-							   ISDN_MODEM_NUMREG))
-						return -EFAULT;
-					p += ISDN_MODEM_NUMREG;
-					if (copy_from_user(dev->mdm.info[i].emu.plmsn, p, ISDN_LMSNLEN))
-						return -EFAULT;
-					p += ISDN_LMSNLEN;
-					if (copy_from_user(dev->mdm.info[i].emu.pmsn, p, ISDN_MSNLEN))
-						return -EFAULT;
-					p += ISDN_MSNLEN;
-				}
-				return 0;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCSETMAP:
-		case IIOCGETMAP:
-			/* Set/Get MSN->EAZ-Mapping for a driver */
-			if (arg) {
-
-				if (copy_from_user(&iocts, argp,
-						   sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				} else
-					drvidx = 0;
-				if (drvidx == -1)
-					return -ENODEV;
-				if (cmd == IIOCSETMAP) {
-					int loop = 1;
-
-					p = (char __user *) iocts.arg;
-					i = 0;
-					while (loop) {
-						int j = 0;
-
-						while (1) {
-							get_user(bname[j], p++);
-							switch (bname[j]) {
-							case '\0':
-								loop = 0;
-								/* Fall through */
-							case ',':
-								bname[j] = '\0';
-								strcpy(dev->drv[drvidx]->msn2eaz[i], bname);
-								j = ISDN_MSNLEN;
-								break;
-							default:
-								j++;
-							}
-							if (j >= ISDN_MSNLEN)
-								break;
-						}
-						if (++i > 9)
-							break;
-					}
-				} else {
-					p = (char __user *) iocts.arg;
-					for (i = 0; i < 10; i++) {
-						snprintf(bname, sizeof(bname), "%s%s",
-							 strlen(dev->drv[drvidx]->msn2eaz[i]) ?
-							 dev->drv[drvidx]->msn2eaz[i] : "_",
-							 (i < 9) ? "," : "\0");
-						if (copy_to_user(p, bname, strlen(bname) + 1))
-							return -EFAULT;
-						p += strlen(bname);
-					}
-				}
-				return 0;
-			} else
-				return -EINVAL;
-		case IIOCDBGVAR:
-			return -EINVAL;
-		default:
-			if ((cmd & IIOCDRVCTL) == IIOCDRVCTL)
-				cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK;
-			else
-				return -EINVAL;
-			if (arg) {
-				int i;
-				char *p;
-				if (copy_from_user(&iocts, argp, sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					if ((p = strchr(iocts.drvid, ',')))
-						*p = 0;
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				} else
-					drvidx = 0;
-				if (drvidx == -1)
-					return -ENODEV;
-				c.driver = drvidx;
-				c.command = ISDN_CMD_IOCTL;
-				c.arg = cmd;
-				memcpy(c.parm.num, &iocts.arg, sizeof(ulong));
-				ret = isdn_command(&c);
-				memcpy(&iocts.arg, c.parm.num, sizeof(ulong));
-				if (copy_to_user(argp, &iocts, sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				return ret;
-			} else
-				return -EINVAL;
-		}
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX)
-		return (isdn_ppp_ioctl(minor - ISDN_MINOR_PPP, file, cmd, arg));
-#endif
-	return -ENODEV;
-
-#undef name
-#undef bname
-#undef iocts
-#undef phone
-#undef cfg
-}
-
-static long
-isdn_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&isdn_mutex);
-	ret = isdn_ioctl(file, cmd, arg);
-	mutex_unlock(&isdn_mutex);
-
-	return ret;
-}
-
-/*
- * Open the device code.
- */
-static int
-isdn_open(struct inode *ino, struct file *filep)
-{
-	uint minor = iminor(ino);
-	int drvidx;
-	int chidx;
-	int retval = -ENODEV;
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		infostruct *p;
-
-		if ((p = kmalloc(sizeof(infostruct), GFP_KERNEL))) {
-			p->next = (char *) dev->infochain;
-			p->private = (char *) &(filep->private_data);
-			dev->infochain = p;
-			/* At opening we allow a single update */
-			filep->private_data = (char *) 1;
-			retval = 0;
-			goto out;
-		} else {
-			retval = -ENOMEM;
-			goto out;
-		}
-	}
-	if (!dev->channels)
-		goto out;
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_open minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0)
-			goto out;
-		chidx = isdn_minor2chan(minor);
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-			goto out;
-		if (!(dev->drv[drvidx]->online & (1 << chidx)))
-			goto out;
-		isdn_lock_drivers();
-		retval = 0;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0)
-			goto out;
-		isdn_lock_drivers();
-		retval = 0;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_open(minor - ISDN_MINOR_PPP, filep);
-		if (retval == 0)
-			isdn_lock_drivers();
-		goto out;
-	}
-#endif
-out:
-	nonseekable_open(ino, filep);
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static int
-isdn_close(struct inode *ino, struct file *filep)
-{
-	uint minor = iminor(ino);
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		infostruct *p = dev->infochain;
-		infostruct *q = NULL;
-
-		while (p) {
-			if (p->private == (char *) &(filep->private_data)) {
-				if (q)
-					q->next = p->next;
-				else
-					dev->infochain = (infostruct *) (p->next);
-				kfree(p);
-				goto out;
-			}
-			q = p;
-			p = (infostruct *) (p->next);
-		}
-		printk(KERN_WARNING "isdn: No private data while closing isdnctrl\n");
-		goto out;
-	}
-	isdn_unlock_drivers();
-	if (minor <= ISDN_MINOR_BMAX)
-		goto out;
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		if (dev->profd == current)
-			dev->profd = NULL;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX)
-		isdn_ppp_release(minor - ISDN_MINOR_PPP, filep);
-#endif
-
-out:
-	mutex_unlock(&isdn_mutex);
-	return 0;
-}
-
-static const struct file_operations isdn_fops =
-{
-	.owner		= THIS_MODULE,
-	.llseek		= no_llseek,
-	.read		= isdn_read,
-	.write		= isdn_write,
-	.poll		= isdn_poll,
-	.unlocked_ioctl	= isdn_unlocked_ioctl,
-	.open		= isdn_open,
-	.release	= isdn_close,
-};
-
-char *
-isdn_map_eaz2msn(char *msn, int di)
-{
-	isdn_driver_t *this = dev->drv[di];
-	int i;
-
-	if (strlen(msn) == 1) {
-		i = msn[0] - '0';
-		if ((i >= 0) && (i <= 9))
-			if (strlen(this->msn2eaz[i]))
-				return (this->msn2eaz[i]);
-	}
-	return (msn);
-}
-
-/*
- * Find an unused ISDN-channel, whose feature-flags match the
- * given L2- and L3-protocols.
- */
-#define L2V (~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038))
-
-/*
- * This function must be called with holding the dev->lock.
- */
-int
-isdn_get_free_channel(int usage, int l2_proto, int l3_proto, int pre_dev
-		      , int pre_chan, char *msn)
-{
-	int i;
-	ulong features;
-	ulong vfeatures;
-
-	features = ((1 << l2_proto) | (0x10000 << l3_proto));
-	vfeatures = (((1 << l2_proto) | (0x10000 << l3_proto)) &
-		     ~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038));
-	/* If Layer-2 protocol is V.110, accept drivers with
-	 * transparent feature even if these don't support V.110
-	 * because we can emulate this in linklevel.
-	 */
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (USG_NONE(dev->usage[i]) &&
-		    (dev->drvmap[i] != -1)) {
-			int d = dev->drvmap[i];
-			if ((dev->usage[i] & ISDN_USAGE_EXCLUSIVE) &&
-			    ((pre_dev != d) || (pre_chan != dev->chanmap[i])))
-				continue;
-			if (!strcmp(isdn_map_eaz2msn(msn, d), "-"))
-				continue;
-			if (dev->usage[i] & ISDN_USAGE_DISABLED)
-				continue; /* usage not allowed */
-			if (dev->drv[d]->flags & DRV_FLAG_RUNNING) {
-				if (((dev->drv[d]->interface->features & features) == features) ||
-				    (((dev->drv[d]->interface->features & vfeatures) == vfeatures) &&
-				     (dev->drv[d]->interface->features & ISDN_FEATURE_L2_TRANS))) {
-					if ((pre_dev < 0) || (pre_chan < 0)) {
-						dev->usage[i] &= ISDN_USAGE_EXCLUSIVE;
-						dev->usage[i] |= usage;
-						isdn_info_update();
-						return i;
-					} else {
-						if ((pre_dev == d) && (pre_chan == dev->chanmap[i])) {
-							dev->usage[i] &= ISDN_USAGE_EXCLUSIVE;
-							dev->usage[i] |= usage;
-							isdn_info_update();
-							return i;
-						}
-					}
-				}
-			}
-		}
-	return -1;
-}
-
-/*
- * Set state of ISDN-channel to 'unused'
- */
-void
-isdn_free_channel(int di, int ch, int usage)
-{
-	int i;
-
-	if ((di < 0) || (ch < 0)) {
-		printk(KERN_WARNING "%s: called with invalid drv(%d) or channel(%d)\n",
-		       __func__, di, ch);
-		return;
-	}
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (((!usage) || ((dev->usage[i] & ISDN_USAGE_MASK) == usage)) &&
-		    (dev->drvmap[i] == di) &&
-		    (dev->chanmap[i] == ch)) {
-			dev->usage[i] &= (ISDN_USAGE_NONE | ISDN_USAGE_EXCLUSIVE);
-			strcpy(dev->num[i], "???");
-			dev->ibytes[i] = 0;
-			dev->obytes[i] = 0;
-// 20.10.99 JIM, try to reinitialize v110 !
-			dev->v110emu[i] = 0;
-			atomic_set(&(dev->v110use[i]), 0);
-			isdn_v110_close(dev->v110[i]);
-			dev->v110[i] = NULL;
-// 20.10.99 JIM, try to reinitialize v110 !
-			isdn_info_update();
-			if (dev->drv[di])
-				skb_queue_purge(&dev->drv[di]->rpqueue[ch]);
-		}
-}
-
-/*
- * Cancel Exclusive-Flag for ISDN-channel
- */
-void
-isdn_unexclusive_channel(int di, int ch)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if ((dev->drvmap[i] == di) &&
-		    (dev->chanmap[i] == ch)) {
-			dev->usage[i] &= ~ISDN_USAGE_EXCLUSIVE;
-			isdn_info_update();
-			return;
-		}
-}
-
-/*
- *  writebuf replacement for SKB_ABLE drivers
- */
-static int
-isdn_writebuf_stub(int drvidx, int chan, const u_char __user *buf, int len)
-{
-	int ret;
-	int hl = dev->drv[drvidx]->interface->hl_hdrlen;
-	struct sk_buff *skb = alloc_skb(hl + len, GFP_ATOMIC);
-
-	if (!skb)
-		return -ENOMEM;
-	skb_reserve(skb, hl);
-	if (copy_from_user(skb_put(skb, len), buf, len)) {
-		dev_kfree_skb(skb);
-		return -EFAULT;
-	}
-	ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, 1, skb);
-	if (ret <= 0)
-		dev_kfree_skb(skb);
-	if (ret > 0)
-		dev->obytes[isdn_dc2minor(drvidx, chan)] += ret;
-	return ret;
-}
-
-/*
- * Return: length of data on success, -ERRcode on failure.
- */
-int
-isdn_writebuf_skb_stub(int drvidx, int chan, int ack, struct sk_buff *skb)
-{
-	int ret;
-	struct sk_buff *nskb = NULL;
-	int v110_ret = skb->len;
-	int idx = isdn_dc2minor(drvidx, chan);
-
-	if (dev->v110[idx]) {
-		atomic_inc(&dev->v110use[idx]);
-		nskb = isdn_v110_encode(dev->v110[idx], skb);
-		atomic_dec(&dev->v110use[idx]);
-		if (!nskb)
-			return 0;
-		v110_ret = *((int *)nskb->data);
-		skb_pull(nskb, sizeof(int));
-		if (!nskb->len) {
-			dev_kfree_skb(nskb);
-			return v110_ret;
-		}
-		/* V.110 must always be acknowledged */
-		ack = 1;
-		ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, nskb);
-	} else {
-		int hl = dev->drv[drvidx]->interface->hl_hdrlen;
-
-		if (skb_headroom(skb) < hl) {
-			/*
-			 * This should only occur when new HL driver with
-			 * increased hl_hdrlen was loaded after netdevice
-			 * was created and connected to the new driver.
-			 *
-			 * The V.110 branch (re-allocates on its own) does
-			 * not need this
-			 */
-			struct sk_buff *skb_tmp;
-
-			skb_tmp = skb_realloc_headroom(skb, hl);
-			printk(KERN_DEBUG "isdn_writebuf_skb_stub: reallocating headroom%s\n", skb_tmp ? "" : " failed");
-			if (!skb_tmp) return -ENOMEM; /* 0 better? */
-			ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb_tmp);
-			if (ret > 0) {
-				dev_kfree_skb(skb);
-			} else {
-				dev_kfree_skb(skb_tmp);
-			}
-		} else {
-			ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb);
-		}
-	}
-	if (ret > 0) {
-		dev->obytes[idx] += ret;
-		if (dev->v110[idx]) {
-			atomic_inc(&dev->v110use[idx]);
-			dev->v110[idx]->skbuser++;
-			atomic_dec(&dev->v110use[idx]);
-			/* For V.110 return unencoded data length */
-			ret = v110_ret;
-			/* if the complete frame was send we free the skb;
-			   if not upper function will requeue the skb */
-			if (ret == skb->len)
-				dev_kfree_skb(skb);
-		}
-	} else
-		if (dev->v110[idx])
-			dev_kfree_skb(nskb);
-	return ret;
-}
-
-static int
-isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding)
-{
-	int j, k, m;
-
-	init_waitqueue_head(&d->st_waitq);
-	if (d->flags & DRV_FLAG_RUNNING)
-		return -1;
-	if (n < 1) return 0;
-
-	m = (adding) ? d->channels + n : n;
-
-	if (dev->channels + n > ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "register_isdn: Max. %d channels supported\n",
-		       ISDN_MAX_CHANNELS);
-		return -1;
-	}
-
-	if ((adding) && (d->rcverr))
-		kfree(d->rcverr);
-	if (!(d->rcverr = kcalloc(m, sizeof(int), GFP_ATOMIC))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcverr\n");
-		return -1;
-	}
-
-	if ((adding) && (d->rcvcount))
-		kfree(d->rcvcount);
-	if (!(d->rcvcount = kcalloc(m, sizeof(int), GFP_ATOMIC))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcvcount\n");
-		if (!adding)
-			kfree(d->rcverr);
-		return -1;
-	}
-
-	if ((adding) && (d->rpqueue)) {
-		for (j = 0; j < d->channels; j++)
-			skb_queue_purge(&d->rpqueue[j]);
-		kfree(d->rpqueue);
-	}
-	d->rpqueue = kmalloc_array(m, sizeof(struct sk_buff_head), GFP_ATOMIC);
-	if (!d->rpqueue) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rpqueue\n");
-		if (!adding) {
-			kfree(d->rcvcount);
-			kfree(d->rcverr);
-		}
-		return -1;
-	}
-	for (j = 0; j < m; j++) {
-		skb_queue_head_init(&d->rpqueue[j]);
-	}
-
-	if ((adding) && (d->rcv_waitq))
-		kfree(d->rcv_waitq);
-	d->rcv_waitq = kmalloc(array3_size(sizeof(wait_queue_head_t), 2, m),
-			       GFP_ATOMIC);
-	if (!d->rcv_waitq) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcv_waitq\n");
-		if (!adding) {
-			kfree(d->rpqueue);
-			kfree(d->rcvcount);
-			kfree(d->rcverr);
-		}
-		return -1;
-	}
-	d->snd_waitq = d->rcv_waitq + m;
-	for (j = 0; j < m; j++) {
-		init_waitqueue_head(&d->rcv_waitq[j]);
-		init_waitqueue_head(&d->snd_waitq[j]);
-	}
-
-	dev->channels += n;
-	for (j = d->channels; j < m; j++)
-		for (k = 0; k < ISDN_MAX_CHANNELS; k++)
-			if (dev->chanmap[k] < 0) {
-				dev->chanmap[k] = j;
-				dev->drvmap[k] = drvidx;
-				break;
-			}
-	d->channels = m;
-	return 0;
-}
-
-/*
- * Low-level-driver registration
- */
-
-static void
-set_global_features(void)
-{
-	int drvidx;
-
-	dev->global_features = 0;
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++) {
-		if (!dev->drv[drvidx])
-			continue;
-		if (dev->drv[drvidx]->interface)
-			dev->global_features |= dev->drv[drvidx]->interface->features;
-	}
-}
-
-#ifdef CONFIG_ISDN_DIVERSION
-
-static char *map_drvname(int di)
-{
-	if ((di < 0) || (di >= ISDN_MAX_DRIVERS))
-		return (NULL);
-	return (dev->drvid[di]); /* driver name */
-} /* map_drvname */
-
-static int map_namedrv(char *id)
-{  int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-	{ if (!strcmp(dev->drvid[i], id))
-			return (i);
-	}
-	return (-1);
-} /* map_namedrv */
-
-int DIVERT_REG_NAME(isdn_divert_if *i_div)
-{
-	if (i_div->if_magic != DIVERT_IF_MAGIC)
-		return (DIVERT_VER_ERR);
-	switch (i_div->cmd)
-	{
-	case DIVERT_CMD_REL:
-		if (divert_if != i_div)
-			return (DIVERT_REL_ERR);
-		divert_if = NULL; /* free interface */
-		return (DIVERT_NO_ERR);
-
-	case DIVERT_CMD_REG:
-		if (divert_if)
-			return (DIVERT_REG_ERR);
-		i_div->ll_cmd = isdn_command; /* set command function */
-		i_div->drv_to_name = map_drvname;
-		i_div->name_to_drv = map_namedrv;
-		divert_if = i_div; /* remember interface */
-		return (DIVERT_NO_ERR);
-
-	default:
-		return (DIVERT_CMD_ERR);
-	}
-} /* DIVERT_REG_NAME */
-
-EXPORT_SYMBOL(DIVERT_REG_NAME);
-
-#endif /* CONFIG_ISDN_DIVERSION */
-
-
-EXPORT_SYMBOL(register_isdn);
-#ifdef CONFIG_ISDN_PPP
-EXPORT_SYMBOL(isdn_ppp_register_compressor);
-EXPORT_SYMBOL(isdn_ppp_unregister_compressor);
-#endif
-
-int
-register_isdn(isdn_if *i)
-{
-	isdn_driver_t *d;
-	int j;
-	ulong flags;
-	int drvidx;
-
-	if (dev->drivers >= ISDN_MAX_DRIVERS) {
-		printk(KERN_WARNING "register_isdn: Max. %d drivers supported\n",
-		       ISDN_MAX_DRIVERS);
-		return 0;
-	}
-	if (!i->writebuf_skb) {
-		printk(KERN_WARNING "register_isdn: No write routine given.\n");
-		return 0;
-	}
-	if (!(d = kzalloc(sizeof(isdn_driver_t), GFP_KERNEL))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc driver-struct\n");
-		return 0;
-	}
-
-	d->maxbufsize = i->maxbufsize;
-	d->pktcount = 0;
-	d->stavail = 0;
-	d->flags = DRV_FLAG_LOADED;
-	d->online = 0;
-	d->interface = i;
-	d->channels = 0;
-	spin_lock_irqsave(&dev->lock, flags);
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++)
-		if (!dev->drv[drvidx])
-			break;
-	if (isdn_add_channels(d, drvidx, i->channels, 0)) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		kfree(d);
-		return 0;
-	}
-	i->channels = drvidx;
-	i->rcvcallb_skb = isdn_receive_skb_callback;
-	i->statcallb = isdn_status_callback;
-	if (!strlen(i->id))
-		sprintf(i->id, "line%d", drvidx);
-	for (j = 0; j < drvidx; j++)
-		if (!strcmp(i->id, dev->drvid[j]))
-			sprintf(i->id, "line%d", drvidx);
-	dev->drv[drvidx] = d;
-	strcpy(dev->drvid[drvidx], i->id);
-	isdn_info_update();
-	dev->drivers++;
-	set_global_features();
-	spin_unlock_irqrestore(&dev->lock, flags);
-	return 1;
-}
-
-/*
-*****************************************************************************
-* And now the modules code.
-*****************************************************************************
-*/
-
-static char *
-isdn_getrev(const char *revision)
-{
-	char *rev;
-	char *p;
-
-	if ((p = strchr(revision, ':'))) {
-		rev = p + 2;
-		p = strchr(rev, '$');
-		*--p = 0;
-	} else
-		rev = "???";
-	return rev;
-}
-
-/*
- * Allocate and initialize all data, register modem-devices
- */
-static int __init isdn_init(void)
-{
-	int i;
-	char tmprev[50];
-
-	dev = vzalloc(sizeof(isdn_dev));
-	if (!dev) {
-		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
-		return -EIO;
-	}
-	timer_setup(&dev->timer, isdn_timer_funct, 0);
-	spin_lock_init(&dev->lock);
-	spin_lock_init(&dev->timerlock);
-#ifdef MODULE
-	dev->owner = THIS_MODULE;
-#endif
-	mutex_init(&dev->mtx);
-	init_waitqueue_head(&dev->info_waitq);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		dev->drvmap[i] = -1;
-		dev->chanmap[i] = -1;
-		dev->m_idx[i] = -1;
-		strcpy(dev->num[i], "???");
-	}
-	if (register_chrdev(ISDN_MAJOR, "isdn", &isdn_fops)) {
-		printk(KERN_WARNING "isdn: Could not register control devices\n");
-		vfree(dev);
-		return -EIO;
-	}
-	if ((isdn_tty_modem_init()) < 0) {
-		printk(KERN_WARNING "isdn: Could not register tty devices\n");
-		vfree(dev);
-		unregister_chrdev(ISDN_MAJOR, "isdn");
-		return -EIO;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (isdn_ppp_init() < 0) {
-		printk(KERN_WARNING "isdn: Could not create PPP-device-structs\n");
-		isdn_tty_exit();
-		unregister_chrdev(ISDN_MAJOR, "isdn");
-		vfree(dev);
-		return -EIO;
-	}
-#endif                          /* CONFIG_ISDN_PPP */
-
-	strcpy(tmprev, isdn_revision);
-	printk(KERN_NOTICE "ISDN subsystem Rev: %s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_net_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_ppp_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_audio_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_v110_revision);
-	printk("%s", isdn_getrev(tmprev));
-
-#ifdef MODULE
-	printk(" loaded\n");
-#else
-	printk("\n");
-#endif
-	isdn_info_update();
-	return 0;
-}
-
-/*
- * Unload module
- */
-static void __exit isdn_exit(void)
-{
-#ifdef CONFIG_ISDN_PPP
-	isdn_ppp_cleanup();
-#endif
-	if (isdn_net_rmall() < 0) {
-		printk(KERN_WARNING "isdn: net-device busy, remove cancelled\n");
-		return;
-	}
-	isdn_tty_exit();
-	unregister_chrdev(ISDN_MAJOR, "isdn");
-	del_timer_sync(&dev->timer);
-	/* call vfree with interrupts enabled, else it will hang */
-	vfree(dev);
-	printk(KERN_NOTICE "ISDN-subsystem unloaded\n");
-}
-
-module_init(isdn_init);
-module_exit(isdn_exit);
diff --git a/drivers/isdn/i4l/isdn_common.h b/drivers/isdn/i4l/isdn_common.h
deleted file mode 100644
index 2260ef07ab9c..000000000000
--- a/drivers/isdn/i4l/isdn_common.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* $Id: isdn_common.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem
- * common used functions and debugging-switches (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#undef  ISDN_DEBUG_MODEM_OPEN
-#undef  ISDN_DEBUG_MODEM_IOCTL
-#undef  ISDN_DEBUG_MODEM_WAITSENT
-#undef  ISDN_DEBUG_MODEM_HUP
-#undef  ISDN_DEBUG_MODEM_ICALL
-#undef  ISDN_DEBUG_MODEM_DUMP
-#undef  ISDN_DEBUG_MODEM_VOICE
-#undef  ISDN_DEBUG_AT
-#undef  ISDN_DEBUG_NET_DUMP
-#undef  ISDN_DEBUG_NET_DIAL
-#undef  ISDN_DEBUG_NET_ICALL
-
-/* Prototypes */
-extern void isdn_lock_drivers(void);
-extern void isdn_unlock_drivers(void);
-extern void isdn_free_channel(int di, int ch, int usage);
-extern void isdn_all_eaz(int di, int ch);
-extern int isdn_command(isdn_ctrl *);
-extern int isdn_dc2minor(int di, int ch);
-extern void isdn_info_update(void);
-extern char *isdn_map_eaz2msn(char *msn, int di);
-extern void isdn_timer_ctrl(int tf, int onoff);
-extern void isdn_unexclusive_channel(int di, int ch);
-extern int isdn_getnum(char **);
-extern int isdn_readbchan(int, int, u_char *, u_char *, int, wait_queue_head_t *);
-extern int isdn_readbchan_tty(int, int, struct tty_port *, int);
-extern int isdn_get_free_channel(int, int, int, int, int, char *);
-extern int isdn_writebuf_skb_stub(int, int, int, struct sk_buff *);
-extern int register_isdn(isdn_if *i);
-extern int isdn_msncmp(const char *,  const char *);
-#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP)
-extern void isdn_dumppkt(char *, u_char *, int, int);
-#endif
diff --git a/drivers/isdn/i4l/isdn_concap.c b/drivers/isdn/i4l/isdn_concap.c
deleted file mode 100644
index 336523ec077c..000000000000
--- a/drivers/isdn/i4l/isdn_concap.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* $Id: isdn_concap.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, protocol encapsulation
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-/* Stuff to support the concap_proto by isdn4linux. isdn4linux - specific
- * stuff goes here. Stuff that depends only on the concap protocol goes to
- * another -- protocol specific -- source file.
- *
- */
-
-
-#include <linux/isdn.h>
-#include "isdn_x25iface.h"
-#include "isdn_net.h"
-#include <linux/concap.h>
-#include "isdn_concap.h"
-
-
-/* The following set of device service operations are for encapsulation
-   protocols that require for reliable datalink semantics. That means:
-
-   - before any data is to be submitted the connection must explicitly
-   be set up.
-   - after the successful set up of the connection is signalled the
-   connection is considered to be reliably up.
-
-   Auto-dialing ist not compatible with this requirements. Thus, auto-dialing
-   is completely bypassed.
-
-   It might be possible to implement a (non standardized) datalink protocol
-   that provides a reliable data link service while using some auto dialing
-   mechanism. Such a protocol would need an auxiliary channel (i.e. user-user-
-   signaling on the D-channel) while the B-channel is down.
-*/
-
-
-static int isdn_concap_dl_data_req(struct concap_proto *concap, struct sk_buff *skb)
-{
-	struct net_device *ndev = concap->net_dev;
-	isdn_net_dev *nd = ((isdn_net_local *) netdev_priv(ndev))->netdev;
-	isdn_net_local *lp = isdn_net_get_locked_lp(nd);
-
-	IX25DEBUG("isdn_concap_dl_data_req: %s \n", concap->net_dev->name);
-	if (!lp) {
-		IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 1);
-		return 1;
-	}
-	lp->huptimer = 0;
-	isdn_net_writebuf_skb(lp, skb);
-	spin_unlock_bh(&lp->xmit_lock);
-	IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 0);
-	return 0;
-}
-
-
-static int isdn_concap_dl_connect_req(struct concap_proto *concap)
-{
-	struct net_device *ndev = concap->net_dev;
-	isdn_net_local *lp = netdev_priv(ndev);
-	int ret;
-	IX25DEBUG("isdn_concap_dl_connect_req: %s \n", ndev->name);
-
-	/* dial ... */
-	ret = isdn_net_dial_req(lp);
-	if (ret) IX25DEBUG("dialing failed\n");
-	return ret;
-}
-
-static int isdn_concap_dl_disconn_req(struct concap_proto *concap)
-{
-	IX25DEBUG("isdn_concap_dl_disconn_req: %s \n", concap->net_dev->name);
-
-	isdn_net_hangup(concap->net_dev);
-	return 0;
-}
-
-struct concap_device_ops isdn_concap_reliable_dl_dops = {
-	.data_req = &isdn_concap_dl_data_req,
-	.connect_req = &isdn_concap_dl_connect_req,
-	.disconn_req = &isdn_concap_dl_disconn_req
-};
-
-/* The following should better go into a dedicated source file such that
-   this sourcefile does not need to include any protocol specific header
-   files. For now:
-*/
-struct concap_proto *isdn_concap_new(int encap)
-{
-	switch (encap) {
-	case ISDN_NET_ENCAP_X25IFACE:
-		return isdn_x25iface_proto_new();
-	}
-	return NULL;
-}
diff --git a/drivers/isdn/i4l/isdn_concap.h b/drivers/isdn/i4l/isdn_concap.h
deleted file mode 100644
index cd7e3ba74e25..000000000000
--- a/drivers/isdn/i4l/isdn_concap.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* $Id: isdn_concap.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, protocol encapsulation
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-extern struct concap_device_ops isdn_concap_reliable_dl_dops;
-extern struct concap_proto *isdn_concap_new(int);
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
deleted file mode 100644
index c138f66f2659..000000000000
--- a/drivers/isdn/i4l/isdn_net.c
+++ /dev/null
@@ -1,3198 +0,0 @@
-/* $Id: isdn_net.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, network interfaces and related functions (linklevel).
- *
- * Copyright 1994-1998  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- * Data Over Voice (DOV) support added - Guy Ellis 23-Mar-02
- *                                       guy@traverse.com.au
- * Outgoing calls - looks for a 'V' in first char of dialed number
- * Incoming calls - checks first character of eaz as follows:
- *   Numeric - accept DATA only - original functionality
- *   'V'     - accept VOICE (DOV) only
- *   'B'     - accept BOTH DATA and DOV types
- *
- * Jan 2001: fix CISCO HDLC      Bjoern A. Zeeb <i4l@zabbadoz.net>
- *           for info on the protocol, see
- *           http://i4l.zabbadoz.net/i4l/cisco-hdlc.txt
- */
-
-#include <linux/isdn.h>
-#include <linux/slab.h>
-#include <net/arp.h>
-#include <net/dst.h>
-#include <net/pkt_sched.h>
-#include <linux/inetdevice.h>
-#include "isdn_common.h"
-#include "isdn_net.h"
-#ifdef CONFIG_ISDN_PPP
-#include "isdn_ppp.h"
-#endif
-#ifdef CONFIG_ISDN_X25
-#include <linux/concap.h>
-#include "isdn_concap.h"
-#endif
-
-
-/*
- * Outline of new tbusy handling:
- *
- * Old method, roughly spoken, consisted of setting tbusy when entering
- * isdn_net_start_xmit() and at several other locations and clearing
- * it from isdn_net_start_xmit() thread when sending was successful.
- *
- * With 2.3.x multithreaded network core, to prevent problems, tbusy should
- * only be set by the isdn_net_start_xmit() thread and only when a tx-busy
- * condition is detected. Other threads (in particular isdn_net_stat_callb())
- * are only allowed to clear tbusy.
- *
- * -HE
- */
-
-/*
- * About SOFTNET:
- * Most of the changes were pretty obvious and basically done by HE already.
- *
- * One problem of the isdn net device code is that it uses struct net_device
- * for masters and slaves. However, only master interface are registered to
- * the network layer, and therefore, it only makes sense to call netif_*
- * functions on them.
- *
- * --KG
- */
-
-/*
- * Find out if the netdevice has been ifup-ed yet.
- * For slaves, look at the corresponding master.
- */
-static __inline__ int isdn_net_device_started(isdn_net_dev *n)
-{
-	isdn_net_local *lp = n->local;
-	struct net_device *dev;
-
-	if (lp->master)
-		dev = lp->master;
-	else
-		dev = n->dev;
-	return netif_running(dev);
-}
-
-/*
- * wake up the network -> net_device queue.
- * For slaves, wake the corresponding master interface.
- */
-static __inline__ void isdn_net_device_wake_queue(isdn_net_local *lp)
-{
-	if (lp->master)
-		netif_wake_queue(lp->master);
-	else
-		netif_wake_queue(lp->netdev->dev);
-}
-
-/*
- * stop the network -> net_device queue.
- * For slaves, stop the corresponding master interface.
- */
-static __inline__ void isdn_net_device_stop_queue(isdn_net_local *lp)
-{
-	if (lp->master)
-		netif_stop_queue(lp->master);
-	else
-		netif_stop_queue(lp->netdev->dev);
-}
-
-/*
- * find out if the net_device which this lp belongs to (lp can be
- * master or slave) is busy. It's busy iff all (master and slave)
- * queues are busy
- */
-static __inline__ int isdn_net_device_busy(isdn_net_local *lp)
-{
-	isdn_net_local *nlp;
-	isdn_net_dev *nd;
-	unsigned long flags;
-
-	if (!isdn_net_lp_busy(lp))
-		return 0;
-
-	if (lp->master)
-		nd = ISDN_MASTER_PRIV(lp)->netdev;
-	else
-		nd = lp->netdev;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-	nlp = lp->next;
-	while (nlp != lp) {
-		if (!isdn_net_lp_busy(nlp)) {
-			spin_unlock_irqrestore(&nd->queue_lock, flags);
-			return 0;
-		}
-		nlp = nlp->next;
-	}
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	return 1;
-}
-
-static __inline__ void isdn_net_inc_frame_cnt(isdn_net_local *lp)
-{
-	atomic_inc(&lp->frame_cnt);
-	if (isdn_net_device_busy(lp))
-		isdn_net_device_stop_queue(lp);
-}
-
-static __inline__ void isdn_net_dec_frame_cnt(isdn_net_local *lp)
-{
-	atomic_dec(&lp->frame_cnt);
-
-	if (!(isdn_net_device_busy(lp))) {
-		if (!skb_queue_empty(&lp->super_tx_queue)) {
-			schedule_work(&lp->tqueue);
-		} else {
-			isdn_net_device_wake_queue(lp);
-		}
-	}
-}
-
-static __inline__ void isdn_net_zero_frame_cnt(isdn_net_local *lp)
-{
-	atomic_set(&lp->frame_cnt, 0);
-}
-
-/* For 2.2.x we leave the transmitter busy timeout at 2 secs, just
- * to be safe.
- * For 2.3.x we push it up to 20 secs, because call establishment
- * (in particular callback) may take such a long time, and we
- * don't want confusing messages in the log. However, there is a slight
- * possibility that this large timeout will break other things like MPPP,
- * which might rely on the tx timeout. If so, we'll find out this way...
- */
-
-#define ISDN_NET_TX_TIMEOUT (20 * HZ)
-
-/* Prototypes */
-
-static int isdn_net_force_dial_lp(isdn_net_local *);
-static netdev_tx_t isdn_net_start_xmit(struct sk_buff *,
-				       struct net_device *);
-
-static void isdn_net_ciscohdlck_connected(isdn_net_local *lp);
-static void isdn_net_ciscohdlck_disconnected(isdn_net_local *lp);
-
-char *isdn_net_revision = "$Revision: 1.1.2.2 $";
-
-/*
- * Code for raw-networking over ISDN
- */
-
-static void
-isdn_net_unreachable(struct net_device *dev, struct sk_buff *skb, char *reason)
-{
-	if (skb) {
-
-		u_short proto = ntohs(skb->protocol);
-
-		printk(KERN_DEBUG "isdn_net: %s: %s, signalling dst_link_failure %s\n",
-		       dev->name,
-		       (reason != NULL) ? reason : "unknown",
-		       (proto != ETH_P_IP) ? "Protocol != ETH_P_IP" : "");
-
-		dst_link_failure(skb);
-	}
-	else {  /* dial not triggered by rawIP packet */
-		printk(KERN_DEBUG "isdn_net: %s: %s\n",
-		       dev->name,
-		       (reason != NULL) ? reason : "reason unknown");
-	}
-}
-
-static void
-isdn_net_reset(struct net_device *dev)
-{
-#ifdef CONFIG_ISDN_X25
-	struct concap_device_ops *dops =
-		((isdn_net_local *)netdev_priv(dev))->dops;
-	struct concap_proto *cprot =
-		((isdn_net_local *)netdev_priv(dev))->netdev->cprot;
-#endif
-#ifdef CONFIG_ISDN_X25
-	if (cprot && cprot->pops && dops)
-		cprot->pops->restart(cprot, dev, dops);
-#endif
-}
-
-/* Open/initialize the board. */
-static int
-isdn_net_open(struct net_device *dev)
-{
-	int i;
-	struct net_device *p;
-	struct in_device *in_dev;
-
-	/* moved here from isdn_net_reset, because only the master has an
-	   interface associated which is supposed to be started. BTW:
-	   we need to call netif_start_queue, not netif_wake_queue here */
-	netif_start_queue(dev);
-
-	isdn_net_reset(dev);
-	/* Fill in the MAC-level header (not needed, but for compatibility... */
-	for (i = 0; i < ETH_ALEN - sizeof(u32); i++)
-		dev->dev_addr[i] = 0xfc;
-	if ((in_dev = dev->ip_ptr) != NULL) {
-		/*
-		 *      Any address will do - we take the first
-		 */
-		struct in_ifaddr *ifa = in_dev->ifa_list;
-		if (ifa != NULL)
-			memcpy(dev->dev_addr + 2, &ifa->ifa_local, 4);
-	}
-
-	/* If this interface has slaves, start them also */
-	p = MASTER_TO_SLAVE(dev);
-	if (p) {
-		while (p) {
-			isdn_net_reset(p);
-			p = MASTER_TO_SLAVE(p);
-		}
-	}
-	isdn_lock_drivers();
-	return 0;
-}
-
-/*
- * Assign an ISDN-channel to a net-interface
- */
-static void
-isdn_net_bind_channel(isdn_net_local *lp, int idx)
-{
-	lp->flags |= ISDN_NET_CONNECTED;
-	lp->isdn_device = dev->drvmap[idx];
-	lp->isdn_channel = dev->chanmap[idx];
-	dev->rx_netdev[idx] = lp->netdev;
-	dev->st_netdev[idx] = lp->netdev;
-}
-
-/*
- * unbind a net-interface (resets interface after an error)
- */
-static void
-isdn_net_unbind_channel(isdn_net_local *lp)
-{
-	skb_queue_purge(&lp->super_tx_queue);
-
-	if (!lp->master) {	/* reset only master device */
-		/* Moral equivalent of dev_purge_queues():
-		   BEWARE! This chunk of code cannot be called from hardware
-		   interrupt handler. I hope it is true. --ANK
-		*/
-		qdisc_reset_all_tx(lp->netdev->dev);
-	}
-	lp->dialstate = 0;
-	dev->rx_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL;
-	dev->st_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL;
-	if (lp->isdn_device != -1 && lp->isdn_channel != -1)
-		isdn_free_channel(lp->isdn_device, lp->isdn_channel,
-				  ISDN_USAGE_NET);
-	lp->flags &= ~ISDN_NET_CONNECTED;
-	lp->isdn_device = -1;
-	lp->isdn_channel = -1;
-}
-
-/*
- * Perform auto-hangup and cps-calculation for net-interfaces.
- *
- * auto-hangup:
- * Increment idle-counter (this counter is reset on any incoming or
- * outgoing packet), if counter exceeds configured limit either do a
- * hangup immediately or - if configured - wait until just before the next
- * charge-info.
- *
- * cps-calculation (needed for dynamic channel-bundling):
- * Since this function is called every second, simply reset the
- * byte-counter of the interface after copying it to the cps-variable.
- */
-static unsigned long last_jiffies = -HZ;
-
-void
-isdn_net_autohup(void)
-{
-	isdn_net_dev *p = dev->netdev;
-	int anymore;
-
-	anymore = 0;
-	while (p) {
-		isdn_net_local *l = p->local;
-		if (jiffies == last_jiffies)
-			l->cps = l->transcount;
-		else
-			l->cps = (l->transcount * HZ) / (jiffies - last_jiffies);
-		l->transcount = 0;
-		if (dev->net_verbose > 3)
-			printk(KERN_DEBUG "%s: %d bogocps\n", p->dev->name, l->cps);
-		if ((l->flags & ISDN_NET_CONNECTED) && (!l->dialstate)) {
-			anymore = 1;
-			l->huptimer++;
-			/*
-			 * if there is some dialmode where timeout-hangup
-			 * should _not_ be done, check for that here
-			 */
-			if ((l->onhtime) &&
-			    (l->huptimer > l->onhtime))
-			{
-				if (l->hupflags & ISDN_MANCHARGE &&
-				    l->hupflags & ISDN_CHARGEHUP) {
-					while (time_after(jiffies, l->chargetime + l->chargeint))
-						l->chargetime += l->chargeint;
-					if (time_after(jiffies, l->chargetime + l->chargeint - 2 * HZ))
-						if (l->outgoing || l->hupflags & ISDN_INHUP)
-							isdn_net_hangup(p->dev);
-				} else if (l->outgoing) {
-					if (l->hupflags & ISDN_CHARGEHUP) {
-						if (l->hupflags & ISDN_WAITCHARGE) {
-							printk(KERN_DEBUG "isdn_net: Hupflags of %s are %X\n",
-							       p->dev->name, l->hupflags);
-							isdn_net_hangup(p->dev);
-						} else if (time_after(jiffies, l->chargetime + l->chargeint)) {
-							printk(KERN_DEBUG
-							       "isdn_net: %s: chtime = %lu, chint = %d\n",
-							       p->dev->name, l->chargetime, l->chargeint);
-							isdn_net_hangup(p->dev);
-						}
-					} else
-						isdn_net_hangup(p->dev);
-				} else if (l->hupflags & ISDN_INHUP)
-					isdn_net_hangup(p->dev);
-			}
-
-			if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*l) == ISDN_NET_DM_OFF)) {
-				isdn_net_hangup(p->dev);
-				break;
-			}
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	last_jiffies = jiffies;
-	isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, anymore);
-}
-
-static void isdn_net_lp_disconnected(isdn_net_local *lp)
-{
-	isdn_net_rm_from_bundle(lp);
-}
-
-/*
- * Handle status-messages from ISDN-interfacecard.
- * This function is called from within the main-status-dispatcher
- * isdn_status_callback, which itself is called from the low-level driver.
- * Return: 1 = Event handled, 0 = not for us or unknown Event.
- */
-int
-isdn_net_stat_callback(int idx, isdn_ctrl *c)
-{
-	isdn_net_dev *p = dev->st_netdev[idx];
-	int cmd = c->command;
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-#ifdef CONFIG_ISDN_X25
-		struct concap_proto *cprot = lp->netdev->cprot;
-		struct concap_proto_ops *pops = cprot ? cprot->pops : NULL;
-#endif
-		switch (cmd) {
-		case ISDN_STAT_BSENT:
-			/* A packet has successfully been sent out */
-			if ((lp->flags & ISDN_NET_CONNECTED) &&
-			    (!lp->dialstate)) {
-				isdn_net_dec_frame_cnt(lp);
-				lp->stats.tx_packets++;
-				lp->stats.tx_bytes += c->parm.length;
-			}
-			return 1;
-		case ISDN_STAT_DCONN:
-			/* D-Channel is up */
-			switch (lp->dialstate) {
-			case 4:
-			case 7:
-			case 8:
-				lp->dialstate++;
-				return 1;
-			case 12:
-				lp->dialstate = 5;
-				return 1;
-			}
-			break;
-		case ISDN_STAT_DHUP:
-			/* Either D-Channel-hangup or error during dialout */
-#ifdef CONFIG_ISDN_X25
-			/* If we are not connencted then dialing had
-			   failed. If there are generic encap protocol
-			   receiver routines signal the closure of
-			   the link*/
-
-			if (!(lp->flags & ISDN_NET_CONNECTED)
-			    && pops && pops->disconn_ind)
-				pops->disconn_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-			if ((!lp->dialstate) && (lp->flags & ISDN_NET_CONNECTED)) {
-				if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK)
-					isdn_net_ciscohdlck_disconnected(lp);
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-					isdn_ppp_free(lp);
-#endif
-				isdn_net_lp_disconnected(lp);
-				isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-				printk(KERN_INFO "%s: remote hangup\n", p->dev->name);
-				printk(KERN_INFO "%s: Chargesum is %d\n", p->dev->name,
-				       lp->charge);
-				isdn_net_unbind_channel(lp);
-				return 1;
-			}
-			break;
-#ifdef CONFIG_ISDN_X25
-		case ISDN_STAT_BHUP:
-			/* B-Channel-hangup */
-			/* try if there are generic encap protocol
-			   receiver routines and signal the closure of
-			   the link */
-			if (pops && pops->disconn_ind) {
-				pops->disconn_ind(cprot);
-				return 1;
-			}
-			break;
-#endif /* CONFIG_ISDN_X25 */
-		case ISDN_STAT_BCONN:
-			/* B-Channel is up */
-			isdn_net_zero_frame_cnt(lp);
-			switch (lp->dialstate) {
-			case 5:
-			case 6:
-			case 7:
-			case 8:
-			case 9:
-			case 10:
-			case 12:
-				if (lp->dialstate <= 6) {
-					dev->usage[idx] |= ISDN_USAGE_OUTGOING;
-					isdn_info_update();
-				} else
-					dev->rx_netdev[idx] = p;
-				lp->dialstate = 0;
-				isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 1);
-				if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK)
-					isdn_net_ciscohdlck_connected(lp);
-				if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP) {
-					if (lp->master) { /* is lp a slave? */
-						isdn_net_dev *nd = ISDN_MASTER_PRIV(lp)->netdev;
-						isdn_net_add_to_bundle(nd, lp);
-					}
-				}
-				printk(KERN_INFO "isdn_net: %s connected\n", p->dev->name);
-				/* If first Chargeinfo comes before B-Channel connect,
-				 * we correct the timestamp here.
-				 */
-				lp->chargetime = jiffies;
-
-				/* reset dial-timeout */
-				lp->dialstarted = 0;
-				lp->dialwait_timer = 0;
-
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-					isdn_ppp_wakeup_daemon(lp);
-#endif
-#ifdef CONFIG_ISDN_X25
-				/* try if there are generic concap receiver routines */
-				if (pops)
-					if (pops->connect_ind)
-						pops->connect_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-				/* ppp needs to do negotiations first */
-				if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP)
-					isdn_net_device_wake_queue(lp);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_NODCH:
-			/* No D-Channel avail. */
-			if (lp->dialstate == 4) {
-				lp->dialstate--;
-				return 1;
-			}
-			break;
-		case ISDN_STAT_CINF:
-			/* Charge-info from TelCo. Calculate interval between
-			 * charge-infos and set timestamp for last info for
-			 * usage by isdn_net_autohup()
-			 */
-			lp->charge++;
-			if (lp->hupflags & ISDN_HAVECHARGE) {
-				lp->hupflags &= ~ISDN_WAITCHARGE;
-				lp->chargeint = jiffies - lp->chargetime - (2 * HZ);
-			}
-			if (lp->hupflags & ISDN_WAITCHARGE)
-				lp->hupflags |= ISDN_HAVECHARGE;
-			lp->chargetime = jiffies;
-			printk(KERN_DEBUG "isdn_net: Got CINF chargetime of %s now %lu\n",
-			       p->dev->name, lp->chargetime);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * Perform dialout for net-interfaces and timeout-handling for
- * D-Channel-up and B-Channel-up Messages.
- * This function is initially called from within isdn_net_start_xmit() or
- * or isdn_net_find_icall() after initializing the dialstate for an
- * interface. If further calls are needed, the function schedules itself
- * for a timer-callback via isdn_timer_function().
- * The dialstate is also affected by incoming status-messages from
- * the ISDN-Channel which are handled in isdn_net_stat_callback() above.
- */
-void
-isdn_net_dial(void)
-{
-	isdn_net_dev *p = dev->netdev;
-	int anymore = 0;
-	int i;
-	isdn_ctrl cmd;
-	u_char *phone_number;
-
-	while (p) {
-		isdn_net_local *lp = p->local;
-
-#ifdef ISDN_DEBUG_NET_DIAL
-		if (lp->dialstate)
-			printk(KERN_DEBUG "%s: dialstate=%d\n", p->dev->name, lp->dialstate);
-#endif
-		switch (lp->dialstate) {
-		case 0:
-			/* Nothing to do for this interface */
-			break;
-		case 1:
-			/* Initiate dialout. Set phone-number-pointer to first number
-			 * of interface.
-			 */
-			lp->dial = lp->phone[1];
-			if (!lp->dial) {
-				printk(KERN_WARNING "%s: phone number deleted?\n",
-				       p->dev->name);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			anymore = 1;
-
-			if (lp->dialtimeout > 0)
-				if (lp->dialstarted == 0 || time_after(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait)) {
-					lp->dialstarted = jiffies;
-					lp->dialwait_timer = 0;
-				}
-
-			lp->dialstate++;
-			/* Fall through */
-		case 2:
-			/* Prepare dialing. Clear EAZ, then set EAZ. */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_CLREAZ;
-			isdn_command(&cmd);
-			sprintf(cmd.parm.num, "%s", isdn_map_eaz2msn(lp->msn, cmd.driver));
-			cmd.command = ISDN_CMD_SETEAZ;
-			isdn_command(&cmd);
-			lp->dialretry = 0;
-			anymore = 1;
-			lp->dialstate++;
-			/* Fall through */
-		case 3:
-			/* Setup interface, dial current phone-number, switch to next number.
-			 * If list of phone-numbers is exhausted, increment
-			 * retry-counter.
-			 */
-			if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF)) {
-				char *s;
-				if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-					s = "dial suppressed: isdn system stopped";
-				else
-					s = "dial suppressed: dialmode `off'";
-				isdn_net_unreachable(p->dev, NULL, s);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL2;
-			cmd.arg = lp->isdn_channel + (lp->l2_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL3;
-			cmd.arg = lp->isdn_channel + (lp->l3_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			if (!lp->dial) {
-				printk(KERN_WARNING "%s: phone number deleted?\n",
-				       p->dev->name);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			if (!strncmp(lp->dial->num, "LEASED", strlen("LEASED"))) {
-				lp->dialstate = 4;
-				printk(KERN_INFO "%s: Open leased line ...\n", p->dev->name);
-			} else {
-				if (lp->dialtimeout > 0)
-					if (time_after(jiffies, lp->dialstarted + lp->dialtimeout)) {
-						lp->dialwait_timer = jiffies + lp->dialwait;
-						lp->dialstarted = 0;
-						isdn_net_unreachable(p->dev, NULL, "dial: timed out");
-						isdn_net_hangup(p->dev);
-						break;
-					}
-
-				cmd.driver = lp->isdn_device;
-				cmd.command = ISDN_CMD_DIAL;
-				cmd.parm.setup.si2 = 0;
-
-				/* check for DOV */
-				phone_number = lp->dial->num;
-				if ((*phone_number == 'v') ||
-				    (*phone_number == 'V')) { /* DOV call */
-					cmd.parm.setup.si1 = 1;
-				} else { /* DATA call */
-					cmd.parm.setup.si1 = 7;
-				}
-
-				strcpy(cmd.parm.setup.phone, phone_number);
-				/*
-				 * Switch to next number or back to start if at end of list.
-				 */
-				if (!(lp->dial = (isdn_net_phone *) lp->dial->next)) {
-					lp->dial = lp->phone[1];
-					lp->dialretry++;
-
-					if (lp->dialretry > lp->dialmax) {
-						if (lp->dialtimeout == 0) {
-							lp->dialwait_timer = jiffies + lp->dialwait;
-							lp->dialstarted = 0;
-							isdn_net_unreachable(p->dev, NULL, "dial: tried all numbers dialmax times");
-						}
-						isdn_net_hangup(p->dev);
-						break;
-					}
-				}
-				sprintf(cmd.parm.setup.eazmsn, "%s",
-					isdn_map_eaz2msn(lp->msn, cmd.driver));
-				i = isdn_dc2minor(lp->isdn_device, lp->isdn_channel);
-				if (i >= 0) {
-					strcpy(dev->num[i], cmd.parm.setup.phone);
-					dev->usage[i] |= ISDN_USAGE_OUTGOING;
-					isdn_info_update();
-				}
-				printk(KERN_INFO "%s: dialing %d %s... %s\n", p->dev->name,
-				       lp->dialretry, cmd.parm.setup.phone,
-				       (cmd.parm.setup.si1 == 1) ? "DOV" : "");
-				lp->dtimer = 0;
-#ifdef ISDN_DEBUG_NET_DIAL
-				printk(KERN_DEBUG "dial: d=%d c=%d\n", lp->isdn_device,
-				       lp->isdn_channel);
-#endif
-				isdn_command(&cmd);
-			}
-			lp->huptimer = 0;
-			lp->outgoing = 1;
-			if (lp->chargeint) {
-				lp->hupflags |= ISDN_HAVECHARGE;
-				lp->hupflags &= ~ISDN_WAITCHARGE;
-			} else {
-				lp->hupflags |= ISDN_WAITCHARGE;
-				lp->hupflags &= ~ISDN_HAVECHARGE;
-			}
-			anymore = 1;
-			lp->dialstate =
-				(lp->cbdelay &&
-				 (lp->flags & ISDN_NET_CBOUT)) ? 12 : 4;
-			break;
-		case 4:
-			/* Wait for D-Channel-connect.
-			 * If timeout, switch back to state 3.
-			 * Dialmax-handling moved to state 3.
-			 */
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				lp->dialstate = 3;
-			anymore = 1;
-			break;
-		case 5:
-			/* Got D-Channel-Connect, send B-Channel-request */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			anymore = 1;
-			lp->dtimer = 0;
-			lp->dialstate++;
-			isdn_command(&cmd);
-			break;
-		case 6:
-			/* Wait for B- or D-Channel-connect. If timeout,
-			 * switch back to state 3.
-			 */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer2: %d\n", lp->dtimer);
-#endif
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				lp->dialstate = 3;
-			anymore = 1;
-			break;
-		case 7:
-			/* Got incoming Call, setup L2 and L3 protocols,
-			 * then wait for D-Channel-connect
-			 */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer);
-#endif
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL2;
-			cmd.arg = lp->isdn_channel + (lp->l2_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL3;
-			cmd.arg = lp->isdn_channel + (lp->l3_proto << 8);
-			isdn_command(&cmd);
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT15)
-				isdn_net_hangup(p->dev);
-			else {
-				anymore = 1;
-				lp->dialstate++;
-			}
-			break;
-		case 9:
-			/* Got incoming D-Channel-Connect, send B-Channel-request */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			isdn_command(&cmd);
-			anymore = 1;
-			lp->dtimer = 0;
-			lp->dialstate++;
-			break;
-		case 8:
-		case 10:
-			/*  Wait for B- or D-channel-connect */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer);
-#endif
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				isdn_net_hangup(p->dev);
-			else
-				anymore = 1;
-			break;
-		case 11:
-			/* Callback Delay */
-			if (lp->dtimer++ > lp->cbdelay)
-				lp->dialstate = 1;
-			anymore = 1;
-			break;
-		case 12:
-			/* Remote does callback. Hangup after cbdelay, then wait for incoming
-			 * call (in state 4).
-			 */
-			if (lp->dtimer++ > lp->cbdelay)
-			{
-				printk(KERN_INFO "%s: hangup waiting for callback ...\n", p->dev->name);
-				lp->dtimer = 0;
-				lp->dialstate = 4;
-				cmd.driver = lp->isdn_device;
-				cmd.command = ISDN_CMD_HANGUP;
-				cmd.arg = lp->isdn_channel;
-				isdn_command(&cmd);
-				isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-			}
-			anymore = 1;
-			break;
-		default:
-			printk(KERN_WARNING "isdn_net: Illegal dialstate %d for device %s\n",
-			       lp->dialstate, p->dev->name);
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	isdn_timer_ctrl(ISDN_TIMER_NETDIAL, anymore);
-}
-
-/*
- * Perform hangup for a net-interface.
- */
-void
-isdn_net_hangup(struct net_device *d)
-{
-	isdn_net_local *lp = netdev_priv(d);
-	isdn_ctrl cmd;
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-	struct concap_proto_ops *pops = cprot ? cprot->pops : NULL;
-#endif
-
-	if (lp->flags & ISDN_NET_CONNECTED) {
-		if (lp->slave != NULL) {
-			isdn_net_local *slp = ISDN_SLAVE_PRIV(lp);
-			if (slp->flags & ISDN_NET_CONNECTED) {
-				printk(KERN_INFO
-				       "isdn_net: hang up slave %s before %s\n",
-				       lp->slave->name, d->name);
-				isdn_net_hangup(lp->slave);
-			}
-		}
-		printk(KERN_INFO "isdn_net: local hangup %s\n", d->name);
-#ifdef CONFIG_ISDN_PPP
-		if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-			isdn_ppp_free(lp);
-#endif
-		isdn_net_lp_disconnected(lp);
-#ifdef CONFIG_ISDN_X25
-		/* try if there are generic encap protocol
-		   receiver routines and signal the closure of
-		   the link */
-		if (pops && pops->disconn_ind)
-			pops->disconn_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-
-		cmd.driver = lp->isdn_device;
-		cmd.command = ISDN_CMD_HANGUP;
-		cmd.arg = lp->isdn_channel;
-		isdn_command(&cmd);
-		printk(KERN_INFO "%s: Chargesum is %d\n", d->name, lp->charge);
-		isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-	}
-	isdn_net_unbind_channel(lp);
-}
-
-typedef struct {
-	__be16 source;
-	__be16 dest;
-} ip_ports;
-
-static void
-isdn_net_log_skb(struct sk_buff *skb, isdn_net_local *lp)
-{
-	/* hopefully, this was set correctly */
-	const u_char *p = skb_network_header(skb);
-	unsigned short proto = ntohs(skb->protocol);
-	int data_ofs;
-	ip_ports *ipp;
-	char addinfo[100];
-
-	addinfo[0] = '\0';
-	/* This check stolen from 2.1.72 dev_queue_xmit_nit() */
-	if (p < skb->data || skb_network_header(skb) >= skb_tail_pointer(skb)) {
-		/* fall back to old isdn_net_log_packet method() */
-		char *buf = skb->data;
-
-		printk(KERN_DEBUG "isdn_net: protocol %04x is buggy, dev %s\n", skb->protocol, lp->netdev->dev->name);
-		p = buf;
-		proto = ETH_P_IP;
-		switch (lp->p_encap) {
-		case ISDN_NET_ENCAP_IPTYP:
-			proto = ntohs(*(__be16 *)&buf[0]);
-			p = &buf[2];
-			break;
-		case ISDN_NET_ENCAP_ETHER:
-			proto = ntohs(*(__be16 *)&buf[12]);
-			p = &buf[14];
-			break;
-		case ISDN_NET_ENCAP_CISCOHDLC:
-			proto = ntohs(*(__be16 *)&buf[2]);
-			p = &buf[4];
-			break;
-#ifdef CONFIG_ISDN_PPP
-		case ISDN_NET_ENCAP_SYNCPPP:
-			proto = ntohs(skb->protocol);
-			p = &buf[IPPP_MAX_HEADER];
-			break;
-#endif
-		}
-	}
-	data_ofs = ((p[0] & 15) * 4);
-	switch (proto) {
-	case ETH_P_IP:
-		switch (p[9]) {
-		case 1:
-			strcpy(addinfo, " ICMP");
-			break;
-		case 2:
-			strcpy(addinfo, " IGMP");
-			break;
-		case 4:
-			strcpy(addinfo, " IPIP");
-			break;
-		case 6:
-			ipp = (ip_ports *) (&p[data_ofs]);
-			sprintf(addinfo, " TCP, port: %d -> %d", ntohs(ipp->source),
-				ntohs(ipp->dest));
-			break;
-		case 8:
-			strcpy(addinfo, " EGP");
-			break;
-		case 12:
-			strcpy(addinfo, " PUP");
-			break;
-		case 17:
-			ipp = (ip_ports *) (&p[data_ofs]);
-			sprintf(addinfo, " UDP, port: %d -> %d", ntohs(ipp->source),
-				ntohs(ipp->dest));
-			break;
-		case 22:
-			strcpy(addinfo, " IDP");
-			break;
-		}
-		printk(KERN_INFO "OPEN: %pI4 -> %pI4%s\n",
-		       p + 12, p + 16, addinfo);
-		break;
-	case ETH_P_ARP:
-		printk(KERN_INFO "OPEN: ARP %pI4 -> *.*.*.* ?%pI4\n",
-		       p + 14, p + 24);
-		break;
-	}
-}
-
-/*
- * this function is used to send supervisory data, i.e. data which was
- * not received from the network layer, but e.g. frames from ipppd, CCP
- * reset frames etc.
- */
-void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb)
-{
-	if (in_irq()) {
-		// we can't grab the lock from irq context,
-		// so we just queue the packet
-		skb_queue_tail(&lp->super_tx_queue, skb);
-		schedule_work(&lp->tqueue);
-		return;
-	}
-
-	spin_lock_bh(&lp->xmit_lock);
-	if (!isdn_net_lp_busy(lp)) {
-		isdn_net_writebuf_skb(lp, skb);
-	} else {
-		skb_queue_tail(&lp->super_tx_queue, skb);
-	}
-	spin_unlock_bh(&lp->xmit_lock);
-}
-
-/*
- * called from tq_immediate
- */
-static void isdn_net_softint(struct work_struct *work)
-{
-	isdn_net_local *lp = container_of(work, isdn_net_local, tqueue);
-	struct sk_buff *skb;
-
-	spin_lock_bh(&lp->xmit_lock);
-	while (!isdn_net_lp_busy(lp)) {
-		skb = skb_dequeue(&lp->super_tx_queue);
-		if (!skb)
-			break;
-		isdn_net_writebuf_skb(lp, skb);
-	}
-	spin_unlock_bh(&lp->xmit_lock);
-}
-
-/*
- * all frames sent from the (net) LL to a HL driver should go via this function
- * it's serialized by the caller holding the lp->xmit_lock spinlock
- */
-void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb)
-{
-	int ret;
-	int len = skb->len;     /* save len */
-
-	/* before obtaining the lock the caller should have checked that
-	   the lp isn't busy */
-	if (isdn_net_lp_busy(lp)) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		goto error;
-	}
-
-	if (!(lp->flags & ISDN_NET_CONNECTED)) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		goto error;
-	}
-	ret = isdn_writebuf_skb_stub(lp->isdn_device, lp->isdn_channel, 1, skb);
-	if (ret != len) {
-		/* we should never get here */
-		printk(KERN_WARNING "%s: HL driver queue full\n", lp->netdev->dev->name);
-		goto error;
-	}
-
-	lp->transcount += len;
-	isdn_net_inc_frame_cnt(lp);
-	return;
-
-error:
-	dev_kfree_skb(skb);
-	lp->stats.tx_errors++;
-
-}
-
-
-/*
- *  Helper function for isdn_net_start_xmit.
- *  When called, the connection is already established.
- *  Based on cps-calculation, check if device is overloaded.
- *  If so, and if a slave exists, trigger dialing for it.
- *  If any slave is online, deliver packets using a simple round robin
- *  scheme.
- *
- *  Return: 0 on success, !0 on failure.
- */
-
-static int
-isdn_net_xmit(struct net_device *ndev, struct sk_buff *skb)
-{
-	isdn_net_dev *nd;
-	isdn_net_local *slp;
-	isdn_net_local *lp = netdev_priv(ndev);
-	int retv = NETDEV_TX_OK;
-
-	if (((isdn_net_local *) netdev_priv(ndev))->master) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		dev_kfree_skb(skb);
-		return NETDEV_TX_OK;
-	}
-
-	/* For the other encaps the header has already been built */
-#ifdef CONFIG_ISDN_PPP
-	if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-		return isdn_ppp_xmit(skb, ndev);
-	}
-#endif
-	nd = ((isdn_net_local *) netdev_priv(ndev))->netdev;
-	lp = isdn_net_get_locked_lp(nd);
-	if (!lp) {
-		printk(KERN_WARNING "%s: all channels busy - requeuing!\n", ndev->name);
-		return NETDEV_TX_BUSY;
-	}
-	/* we have our lp locked from now on */
-
-	/* Reset hangup-timeout */
-	lp->huptimer = 0; // FIXME?
-	isdn_net_writebuf_skb(lp, skb);
-	spin_unlock_bh(&lp->xmit_lock);
-
-	/* the following stuff is here for backwards compatibility.
-	 * in future, start-up and hangup of slaves (based on current load)
-	 * should move to userspace and get based on an overall cps
-	 * calculation
-	 */
-	if (lp->cps > lp->triggercps) {
-		if (lp->slave) {
-			if (!lp->sqfull) {
-				/* First time overload: set timestamp only */
-				lp->sqfull = 1;
-				lp->sqfull_stamp = jiffies;
-			} else {
-				/* subsequent overload: if slavedelay exceeded, start dialing */
-				if (time_after(jiffies, lp->sqfull_stamp + lp->slavedelay)) {
-					slp = ISDN_SLAVE_PRIV(lp);
-					if (!(slp->flags & ISDN_NET_CONNECTED)) {
-						isdn_net_force_dial_lp(ISDN_SLAVE_PRIV(lp));
-					}
-				}
-			}
-		}
-	} else {
-		if (lp->sqfull && time_after(jiffies, lp->sqfull_stamp + lp->slavedelay + (10 * HZ))) {
-			lp->sqfull = 0;
-		}
-		/* this is a hack to allow auto-hangup for slaves on moderate loads */
-		nd->queue = nd->local;
-	}
-
-	return retv;
-
-}
-
-static void
-isdn_net_adjust_hdr(struct sk_buff *skb, struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	if (!skb)
-		return;
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER) {
-		const int pullsize = skb_network_offset(skb) - ETH_HLEN;
-		if (pullsize > 0) {
-			printk(KERN_DEBUG "isdn_net: Pull junk %d\n", pullsize);
-			skb_pull(skb, pullsize);
-		}
-	}
-}
-
-
-static void isdn_net_tx_timeout(struct net_device *ndev)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-
-	printk(KERN_WARNING "isdn_tx_timeout dev %s dialstate %d\n", ndev->name, lp->dialstate);
-	if (!lp->dialstate) {
-		lp->stats.tx_errors++;
-		/*
-		 * There is a certain probability that this currently
-		 * works at all because if we always wake up the interface,
-		 * then upper layer will try to send the next packet
-		 * immediately. And then, the old clean_up logic in the
-		 * driver will hopefully continue to work as it used to do.
-		 *
-		 * This is rather primitive right know, we better should
-		 * clean internal queues here, in particular for multilink and
-		 * ppp, and reset HL driver's channel, too.   --HE
-		 *
-		 * actually, this may not matter at all, because ISDN hardware
-		 * should not see transmitter hangs at all IMO
-		 * changed KERN_DEBUG to KERN_WARNING to find out if this is
-		 * ever called   --KG
-		 */
-	}
-	netif_trans_update(ndev);
-	netif_wake_queue(ndev);
-}
-
-/*
- * Try sending a packet.
- * If this interface isn't connected to a ISDN-Channel, find a free channel,
- * and start dialing.
- */
-static netdev_tx_t
-isdn_net_start_xmit(struct sk_buff *skb, struct net_device *ndev)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-/* At this point hard_start_xmit() passes control to the encapsulation
-   protocol (if present).
-   For X.25 auto-dialing is completly bypassed because:
-   - It does not conform with the semantics of a reliable datalink
-   service as needed by X.25 PLP.
-   - I don't want that the interface starts dialing when the network layer
-   sends a message which requests to disconnect the lapb link (or if it
-   sends any other message not resulting in data transmission).
-   Instead, dialing will be initiated by the encapsulation protocol entity
-   when a dl_establish request is received from the upper layer.
-*/
-	if (cprot && cprot->pops) {
-		int ret = cprot->pops->encap_and_xmit(cprot, skb);
-
-		if (ret)
-			netif_stop_queue(ndev);
-		return ret;
-	} else
-#endif
-		/* auto-dialing xmit function */
-	{
-#ifdef ISDN_DEBUG_NET_DUMP
-		u_char *buf;
-#endif
-		isdn_net_adjust_hdr(skb, ndev);
-#ifdef ISDN_DEBUG_NET_DUMP
-		buf = skb->data;
-		isdn_dumppkt("S:", buf, skb->len, 40);
-#endif
-
-		if (!(lp->flags & ISDN_NET_CONNECTED)) {
-			int chi;
-			/* only do autodial if allowed by config */
-			if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) {
-				isdn_net_unreachable(ndev, skb, "dial rejected: interface not in dialmode `auto'");
-				dev_kfree_skb(skb);
-				return NETDEV_TX_OK;
-			}
-			if (lp->phone[1]) {
-				ulong flags;
-
-				if (lp->dialwait_timer <= 0)
-					if (lp->dialstarted > 0 && lp->dialtimeout > 0 && time_before(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait))
-						lp->dialwait_timer = lp->dialstarted + lp->dialtimeout + lp->dialwait;
-
-				if (lp->dialwait_timer > 0) {
-					if (time_before(jiffies, lp->dialwait_timer)) {
-						isdn_net_unreachable(ndev, skb, "dial rejected: retry-time not reached");
-						dev_kfree_skb(skb);
-						return NETDEV_TX_OK;
-					} else
-						lp->dialwait_timer = 0;
-				}
-				/* Grab a free ISDN-Channel */
-				spin_lock_irqsave(&dev->lock, flags);
-				if (((chi =
-				      isdn_get_free_channel(
-					      ISDN_USAGE_NET,
-					      lp->l2_proto,
-					      lp->l3_proto,
-					      lp->pre_device,
-					      lp->pre_channel,
-					      lp->msn)
-					     ) < 0) &&
-				    ((chi =
-				      isdn_get_free_channel(
-					      ISDN_USAGE_NET,
-					      lp->l2_proto,
-					      lp->l3_proto,
-					      lp->pre_device,
-					      lp->pre_channel^1,
-					      lp->msn)
-					    ) < 0)) {
-					spin_unlock_irqrestore(&dev->lock, flags);
-					isdn_net_unreachable(ndev, skb,
-							     "No channel");
-					dev_kfree_skb(skb);
-					return NETDEV_TX_OK;
-				}
-				/* Log packet, which triggered dialing */
-				if (dev->net_verbose)
-					isdn_net_log_skb(skb, lp);
-				lp->dialstate = 1;
-				/* Connect interface with channel */
-				isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-					/* no 'first_skb' handling for syncPPP */
-					if (isdn_ppp_bind(lp) < 0) {
-						dev_kfree_skb(skb);
-						isdn_net_unbind_channel(lp);
-						spin_unlock_irqrestore(&dev->lock, flags);
-						return NETDEV_TX_OK;	/* STN (skb to nirvana) ;) */
-					}
-#ifdef CONFIG_IPPP_FILTER
-					if (isdn_ppp_autodial_filter(skb, lp)) {
-						isdn_ppp_free(lp);
-						isdn_net_unbind_channel(lp);
-						spin_unlock_irqrestore(&dev->lock, flags);
-						isdn_net_unreachable(ndev, skb, "dial rejected: packet filtered");
-						dev_kfree_skb(skb);
-						return NETDEV_TX_OK;
-					}
-#endif
-					spin_unlock_irqrestore(&dev->lock, flags);
-					isdn_net_dial();	/* Initiate dialing */
-					netif_stop_queue(ndev);
-					return NETDEV_TX_BUSY;	/* let upper layer requeue skb packet */
-				}
-#endif
-				/* Initiate dialing */
-				spin_unlock_irqrestore(&dev->lock, flags);
-				isdn_net_dial();
-				isdn_net_device_stop_queue(lp);
-				return NETDEV_TX_BUSY;
-			} else {
-				isdn_net_unreachable(ndev, skb,
-						     "No phone number");
-				dev_kfree_skb(skb);
-				return NETDEV_TX_OK;
-			}
-		} else {
-			/* Device is connected to an ISDN channel */
-			netif_trans_update(ndev);
-			if (!lp->dialstate) {
-				/* ISDN connection is established, try sending */
-				int ret;
-				ret = (isdn_net_xmit(ndev, skb));
-				if (ret) netif_stop_queue(ndev);
-				return ret;
-			} else
-				netif_stop_queue(ndev);
-		}
-	}
-	return NETDEV_TX_BUSY;
-}
-
-/*
- * Shutdown a net-interface.
- */
-static int
-isdn_net_close(struct net_device *dev)
-{
-	struct net_device *p;
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot =
-		((isdn_net_local *)netdev_priv(dev))->netdev->cprot;
-	/* printk(KERN_DEBUG "isdn_net_close %s\n" , dev-> name); */
-#endif
-
-#ifdef CONFIG_ISDN_X25
-	if (cprot && cprot->pops) cprot->pops->close(cprot);
-#endif
-	netif_stop_queue(dev);
-	p = MASTER_TO_SLAVE(dev);
-	if (p) {
-		/* If this interface has slaves, stop them also */
-		while (p) {
-#ifdef CONFIG_ISDN_X25
-			cprot = ((isdn_net_local *)netdev_priv(p))
-				->netdev->cprot;
-			if (cprot && cprot->pops)
-				cprot->pops->close(cprot);
-#endif
-			isdn_net_hangup(p);
-			p = MASTER_TO_SLAVE(p);
-		}
-	}
-	isdn_net_hangup(dev);
-	isdn_unlock_drivers();
-	return 0;
-}
-
-/*
- * Get statistics
- */
-static struct net_device_stats *
-isdn_net_get_stats(struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	return &lp->stats;
-}
-
-/*      This is simply a copy from std. eth.c EXCEPT we pull ETH_HLEN
- *      instead of dev->hard_header_len off. This is done because the
- *      lowlevel-driver has already pulled off its stuff when we get
- *      here and this routine only gets called with p_encap == ETHER.
- *      Determine the packet's protocol ID. The rule here is that we
- *      assume 802.3 if the type field is short enough to be a length.
- *      This is normal practice and works for any 'now in use' protocol.
- */
-
-static __be16
-isdn_net_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
-	struct ethhdr *eth;
-	unsigned char *rawp;
-
-	skb_reset_mac_header(skb);
-	skb_pull(skb, ETH_HLEN);
-	eth = eth_hdr(skb);
-
-	if (*eth->h_dest & 1) {
-		if (ether_addr_equal(eth->h_dest, dev->broadcast))
-			skb->pkt_type = PACKET_BROADCAST;
-		else
-			skb->pkt_type = PACKET_MULTICAST;
-	}
-	/*
-	 *      This ALLMULTI check should be redundant by 1.4
-	 *      so don't forget to remove it.
-	 */
-
-	else if (dev->flags & (IFF_PROMISC /*| IFF_ALLMULTI*/)) {
-		if (!ether_addr_equal(eth->h_dest, dev->dev_addr))
-			skb->pkt_type = PACKET_OTHERHOST;
-	}
-	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
-		return eth->h_proto;
-
-	rawp = skb->data;
-
-	/*
-	 *      This is a magic hack to spot IPX packets. Older Novell breaks
-	 *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
-	 *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
-	 *      won't work for fault tolerant netware but does for the rest.
-	 */
-	if (*(unsigned short *) rawp == 0xFFFF)
-		return htons(ETH_P_802_3);
-	/*
-	 *      Real 802.2 LLC
-	 */
-	return htons(ETH_P_802_2);
-}
-
-
-/*
- * CISCO HDLC keepalive specific stuff
- */
-static struct sk_buff*
-isdn_net_ciscohdlck_alloc_skb(isdn_net_local *lp, int len)
-{
-	unsigned short hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-	struct sk_buff *skb;
-
-	skb = alloc_skb(hl + len, GFP_ATOMIC);
-	if (skb)
-		skb_reserve(skb, hl);
-	else
-		printk("isdn out of mem at %s:%d!\n", __FILE__, __LINE__);
-	return skb;
-}
-
-/* cisco hdlck device private ioctls */
-static int
-isdn_ciscohdlck_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	unsigned long len = 0;
-	unsigned long expires = 0;
-	int tmp = 0;
-	int period = lp->cisco_keepalive_period;
-	s8 debserint = lp->cisco_debserint;
-	int rc = 0;
-
-	if (lp->p_encap != ISDN_NET_ENCAP_CISCOHDLCK)
-		return -EINVAL;
-
-	switch (cmd) {
-		/* get/set keepalive period */
-	case SIOCGKEEPPERIOD:
-		len = (unsigned long)sizeof(lp->cisco_keepalive_period);
-		if (copy_to_user(ifr->ifr_data,
-				 &lp->cisco_keepalive_period, len))
-			rc = -EFAULT;
-		break;
-	case SIOCSKEEPPERIOD:
-		tmp = lp->cisco_keepalive_period;
-		len = (unsigned long)sizeof(lp->cisco_keepalive_period);
-		if (copy_from_user(&period, ifr->ifr_data, len))
-			rc = -EFAULT;
-		if ((period > 0) && (period <= 32767))
-			lp->cisco_keepalive_period = period;
-		else
-			rc = -EINVAL;
-		if (!rc && (tmp != lp->cisco_keepalive_period)) {
-			expires = (unsigned long)(jiffies +
-						  lp->cisco_keepalive_period * HZ);
-			mod_timer(&lp->cisco_timer, expires);
-			printk(KERN_INFO "%s: Keepalive period set "
-			       "to %d seconds.\n",
-			       dev->name, lp->cisco_keepalive_period);
-		}
-		break;
-
-		/* get/set debugging */
-	case SIOCGDEBSERINT:
-		len = (unsigned long)sizeof(lp->cisco_debserint);
-		if (copy_to_user(ifr->ifr_data,
-				 &lp->cisco_debserint, len))
-			rc = -EFAULT;
-		break;
-	case SIOCSDEBSERINT:
-		len = (unsigned long)sizeof(lp->cisco_debserint);
-		if (copy_from_user(&debserint,
-				   ifr->ifr_data, len))
-			rc = -EFAULT;
-		if ((debserint >= 0) && (debserint <= 64))
-			lp->cisco_debserint = debserint;
-		else
-			rc = -EINVAL;
-		break;
-
-	default:
-		rc = -EINVAL;
-		break;
-	}
-	return (rc);
-}
-
-
-static int isdn_net_ioctl(struct net_device *dev,
-			  struct ifreq *ifr, int cmd)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-
-	switch (lp->p_encap) {
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		return isdn_ppp_dev_ioctl(dev, ifr, cmd);
-#endif
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		return isdn_ciscohdlck_dev_ioctl(dev, ifr, cmd);
-	default:
-		return -EINVAL;
-	}
-}
-
-/* called via cisco_timer.function */
-static void
-isdn_net_ciscohdlck_slarp_send_keepalive(struct timer_list *t)
-{
-	isdn_net_local *lp = from_timer(lp, t, cisco_timer);
-	struct sk_buff *skb;
-	unsigned char *p;
-	unsigned long last_cisco_myseq = lp->cisco_myseq;
-	int myseq_diff = 0;
-
-	if (!(lp->flags & ISDN_NET_CONNECTED) || lp->dialstate) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		return;
-	}
-	lp->cisco_myseq++;
-
-	myseq_diff = (lp->cisco_myseq - lp->cisco_mineseen);
-	if ((lp->cisco_line_state) && ((myseq_diff >= 3) || (myseq_diff <= -3))) {
-		/* line up -> down */
-		lp->cisco_line_state = 0;
-		printk(KERN_WARNING
-		       "UPDOWN: Line protocol on Interface %s,"
-		       " changed state to down\n", lp->netdev->dev->name);
-		/* should stop routing higher-level data across */
-	} else if ((!lp->cisco_line_state) &&
-		   (myseq_diff >= 0) && (myseq_diff <= 2)) {
-		/* line down -> up */
-		lp->cisco_line_state = 1;
-		printk(KERN_WARNING
-		       "UPDOWN: Line protocol on Interface %s,"
-		       " changed state to up\n", lp->netdev->dev->name);
-		/* restart routing higher-level data across */
-	}
-
-	if (lp->cisco_debserint)
-		printk(KERN_DEBUG "%s: HDLC "
-		       "myseq %lu, mineseen %lu%c, yourseen %lu, %s\n",
-		       lp->netdev->dev->name, last_cisco_myseq, lp->cisco_mineseen,
-		       ((last_cisco_myseq == lp->cisco_mineseen) ? '*' : 040),
-		       lp->cisco_yourseq,
-		       ((lp->cisco_line_state) ? "line up" : "line down"));
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp keepalive */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_KEEPALIVE);
-	*(__be32 *)(p +  8) = cpu_to_be32(lp->cisco_myseq);
-	*(__be32 *)(p + 12) = cpu_to_be32(lp->cisco_yourseq);
-	*(__be16 *)(p + 16) = cpu_to_be16(0xffff); // reliability, always 0xffff
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-
-	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
-
-	add_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_send_request(isdn_net_local *lp)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp request */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_REQUEST);
-	*(__be32 *)(p +  8) = cpu_to_be32(0); // address
-	*(__be32 *)(p + 12) = cpu_to_be32(0); // netmask
-	*(__be16 *)(p + 16) = cpu_to_be16(0); // unused
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-}
-
-static void
-isdn_net_ciscohdlck_connected(isdn_net_local *lp)
-{
-	lp->cisco_myseq = 0;
-	lp->cisco_mineseen = 0;
-	lp->cisco_yourseq = 0;
-	lp->cisco_keepalive_period = ISDN_TIMER_KEEPINT;
-	lp->cisco_last_slarp_in = 0;
-	lp->cisco_line_state = 0;
-	lp->cisco_debserint = 0;
-
-	/* send slarp request because interface/seq.no.s reset */
-	isdn_net_ciscohdlck_slarp_send_request(lp);
-
-	timer_setup(&lp->cisco_timer,
-		    isdn_net_ciscohdlck_slarp_send_keepalive, 0);
-	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
-	add_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_disconnected(isdn_net_local *lp)
-{
-	del_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_send_reply(isdn_net_local *lp)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-	struct in_device *in_dev = NULL;
-	__be32 addr = 0;		/* local ipv4 address */
-	__be32 mask = 0;		/* local netmask */
-
-	if ((in_dev = lp->netdev->dev->ip_ptr) != NULL) {
-		/* take primary(first) address of interface */
-		struct in_ifaddr *ifa = in_dev->ifa_list;
-		if (ifa != NULL) {
-			addr = ifa->ifa_local;
-			mask = ifa->ifa_mask;
-		}
-	}
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp reply, send own ip/netmask; if values are nonsense remote
-	 * should think we are unable to provide it with an address via SLARP */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_REPLY);
-	*(__be32 *)(p +  8) = addr; // address
-	*(__be32 *)(p + 12) = mask; // netmask
-	*(__be16 *)(p + 16) = cpu_to_be16(0); // unused
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_in(isdn_net_local *lp, struct sk_buff *skb)
-{
-	unsigned char *p;
-	int period;
-	u32 code;
-	u32 my_seq;
-	u32 your_seq;
-	__be32 local;
-	__be32 *addr, *mask;
-
-	if (skb->len < 14)
-		return;
-
-	p = skb->data;
-	code = be32_to_cpup((__be32 *)p);
-	p += 4;
-
-	switch (code) {
-	case CISCO_SLARP_REQUEST:
-		lp->cisco_yourseq = 0;
-		isdn_net_ciscohdlck_slarp_send_reply(lp);
-		break;
-	case CISCO_SLARP_REPLY:
-		addr = (__be32 *)p;
-		mask = (__be32 *)(p + 4);
-		if (*mask != cpu_to_be32(0xfffffffc))
-			goto slarp_reply_out;
-		if ((*addr & cpu_to_be32(3)) == cpu_to_be32(0) ||
-		    (*addr & cpu_to_be32(3)) == cpu_to_be32(3))
-			goto slarp_reply_out;
-		local = *addr ^ cpu_to_be32(3);
-		printk(KERN_INFO "%s: got slarp reply: remote ip: %pI4, local ip: %pI4 mask: %pI4\n",
-		       lp->netdev->dev->name, addr, &local, mask);
-		break;
-	slarp_reply_out:
-		printk(KERN_INFO "%s: got invalid slarp reply (%pI4/%pI4) - ignored\n",
-		       lp->netdev->dev->name, addr, mask);
-		break;
-	case CISCO_SLARP_KEEPALIVE:
-		period = (int)((jiffies - lp->cisco_last_slarp_in
-				+ HZ / 2 - 1) / HZ);
-		if (lp->cisco_debserint &&
-		    (period != lp->cisco_keepalive_period) &&
-		    lp->cisco_last_slarp_in) {
-			printk(KERN_DEBUG "%s: Keepalive period mismatch - "
-			       "is %d but should be %d.\n",
-			       lp->netdev->dev->name, period,
-			       lp->cisco_keepalive_period);
-		}
-		lp->cisco_last_slarp_in = jiffies;
-		my_seq = be32_to_cpup((__be32 *)(p + 0));
-		your_seq = be32_to_cpup((__be32 *)(p + 4));
-		p += 10;
-		lp->cisco_yourseq = my_seq;
-		lp->cisco_mineseen = your_seq;
-		break;
-	}
-}
-
-static void
-isdn_net_ciscohdlck_receive(isdn_net_local *lp, struct sk_buff *skb)
-{
-	unsigned char *p;
-	u8 addr;
-	u8 ctrl;
-	u16 type;
-
-	if (skb->len < 4)
-		goto out_free;
-
-	p = skb->data;
-	addr = *(u8 *)(p + 0);
-	ctrl = *(u8 *)(p + 1);
-	type = be16_to_cpup((__be16 *)(p + 2));
-	p += 4;
-	skb_pull(skb, 4);
-
-	if (addr != CISCO_ADDR_UNICAST && addr != CISCO_ADDR_BROADCAST) {
-		printk(KERN_WARNING "%s: Unknown Cisco addr 0x%02x\n",
-		       lp->netdev->dev->name, addr);
-		goto out_free;
-	}
-	if (ctrl != CISCO_CTRL) {
-		printk(KERN_WARNING "%s: Unknown Cisco ctrl 0x%02x\n",
-		       lp->netdev->dev->name, ctrl);
-		goto out_free;
-	}
-
-	switch (type) {
-	case CISCO_TYPE_SLARP:
-		isdn_net_ciscohdlck_slarp_in(lp, skb);
-		goto out_free;
-	case CISCO_TYPE_CDP:
-		if (lp->cisco_debserint)
-			printk(KERN_DEBUG "%s: Received CDP packet. use "
-			       "\"no cdp enable\" on cisco.\n",
-			       lp->netdev->dev->name);
-		goto out_free;
-	default:
-		/* no special cisco protocol */
-		skb->protocol = htons(type);
-		netif_rx(skb);
-		return;
-	}
-
-out_free:
-	kfree_skb(skb);
-}
-
-/*
- * Got a packet from ISDN-Channel.
- */
-static void
-isdn_net_receive(struct net_device *ndev, struct sk_buff *skb)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-	isdn_net_local *olp = lp;	/* original 'lp' */
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-#endif
-	lp->transcount += skb->len;
-
-	lp->stats.rx_packets++;
-	lp->stats.rx_bytes += skb->len;
-	if (lp->master) {
-		/* Bundling: If device is a slave-device, deliver to master, also
-		 * handle master's statistics and hangup-timeout
-		 */
-		ndev = lp->master;
-		lp = netdev_priv(ndev);
-		lp->stats.rx_packets++;
-		lp->stats.rx_bytes += skb->len;
-	}
-	skb->dev = ndev;
-	skb->pkt_type = PACKET_HOST;
-	skb_reset_mac_header(skb);
-#ifdef ISDN_DEBUG_NET_DUMP
-	isdn_dumppkt("R:", skb->data, skb->len, 40);
-#endif
-	switch (lp->p_encap) {
-	case ISDN_NET_ENCAP_ETHER:
-		/* Ethernet over ISDN */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = isdn_net_type_trans(skb, ndev);
-		break;
-	case ISDN_NET_ENCAP_UIHDLC:
-		/* HDLC with UI-frame (for ispa with -h1 option) */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb_pull(skb, 2);
-		/* Fall through */
-	case ISDN_NET_ENCAP_RAWIP:
-		/* RAW-IP without MAC-Header */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		isdn_net_ciscohdlck_receive(lp, skb);
-		return;
-	case ISDN_NET_ENCAP_CISCOHDLC:
-		/* CISCO-HDLC IP with type field and  fake I-frame-header */
-		skb_pull(skb, 2);
-		/* Fall through */
-	case ISDN_NET_ENCAP_IPTYP:
-		/* IP with type field */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = *(__be16 *)&(skb->data[0]);
-		skb_pull(skb, 2);
-		if (*(unsigned short *) skb->data == 0xFFFF)
-			skb->protocol = htons(ETH_P_802_3);
-		break;
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		/* huptimer is done in isdn_ppp_push_higher */
-		isdn_ppp_receive(lp->netdev, olp, skb);
-		return;
-#endif
-
-	default:
-#ifdef CONFIG_ISDN_X25
-		/* try if there are generic sync_device receiver routines */
-		if (cprot) if (cprot->pops)
-				   if (cprot->pops->data_ind) {
-					   cprot->pops->data_ind(cprot, skb);
-					   return;
-				   };
-#endif /* CONFIG_ISDN_X25 */
-		printk(KERN_WARNING "%s: unknown encapsulation, dropping\n",
-		       lp->netdev->dev->name);
-		kfree_skb(skb);
-		return;
-	}
-
-	netif_rx(skb);
-	return;
-}
-
-/*
- * A packet arrived via ISDN. Search interface-chain for a corresponding
- * interface. If found, deliver packet to receiver-function and return 1,
- * else return 0.
- */
-int
-isdn_net_rcv_skb(int idx, struct sk_buff *skb)
-{
-	isdn_net_dev *p = dev->rx_netdev[idx];
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-		if ((lp->flags & ISDN_NET_CONNECTED) &&
-		    (!lp->dialstate)) {
-			isdn_net_receive(p->dev, skb);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- *  build an header
- *  depends on encaps that is being used.
- */
-
-static int isdn_net_header(struct sk_buff *skb, struct net_device *dev,
-			   unsigned short type,
-			   const void *daddr, const void *saddr, unsigned plen)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	unsigned char *p;
-	int len = 0;
-
-	switch (lp->p_encap) {
-	case ISDN_NET_ENCAP_ETHER:
-		len = eth_header(skb, dev, type, daddr, saddr, plen);
-		break;
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		/* stick on a fake header to keep fragmentation code happy. */
-		len = IPPP_MAX_HEADER;
-		skb_push(skb, len);
-		break;
-#endif
-	case ISDN_NET_ENCAP_RAWIP:
-		printk(KERN_WARNING "isdn_net_header called with RAW_IP!\n");
-		len = 0;
-		break;
-	case ISDN_NET_ENCAP_IPTYP:
-		/* ethernet type field */
-		*((__be16 *)skb_push(skb, 2)) = htons(type);
-		len = 2;
-		break;
-	case ISDN_NET_ENCAP_UIHDLC:
-		/* HDLC with UI-Frames (for ispa with -h1 option) */
-		*((__be16 *)skb_push(skb, 2)) = htons(0x0103);
-		len = 2;
-		break;
-	case ISDN_NET_ENCAP_CISCOHDLC:
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		p = skb_push(skb, 4);
-		*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-		*(u8 *)(p + 1) = CISCO_CTRL;
-		*(__be16 *)(p + 2) = cpu_to_be16(type);
-		p += 4;
-		len = 4;
-		break;
-#ifdef CONFIG_ISDN_X25
-	default:
-		/* try if there are generic concap protocol routines */
-		if (lp->netdev->cprot) {
-			printk(KERN_WARNING "isdn_net_header called with concap_proto!\n");
-			len = 0;
-			break;
-		}
-		break;
-#endif /* CONFIG_ISDN_X25 */
-	}
-	return len;
-}
-
-static int isdn_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
-			     __be16 type)
-{
-	const struct net_device *dev = neigh->dev;
-	isdn_net_local *lp = netdev_priv(dev);
-
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER)
-		return eth_header_cache(neigh, hh, type);
-	return -1;
-}
-
-static void isdn_header_cache_update(struct hh_cache *hh,
-				     const struct net_device *dev,
-				     const unsigned char *haddr)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER)
-		eth_header_cache_update(hh, dev, haddr);
-}
-
-static const struct header_ops isdn_header_ops = {
-	.create = isdn_net_header,
-	.cache = isdn_header_cache,
-	.cache_update = isdn_header_cache_update,
-};
-
-/*
- * Interface-setup. (just after registering a new interface)
- */
-static int
-isdn_net_init(struct net_device *ndev)
-{
-	ushort max_hlhdr_len = 0;
-	int drvidx;
-
-	/*
-	 *  up till binding we ask the protocol layer to reserve as much
-	 *  as we might need for HL layer
-	 */
-
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++)
-		if (dev->drv[drvidx])
-			if (max_hlhdr_len < dev->drv[drvidx]->interface->hl_hdrlen)
-				max_hlhdr_len = dev->drv[drvidx]->interface->hl_hdrlen;
-
-	ndev->hard_header_len = ETH_HLEN + max_hlhdr_len;
-	return 0;
-}
-
-static void
-isdn_net_swapbind(int drvidx)
-{
-	isdn_net_dev *p;
-
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: swapping ch of %d\n", drvidx);
-#endif
-	p = dev->netdev;
-	while (p) {
-		if (p->local->pre_device == drvidx)
-			switch (p->local->pre_channel) {
-			case 0:
-				p->local->pre_channel = 1;
-				break;
-			case 1:
-				p->local->pre_channel = 0;
-				break;
-			}
-		p = (isdn_net_dev *) p->next;
-	}
-}
-
-static void
-isdn_net_swap_usage(int i1, int i2)
-{
-	int u1 = dev->usage[i1] & ISDN_USAGE_EXCLUSIVE;
-	int u2 = dev->usage[i2] & ISDN_USAGE_EXCLUSIVE;
-
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: usage of %d and %d\n", i1, i2);
-#endif
-	dev->usage[i1] &= ~ISDN_USAGE_EXCLUSIVE;
-	dev->usage[i1] |= u2;
-	dev->usage[i2] &= ~ISDN_USAGE_EXCLUSIVE;
-	dev->usage[i2] |= u1;
-	isdn_info_update();
-}
-
-/*
- * An incoming call-request has arrived.
- * Search the interface-chain for an appropriate interface.
- * If found, connect the interface to the ISDN-channel and initiate
- * D- and B-Channel-setup. If secure-flag is set, accept only
- * configured phone-numbers. If callback-flag is set, initiate
- * callback-dialing.
- *
- * Return-Value: 0 = No appropriate interface for this call.
- *               1 = Call accepted
- *               2 = Reject call, wait cbdelay, then call back
- *               3 = Reject call
- *               4 = Wait cbdelay, then call back
- *               5 = No appropriate interface for this call,
- *                   would eventually match if CID was longer.
- */
-
-int
-isdn_net_find_icall(int di, int ch, int idx, setup_parm *setup)
-{
-	char *eaz;
-	int si1;
-	int si2;
-	int ematch;
-	int wret;
-	int swapped;
-	int sidx = 0;
-	u_long flags;
-	isdn_net_dev *p;
-	isdn_net_phone *n;
-	char nr[ISDN_MSNLEN];
-	char *my_eaz;
-
-	/* Search name in netdev-chain */
-	if (!setup->phone[0]) {
-		nr[0] = '0';
-		nr[1] = '\0';
-		printk(KERN_INFO "isdn_net: Incoming call without OAD, assuming '0'\n");
-	} else
-		strlcpy(nr, setup->phone, ISDN_MSNLEN);
-	si1 = (int) setup->si1;
-	si2 = (int) setup->si2;
-	if (!setup->eazmsn[0]) {
-		printk(KERN_WARNING "isdn_net: Incoming call without CPN, assuming '0'\n");
-		eaz = "0";
-	} else
-		eaz = setup->eazmsn;
-	if (dev->net_verbose > 1)
-		printk(KERN_INFO "isdn_net: call from %s,%d,%d -> %s\n", nr, si1, si2, eaz);
-	/* Accept DATA and VOICE calls at this stage
-	 * local eaz is checked later for allowed call types
-	 */
-	if ((si1 != 7) && (si1 != 1)) {
-		if (dev->net_verbose > 1)
-			printk(KERN_INFO "isdn_net: Service-Indicator not 1 or 7, ignored\n");
-		return 0;
-	}
-	n = (isdn_net_phone *) 0;
-	p = dev->netdev;
-	ematch = wret = swapped = 0;
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: di=%d ch=%d idx=%d usg=%d\n", di, ch, idx,
-	       dev->usage[idx]);
-#endif
-	while (p) {
-		int matchret;
-		isdn_net_local *lp = p->local;
-
-		/* If last check has triggered as binding-swap, revert it */
-		switch (swapped) {
-		case 2:
-			isdn_net_swap_usage(idx, sidx);
-			/* fall through */
-		case 1:
-			isdn_net_swapbind(di);
-			break;
-		}
-		swapped = 0;
-		/* check acceptable call types for DOV */
-		my_eaz = isdn_map_eaz2msn(lp->msn, di);
-		if (si1 == 1) { /* it's a DOV call, check if we allow it */
-			if (*my_eaz == 'v' || *my_eaz == 'V' ||
-			    *my_eaz == 'b' || *my_eaz == 'B')
-				my_eaz++; /* skip to allow a match */
-			else
-				my_eaz = NULL; /* force non match */
-		} else { /* it's a DATA call, check if we allow it */
-			if (*my_eaz == 'b' || *my_eaz == 'B')
-				my_eaz++; /* skip to allow a match */
-		}
-		if (my_eaz)
-			matchret = isdn_msncmp(eaz, my_eaz);
-		else
-			matchret = 1;
-		if (!matchret)
-			ematch = 1;
-
-		/* Remember if more numbers eventually can match */
-		if (matchret > wret)
-			wret = matchret;
-#ifdef ISDN_DEBUG_NET_ICALL
-		printk(KERN_DEBUG "n_fi: if='%s', l.msn=%s, l.flags=%d, l.dstate=%d\n",
-		       p->dev->name, lp->msn, lp->flags, lp->dialstate);
-#endif
-		if ((!matchret) &&                                        /* EAZ is matching   */
-		    (((!(lp->flags & ISDN_NET_CONNECTED)) &&              /* but not connected */
-		      (USG_NONE(dev->usage[idx]))) ||                     /* and ch. unused or */
-		     ((((lp->dialstate == 4) || (lp->dialstate == 12)) && /* if dialing        */
-		       (!(lp->flags & ISDN_NET_CALLBACK)))                /* but no callback   */
-			     )))
-		{
-#ifdef ISDN_DEBUG_NET_ICALL
-			printk(KERN_DEBUG "n_fi: match1, pdev=%d pch=%d\n",
-			       lp->pre_device, lp->pre_channel);
-#endif
-			if (dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) {
-				if ((lp->pre_channel != ch) ||
-				    (lp->pre_device != di)) {
-					/* Here we got a problem:
-					 * If using an ICN-Card, an incoming call is always signaled on
-					 * on the first channel of the card, if both channels are
-					 * down. However this channel may be bound exclusive. If the
-					 * second channel is free, this call should be accepted.
-					 * The solution is horribly but it runs, so what:
-					 * We exchange the exclusive bindings of the two channels, the
-					 * corresponding variables in the interface-structs.
-					 */
-					if (ch == 0) {
-						sidx = isdn_dc2minor(di, 1);
-#ifdef ISDN_DEBUG_NET_ICALL
-						printk(KERN_DEBUG "n_fi: ch is 0\n");
-#endif
-						if (USG_NONE(dev->usage[sidx])) {
-							/* Second Channel is free, now see if it is bound
-							 * exclusive too. */
-							if (dev->usage[sidx] & ISDN_USAGE_EXCLUSIVE) {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: 2nd channel is down and bound\n");
-#endif
-								/* Yes, swap bindings only, if the original
-								 * binding is bound to channel 1 of this driver */
-								if ((lp->pre_device == di) &&
-								    (lp->pre_channel == 1)) {
-									isdn_net_swapbind(di);
-									swapped = 1;
-								} else {
-									/* ... else iterate next device */
-									p = (isdn_net_dev *) p->next;
-									continue;
-								}
-							} else {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: 2nd channel is down and unbound\n");
-#endif
-								/* No, swap always and swap excl-usage also */
-								isdn_net_swap_usage(idx, sidx);
-								isdn_net_swapbind(di);
-								swapped = 2;
-							}
-							/* Now check for exclusive binding again */
-#ifdef ISDN_DEBUG_NET_ICALL
-							printk(KERN_DEBUG "n_fi: final check\n");
-#endif
-							if ((dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) &&
-							    ((lp->pre_channel != ch) ||
-							     (lp->pre_device != di))) {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: final check failed\n");
-#endif
-								p = (isdn_net_dev *) p->next;
-								continue;
-							}
-						}
-					} else {
-						/* We are already on the second channel, so nothing to do */
-#ifdef ISDN_DEBUG_NET_ICALL
-						printk(KERN_DEBUG "n_fi: already on 2nd channel\n");
-#endif
-					}
-				}
-			}
-#ifdef ISDN_DEBUG_NET_ICALL
-			printk(KERN_DEBUG "n_fi: match2\n");
-#endif
-			n = lp->phone[0];
-			if (lp->flags & ISDN_NET_SECURE) {
-				while (n) {
-					if (!isdn_msncmp(nr, n->num))
-						break;
-					n = (isdn_net_phone *) n->next;
-				}
-			}
-			if (n || (!(lp->flags & ISDN_NET_SECURE))) {
-#ifdef ISDN_DEBUG_NET_ICALL
-				printk(KERN_DEBUG "n_fi: match3\n");
-#endif
-				/* matching interface found */
-
-				/*
-				 * Is the state STOPPED?
-				 * If so, no dialin is allowed,
-				 * so reject actively.
-				 * */
-				if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) {
-					printk(KERN_INFO "incoming call, interface %s `stopped' -> rejected\n",
-					       p->dev->name);
-					return 3;
-				}
-				/*
-				 * Is the interface up?
-				 * If not, reject the call actively.
-				 */
-				if (!isdn_net_device_started(p)) {
-					printk(KERN_INFO "%s: incoming call, interface down -> rejected\n",
-					       p->dev->name);
-					return 3;
-				}
-				/* Interface is up, now see if it's a slave. If so, see if
-				 * it's master and parent slave is online. If not, reject the call.
-				 */
-				if (lp->master) {
-					isdn_net_local *mlp = ISDN_MASTER_PRIV(lp);
-					printk(KERN_DEBUG "ICALLslv: %s\n", p->dev->name);
-					printk(KERN_DEBUG "master=%s\n", lp->master->name);
-					if (mlp->flags & ISDN_NET_CONNECTED) {
-						printk(KERN_DEBUG "master online\n");
-						/* Master is online, find parent-slave (master if first slave) */
-						while (mlp->slave) {
-							if (ISDN_SLAVE_PRIV(mlp) == lp)
-								break;
-							mlp = ISDN_SLAVE_PRIV(mlp);
-						}
-					} else
-						printk(KERN_DEBUG "master offline\n");
-					/* Found parent, if it's offline iterate next device */
-					printk(KERN_DEBUG "mlpf: %d\n", mlp->flags & ISDN_NET_CONNECTED);
-					if (!(mlp->flags & ISDN_NET_CONNECTED)) {
-						p = (isdn_net_dev *) p->next;
-						continue;
-					}
-				}
-				if (lp->flags & ISDN_NET_CALLBACK) {
-					int chi;
-					/*
-					 * Is the state MANUAL?
-					 * If so, no callback can be made,
-					 * so reject actively.
-					 * */
-					if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) {
-						printk(KERN_INFO "incoming call for callback, interface %s `off' -> rejected\n",
-						       p->dev->name);
-						return 3;
-					}
-					printk(KERN_DEBUG "%s: call from %s -> %s, start callback\n",
-					       p->dev->name, nr, eaz);
-					if (lp->phone[1]) {
-						/* Grab a free ISDN-Channel */
-						spin_lock_irqsave(&dev->lock, flags);
-						if ((chi =
-						     isdn_get_free_channel(
-							     ISDN_USAGE_NET,
-							     lp->l2_proto,
-							     lp->l3_proto,
-							     lp->pre_device,
-							     lp->pre_channel,
-							     lp->msn)
-							    ) < 0) {
-
-							printk(KERN_WARNING "isdn_net_find_icall: No channel for %s\n",
-							       p->dev->name);
-							spin_unlock_irqrestore(&dev->lock, flags);
-							return 0;
-						}
-						/* Setup dialstate. */
-						lp->dtimer = 0;
-						lp->dialstate = 11;
-						/* Connect interface with channel */
-						isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-						if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-							if (isdn_ppp_bind(lp) < 0) {
-								spin_unlock_irqrestore(&dev->lock, flags);
-								isdn_net_unbind_channel(lp);
-								return 0;
-							}
-#endif
-						spin_unlock_irqrestore(&dev->lock, flags);
-						/* Initiate dialing by returning 2 or 4 */
-						return (lp->flags & ISDN_NET_CBHUP) ? 2 : 4;
-					} else
-						printk(KERN_WARNING "isdn_net: %s: No phone number\n",
-						       p->dev->name);
-					return 0;
-				} else {
-					printk(KERN_DEBUG "%s: call from %s -> %s accepted\n",
-					       p->dev->name, nr, eaz);
-					/* if this interface is dialing, it does it probably on a different
-					   device, so free this device */
-					if ((lp->dialstate == 4) || (lp->dialstate == 12)) {
-#ifdef CONFIG_ISDN_PPP
-						if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-							isdn_ppp_free(lp);
-#endif
-						isdn_net_lp_disconnected(lp);
-						isdn_free_channel(lp->isdn_device, lp->isdn_channel,
-								  ISDN_USAGE_NET);
-					}
-					spin_lock_irqsave(&dev->lock, flags);
-					dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE;
-					dev->usage[idx] |= ISDN_USAGE_NET;
-					strcpy(dev->num[idx], nr);
-					isdn_info_update();
-					dev->st_netdev[idx] = lp->netdev;
-					lp->isdn_device = di;
-					lp->isdn_channel = ch;
-					lp->ppp_slot = -1;
-					lp->flags |= ISDN_NET_CONNECTED;
-					lp->dialstate = 7;
-					lp->dtimer = 0;
-					lp->outgoing = 0;
-					lp->huptimer = 0;
-					lp->hupflags |= ISDN_WAITCHARGE;
-					lp->hupflags &= ~ISDN_HAVECHARGE;
-#ifdef CONFIG_ISDN_PPP
-					if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-						if (isdn_ppp_bind(lp) < 0) {
-							isdn_net_unbind_channel(lp);
-							spin_unlock_irqrestore(&dev->lock, flags);
-							return 0;
-						}
-					}
-#endif
-					spin_unlock_irqrestore(&dev->lock, flags);
-					return 1;
-				}
-			}
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	/* If none of configured EAZ/MSN matched and not verbose, be silent */
-	if (!ematch || dev->net_verbose)
-		printk(KERN_INFO "isdn_net: call from %s -> %d %s ignored\n", nr, di, eaz);
-	return (wret == 2) ? 5 : 0;
-}
-
-/*
- * Search list of net-interfaces for an interface with given name.
- */
-isdn_net_dev *
-isdn_net_findif(char *name)
-{
-	isdn_net_dev *p = dev->netdev;
-
-	while (p) {
-		if (!strcmp(p->dev->name, name))
-			return p;
-		p = (isdn_net_dev *) p->next;
-	}
-	return (isdn_net_dev *) NULL;
-}
-
-/*
- * Force a net-interface to dial out.
- * This is called from the userlevel-routine below or
- * from isdn_net_start_xmit().
- */
-static int
-isdn_net_force_dial_lp(isdn_net_local *lp)
-{
-	if ((!(lp->flags & ISDN_NET_CONNECTED)) && !lp->dialstate) {
-		int chi;
-		if (lp->phone[1]) {
-			ulong flags;
-
-			/* Grab a free ISDN-Channel */
-			spin_lock_irqsave(&dev->lock, flags);
-			if ((chi = isdn_get_free_channel(
-				     ISDN_USAGE_NET,
-				     lp->l2_proto,
-				     lp->l3_proto,
-				     lp->pre_device,
-				     lp->pre_channel,
-				     lp->msn)) < 0) {
-				printk(KERN_WARNING "isdn_net_force_dial: No channel for %s\n",
-				       lp->netdev->dev->name);
-				spin_unlock_irqrestore(&dev->lock, flags);
-				return -EAGAIN;
-			}
-			lp->dialstate = 1;
-			/* Connect interface with channel */
-			isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-			if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-				if (isdn_ppp_bind(lp) < 0) {
-					isdn_net_unbind_channel(lp);
-					spin_unlock_irqrestore(&dev->lock, flags);
-					return -EAGAIN;
-				}
-#endif
-			/* Initiate dialing */
-			spin_unlock_irqrestore(&dev->lock, flags);
-			isdn_net_dial();
-			return 0;
-		} else
-			return -EINVAL;
-	} else
-		return -EBUSY;
-}
-
-/*
- * This is called from certain upper protocol layers (multilink ppp
- * and x25iface encapsulation module) that want to initiate dialing
- * themselves.
- */
-int
-isdn_net_dial_req(isdn_net_local *lp)
-{
-	/* is there a better error code? */
-	if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) return -EBUSY;
-
-	return isdn_net_force_dial_lp(lp);
-}
-
-/*
- * Force a net-interface to dial out.
- * This is always called from within userspace (ISDN_IOCTL_NET_DIAL).
- */
-int
-isdn_net_force_dial(char *name)
-{
-	isdn_net_dev *p = isdn_net_findif(name);
-
-	if (!p)
-		return -ENODEV;
-	return (isdn_net_force_dial_lp(p->local));
-}
-
-/* The ISDN-specific entries in the device structure. */
-static const struct net_device_ops isdn_netdev_ops = {
-	.ndo_init	      = isdn_net_init,
-	.ndo_open	      = isdn_net_open,
-	.ndo_stop	      = isdn_net_close,
-	.ndo_do_ioctl	      = isdn_net_ioctl,
-
-	.ndo_start_xmit	      = isdn_net_start_xmit,
-	.ndo_get_stats	      = isdn_net_get_stats,
-	.ndo_tx_timeout	      = isdn_net_tx_timeout,
-};
-
-/*
- * Helper for alloc_netdev()
- */
-static void _isdn_setup(struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-
-	ether_setup(dev);
-
-	/* Setup the generic properties */
-	dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-
-	/* isdn prepends a header in the tx path, can't share skbs */
-	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
-	dev->header_ops = NULL;
-	dev->netdev_ops = &isdn_netdev_ops;
-
-	/* for clients with MPPP maybe higher values better */
-	dev->tx_queue_len = 30;
-
-	lp->p_encap = ISDN_NET_ENCAP_RAWIP;
-	lp->magic = ISDN_NET_MAGIC;
-	lp->last = lp;
-	lp->next = lp;
-	lp->isdn_device = -1;
-	lp->isdn_channel = -1;
-	lp->pre_device = -1;
-	lp->pre_channel = -1;
-	lp->exclusive = -1;
-	lp->ppp_slot = -1;
-	lp->pppbind = -1;
-	skb_queue_head_init(&lp->super_tx_queue);
-	lp->l2_proto = ISDN_PROTO_L2_X75I;
-	lp->l3_proto = ISDN_PROTO_L3_TRANS;
-	lp->triggercps = 6000;
-	lp->slavedelay = 10 * HZ;
-	lp->hupflags = ISDN_INHUP;	/* Do hangup even on incoming calls */
-	lp->onhtime = 10;	/* Default hangup-time for saving costs */
-	lp->dialmax = 1;
-	/* Hangup before Callback, manual dial */
-	lp->flags = ISDN_NET_CBHUP | ISDN_NET_DM_MANUAL;
-	lp->cbdelay = 25;	/* Wait 5 secs before Callback */
-	lp->dialtimeout = -1;  /* Infinite Dial-Timeout */
-	lp->dialwait = 5 * HZ; /* Wait 5 sec. after failed dial */
-	lp->dialstarted = 0;   /* Jiffies of last dial-start */
-	lp->dialwait_timer = 0;  /* Jiffies of earliest next dial-start */
-}
-
-/*
- * Allocate a new network-interface and initialize its data structures.
- */
-char *
-isdn_net_new(char *name, struct net_device *master)
-{
-	isdn_net_dev *netdev;
-
-	/* Avoid creating an existing interface */
-	if (isdn_net_findif(name)) {
-		printk(KERN_WARNING "isdn_net: interface %s already exists\n", name);
-		return NULL;
-	}
-	if (name == NULL)
-		return NULL;
-	if (!(netdev = kzalloc(sizeof(isdn_net_dev), GFP_KERNEL))) {
-		printk(KERN_WARNING "isdn_net: Could not allocate net-device\n");
-		return NULL;
-	}
-	netdev->dev = alloc_netdev(sizeof(isdn_net_local), name,
-				   NET_NAME_UNKNOWN, _isdn_setup);
-	if (!netdev->dev) {
-		printk(KERN_WARNING "isdn_net: Could not allocate network device\n");
-		kfree(netdev);
-		return NULL;
-	}
-	netdev->local = netdev_priv(netdev->dev);
-
-	if (master) {
-		/* Device shall be a slave */
-		struct net_device *p = MASTER_TO_SLAVE(master);
-		struct net_device *q = master;
-
-		netdev->local->master = master;
-		/* Put device at end of slave-chain */
-		while (p) {
-			q = p;
-			p = MASTER_TO_SLAVE(p);
-		}
-		MASTER_TO_SLAVE(q) = netdev->dev;
-	} else {
-		/* Device shall be a master */
-		/*
-		 * Watchdog timer (currently) for master only.
-		 */
-		netdev->dev->watchdog_timeo = ISDN_NET_TX_TIMEOUT;
-		if (register_netdev(netdev->dev) != 0) {
-			printk(KERN_WARNING "isdn_net: Could not register net-device\n");
-			free_netdev(netdev->dev);
-			kfree(netdev);
-			return NULL;
-		}
-	}
-	netdev->queue = netdev->local;
-	spin_lock_init(&netdev->queue_lock);
-
-	netdev->local->netdev = netdev;
-
-	INIT_WORK(&netdev->local->tqueue, isdn_net_softint);
-	spin_lock_init(&netdev->local->xmit_lock);
-
-	/* Put into to netdev-chain */
-	netdev->next = (void *) dev->netdev;
-	dev->netdev = netdev;
-	return netdev->dev->name;
-}
-
-char *
-isdn_net_newslave(char *parm)
-{
-	char *p = strchr(parm, ',');
-	isdn_net_dev *n;
-	char newname[10];
-
-	if (p) {
-		/* Slave-Name MUST not be empty or overflow 'newname' */
-		if (strscpy(newname, p + 1, sizeof(newname)) <= 0)
-			return NULL;
-		*p = 0;
-		/* Master must already exist */
-		if (!(n = isdn_net_findif(parm)))
-			return NULL;
-		/* Master must be a real interface, not a slave */
-		if (n->local->master)
-			return NULL;
-		/* Master must not be started yet */
-		if (isdn_net_device_started(n))
-			return NULL;
-		return (isdn_net_new(newname, n->dev));
-	}
-	return NULL;
-}
-
-/*
- * Set interface-parameters.
- * Always set all parameters, so the user-level application is responsible
- * for not overwriting existing setups. It has to get the current
- * setup first, if only selected parameters are to be changed.
- */
-int
-isdn_net_setcfg(isdn_net_ioctl_cfg *cfg)
-{
-	isdn_net_dev *p = isdn_net_findif(cfg->name);
-	ulong features;
-	int i;
-	int drvidx;
-	int chidx;
-	char drvid[25];
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-
-		/* See if any registered driver supports the features we want */
-		features = ((1 << cfg->l2_proto) << ISDN_FEATURE_L2_SHIFT) |
-			((1 << cfg->l3_proto) << ISDN_FEATURE_L3_SHIFT);
-		for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-			if (dev->drv[i])
-				if ((dev->drv[i]->interface->features & features) == features)
-					break;
-		if (i == ISDN_MAX_DRIVERS) {
-			printk(KERN_WARNING "isdn_net: No driver with selected features\n");
-			return -ENODEV;
-		}
-		if (lp->p_encap != cfg->p_encap) {
-#ifdef CONFIG_ISDN_X25
-			struct concap_proto *cprot = p->cprot;
-#endif
-			if (isdn_net_device_started(p)) {
-				printk(KERN_WARNING "%s: cannot change encap when if is up\n",
-				       p->dev->name);
-				return -EBUSY;
-			}
-#ifdef CONFIG_ISDN_X25
-			if (cprot && cprot->pops)
-				cprot->pops->proto_del(cprot);
-			p->cprot = NULL;
-			lp->dops = NULL;
-			/* ... ,  prepare for configuration of new one ... */
-			switch (cfg->p_encap) {
-			case ISDN_NET_ENCAP_X25IFACE:
-				lp->dops = &isdn_concap_reliable_dl_dops;
-			}
-			/* ... and allocate new one ... */
-			p->cprot = isdn_concap_new(cfg->p_encap);
-			/* p -> cprot == NULL now if p_encap is not supported
-			   by means of the concap_proto mechanism */
-			/* the protocol is not configured yet; this will
-			   happen later when isdn_net_reset() is called */
-#endif
-		}
-		switch (cfg->p_encap) {
-		case ISDN_NET_ENCAP_SYNCPPP:
-#ifndef CONFIG_ISDN_PPP
-			printk(KERN_WARNING "%s: SyncPPP support not configured\n",
-			       p->dev->name);
-			return -EINVAL;
-#else
-			p->dev->type = ARPHRD_PPP;	/* change ARP type */
-			p->dev->addr_len = 0;
-#endif
-			break;
-		case ISDN_NET_ENCAP_X25IFACE:
-#ifndef CONFIG_ISDN_X25
-			printk(KERN_WARNING "%s: isdn-x25 support not configured\n",
-			       p->dev->name);
-			return -EINVAL;
-#else
-			p->dev->type = ARPHRD_X25;	/* change ARP type */
-			p->dev->addr_len = 0;
-#endif
-			break;
-		case ISDN_NET_ENCAP_CISCOHDLCK:
-			break;
-		default:
-			if (cfg->p_encap >= 0 &&
-			    cfg->p_encap <= ISDN_NET_ENCAP_MAX_ENCAP)
-				break;
-			printk(KERN_WARNING
-			       "%s: encapsulation protocol %d not supported\n",
-			       p->dev->name, cfg->p_encap);
-			return -EINVAL;
-		}
-		if (strlen(cfg->drvid)) {
-			/* A bind has been requested ... */
-			char *c,
-				*e;
-
-			if (strnlen(cfg->drvid, sizeof(cfg->drvid)) ==
-			    sizeof(cfg->drvid))
-				return -EINVAL;
-			drvidx = -1;
-			chidx = -1;
-			strcpy(drvid, cfg->drvid);
-			if ((c = strchr(drvid, ','))) {
-				/* The channel-number is appended to the driver-Id with a comma */
-				chidx = (int) simple_strtoul(c + 1, &e, 10);
-				if (e == c)
-					chidx = -1;
-				*c = '\0';
-			}
-			for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-				/* Lookup driver-Id in array */
-				if (!(strcmp(dev->drvid[i], drvid))) {
-					drvidx = i;
-					break;
-				}
-			if ((drvidx == -1) || (chidx == -1))
-				/* Either driver-Id or channel-number invalid */
-				return -ENODEV;
-		} else {
-			/* Parameters are valid, so get them */
-			drvidx = lp->pre_device;
-			chidx = lp->pre_channel;
-		}
-		if (cfg->exclusive > 0) {
-			unsigned long flags;
-
-			/* If binding is exclusive, try to grab the channel */
-			spin_lock_irqsave(&dev->lock, flags);
-			if ((i = isdn_get_free_channel(ISDN_USAGE_NET,
-						       lp->l2_proto, lp->l3_proto, drvidx,
-						       chidx, lp->msn)) < 0) {
-				/* Grab failed, because desired channel is in use */
-				lp->exclusive = -1;
-				spin_unlock_irqrestore(&dev->lock, flags);
-				return -EBUSY;
-			}
-			/* All went ok, so update isdninfo */
-			dev->usage[i] = ISDN_USAGE_EXCLUSIVE;
-			isdn_info_update();
-			spin_unlock_irqrestore(&dev->lock, flags);
-			lp->exclusive = i;
-		} else {
-			/* Non-exclusive binding or unbind. */
-			lp->exclusive = -1;
-			if ((lp->pre_device != -1) && (cfg->exclusive == -1)) {
-				isdn_unexclusive_channel(lp->pre_device, lp->pre_channel);
-				isdn_free_channel(lp->pre_device, lp->pre_channel, ISDN_USAGE_NET);
-				drvidx = -1;
-				chidx = -1;
-			}
-		}
-		strlcpy(lp->msn, cfg->eaz, sizeof(lp->msn));
-		lp->pre_device = drvidx;
-		lp->pre_channel = chidx;
-		lp->onhtime = cfg->onhtime;
-		lp->charge = cfg->charge;
-		lp->l2_proto = cfg->l2_proto;
-		lp->l3_proto = cfg->l3_proto;
-		lp->cbdelay = cfg->cbdelay;
-		lp->dialmax = cfg->dialmax;
-		lp->triggercps = cfg->triggercps;
-		lp->slavedelay = cfg->slavedelay * HZ;
-		lp->pppbind = cfg->pppbind;
-		lp->dialtimeout = cfg->dialtimeout >= 0 ? cfg->dialtimeout * HZ : -1;
-		lp->dialwait = cfg->dialwait * HZ;
-		if (cfg->secure)
-			lp->flags |= ISDN_NET_SECURE;
-		else
-			lp->flags &= ~ISDN_NET_SECURE;
-		if (cfg->cbhup)
-			lp->flags |= ISDN_NET_CBHUP;
-		else
-			lp->flags &= ~ISDN_NET_CBHUP;
-		switch (cfg->callback) {
-		case 0:
-			lp->flags &= ~(ISDN_NET_CALLBACK | ISDN_NET_CBOUT);
-			break;
-		case 1:
-			lp->flags |= ISDN_NET_CALLBACK;
-			lp->flags &= ~ISDN_NET_CBOUT;
-			break;
-		case 2:
-			lp->flags |= ISDN_NET_CBOUT;
-			lp->flags &= ~ISDN_NET_CALLBACK;
-			break;
-		}
-		lp->flags &= ~ISDN_NET_DIALMODE_MASK;	/* first all bits off */
-		if (cfg->dialmode && !(cfg->dialmode & ISDN_NET_DIALMODE_MASK)) {
-			/* old isdnctrl version, where only 0 or 1 is given */
-			printk(KERN_WARNING
-			       "Old isdnctrl version detected! Please update.\n");
-			lp->flags |= ISDN_NET_DM_OFF; /* turn on `off' bit */
-		}
-		else {
-			lp->flags |= cfg->dialmode;  /* turn on selected bits */
-		}
-		if (cfg->chargehup)
-			lp->hupflags |= ISDN_CHARGEHUP;
-		else
-			lp->hupflags &= ~ISDN_CHARGEHUP;
-		if (cfg->ihup)
-			lp->hupflags |= ISDN_INHUP;
-		else
-			lp->hupflags &= ~ISDN_INHUP;
-		if (cfg->chargeint > 10) {
-			lp->hupflags |= ISDN_CHARGEHUP | ISDN_HAVECHARGE | ISDN_MANCHARGE;
-			lp->chargeint = cfg->chargeint * HZ;
-		}
-		if (cfg->p_encap != lp->p_encap) {
-			if (cfg->p_encap == ISDN_NET_ENCAP_RAWIP) {
-				p->dev->header_ops = NULL;
-				p->dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-			} else {
-				p->dev->header_ops = &isdn_header_ops;
-				if (cfg->p_encap == ISDN_NET_ENCAP_ETHER)
-					p->dev->flags = IFF_BROADCAST | IFF_MULTICAST;
-				else
-					p->dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-			}
-		}
-		lp->p_encap = cfg->p_encap;
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Perform get-interface-parameters.ioctl
- */
-int
-isdn_net_getcfg(isdn_net_ioctl_cfg *cfg)
-{
-	isdn_net_dev *p = isdn_net_findif(cfg->name);
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-
-		strcpy(cfg->eaz, lp->msn);
-		cfg->exclusive = lp->exclusive;
-		if (lp->pre_device >= 0) {
-			sprintf(cfg->drvid, "%s,%d", dev->drvid[lp->pre_device],
-				lp->pre_channel);
-		} else
-			cfg->drvid[0] = '\0';
-		cfg->onhtime = lp->onhtime;
-		cfg->charge = lp->charge;
-		cfg->l2_proto = lp->l2_proto;
-		cfg->l3_proto = lp->l3_proto;
-		cfg->p_encap = lp->p_encap;
-		cfg->secure = (lp->flags & ISDN_NET_SECURE) ? 1 : 0;
-		cfg->callback = 0;
-		if (lp->flags & ISDN_NET_CALLBACK)
-			cfg->callback = 1;
-		if (lp->flags & ISDN_NET_CBOUT)
-			cfg->callback = 2;
-		cfg->cbhup = (lp->flags & ISDN_NET_CBHUP) ? 1 : 0;
-		cfg->dialmode = lp->flags & ISDN_NET_DIALMODE_MASK;
-		cfg->chargehup = (lp->hupflags & ISDN_CHARGEHUP) ? 1 : 0;
-		cfg->ihup = (lp->hupflags & ISDN_INHUP) ? 1 : 0;
-		cfg->cbdelay = lp->cbdelay;
-		cfg->dialmax = lp->dialmax;
-		cfg->triggercps = lp->triggercps;
-		cfg->slavedelay = lp->slavedelay / HZ;
-		cfg->chargeint = (lp->hupflags & ISDN_CHARGEHUP) ?
-			(lp->chargeint / HZ) : 0;
-		cfg->pppbind = lp->pppbind;
-		cfg->dialtimeout = lp->dialtimeout >= 0 ? lp->dialtimeout / HZ : -1;
-		cfg->dialwait = lp->dialwait / HZ;
-		if (lp->slave) {
-			if (strlen(lp->slave->name) >= 10)
-				strcpy(cfg->slave, "too-long");
-			else
-				strcpy(cfg->slave, lp->slave->name);
-		} else
-			cfg->slave[0] = '\0';
-		if (lp->master) {
-			if (strlen(lp->master->name) >= 10)
-				strcpy(cfg->master, "too-long");
-			else
-				strcpy(cfg->master, lp->master->name);
-		} else
-			cfg->master[0] = '\0';
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Add a phone-number to an interface.
- */
-int
-isdn_net_addphone(isdn_net_ioctl_phone *phone)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	isdn_net_phone *n;
-
-	if (p) {
-		if (!(n = kmalloc(sizeof(isdn_net_phone), GFP_KERNEL)))
-			return -ENOMEM;
-		strlcpy(n->num, phone->phone, sizeof(n->num));
-		n->next = p->local->phone[phone->outgoing & 1];
-		p->local->phone[phone->outgoing & 1] = n;
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Copy a string of all phone-numbers of an interface to user space.
- * This might sleep and must be called with the isdn semaphore down.
- */
-int
-isdn_net_getphones(isdn_net_ioctl_phone *phone, char __user *phones)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int inout = phone->outgoing & 1;
-	int more = 0;
-	int count = 0;
-	isdn_net_phone *n;
-
-	if (!p)
-		return -ENODEV;
-	inout &= 1;
-	for (n = p->local->phone[inout]; n; n = n->next) {
-		if (more) {
-			put_user(' ', phones++);
-			count++;
-		}
-		if (copy_to_user(phones, n->num, strlen(n->num) + 1)) {
-			return -EFAULT;
-		}
-		phones += strlen(n->num);
-		count += strlen(n->num);
-		more = 1;
-	}
-	put_user(0, phones);
-	count++;
-	return count;
-}
-
-/*
- * Copy a string containing the peer's phone number of a connected interface
- * to user space.
- */
-int
-isdn_net_getpeer(isdn_net_ioctl_phone *phone, isdn_net_ioctl_phone __user *peer)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int ch, dv, idx;
-
-	if (!p)
-		return -ENODEV;
-	/*
-	 * Theoretical race: while this executes, the remote number might
-	 * become invalid (hang up) or change (new connection), resulting
-	 * in (partially) wrong number copied to user. This race
-	 * currently ignored.
-	 */
-	ch = p->local->isdn_channel;
-	dv = p->local->isdn_device;
-	if (ch < 0 && dv < 0)
-		return -ENOTCONN;
-	idx = isdn_dc2minor(dv, ch);
-	if (idx < 0)
-		return -ENODEV;
-	/* for pre-bound channels, we need this extra check */
-	if (strncmp(dev->num[idx], "???", 3) == 0)
-		return -ENOTCONN;
-	strncpy(phone->phone, dev->num[idx], ISDN_MSNLEN);
-	phone->outgoing = USG_OUTGOING(dev->usage[idx]);
-	if (copy_to_user(peer, phone, sizeof(*peer)))
-		return -EFAULT;
-	return 0;
-}
-/*
- * Delete a phone-number from an interface.
- */
-int
-isdn_net_delphone(isdn_net_ioctl_phone *phone)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int inout = phone->outgoing & 1;
-	isdn_net_phone *n;
-	isdn_net_phone *m;
-
-	if (p) {
-		n = p->local->phone[inout];
-		m = NULL;
-		while (n) {
-			if (!strcmp(n->num, phone->phone)) {
-				if (p->local->dial == n)
-					p->local->dial = n->next;
-				if (m)
-					m->next = n->next;
-				else
-					p->local->phone[inout] = n->next;
-				kfree(n);
-				return 0;
-			}
-			m = n;
-			n = (isdn_net_phone *) n->next;
-		}
-		return -EINVAL;
-	}
-	return -ENODEV;
-}
-
-/*
- * Delete all phone-numbers of an interface.
- */
-static int
-isdn_net_rmallphone(isdn_net_dev *p)
-{
-	isdn_net_phone *n;
-	isdn_net_phone *m;
-	int i;
-
-	for (i = 0; i < 2; i++) {
-		n = p->local->phone[i];
-		while (n) {
-			m = n->next;
-			kfree(n);
-			n = m;
-		}
-		p->local->phone[i] = NULL;
-	}
-	p->local->dial = NULL;
-	return 0;
-}
-
-/*
- * Force a hangup of a network-interface.
- */
-int
-isdn_net_force_hangup(char *name)
-{
-	isdn_net_dev *p = isdn_net_findif(name);
-	struct net_device *q;
-
-	if (p) {
-		if (p->local->isdn_device < 0)
-			return 1;
-		q = p->local->slave;
-		/* If this interface has slaves, do a hangup for them also. */
-		while (q) {
-			isdn_net_hangup(q);
-			q = MASTER_TO_SLAVE(q);
-		}
-		isdn_net_hangup(p->dev);
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Helper-function for isdn_net_rm: Do the real work.
- */
-static int
-isdn_net_realrm(isdn_net_dev *p, isdn_net_dev *q)
-{
-	u_long flags;
-
-	if (isdn_net_device_started(p)) {
-		return -EBUSY;
-	}
-#ifdef CONFIG_ISDN_X25
-	if (p->cprot && p->cprot->pops)
-		p->cprot->pops->proto_del(p->cprot);
-#endif
-	/* Free all phone-entries */
-	isdn_net_rmallphone(p);
-	/* If interface is bound exclusive, free channel-usage */
-	if (p->local->exclusive != -1)
-		isdn_unexclusive_channel(p->local->pre_device, p->local->pre_channel);
-	if (p->local->master) {
-		/* It's a slave-device, so update master's slave-pointer if necessary */
-		if (((isdn_net_local *) ISDN_MASTER_PRIV(p->local))->slave ==
-		    p->dev)
-			((isdn_net_local *)ISDN_MASTER_PRIV(p->local))->slave =
-				p->local->slave;
-	} else {
-		/* Unregister only if it's a master-device */
-		unregister_netdev(p->dev);
-	}
-	/* Unlink device from chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	if (q)
-		q->next = p->next;
-	else
-		dev->netdev = p->next;
-	if (p->local->slave) {
-		/* If this interface has a slave, remove it also */
-		char *slavename = p->local->slave->name;
-		isdn_net_dev *n = dev->netdev;
-		q = NULL;
-		while (n) {
-			if (!strcmp(n->dev->name, slavename)) {
-				spin_unlock_irqrestore(&dev->lock, flags);
-				isdn_net_realrm(n, q);
-				spin_lock_irqsave(&dev->lock, flags);
-				break;
-			}
-			q = n;
-			n = (isdn_net_dev *)n->next;
-		}
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	/* If no more net-devices remain, disable auto-hangup timer */
-	if (dev->netdev == NULL)
-		isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0);
-	free_netdev(p->dev);
-	kfree(p);
-
-	return 0;
-}
-
-/*
- * Remove a single network-interface.
- */
-int
-isdn_net_rm(char *name)
-{
-	u_long flags;
-	isdn_net_dev *p;
-	isdn_net_dev *q;
-
-	/* Search name in netdev-chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	p = dev->netdev;
-	q = NULL;
-	while (p) {
-		if (!strcmp(p->dev->name, name)) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			return (isdn_net_realrm(p, q));
-		}
-		q = p;
-		p = (isdn_net_dev *) p->next;
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	/* If no more net-devices remain, disable auto-hangup timer */
-	if (dev->netdev == NULL)
-		isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0);
-	return -ENODEV;
-}
-
-/*
- * Remove all network-interfaces
- */
-int
-isdn_net_rmall(void)
-{
-	u_long flags;
-	int ret;
-
-	/* Walk through netdev-chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	while (dev->netdev) {
-		if (!dev->netdev->local->master) {
-			/* Remove master-devices only, slaves get removed with their master */
-			spin_unlock_irqrestore(&dev->lock, flags);
-			if ((ret = isdn_net_realrm(dev->netdev, NULL))) {
-				return ret;
-			}
-			spin_lock_irqsave(&dev->lock, flags);
-		}
-	}
-	dev->netdev = NULL;
-	spin_unlock_irqrestore(&dev->lock, flags);
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_net.h b/drivers/isdn/i4l/isdn_net.h
deleted file mode 100644
index cca6d68da171..000000000000
--- a/drivers/isdn/i4l/isdn_net.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* $Id: isdn_net.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, network related functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-/* Definitions for hupflags:                */
-#define ISDN_WAITCHARGE  1      /* did not get a charge info yet            */
-#define ISDN_HAVECHARGE  2      /* We know a charge info                    */
-#define ISDN_CHARGEHUP   4      /* We want to use the charge mechanism      */
-#define ISDN_INHUP       8      /* Even if incoming, close after huptimeout */
-#define ISDN_MANCHARGE  16      /* Charge Interval manually set             */
-
-/*
- * Definitions for Cisco-HDLC header.
- */
-
-#define CISCO_ADDR_UNICAST    0x0f
-#define CISCO_ADDR_BROADCAST  0x8f
-#define CISCO_CTRL            0x00
-#define CISCO_TYPE_CDP        0x2000
-#define CISCO_TYPE_SLARP      0x8035
-#define CISCO_SLARP_REQUEST   0
-#define CISCO_SLARP_REPLY     1
-#define CISCO_SLARP_KEEPALIVE 2
-
-extern char *isdn_net_new(char *, struct net_device *);
-extern char *isdn_net_newslave(char *);
-extern int isdn_net_rm(char *);
-extern int isdn_net_rmall(void);
-extern int isdn_net_stat_callback(int, isdn_ctrl *);
-extern int isdn_net_setcfg(isdn_net_ioctl_cfg *);
-extern int isdn_net_getcfg(isdn_net_ioctl_cfg *);
-extern int isdn_net_addphone(isdn_net_ioctl_phone *);
-extern int isdn_net_getphones(isdn_net_ioctl_phone *, char __user *);
-extern int isdn_net_getpeer(isdn_net_ioctl_phone *, isdn_net_ioctl_phone __user *);
-extern int isdn_net_delphone(isdn_net_ioctl_phone *);
-extern int isdn_net_find_icall(int, int, int, setup_parm *);
-extern void isdn_net_hangup(struct net_device *);
-extern void isdn_net_dial(void);
-extern void isdn_net_autohup(void);
-extern int isdn_net_force_hangup(char *);
-extern int isdn_net_force_dial(char *);
-extern isdn_net_dev *isdn_net_findif(char *);
-extern int isdn_net_rcv_skb(int, struct sk_buff *);
-extern int isdn_net_dial_req(isdn_net_local *);
-extern void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb);
-extern void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb);
-
-#define ISDN_NET_MAX_QUEUE_LENGTH 2
-
-#define ISDN_MASTER_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->master))
-#define ISDN_SLAVE_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->slave))
-#define MASTER_TO_SLAVE(master)					\
-	(((isdn_net_local *) netdev_priv(master))->slave)
-
-/*
- * is this particular channel busy?
- */
-static __inline__ int isdn_net_lp_busy(isdn_net_local *lp)
-{
-	if (atomic_read(&lp->frame_cnt) < ISDN_NET_MAX_QUEUE_LENGTH)
-		return 0;
-	else
-		return 1;
-}
-
-/*
- * For the given net device, this will get a non-busy channel out of the
- * corresponding bundle. The returned channel is locked.
- */
-static __inline__ isdn_net_local *isdn_net_get_locked_lp(isdn_net_dev *nd)
-{
-	unsigned long flags;
-	isdn_net_local *lp;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-	lp = nd->queue;         /* get lp on top of queue */
-	while (isdn_net_lp_busy(nd->queue)) {
-		nd->queue = nd->queue->next;
-		if (nd->queue == lp) { /* not found -- should never happen */
-			lp = NULL;
-			goto errout;
-		}
-	}
-	lp = nd->queue;
-	nd->queue = nd->queue->next;
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	spin_lock(&lp->xmit_lock);
-	local_bh_disable();
-	return lp;
-errout:
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	return lp;
-}
-
-/*
- * add a channel to a bundle
- */
-static __inline__ void isdn_net_add_to_bundle(isdn_net_dev *nd, isdn_net_local *nlp)
-{
-	isdn_net_local *lp;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-
-	lp = nd->queue;
-//	printk(KERN_DEBUG "%s: lp:%s(%p) nlp:%s(%p) last(%p)\n",
-//		__func__, lp->name, lp, nlp->name, nlp, lp->last);
-	nlp->last = lp->last;
-	lp->last->next = nlp;
-	lp->last = nlp;
-	nlp->next = lp;
-	nd->queue = nlp;
-
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-}
-/*
- * remove a channel from the bundle it belongs to
- */
-static __inline__ void isdn_net_rm_from_bundle(isdn_net_local *lp)
-{
-	isdn_net_local *master_lp = lp;
-	unsigned long flags;
-
-	if (lp->master)
-		master_lp = ISDN_MASTER_PRIV(lp);
-
-//	printk(KERN_DEBUG "%s: lp:%s(%p) mlp:%s(%p) last(%p) next(%p) mndq(%p)\n",
-//		__func__, lp->name, lp, master_lp->name, master_lp, lp->last, lp->next, master_lp->netdev->queue);
-	spin_lock_irqsave(&master_lp->netdev->queue_lock, flags);
-	lp->last->next = lp->next;
-	lp->next->last = lp->last;
-	if (master_lp->netdev->queue == lp) {
-		master_lp->netdev->queue = lp->next;
-		if (lp->next == lp) { /* last in queue */
-			master_lp->netdev->queue = master_lp->netdev->local;
-		}
-	}
-	lp->next = lp->last = lp;	/* (re)set own pointers */
-//	printk(KERN_DEBUG "%s: mndq(%p)\n",
-//		__func__, master_lp->netdev->queue);
-	spin_unlock_irqrestore(&master_lp->netdev->queue_lock, flags);
-}
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
deleted file mode 100644
index 7e0f419c14f8..000000000000
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ /dev/null
@@ -1,3046 +0,0 @@
-/* $Id: isdn_ppp.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $
- *
- * Linux ISDN subsystem, functions for synchronous PPP (linklevel).
- *
- * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/isdn.h>
-#include <linux/poll.h>
-#include <linux/ppp-comp.h>
-#include <linux/slab.h>
-#ifdef CONFIG_IPPP_FILTER
-#include <linux/filter.h>
-#endif
-
-#include "isdn_common.h"
-#include "isdn_ppp.h"
-#include "isdn_net.h"
-
-#ifndef PPP_IPX
-#define PPP_IPX 0x002b
-#endif
-
-/* Prototypes */
-static int isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot);
-static int isdn_ppp_closewait(int slot);
-static void isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto);
-static int isdn_ppp_if_get_unit(char *namebuf);
-static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *);
-static struct sk_buff *isdn_ppp_decompress(struct sk_buff *,
-					   struct ippp_struct *, struct ippp_struct *, int *proto);
-static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto);
-static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto,
-					 struct ippp_struct *is, struct ippp_struct *master, int type);
-static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-			      struct sk_buff *skb);
-
-/* New CCP stuff */
-static void isdn_ppp_ccp_kickup(struct ippp_struct *is);
-static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto,
-				    unsigned char code, unsigned char id,
-				    unsigned char *data, int len);
-static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is);
-static void isdn_ppp_ccp_reset_free(struct ippp_struct *is);
-static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
-					  unsigned char id);
-static void isdn_ppp_ccp_timer_callback(struct timer_list *t);
-static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
-								   unsigned char id);
-static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
-				     struct isdn_ppp_resetparams *rp);
-static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is,
-					unsigned char id);
-
-
-
-#ifdef CONFIG_ISDN_MPP
-static ippp_bundle *isdn_ppp_bundle_arr = NULL;
-
-static int isdn_ppp_mp_bundle_array_init(void);
-static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to);
-static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp,
-				struct sk_buff *skb);
-static void isdn_ppp_mp_cleanup(isdn_net_local *lp);
-
-static int isdn_ppp_bundle(struct ippp_struct *, int unit);
-#endif	/* CONFIG_ISDN_MPP */
-
-char *isdn_ppp_revision = "$Revision: 1.1.2.3 $";
-
-static struct ippp_struct *ippp_table[ISDN_MAX_CHANNELS];
-
-static struct isdn_ppp_compressor *ipc_head = NULL;
-
-/*
- * frame log (debug)
- */
-static void
-isdn_ppp_frame_log(char *info, char *data, int len, int maxlen, int unit, int slot)
-{
-	int cnt,
-		j,
-		i;
-	char buf[80];
-
-	if (len < maxlen)
-		maxlen = len;
-
-	for (i = 0, cnt = 0; cnt < maxlen; i++) {
-		for (j = 0; j < 16 && cnt < maxlen; j++, cnt++)
-			sprintf(buf + j * 3, "%02x ", (unsigned char)data[cnt]);
-		printk(KERN_DEBUG "[%d/%d].%s[%d]: %s\n", unit, slot, info, i, buf);
-	}
-}
-
-/*
- * unbind isdn_net_local <=> ippp-device
- * note: it can happen, that we hangup/free the master before the slaves
- *       in this case we bind another lp to the master device
- */
-int
-isdn_ppp_free(isdn_net_local *lp)
-{
-	struct ippp_struct *is;
-
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return 0;
-	}
-
-#ifdef CONFIG_ISDN_MPP
-	spin_lock(&lp->netdev->pb->lock);
-#endif
-	isdn_net_rm_from_bundle(lp);
-#ifdef CONFIG_ISDN_MPP
-	if (lp->netdev->pb->ref_ct == 1)	/* last link in queue? */
-		isdn_ppp_mp_cleanup(lp);
-
-	lp->netdev->pb->ref_ct--;
-	spin_unlock(&lp->netdev->pb->lock);
-#endif /* CONFIG_ISDN_MPP */
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) now invalid\n",
-		       __func__, lp->ppp_slot);
-		return 0;
-	}
-	is = ippp_table[lp->ppp_slot];
-	if ((is->state & IPPP_CONNECT))
-		isdn_ppp_closewait(lp->ppp_slot);	/* force wakeup on ippp device */
-	else if (is->state & IPPP_ASSIGNED)
-		is->state = IPPP_OPEN;	/* fallback to 'OPEN but not ASSIGNED' state */
-
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "isdn_ppp_free %d %lx %lx\n", lp->ppp_slot, (long) lp, (long) is->lp);
-
-	is->lp = NULL;          /* link is down .. set lp to NULL */
-	lp->ppp_slot = -1;      /* is this OK ?? */
-
-	return 0;
-}
-
-/*
- * bind isdn_net_local <=> ippp-device
- *
- * This function is allways called with holding dev->lock so
- * no additional lock is needed
- */
-int
-isdn_ppp_bind(isdn_net_local *lp)
-{
-	int i;
-	int unit = 0;
-	struct ippp_struct *is;
-	int retval;
-
-	if (lp->pppbind < 0) {  /* device bounded to ippp device ? */
-		isdn_net_dev *net_dev = dev->netdev;
-		char exclusive[ISDN_MAX_CHANNELS];	/* exclusive flags */
-		memset(exclusive, 0, ISDN_MAX_CHANNELS);
-		while (net_dev) {	/* step through net devices to find exclusive minors */
-			isdn_net_local *lp = net_dev->local;
-			if (lp->pppbind >= 0)
-				exclusive[lp->pppbind] = 1;
-			net_dev = net_dev->next;
-		}
-		/*
-		 * search a free device / slot
-		 */
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-			if (ippp_table[i]->state == IPPP_OPEN && !exclusive[ippp_table[i]->minor]) {	/* OPEN, but not connected! */
-				break;
-			}
-		}
-	} else {
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-			if (ippp_table[i]->minor == lp->pppbind &&
-			    (ippp_table[i]->state & IPPP_OPEN) == IPPP_OPEN)
-				break;
-		}
-	}
-
-	if (i >= ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "isdn_ppp_bind: Can't find a (free) connection to the ipppd daemon.\n");
-		retval = -1;
-		goto out;
-	}
-	/* get unit number from interface name .. ugly! */
-	unit = isdn_ppp_if_get_unit(lp->netdev->dev->name);
-	if (unit < 0) {
-		printk(KERN_ERR "isdn_ppp_bind: illegal interface name %s.\n",
-		       lp->netdev->dev->name);
-		retval = -1;
-		goto out;
-	}
-
-	lp->ppp_slot = i;
-	is = ippp_table[i];
-	is->lp = lp;
-	is->unit = unit;
-	is->state = IPPP_OPEN | IPPP_ASSIGNED;	/* assigned to a netdevice but not connected */
-#ifdef CONFIG_ISDN_MPP
-	retval = isdn_ppp_mp_init(lp, NULL);
-	if (retval < 0)
-		goto out;
-#endif /* CONFIG_ISDN_MPP */
-
-	retval = lp->ppp_slot;
-
-out:
-	return retval;
-}
-
-/*
- * kick the ipppd on the device
- * (wakes up daemon after B-channel connect)
- */
-
-void
-isdn_ppp_wakeup_daemon(isdn_net_local *lp)
-{
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	ippp_table[lp->ppp_slot]->state = IPPP_OPEN | IPPP_CONNECT | IPPP_NOBLOCK;
-	wake_up_interruptible(&ippp_table[lp->ppp_slot]->wq);
-}
-
-/*
- * there was a hangup on the netdevice
- * force wakeup of the ippp device
- * go into 'device waits for release' state
- */
-static int
-isdn_ppp_closewait(int slot)
-{
-	struct ippp_struct *is;
-
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: slot(%d) out of range\n",
-		       __func__, slot);
-		return 0;
-	}
-	is = ippp_table[slot];
-	if (is->state)
-		wake_up_interruptible(&is->wq);
-	is->state = IPPP_CLOSEWAIT;
-	return 1;
-}
-
-/*
- * isdn_ppp_find_slot / isdn_ppp_free_slot
- */
-
-static int
-isdn_ppp_get_slot(void)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		if (!ippp_table[i]->state)
-			return i;
-	}
-	return -1;
-}
-
-/*
- * isdn_ppp_open
- */
-
-int
-isdn_ppp_open(int min, struct file *file)
-{
-	int slot;
-	struct ippp_struct *is;
-
-	if (min < 0 || min >= ISDN_MAX_CHANNELS)
-		return -ENODEV;
-
-	slot = isdn_ppp_get_slot();
-	if (slot < 0) {
-		return -EBUSY;
-	}
-	is = file->private_data = ippp_table[slot];
-
-	printk(KERN_DEBUG "ippp, open, slot: %d, minor: %d, state: %04x\n",
-	       slot, min, is->state);
-
-	/* compression stuff */
-	is->link_compressor   = is->compressor = NULL;
-	is->link_decompressor = is->decompressor = NULL;
-	is->link_comp_stat    = is->comp_stat = NULL;
-	is->link_decomp_stat  = is->decomp_stat = NULL;
-	is->compflags = 0;
-
-	is->reset = isdn_ppp_ccp_reset_alloc(is);
-	if (!is->reset)
-		return -ENOMEM;
-
-	is->lp = NULL;
-	is->mp_seqno = 0;       /* MP sequence number */
-	is->pppcfg = 0;         /* ppp configuration */
-	is->mpppcfg = 0;        /* mppp configuration */
-	is->last_link_seqno = -1;	/* MP: maybe set to Bundle-MIN, when joining a bundle ?? */
-	is->unit = -1;          /* set, when we have our interface */
-	is->mru = 1524;         /* MRU, default 1524 */
-	is->maxcid = 16;        /* VJ: maxcid */
-	is->tk = current;
-	init_waitqueue_head(&is->wq);
-	is->first = is->rq + NUM_RCV_BUFFS - 1;	/* receive queue */
-	is->last = is->rq;
-	is->minor = min;
-#ifdef CONFIG_ISDN_PPP_VJ
-	/*
-	 * VJ header compression init
-	 */
-	is->slcomp = slhc_init(16, 16);	/* not necessary for 2. link in bundle */
-	if (IS_ERR(is->slcomp)) {
-		isdn_ppp_ccp_reset_free(is);
-		return PTR_ERR(is->slcomp);
-	}
-#endif
-#ifdef CONFIG_IPPP_FILTER
-	is->pass_filter = NULL;
-	is->active_filter = NULL;
-#endif
-	is->state = IPPP_OPEN;
-
-	return 0;
-}
-
-/*
- * release ippp device
- */
-void
-isdn_ppp_release(int min, struct file *file)
-{
-	int i;
-	struct ippp_struct *is;
-
-	if (min < 0 || min >= ISDN_MAX_CHANNELS)
-		return;
-	is = file->private_data;
-
-	if (!is) {
-		printk(KERN_ERR "%s: no file->private_data\n", __func__);
-		return;
-	}
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "ippp: release, minor: %d %lx\n", min, (long) is->lp);
-
-	if (is->lp) {           /* a lp address says: this link is still up */
-		isdn_net_dev *p = is->lp->netdev;
-
-		if (!p) {
-			printk(KERN_ERR "%s: no lp->netdev\n", __func__);
-			return;
-		}
-		is->state &= ~IPPP_CONNECT;	/* -> effect: no call of wakeup */
-		/*
-		 * isdn_net_hangup() calls isdn_ppp_free()
-		 * isdn_ppp_free() sets is->lp to NULL and lp->ppp_slot to -1
-		 * removing the IPPP_CONNECT flag omits calling of isdn_ppp_wakeup_daemon()
-		 */
-		isdn_net_hangup(p->dev);
-	}
-	for (i = 0; i < NUM_RCV_BUFFS; i++) {
-		kfree(is->rq[i].buf);
-		is->rq[i].buf = NULL;
-	}
-	is->first = is->rq + NUM_RCV_BUFFS - 1;	/* receive queue */
-	is->last = is->rq;
-
-#ifdef CONFIG_ISDN_PPP_VJ
-/* TODO: if this was the previous master: link the slcomp to the new master */
-	slhc_free(is->slcomp);
-	is->slcomp = NULL;
-#endif
-#ifdef CONFIG_IPPP_FILTER
-	if (is->pass_filter) {
-		bpf_prog_destroy(is->pass_filter);
-		is->pass_filter = NULL;
-	}
-
-	if (is->active_filter) {
-		bpf_prog_destroy(is->active_filter);
-		is->active_filter = NULL;
-	}
-#endif
-
-/* TODO: if this was the previous master: link the stuff to the new master */
-	if (is->comp_stat)
-		is->compressor->free(is->comp_stat);
-	if (is->link_comp_stat)
-		is->link_compressor->free(is->link_comp_stat);
-	if (is->link_decomp_stat)
-		is->link_decompressor->free(is->link_decomp_stat);
-	if (is->decomp_stat)
-		is->decompressor->free(is->decomp_stat);
-	is->compressor   = is->link_compressor   = NULL;
-	is->decompressor = is->link_decompressor = NULL;
-	is->comp_stat    = is->link_comp_stat    = NULL;
-	is->decomp_stat  = is->link_decomp_stat  = NULL;
-
-	/* Clean up if necessary */
-	if (is->reset)
-		isdn_ppp_ccp_reset_free(is);
-
-	/* this slot is ready for new connections */
-	is->state = 0;
-}
-
-/*
- * get_arg .. ioctl helper
- */
-static int
-get_arg(void __user *b, void *val, int len)
-{
-	if (len <= 0)
-		len = sizeof(void *);
-	if (copy_from_user(val, b, len))
-		return -EFAULT;
-	return 0;
-}
-
-/*
- * set arg .. ioctl helper
- */
-static int
-set_arg(void __user *b, void *val, int len)
-{
-	if (len <= 0)
-		len = sizeof(void *);
-	if (copy_to_user(b, val, len))
-		return -EFAULT;
-	return 0;
-}
-
-#ifdef CONFIG_IPPP_FILTER
-static int get_filter(void __user *arg, struct sock_filter **p)
-{
-	struct sock_fprog uprog;
-	struct sock_filter *code = NULL;
-	int len;
-
-	if (copy_from_user(&uprog, arg, sizeof(uprog)))
-		return -EFAULT;
-
-	if (!uprog.len) {
-		*p = NULL;
-		return 0;
-	}
-
-	/* uprog.len is unsigned short, so no overflow here */
-	len = uprog.len * sizeof(struct sock_filter);
-	code = memdup_user(uprog.filter, len);
-	if (IS_ERR(code))
-		return PTR_ERR(code);
-
-	*p = code;
-	return uprog.len;
-}
-#endif /* CONFIG_IPPP_FILTER */
-
-/*
- * ippp device ioctl
- */
-int
-isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg)
-{
-	unsigned long val;
-	int r, i, j;
-	struct ippp_struct *is;
-	isdn_net_local *lp;
-	struct isdn_ppp_comp_data data;
-	void __user *argp = (void __user *)arg;
-
-	is = file->private_data;
-	lp = is->lp;
-
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "isdn_ppp_ioctl: minor: %d cmd: %x state: %x\n", min, cmd, is->state);
-
-	if (!(is->state & IPPP_OPEN))
-		return -EINVAL;
-
-	switch (cmd) {
-	case PPPIOCBUNDLE:
-#ifdef CONFIG_ISDN_MPP
-		if (!(is->state & IPPP_CONNECT))
-			return -EINVAL;
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		printk(KERN_DEBUG "iPPP-bundle: minor: %d, slave unit: %d, master unit: %d\n",
-		       (int) min, (int) is->unit, (int) val);
-		return isdn_ppp_bundle(is, val);
-#else
-		return -1;
-#endif
-		break;
-	case PPPIOCGUNIT:	/* get ppp/isdn unit number */
-		if ((r = set_arg(argp, &is->unit, sizeof(is->unit))))
-			return r;
-		break;
-	case PPPIOCGIFNAME:
-		if (!lp)
-			return -EINVAL;
-		if ((r = set_arg(argp, lp->netdev->dev->name,
-				 strlen(lp->netdev->dev->name))))
-			return r;
-		break;
-	case PPPIOCGMPFLAGS:	/* get configuration flags */
-		if ((r = set_arg(argp, &is->mpppcfg, sizeof(is->mpppcfg))))
-			return r;
-		break;
-	case PPPIOCSMPFLAGS:	/* set configuration flags */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->mpppcfg = val;
-		break;
-	case PPPIOCGFLAGS:	/* get configuration flags */
-		if ((r = set_arg(argp, &is->pppcfg, sizeof(is->pppcfg))))
-			return r;
-		break;
-	case PPPIOCSFLAGS:	/* set configuration flags */
-		if ((r = get_arg(argp, &val, sizeof(val)))) {
-			return r;
-		}
-		if (val & SC_ENABLE_IP && !(is->pppcfg & SC_ENABLE_IP) && (is->state & IPPP_CONNECT)) {
-			if (lp) {
-				/* OK .. we are ready to send buffers */
-				is->pppcfg = val; /* isdn_ppp_xmit test for SC_ENABLE_IP !!! */
-				netif_wake_queue(lp->netdev->dev);
-				break;
-			}
-		}
-		is->pppcfg = val;
-		break;
-	case PPPIOCGIDLE:	/* get idle time information */
-		if (lp) {
-			struct ppp_idle pidle;
-			pidle.xmit_idle = pidle.recv_idle = lp->huptimer;
-			if ((r = set_arg(argp, &pidle, sizeof(struct ppp_idle))))
-				return r;
-		}
-		break;
-	case PPPIOCSMRU:	/* set receive unit size for PPP */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->mru = val;
-		break;
-	case PPPIOCSMPMRU:
-		break;
-	case PPPIOCSMPMTU:
-		break;
-	case PPPIOCSMAXCID:	/* set the maximum compression slot id */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		val++;
-		if (is->maxcid != val) {
-#ifdef CONFIG_ISDN_PPP_VJ
-			struct slcompress *sltmp;
-#endif
-			if (is->debug & 0x1)
-				printk(KERN_DEBUG "ippp, ioctl: changed MAXCID to %ld\n", val);
-			is->maxcid = val;
-#ifdef CONFIG_ISDN_PPP_VJ
-			sltmp = slhc_init(16, val);
-			if (IS_ERR(sltmp))
-				return PTR_ERR(sltmp);
-			if (is->slcomp)
-				slhc_free(is->slcomp);
-			is->slcomp = sltmp;
-#endif
-		}
-		break;
-	case PPPIOCGDEBUG:
-		if ((r = set_arg(argp, &is->debug, sizeof(is->debug))))
-			return r;
-		break;
-	case PPPIOCSDEBUG:
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->debug = val;
-		break;
-	case PPPIOCGCOMPRESSORS:
-	{
-		unsigned long protos[8] = {0,};
-		struct isdn_ppp_compressor *ipc = ipc_head;
-		while (ipc) {
-			j = ipc->num / (sizeof(long) * 8);
-			i = ipc->num % (sizeof(long) * 8);
-			if (j < 8)
-				protos[j] |= (1UL << i);
-			ipc = ipc->next;
-		}
-		if ((r = set_arg(argp, protos, 8 * sizeof(long))))
-			return r;
-	}
-	break;
-	case PPPIOCSCOMPRESSOR:
-		if ((r = get_arg(argp, &data, sizeof(struct isdn_ppp_comp_data))))
-			return r;
-		return isdn_ppp_set_compressor(is, &data);
-	case PPPIOCGCALLINFO:
-	{
-		struct pppcallinfo pci;
-		memset((char *)&pci, 0, sizeof(struct pppcallinfo));
-		if (lp)
-		{
-			strncpy(pci.local_num, lp->msn, 63);
-			if (lp->dial) {
-				strncpy(pci.remote_num, lp->dial->num, 63);
-			}
-			pci.charge_units = lp->charge;
-			if (lp->outgoing)
-				pci.calltype = CALLTYPE_OUTGOING;
-			else
-				pci.calltype = CALLTYPE_INCOMING;
-			if (lp->flags & ISDN_NET_CALLBACK)
-				pci.calltype |= CALLTYPE_CALLBACK;
-		}
-		return set_arg(argp, &pci, sizeof(struct pppcallinfo));
-	}
-#ifdef CONFIG_IPPP_FILTER
-	case PPPIOCSPASS:
-	{
-		struct sock_fprog_kern fprog;
-		struct sock_filter *code;
-		int err, len = get_filter(argp, &code);
-
-		if (len < 0)
-			return len;
-
-		fprog.len = len;
-		fprog.filter = code;
-
-		if (is->pass_filter) {
-			bpf_prog_destroy(is->pass_filter);
-			is->pass_filter = NULL;
-		}
-		if (fprog.filter != NULL)
-			err = bpf_prog_create(&is->pass_filter, &fprog);
-		else
-			err = 0;
-		kfree(code);
-
-		return err;
-	}
-	case PPPIOCSACTIVE:
-	{
-		struct sock_fprog_kern fprog;
-		struct sock_filter *code;
-		int err, len = get_filter(argp, &code);
-
-		if (len < 0)
-			return len;
-
-		fprog.len = len;
-		fprog.filter = code;
-
-		if (is->active_filter) {
-			bpf_prog_destroy(is->active_filter);
-			is->active_filter = NULL;
-		}
-		if (fprog.filter != NULL)
-			err = bpf_prog_create(&is->active_filter, &fprog);
-		else
-			err = 0;
-		kfree(code);
-
-		return err;
-	}
-#endif /* CONFIG_IPPP_FILTER */
-	default:
-		break;
-	}
-	return 0;
-}
-
-__poll_t
-isdn_ppp_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask;
-	struct ippp_buf_queue *bf, *bl;
-	u_long flags;
-	struct ippp_struct *is;
-
-	is = file->private_data;
-
-	if (is->debug & 0x2)
-		printk(KERN_DEBUG "isdn_ppp_poll: minor: %d\n",
-		       iminor(file_inode(file)));
-
-	/* just registers wait_queue hook. This doesn't really wait. */
-	poll_wait(file, &is->wq, wait);
-
-	if (!(is->state & IPPP_OPEN)) {
-		if (is->state == IPPP_CLOSEWAIT)
-			return EPOLLHUP;
-		printk(KERN_DEBUG "isdn_ppp: device not open\n");
-		return EPOLLERR;
-	}
-	/* we're always ready to send .. */
-	mask = EPOLLOUT | EPOLLWRNORM;
-
-	spin_lock_irqsave(&is->buflock, flags);
-	bl = is->last;
-	bf = is->first;
-	/*
-	 * if IPPP_NOBLOCK is set we return even if we have nothing to read
-	 */
-	if (bf->next != bl || (is->state & IPPP_NOBLOCK)) {
-		is->state &= ~IPPP_NOBLOCK;
-		mask |= EPOLLIN | EPOLLRDNORM;
-	}
-	spin_unlock_irqrestore(&is->buflock, flags);
-	return mask;
-}
-
-/*
- *  fill up isdn_ppp_read() queue ..
- */
-
-static int
-isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot)
-{
-	struct ippp_buf_queue *bf, *bl;
-	u_long flags;
-	u_char *nbuf;
-	struct ippp_struct *is;
-
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "ippp: illegal slot(%d).\n", slot);
-		return 0;
-	}
-	is = ippp_table[slot];
-
-	if (!(is->state & IPPP_CONNECT)) {
-		printk(KERN_DEBUG "ippp: device not activated.\n");
-		return 0;
-	}
-	nbuf = kmalloc(len + 4, GFP_ATOMIC);
-	if (!nbuf) {
-		printk(KERN_WARNING "ippp: Can't alloc buf\n");
-		return 0;
-	}
-	nbuf[0] = PPP_ALLSTATIONS;
-	nbuf[1] = PPP_UI;
-	nbuf[2] = proto >> 8;
-	nbuf[3] = proto & 0xff;
-	memcpy(nbuf + 4, buf, len);
-
-	spin_lock_irqsave(&is->buflock, flags);
-	bf = is->first;
-	bl = is->last;
-
-	if (bf == bl) {
-		printk(KERN_WARNING "ippp: Queue is full; discarding first buffer\n");
-		bf = bf->next;
-		kfree(bf->buf);
-		is->first = bf;
-	}
-	bl->buf = (char *) nbuf;
-	bl->len = len + 4;
-
-	is->last = bl->next;
-	spin_unlock_irqrestore(&is->buflock, flags);
-	wake_up_interruptible(&is->wq);
-	return len;
-}
-
-/*
- * read() .. non-blocking: ipppd calls it only after select()
- *           reports, that there is data
- */
-
-int
-isdn_ppp_read(int min, struct file *file, char __user *buf, int count)
-{
-	struct ippp_struct *is;
-	struct ippp_buf_queue *b;
-	u_long flags;
-	u_char *save_buf;
-
-	is = file->private_data;
-
-	if (!(is->state & IPPP_OPEN))
-		return 0;
-
-	spin_lock_irqsave(&is->buflock, flags);
-	b = is->first->next;
-	save_buf = b->buf;
-	if (!save_buf) {
-		spin_unlock_irqrestore(&is->buflock, flags);
-		return -EAGAIN;
-	}
-	if (b->len < count)
-		count = b->len;
-	b->buf = NULL;
-	is->first = b;
-
-	spin_unlock_irqrestore(&is->buflock, flags);
-	if (copy_to_user(buf, save_buf, count))
-		count = -EFAULT;
-	kfree(save_buf);
-
-	return count;
-}
-
-/*
- * ipppd wanna write a packet to the card .. non-blocking
- */
-
-int
-isdn_ppp_write(int min, struct file *file, const char __user *buf, int count)
-{
-	isdn_net_local *lp;
-	struct ippp_struct *is;
-	int proto;
-
-	is = file->private_data;
-
-	if (!(is->state & IPPP_CONNECT))
-		return 0;
-
-	lp = is->lp;
-
-	/* -> push it directly to the lowlevel interface */
-
-	if (!lp)
-		printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n");
-	else {
-		if (lp->isdn_device < 0 || lp->isdn_channel < 0) {
-			unsigned char protobuf[4];
-			/*
-			 * Don't reset huptimer for
-			 * LCP packets. (Echo requests).
-			 */
-			if (copy_from_user(protobuf, buf, 4))
-				return -EFAULT;
-
-			proto = PPP_PROTOCOL(protobuf);
-			if (proto != PPP_LCP)
-				lp->huptimer = 0;
-
-			return 0;
-		}
-
-		if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) &&
-		    lp->dialstate == 0 &&
-		    (lp->flags & ISDN_NET_CONNECTED)) {
-			unsigned short hl;
-			struct sk_buff *skb;
-			unsigned char *cpy_buf;
-			/*
-			 * we need to reserve enough space in front of
-			 * sk_buff. old call to dev_alloc_skb only reserved
-			 * 16 bytes, now we are looking what the driver want
-			 */
-			hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-			skb = alloc_skb(hl + count, GFP_ATOMIC);
-			if (!skb) {
-				printk(KERN_WARNING "isdn_ppp_write: out of memory!\n");
-				return count;
-			}
-			skb_reserve(skb, hl);
-			cpy_buf = skb_put(skb, count);
-			if (copy_from_user(cpy_buf, buf, count))
-			{
-				kfree_skb(skb);
-				return -EFAULT;
-			}
-
-			/*
-			 * Don't reset huptimer for
-			 * LCP packets. (Echo requests).
-			 */
-			proto = PPP_PROTOCOL(cpy_buf);
-			if (proto != PPP_LCP)
-				lp->huptimer = 0;
-
-			if (is->debug & 0x40) {
-				printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len);
-				isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-			}
-
-			isdn_ppp_send_ccp(lp->netdev, lp, skb); /* keeps CCP/compression states in sync */
-
-			isdn_net_write_super(lp, skb);
-		}
-	}
-	return count;
-}
-
-/*
- * init memory, structures etc.
- */
-
-int
-isdn_ppp_init(void)
-{
-	int i,
-		j;
-
-#ifdef CONFIG_ISDN_MPP
-	if (isdn_ppp_mp_bundle_array_init() < 0)
-		return -ENOMEM;
-#endif /* CONFIG_ISDN_MPP */
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		if (!(ippp_table[i] = kzalloc(sizeof(struct ippp_struct), GFP_KERNEL))) {
-			printk(KERN_WARNING "isdn_ppp_init: Could not alloc ippp_table\n");
-			for (j = 0; j < i; j++)
-				kfree(ippp_table[j]);
-			return -1;
-		}
-		spin_lock_init(&ippp_table[i]->buflock);
-		ippp_table[i]->state = 0;
-		ippp_table[i]->first = ippp_table[i]->rq + NUM_RCV_BUFFS - 1;
-		ippp_table[i]->last = ippp_table[i]->rq;
-
-		for (j = 0; j < NUM_RCV_BUFFS; j++) {
-			ippp_table[i]->rq[j].buf = NULL;
-			ippp_table[i]->rq[j].last = ippp_table[i]->rq +
-				(NUM_RCV_BUFFS + j - 1) % NUM_RCV_BUFFS;
-			ippp_table[i]->rq[j].next = ippp_table[i]->rq + (j + 1) % NUM_RCV_BUFFS;
-		}
-	}
-	return 0;
-}
-
-void
-isdn_ppp_cleanup(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		kfree(ippp_table[i]);
-
-#ifdef CONFIG_ISDN_MPP
-	kfree(isdn_ppp_bundle_arr);
-#endif /* CONFIG_ISDN_MPP */
-
-}
-
-/*
- * check for address/control field and skip if allowed
- * retval != 0 -> discard packet silently
- */
-static int isdn_ppp_skip_ac(struct ippp_struct *is, struct sk_buff *skb)
-{
-	if (skb->len < 1)
-		return -1;
-
-	if (skb->data[0] == 0xff) {
-		if (skb->len < 2)
-			return -1;
-
-		if (skb->data[1] != 0x03)
-			return -1;
-
-		// skip address/control (AC) field
-		skb_pull(skb, 2);
-	} else {
-		if (is->pppcfg & SC_REJ_COMP_AC)
-			// if AC compression was not negotiated, but used, discard packet
-			return -1;
-	}
-	return 0;
-}
-
-/*
- * get the PPP protocol header and pull skb
- * retval < 0 -> discard packet silently
- */
-static int isdn_ppp_strip_proto(struct sk_buff *skb)
-{
-	int proto;
-
-	if (skb->len < 1)
-		return -1;
-
-	if (skb->data[0] & 0x1) {
-		// protocol field is compressed
-		proto = skb->data[0];
-		skb_pull(skb, 1);
-	} else {
-		if (skb->len < 2)
-			return -1;
-		proto = ((int) skb->data[0] << 8) + skb->data[1];
-		skb_pull(skb, 2);
-	}
-	return proto;
-}
-
-
-/*
- * handler for incoming packets on a syncPPP interface
- */
-void isdn_ppp_receive(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb)
-{
-	struct ippp_struct *is;
-	int slot;
-	int proto;
-
-	BUG_ON(net_dev->local->master); // we're called with the master device always
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_receive: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		kfree_skb(skb);
-		return;
-	}
-	is = ippp_table[slot];
-
-	if (is->debug & 0x4) {
-		printk(KERN_DEBUG "ippp_receive: is:%08lx lp:%08lx slot:%d unit:%d len:%d\n",
-		       (long)is, (long)lp, lp->ppp_slot, is->unit, (int)skb->len);
-		isdn_ppp_frame_log("receive", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-	}
-
-	if (isdn_ppp_skip_ac(is, skb) < 0) {
-		kfree_skb(skb);
-		return;
-	}
-	proto = isdn_ppp_strip_proto(skb);
-	if (proto < 0) {
-		kfree_skb(skb);
-		return;
-	}
-
-#ifdef CONFIG_ISDN_MPP
-	if (is->compflags & SC_LINK_DECOMP_ON) {
-		skb = isdn_ppp_decompress(skb, is, NULL, &proto);
-		if (!skb) // decompression error
-			return;
-	}
-
-	if (!(is->mpppcfg & SC_REJ_MP_PROT)) { // we agreed to receive MPPP
-		if (proto == PPP_MP) {
-			isdn_ppp_mp_receive(net_dev, lp, skb);
-			return;
-		}
-	}
-#endif
-	isdn_ppp_push_higher(net_dev, lp, skb, proto);
-}
-
-/*
- * we receive a reassembled frame, MPPP has been taken care of before.
- * address/control and protocol have been stripped from the skb
- * note: net_dev has to be master net_dev
- */
-static void
-isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb, int proto)
-{
-	struct net_device *dev = net_dev->dev;
-	struct ippp_struct *is, *mis;
-	isdn_net_local *mlp = NULL;
-	int slot;
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_push_higher: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		goto drop_packet;
-	}
-	is = ippp_table[slot];
-
-	if (lp->master) { // FIXME?
-		mlp = ISDN_MASTER_PRIV(lp);
-		slot = mlp->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "isdn_ppp_push_higher: master->ppp_slot(%d)\n",
-			       lp->ppp_slot);
-			goto drop_packet;
-		}
-	}
-	mis = ippp_table[slot];
-
-	if (is->debug & 0x10) {
-		printk(KERN_DEBUG "push, skb %d %04x\n", (int) skb->len, proto);
-		isdn_ppp_frame_log("rpush", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-	}
-	if (mis->compflags & SC_DECOMP_ON) {
-		skb = isdn_ppp_decompress(skb, is, mis, &proto);
-		if (!skb) // decompression error
-			return;
-	}
-	switch (proto) {
-	case PPP_IPX:  /* untested */
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: IPX\n");
-		skb->protocol = htons(ETH_P_IPX);
-		break;
-	case PPP_IP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: IP\n");
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case PPP_COMP:
-	case PPP_COMPFRAG:
-		printk(KERN_INFO "isdn_ppp: unexpected compressed frame dropped\n");
-		goto drop_packet;
-#ifdef CONFIG_ISDN_PPP_VJ
-	case PPP_VJC_UNCOMP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: VJC_UNCOMP\n");
-		if (net_dev->local->ppp_slot < 0) {
-			printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n",
-			       __func__, net_dev->local->ppp_slot);
-			goto drop_packet;
-		}
-		if (slhc_remember(ippp_table[net_dev->local->ppp_slot]->slcomp, skb->data, skb->len) <= 0) {
-			printk(KERN_WARNING "isdn_ppp: received illegal VJC_UNCOMP frame!\n");
-			goto drop_packet;
-		}
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case PPP_VJC_COMP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: VJC_COMP\n");
-		{
-			struct sk_buff *skb_old = skb;
-			int pkt_len;
-			skb = dev_alloc_skb(skb_old->len + 128);
-
-			if (!skb) {
-				printk(KERN_WARNING "%s: Memory squeeze, dropping packet.\n", dev->name);
-				skb = skb_old;
-				goto drop_packet;
-			}
-			skb_put(skb, skb_old->len + 128);
-			skb_copy_from_linear_data(skb_old, skb->data,
-						  skb_old->len);
-			if (net_dev->local->ppp_slot < 0) {
-				printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n",
-				       __func__, net_dev->local->ppp_slot);
-				goto drop_packet;
-			}
-			pkt_len = slhc_uncompress(ippp_table[net_dev->local->ppp_slot]->slcomp,
-						  skb->data, skb_old->len);
-			kfree_skb(skb_old);
-			if (pkt_len < 0)
-				goto drop_packet;
-
-			skb_trim(skb, pkt_len);
-			skb->protocol = htons(ETH_P_IP);
-		}
-		break;
-#endif
-	case PPP_CCP:
-	case PPP_CCPFRAG:
-		isdn_ppp_receive_ccp(net_dev, lp, skb, proto);
-		/* Dont pop up ResetReq/Ack stuff to the daemon any
-		   longer - the job is done already */
-		if (skb->data[0] == CCP_RESETREQ ||
-		    skb->data[0] == CCP_RESETACK)
-			break;
-		/* fall through */
-	default:
-		isdn_ppp_fill_rq(skb->data, skb->len, proto, lp->ppp_slot);	/* push data to pppd device */
-		kfree_skb(skb);
-		return;
-	}
-
-#ifdef CONFIG_IPPP_FILTER
-	/* check if the packet passes the pass and active filters
-	 * the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet (which is still present) */
-	skb_push(skb, 4);
-
-	{
-		u_int16_t *p = (u_int16_t *) skb->data;
-
-		*p = 0;	/* indicate inbound */
-	}
-
-	if (is->pass_filter
-	    && BPF_PROG_RUN(is->pass_filter, skb) == 0) {
-		if (is->debug & 0x2)
-			printk(KERN_DEBUG "IPPP: inbound frame filtered.\n");
-		kfree_skb(skb);
-		return;
-	}
-	if (!(is->active_filter
-	      && BPF_PROG_RUN(is->active_filter, skb) == 0)) {
-		if (is->debug & 0x2)
-			printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n");
-		lp->huptimer = 0;
-		if (mlp)
-			mlp->huptimer = 0;
-	}
-	skb_pull(skb, 4);
-#else /* CONFIG_IPPP_FILTER */
-	lp->huptimer = 0;
-	if (mlp)
-		mlp->huptimer = 0;
-#endif /* CONFIG_IPPP_FILTER */
-	skb->dev = dev;
-	skb_reset_mac_header(skb);
-	netif_rx(skb);
-	/* net_dev->local->stats.rx_packets++; done in isdn_net.c */
-	return;
-
-drop_packet:
-	net_dev->local->stats.rx_dropped++;
-	kfree_skb(skb);
-}
-
-/*
- * isdn_ppp_skb_push ..
- * checks whether we have enough space at the beginning of the skb
- * and allocs a new SKB if necessary
- */
-static unsigned char *isdn_ppp_skb_push(struct sk_buff **skb_p, int len)
-{
-	struct sk_buff *skb = *skb_p;
-
-	if (skb_headroom(skb) < len) {
-		struct sk_buff *nskb = skb_realloc_headroom(skb, len);
-
-		if (!nskb) {
-			printk(KERN_ERR "isdn_ppp_skb_push: can't realloc headroom!\n");
-			dev_kfree_skb(skb);
-			return NULL;
-		}
-		printk(KERN_DEBUG "isdn_ppp_skb_push:under %d %d\n", skb_headroom(skb), len);
-		dev_kfree_skb(skb);
-		*skb_p = nskb;
-		return skb_push(nskb, len);
-	}
-	return skb_push(skb, len);
-}
-
-/*
- * send ppp frame .. we expect a PIDCOMPressable proto --
- *  (here: currently always PPP_IP,PPP_VJC_COMP,PPP_VJC_UNCOMP)
- *
- * VJ compression may change skb pointer!!! .. requeue with old
- * skb isn't allowed!!
- */
-
-int
-isdn_ppp_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-	isdn_net_local *lp, *mlp;
-	isdn_net_dev *nd;
-	unsigned int proto = PPP_IP;     /* 0x21 */
-	struct ippp_struct *ipt, *ipts;
-	int slot, retval = NETDEV_TX_OK;
-
-	mlp = netdev_priv(netdev);
-	nd = mlp->netdev;       /* get master lp */
-
-	slot = mlp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n",
-		       mlp->ppp_slot);
-		kfree_skb(skb);
-		goto out;
-	}
-	ipts = ippp_table[slot];
-
-	if (!(ipts->pppcfg & SC_ENABLE_IP)) {	/* PPP connected ? */
-		if (ipts->debug & 0x1)
-			printk(KERN_INFO "%s: IP frame delayed.\n", netdev->name);
-		retval = NETDEV_TX_BUSY;
-		goto out;
-	}
-
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
-		proto = PPP_IP;
-		break;
-	case ETH_P_IPX:
-		proto = PPP_IPX;	/* untested */
-		break;
-	default:
-		printk(KERN_ERR "isdn_ppp: skipped unsupported protocol: %#x.\n",
-		       skb->protocol);
-		dev_kfree_skb(skb);
-		goto out;
-	}
-
-	lp = isdn_net_get_locked_lp(nd);
-	if (!lp) {
-		printk(KERN_WARNING "%s: all channels busy - requeuing!\n", netdev->name);
-		retval = NETDEV_TX_BUSY;
-		goto out;
-	}
-	/* we have our lp locked from now on */
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		kfree_skb(skb);
-		goto unlock;
-	}
-	ipt = ippp_table[slot];
-
-	/*
-	 * after this line .. requeueing in the device queue is no longer allowed!!!
-	 */
-
-	/* Pull off the fake header we stuck on earlier to keep
-	 * the fragmentation code happy.
-	 */
-	skb_pull(skb, IPPP_MAX_HEADER);
-
-#ifdef CONFIG_IPPP_FILTER
-	/* check if we should pass this packet
-	 * the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet */
-	*(u8 *)skb_push(skb, 4) = 1; /* indicate outbound */
-
-	{
-		__be16 *p = (__be16 *)skb->data;
-
-		p++;
-		*p = htons(proto);
-	}
-
-	if (ipt->pass_filter
-	    && BPF_PROG_RUN(ipt->pass_filter, skb) == 0) {
-		if (ipt->debug & 0x4)
-			printk(KERN_DEBUG "IPPP: outbound frame filtered.\n");
-		kfree_skb(skb);
-		goto unlock;
-	}
-	if (!(ipt->active_filter
-	      && BPF_PROG_RUN(ipt->active_filter, skb) == 0)) {
-		if (ipt->debug & 0x4)
-			printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n");
-		lp->huptimer = 0;
-	}
-	skb_pull(skb, 4);
-#else /* CONFIG_IPPP_FILTER */
-	lp->huptimer = 0;
-#endif /* CONFIG_IPPP_FILTER */
-
-	if (ipt->debug & 0x4)
-		printk(KERN_DEBUG "xmit skb, len %d\n", (int) skb->len);
-	if (ipts->debug & 0x40)
-		isdn_ppp_frame_log("xmit0", skb->data, skb->len, 32, ipts->unit, lp->ppp_slot);
-
-#ifdef CONFIG_ISDN_PPP_VJ
-	if (proto == PPP_IP && ipts->pppcfg & SC_COMP_TCP) {	/* ipts here? probably yes, but check this again */
-		struct sk_buff *new_skb;
-		unsigned short hl;
-		/*
-		 * we need to reserve enough space in front of
-		 * sk_buff. old call to dev_alloc_skb only reserved
-		 * 16 bytes, now we are looking what the driver want.
-		 */
-		hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen + IPPP_MAX_HEADER;
-		/*
-		 * Note: hl might still be insufficient because the method
-		 * above does not account for a possibible MPPP slave channel
-		 * which had larger HL header space requirements than the
-		 * master.
-		 */
-		new_skb = alloc_skb(hl + skb->len, GFP_ATOMIC);
-		if (new_skb) {
-			u_char *buf;
-			int pktlen;
-
-			skb_reserve(new_skb, hl);
-			new_skb->dev = skb->dev;
-			skb_put(new_skb, skb->len);
-			buf = skb->data;
-
-			pktlen = slhc_compress(ipts->slcomp, skb->data, skb->len, new_skb->data,
-					       &buf, !(ipts->pppcfg & SC_NO_TCP_CCID));
-
-			if (buf != skb->data) {
-				if (new_skb->data != buf)
-					printk(KERN_ERR "isdn_ppp: FATAL error after slhc_compress!!\n");
-				dev_kfree_skb(skb);
-				skb = new_skb;
-			} else {
-				dev_kfree_skb(new_skb);
-			}
-
-			skb_trim(skb, pktlen);
-			if (skb->data[0] & SL_TYPE_COMPRESSED_TCP) {	/* cslip? style -> PPP */
-				proto = PPP_VJC_COMP;
-				skb->data[0] ^= SL_TYPE_COMPRESSED_TCP;
-			} else {
-				if (skb->data[0] >= SL_TYPE_UNCOMPRESSED_TCP)
-					proto = PPP_VJC_UNCOMP;
-				skb->data[0] = (skb->data[0] & 0x0f) | 0x40;
-			}
-		}
-	}
-#endif
-
-	/*
-	 * normal (single link) or bundle compression
-	 */
-	if (ipts->compflags & SC_COMP_ON) {
-		/* We send compressed only if both down- und upstream
-		   compression is negotiated, that means, CCP is up */
-		if (ipts->compflags & SC_DECOMP_ON) {
-			skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 0);
-		} else {
-			printk(KERN_DEBUG "isdn_ppp: CCP not yet up - sending as-is\n");
-		}
-	}
-
-	if (ipt->debug & 0x24)
-		printk(KERN_DEBUG "xmit2 skb, len %d, proto %04x\n", (int) skb->len, proto);
-
-#ifdef CONFIG_ISDN_MPP
-	if (ipt->mpppcfg & SC_MP_PROT) {
-		/* we get mp_seqno from static isdn_net_local */
-		long mp_seqno = ipts->mp_seqno;
-		ipts->mp_seqno++;
-		if (ipt->mpppcfg & SC_OUT_SHORT_SEQ) {
-			unsigned char *data = isdn_ppp_skb_push(&skb, 3);
-			if (!data)
-				goto unlock;
-			mp_seqno &= 0xfff;
-			data[0] = MP_BEGIN_FRAG | MP_END_FRAG | ((mp_seqno >> 8) & 0xf);	/* (B)egin & (E)ndbit .. */
-			data[1] = mp_seqno & 0xff;
-			data[2] = proto;	/* PID compression */
-		} else {
-			unsigned char *data = isdn_ppp_skb_push(&skb, 5);
-			if (!data)
-				goto unlock;
-			data[0] = MP_BEGIN_FRAG | MP_END_FRAG;	/* (B)egin & (E)ndbit .. */
-			data[1] = (mp_seqno >> 16) & 0xff;	/* sequence number: 24bit */
-			data[2] = (mp_seqno >> 8) & 0xff;
-			data[3] = (mp_seqno >> 0) & 0xff;
-			data[4] = proto;	/* PID compression */
-		}
-		proto = PPP_MP; /* MP Protocol, 0x003d */
-	}
-#endif
-
-	/*
-	 * 'link in bundle' compression  ...
-	 */
-	if (ipt->compflags & SC_LINK_COMP_ON)
-		skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 1);
-
-	if ((ipt->pppcfg & SC_COMP_PROT) && (proto <= 0xff)) {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 1);
-		if (!data)
-			goto unlock;
-		data[0] = proto & 0xff;
-	}
-	else {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 2);
-		if (!data)
-			goto unlock;
-		data[0] = (proto >> 8) & 0xff;
-		data[1] = proto & 0xff;
-	}
-	if (!(ipt->pppcfg & SC_COMP_AC)) {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 2);
-		if (!data)
-			goto unlock;
-		data[0] = 0xff;    /* All Stations */
-		data[1] = 0x03;    /* Unnumbered information */
-	}
-
-	/* tx-stats are now updated via BSENT-callback */
-
-	if (ipts->debug & 0x40) {
-		printk(KERN_DEBUG "skb xmit: len: %d\n", (int) skb->len);
-		isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, ipt->unit, lp->ppp_slot);
-	}
-
-	isdn_net_writebuf_skb(lp, skb);
-
-unlock:
-	spin_unlock_bh(&lp->xmit_lock);
-out:
-	return retval;
-}
-
-#ifdef CONFIG_IPPP_FILTER
-/*
- * check if this packet may trigger auto-dial.
- */
-
-int isdn_ppp_autodial_filter(struct sk_buff *skb, isdn_net_local *lp)
-{
-	struct ippp_struct *is = ippp_table[lp->ppp_slot];
-	u_int16_t proto;
-	int drop = 0;
-
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
-		proto = PPP_IP;
-		break;
-	case ETH_P_IPX:
-		proto = PPP_IPX;
-		break;
-	default:
-		printk(KERN_ERR "isdn_ppp_autodial_filter: unsupported protocol 0x%x.\n",
-		       skb->protocol);
-		return 1;
-	}
-
-	/* the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet. we have to
-	 * temporarily remove part of the fake header stuck on
-	 * earlier.
-	 */
-	*(u8 *)skb_pull(skb, IPPP_MAX_HEADER - 4) = 1; /* indicate outbound */
-
-	{
-		__be16 *p = (__be16 *)skb->data;
-
-		p++;
-		*p = htons(proto);
-	}
-
-	drop |= is->pass_filter
-		&& BPF_PROG_RUN(is->pass_filter, skb) == 0;
-	drop |= is->active_filter
-		&& BPF_PROG_RUN(is->active_filter, skb) == 0;
-
-	skb_push(skb, IPPP_MAX_HEADER - 4);
-	return drop;
-}
-#endif
-#ifdef CONFIG_ISDN_MPP
-
-/* this is _not_ rfc1990 header, but something we convert both short and long
- * headers to for convinience's sake:
- *	byte 0 is flags as in rfc1990
- *	bytes 1...4 is 24-bit seqence number converted to host byte order
- */
-#define MP_HEADER_LEN	5
-
-#define MP_LONGSEQ_MASK		0x00ffffff
-#define MP_SHORTSEQ_MASK	0x00000fff
-#define MP_LONGSEQ_MAX		MP_LONGSEQ_MASK
-#define MP_SHORTSEQ_MAX		MP_SHORTSEQ_MASK
-#define MP_LONGSEQ_MAXBIT	((MP_LONGSEQ_MASK + 1) >> 1)
-#define MP_SHORTSEQ_MAXBIT	((MP_SHORTSEQ_MASK + 1) >> 1)
-
-/* sequence-wrap safe comparisons (for long sequence)*/
-#define MP_LT(a, b)	((a - b) & MP_LONGSEQ_MAXBIT)
-#define MP_LE(a, b)	!((b - a) & MP_LONGSEQ_MAXBIT)
-#define MP_GT(a, b)	((b - a) & MP_LONGSEQ_MAXBIT)
-#define MP_GE(a, b)	!((a - b) & MP_LONGSEQ_MAXBIT)
-
-#define MP_SEQ(f)	((*(u32 *)(f->data + 1)))
-#define MP_FLAGS(f)	(f->data[0])
-
-static int isdn_ppp_mp_bundle_array_init(void)
-{
-	int i;
-	int sz = ISDN_MAX_CHANNELS * sizeof(ippp_bundle);
-	if ((isdn_ppp_bundle_arr = kzalloc(sz, GFP_KERNEL)) == NULL)
-		return -ENOMEM;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		spin_lock_init(&isdn_ppp_bundle_arr[i].lock);
-	return 0;
-}
-
-static ippp_bundle *isdn_ppp_mp_bundle_alloc(void)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (isdn_ppp_bundle_arr[i].ref_ct <= 0)
-			return (isdn_ppp_bundle_arr + i);
-	return NULL;
-}
-
-static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to)
-{
-	struct ippp_struct *is;
-
-	if (lp->ppp_slot < 0) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return (-EINVAL);
-	}
-
-	is = ippp_table[lp->ppp_slot];
-	if (add_to) {
-		if (lp->netdev->pb)
-			lp->netdev->pb->ref_ct--;
-		lp->netdev->pb = add_to;
-	} else {		/* first link in a bundle */
-		is->mp_seqno = 0;
-		if ((lp->netdev->pb = isdn_ppp_mp_bundle_alloc()) == NULL)
-			return -ENOMEM;
-		lp->next = lp->last = lp;	/* nobody else in a queue */
-		lp->netdev->pb->frags = NULL;
-		lp->netdev->pb->frames = 0;
-		lp->netdev->pb->seq = UINT_MAX;
-	}
-	lp->netdev->pb->ref_ct++;
-
-	is->last_link_seqno = 0;
-	return 0;
-}
-
-static u32 isdn_ppp_mp_get_seq(int short_seq,
-			       struct sk_buff *skb, u32 last_seq);
-static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp,
-					   struct sk_buff *from, struct sk_buff *to);
-static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp,
-				   struct sk_buff *from, struct sk_buff *to);
-static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb);
-static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb);
-
-static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp,
-				struct sk_buff *skb)
-{
-	struct ippp_struct *is;
-	isdn_net_local *lpq;
-	ippp_bundle *mp;
-	isdn_mppp_stats *stats;
-	struct sk_buff *newfrag, *frag, *start, *nextf;
-	u32 newseq, minseq, thisseq;
-	unsigned long flags;
-	int slot;
-
-	spin_lock_irqsave(&net_dev->pb->lock, flags);
-	mp = net_dev->pb;
-	stats = &mp->stats;
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d)\n",
-		       __func__, lp->ppp_slot);
-		stats->frame_drops++;
-		dev_kfree_skb(skb);
-		spin_unlock_irqrestore(&mp->lock, flags);
-		return;
-	}
-	is = ippp_table[slot];
-	if (++mp->frames > stats->max_queue_len)
-		stats->max_queue_len = mp->frames;
-
-	if (is->debug & 0x8)
-		isdn_ppp_mp_print_recv_pkt(lp->ppp_slot, skb);
-
-	newseq = isdn_ppp_mp_get_seq(is->mpppcfg & SC_IN_SHORT_SEQ,
-				     skb, is->last_link_seqno);
-
-
-	/* if this packet seq # is less than last already processed one,
-	 * toss it right away, but check for sequence start case first
-	 */
-	if (mp->seq > MP_LONGSEQ_MAX && (newseq & MP_LONGSEQ_MAXBIT)) {
-		mp->seq = newseq;	/* the first packet: required for
-					 * rfc1990 non-compliant clients --
-					 * prevents constant packet toss */
-	} else if (MP_LT(newseq, mp->seq)) {
-		stats->frame_drops++;
-		isdn_ppp_mp_free_skb(mp, skb);
-		spin_unlock_irqrestore(&mp->lock, flags);
-		return;
-	}
-
-	/* find the minimum received sequence number over all links */
-	is->last_link_seqno = minseq = newseq;
-	for (lpq = net_dev->queue;;) {
-		slot = lpq->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: lpq->ppp_slot(%d)\n",
-			       __func__, lpq->ppp_slot);
-		} else {
-			u32 lls = ippp_table[slot]->last_link_seqno;
-			if (MP_LT(lls, minseq))
-				minseq = lls;
-		}
-		if ((lpq = lpq->next) == net_dev->queue)
-			break;
-	}
-	if (MP_LT(minseq, mp->seq))
-		minseq = mp->seq;	/* can't go beyond already processed
-					 * packets */
-	newfrag = skb;
-
-	/* if this new fragment is before the first one, then enqueue it now. */
-	if ((frag = mp->frags) == NULL || MP_LT(newseq, MP_SEQ(frag))) {
-		newfrag->next = frag;
-		mp->frags = frag = newfrag;
-		newfrag = NULL;
-	}
-
-	start = MP_FLAGS(frag) & MP_BEGIN_FRAG &&
-		MP_SEQ(frag) == mp->seq ? frag : NULL;
-
-	/*
-	 * main fragment traversing loop
-	 *
-	 * try to accomplish several tasks:
-	 * - insert new fragment into the proper sequence slot (once that's done
-	 *   newfrag will be set to NULL)
-	 * - reassemble any complete fragment sequence (non-null 'start'
-	 *   indicates there is a contiguous sequence present)
-	 * - discard any incomplete sequences that are below minseq -- due
-	 *   to the fact that sender always increment sequence number, if there
-	 *   is an incomplete sequence below minseq, no new fragments would
-	 *   come to complete such sequence and it should be discarded
-	 *
-	 * loop completes when we accomplished the following tasks:
-	 * - new fragment is inserted in the proper sequence ('newfrag' is
-	 *   set to NULL)
-	 * - we hit a gap in the sequence, so no reassembly/processing is
-	 *   possible ('start' would be set to NULL)
-	 *
-	 * algorithm for this code is derived from code in the book
-	 * 'PPP Design And Debugging' by James Carlson (Addison-Wesley)
-	 */
-	while (start != NULL || newfrag != NULL) {
-
-		thisseq = MP_SEQ(frag);
-		nextf = frag->next;
-
-		/* drop any duplicate fragments */
-		if (newfrag != NULL && thisseq == newseq) {
-			isdn_ppp_mp_free_skb(mp, newfrag);
-			newfrag = NULL;
-		}
-
-		/* insert new fragment before next element if possible. */
-		if (newfrag != NULL && (nextf == NULL ||
-					MP_LT(newseq, MP_SEQ(nextf)))) {
-			newfrag->next = nextf;
-			frag->next = nextf = newfrag;
-			newfrag = NULL;
-		}
-
-		if (start != NULL) {
-			/* check for misplaced start */
-			if (start != frag && (MP_FLAGS(frag) & MP_BEGIN_FRAG)) {
-				printk(KERN_WARNING"isdn_mppp(seq %d): new "
-				       "BEGIN flag with no prior END", thisseq);
-				stats->seqerrs++;
-				stats->frame_drops++;
-				start = isdn_ppp_mp_discard(mp, start, frag);
-				nextf = frag->next;
-			}
-		} else if (MP_LE(thisseq, minseq)) {
-			if (MP_FLAGS(frag) & MP_BEGIN_FRAG)
-				start = frag;
-			else {
-				if (MP_FLAGS(frag) & MP_END_FRAG)
-					stats->frame_drops++;
-				if (mp->frags == frag)
-					mp->frags = nextf;
-				isdn_ppp_mp_free_skb(mp, frag);
-				frag = nextf;
-				continue;
-			}
-		}
-
-		/* if start is non-null and we have end fragment, then
-		 * we have full reassembly sequence -- reassemble
-		 * and process packet now
-		 */
-		if (start != NULL && (MP_FLAGS(frag) & MP_END_FRAG)) {
-			minseq = mp->seq = (thisseq + 1) & MP_LONGSEQ_MASK;
-			/* Reassemble the packet then dispatch it */
-			isdn_ppp_mp_reassembly(net_dev, lp, start, nextf);
-
-			start = NULL;
-			frag = NULL;
-
-			mp->frags = nextf;
-		}
-
-		/* check if need to update start pointer: if we just
-		 * reassembled the packet and sequence is contiguous
-		 * then next fragment should be the start of new reassembly
-		 * if sequence is contiguous, but we haven't reassembled yet,
-		 * keep going.
-		 * if sequence is not contiguous, either clear everything
-		 * below low watermark and set start to the next frag or
-		 * clear start ptr.
-		 */
-		if (nextf != NULL &&
-		    ((thisseq + 1) & MP_LONGSEQ_MASK) == MP_SEQ(nextf)) {
-			/* if we just reassembled and the next one is here,
-			 * then start another reassembly. */
-
-			if (frag == NULL) {
-				if (MP_FLAGS(nextf) & MP_BEGIN_FRAG)
-					start = nextf;
-				else
-				{
-					printk(KERN_WARNING"isdn_mppp(seq %d):"
-					       " END flag with no following "
-					       "BEGIN", thisseq);
-					stats->seqerrs++;
-				}
-			}
-
-		} else {
-			if (nextf != NULL && frag != NULL &&
-			    MP_LT(thisseq, minseq)) {
-				/* we've got a break in the sequence
-				 * and we not at the end yet
-				 * and we did not just reassembled
-				 *(if we did, there wouldn't be anything before)
-				 * and we below the low watermark
-				 * discard all the frames below low watermark
-				 * and start over */
-				stats->frame_drops++;
-				mp->frags = isdn_ppp_mp_discard(mp, start, nextf);
-			}
-			/* break in the sequence, no reassembly */
-			start = NULL;
-		}
-
-		frag = nextf;
-	}	/* while -- main loop */
-
-	if (mp->frags == NULL)
-		mp->frags = frag;
-
-	/* rather straighforward way to deal with (not very) possible
-	 * queue overflow */
-	if (mp->frames > MP_MAX_QUEUE_LEN) {
-		stats->overflows++;
-		while (mp->frames > MP_MAX_QUEUE_LEN) {
-			frag = mp->frags->next;
-			isdn_ppp_mp_free_skb(mp, mp->frags);
-			mp->frags = frag;
-		}
-	}
-	spin_unlock_irqrestore(&mp->lock, flags);
-}
-
-static void isdn_ppp_mp_cleanup(isdn_net_local *lp)
-{
-	struct sk_buff *frag = lp->netdev->pb->frags;
-	struct sk_buff *nextfrag;
-	while (frag) {
-		nextfrag = frag->next;
-		isdn_ppp_mp_free_skb(lp->netdev->pb, frag);
-		frag = nextfrag;
-	}
-	lp->netdev->pb->frags = NULL;
-}
-
-static u32 isdn_ppp_mp_get_seq(int short_seq,
-			       struct sk_buff *skb, u32 last_seq)
-{
-	u32 seq;
-	int flags = skb->data[0] & (MP_BEGIN_FRAG | MP_END_FRAG);
-
-	if (!short_seq)
-	{
-		seq = ntohl(*(__be32 *)skb->data) & MP_LONGSEQ_MASK;
-		skb_push(skb, 1);
-	}
-	else
-	{
-		/* convert 12-bit short seq number to 24-bit long one
-		 */
-		seq = ntohs(*(__be16 *)skb->data) & MP_SHORTSEQ_MASK;
-
-		/* check for seqence wrap */
-		if (!(seq &  MP_SHORTSEQ_MAXBIT) &&
-		    (last_seq &  MP_SHORTSEQ_MAXBIT) &&
-		    (unsigned long)last_seq <= MP_LONGSEQ_MAX)
-			seq |= (last_seq + MP_SHORTSEQ_MAX + 1) &
-				(~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK);
-		else
-			seq |= last_seq & (~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK);
-
-		skb_push(skb, 3);	/* put converted seqence back in skb */
-	}
-	*(u32 *)(skb->data + 1) = seq;	/* put seqence back in _host_ byte
-					 * order */
-	skb->data[0] = flags;	        /* restore flags */
-	return seq;
-}
-
-static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp,
-					   struct sk_buff *from,
-					   struct sk_buff *to)
-{
-	if (from)
-		while (from != to) {
-			struct sk_buff *next = from->next;
-			isdn_ppp_mp_free_skb(mp, from);
-			from = next;
-		}
-	return from;
-}
-
-static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp,
-				   struct sk_buff *from, struct sk_buff *to)
-{
-	ippp_bundle *mp = net_dev->pb;
-	int proto;
-	struct sk_buff *skb;
-	unsigned int tot_len;
-
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	if (MP_FLAGS(from) == (MP_BEGIN_FRAG | MP_END_FRAG)) {
-		if (ippp_table[lp->ppp_slot]->debug & 0x40)
-			printk(KERN_DEBUG "isdn_mppp: reassembly: frame %d, "
-			       "len %d\n", MP_SEQ(from), from->len);
-		skb = from;
-		skb_pull(skb, MP_HEADER_LEN);
-		mp->frames--;
-	} else {
-		struct sk_buff *frag;
-		int n;
-
-		for (tot_len = n = 0, frag = from; frag != to; frag = frag->next, n++)
-			tot_len += frag->len - MP_HEADER_LEN;
-
-		if (ippp_table[lp->ppp_slot]->debug & 0x40)
-			printk(KERN_DEBUG"isdn_mppp: reassembling frames %d "
-			       "to %d, len %d\n", MP_SEQ(from),
-			       (MP_SEQ(from) + n - 1) & MP_LONGSEQ_MASK, tot_len);
-		if ((skb = dev_alloc_skb(tot_len)) == NULL) {
-			printk(KERN_ERR "isdn_mppp: cannot allocate sk buff "
-			       "of size %d\n", tot_len);
-			isdn_ppp_mp_discard(mp, from, to);
-			return;
-		}
-
-		while (from != to) {
-			unsigned int len = from->len - MP_HEADER_LEN;
-
-			skb_copy_from_linear_data_offset(from, MP_HEADER_LEN,
-							 skb_put(skb, len),
-							 len);
-			frag = from->next;
-			isdn_ppp_mp_free_skb(mp, from);
-			from = frag;
-		}
-	}
-	proto = isdn_ppp_strip_proto(skb);
-	isdn_ppp_push_higher(net_dev, lp, skb, proto);
-}
-
-static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb)
-{
-	dev_kfree_skb(skb);
-	mp->frames--;
-}
-
-static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb)
-{
-	printk(KERN_DEBUG "mp_recv: %d/%d -> %02x %02x %02x %02x %02x %02x\n",
-	       slot, (int) skb->len,
-	       (int) skb->data[0], (int) skb->data[1], (int) skb->data[2],
-	       (int) skb->data[3], (int) skb->data[4], (int) skb->data[5]);
-}
-
-static int
-isdn_ppp_bundle(struct ippp_struct *is, int unit)
-{
-	char ifn[IFNAMSIZ + 1];
-	isdn_net_dev *p;
-	isdn_net_local *lp, *nlp;
-	int rc;
-	unsigned long flags;
-
-	sprintf(ifn, "ippp%d", unit);
-	p = isdn_net_findif(ifn);
-	if (!p) {
-		printk(KERN_ERR "ippp_bundle: cannot find %s\n", ifn);
-		return -EINVAL;
-	}
-
-	spin_lock_irqsave(&p->pb->lock, flags);
-
-	nlp = is->lp;
-	lp = p->queue;
-	if (nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS ||
-	    lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "ippp_bundle: binding to invalid slot %d\n",
-		       nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS ?
-		       nlp->ppp_slot : lp->ppp_slot);
-		rc = -EINVAL;
-		goto out;
-	}
-
-	isdn_net_add_to_bundle(p, nlp);
-
-	ippp_table[nlp->ppp_slot]->unit = ippp_table[lp->ppp_slot]->unit;
-
-	/* maybe also SC_CCP stuff */
-	ippp_table[nlp->ppp_slot]->pppcfg |= ippp_table[lp->ppp_slot]->pppcfg &
-		(SC_ENABLE_IP | SC_NO_TCP_CCID | SC_REJ_COMP_TCP);
-	ippp_table[nlp->ppp_slot]->mpppcfg |= ippp_table[lp->ppp_slot]->mpppcfg &
-		(SC_MP_PROT | SC_REJ_MP_PROT | SC_OUT_SHORT_SEQ | SC_IN_SHORT_SEQ);
-	rc = isdn_ppp_mp_init(nlp, p->pb);
-out:
-	spin_unlock_irqrestore(&p->pb->lock, flags);
-	return rc;
-}
-
-#endif /* CONFIG_ISDN_MPP */
-
-/*
- * network device ioctl handlers
- */
-
-static int
-isdn_ppp_dev_ioctl_stats(int slot, struct ifreq *ifr, struct net_device *dev)
-{
-	struct ppp_stats __user *res = ifr->ifr_data;
-	struct ppp_stats t;
-	isdn_net_local *lp = netdev_priv(dev);
-
-	/* build a temporary stat struct and copy it to user space */
-
-	memset(&t, 0, sizeof(struct ppp_stats));
-	if (dev->flags & IFF_UP) {
-		t.p.ppp_ipackets = lp->stats.rx_packets;
-		t.p.ppp_ibytes = lp->stats.rx_bytes;
-		t.p.ppp_ierrors = lp->stats.rx_errors;
-		t.p.ppp_opackets = lp->stats.tx_packets;
-		t.p.ppp_obytes = lp->stats.tx_bytes;
-		t.p.ppp_oerrors = lp->stats.tx_errors;
-#ifdef CONFIG_ISDN_PPP_VJ
-		if (slot >= 0 && ippp_table[slot]->slcomp) {
-			struct slcompress *slcomp = ippp_table[slot]->slcomp;
-			t.vj.vjs_packets = slcomp->sls_o_compressed + slcomp->sls_o_uncompressed;
-			t.vj.vjs_compressed = slcomp->sls_o_compressed;
-			t.vj.vjs_searches = slcomp->sls_o_searches;
-			t.vj.vjs_misses = slcomp->sls_o_misses;
-			t.vj.vjs_errorin = slcomp->sls_i_error;
-			t.vj.vjs_tossed = slcomp->sls_i_tossed;
-			t.vj.vjs_uncompressedin = slcomp->sls_i_uncompressed;
-			t.vj.vjs_compressedin = slcomp->sls_i_compressed;
-		}
-#endif
-	}
-	if (copy_to_user(res, &t, sizeof(struct ppp_stats)))
-		return -EFAULT;
-	return 0;
-}
-
-int
-isdn_ppp_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	int error = 0;
-	int len;
-	isdn_net_local *lp = netdev_priv(dev);
-
-
-	if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP)
-		return -EINVAL;
-
-	switch (cmd) {
-#define PPP_VERSION "2.3.7"
-	case SIOCGPPPVER:
-		len = strlen(PPP_VERSION) + 1;
-		if (copy_to_user(ifr->ifr_data, PPP_VERSION, len))
-			error = -EFAULT;
-		break;
-
-	case SIOCGPPPSTATS:
-		error = isdn_ppp_dev_ioctl_stats(lp->ppp_slot, ifr, dev);
-		break;
-	default:
-		error = -EINVAL;
-		break;
-	}
-	return error;
-}
-
-static int
-isdn_ppp_if_get_unit(char *name)
-{
-	int len,
-		i,
-		unit = 0,
-		deci;
-
-	len = strlen(name);
-
-	if (strncmp("ippp", name, 4) || len > 8)
-		return -1;
-
-	for (i = 0, deci = 1; i < len; i++, deci *= 10) {
-		char a = name[len - i - 1];
-		if (a >= '0' && a <= '9')
-			unit += (a - '0') * deci;
-		else
-			break;
-	}
-	if (!i || len - i != 4)
-		unit = -1;
-
-	return unit;
-}
-
-
-int
-isdn_ppp_dial_slave(char *name)
-{
-#ifdef CONFIG_ISDN_MPP
-	isdn_net_dev *ndev;
-	isdn_net_local *lp;
-	struct net_device *sdev;
-
-	if (!(ndev = isdn_net_findif(name)))
-		return 1;
-	lp = ndev->local;
-	if (!(lp->flags & ISDN_NET_CONNECTED))
-		return 5;
-
-	sdev = lp->slave;
-	while (sdev) {
-		isdn_net_local *mlp = netdev_priv(sdev);
-		if (!(mlp->flags & ISDN_NET_CONNECTED))
-			break;
-		sdev = mlp->slave;
-	}
-	if (!sdev)
-		return 2;
-
-	isdn_net_dial_req(netdev_priv(sdev));
-	return 0;
-#else
-	return -1;
-#endif
-}
-
-int
-isdn_ppp_hangup_slave(char *name)
-{
-#ifdef CONFIG_ISDN_MPP
-	isdn_net_dev *ndev;
-	isdn_net_local *lp;
-	struct net_device *sdev;
-
-	if (!(ndev = isdn_net_findif(name)))
-		return 1;
-	lp = ndev->local;
-	if (!(lp->flags & ISDN_NET_CONNECTED))
-		return 5;
-
-	sdev = lp->slave;
-	while (sdev) {
-		isdn_net_local *mlp = netdev_priv(sdev);
-
-		if (mlp->slave) { /* find last connected link in chain */
-			isdn_net_local *nlp = ISDN_SLAVE_PRIV(mlp);
-
-			if (!(nlp->flags & ISDN_NET_CONNECTED))
-				break;
-		} else if (mlp->flags & ISDN_NET_CONNECTED)
-			break;
-
-		sdev = mlp->slave;
-	}
-	if (!sdev)
-		return 2;
-
-	isdn_net_hangup(sdev);
-	return 0;
-#else
-	return -1;
-#endif
-}
-
-/*
- * PPP compression stuff
- */
-
-
-/* Push an empty CCP Data Frame up to the daemon to wake it up and let it
-   generate a CCP Reset-Request or tear down CCP altogether */
-
-static void isdn_ppp_ccp_kickup(struct ippp_struct *is)
-{
-	isdn_ppp_fill_rq(NULL, 0, PPP_COMP, is->lp->ppp_slot);
-}
-
-/* In-kernel handling of CCP Reset-Request and Reset-Ack is necessary,
-   but absolutely nontrivial. The most abstruse problem we are facing is
-   that the generation, reception and all the handling of timeouts and
-   resends including proper request id management should be entirely left
-   to the (de)compressor, but indeed is not covered by the current API to
-   the (de)compressor. The API is a prototype version from PPP where only
-   some (de)compressors have yet been implemented and all of them are
-   rather simple in their reset handling. Especially, their is only one
-   outstanding ResetAck at a time with all of them and ResetReq/-Acks do
-   not have parameters. For this very special case it was sufficient to
-   just return an error code from the decompressor and have a single
-   reset() entry to communicate all the necessary information between
-   the framework and the (de)compressor. Bad enough, LZS is different
-   (and any other compressor may be different, too). It has multiple
-   histories (eventually) and needs to Reset each of them independently
-   and thus uses multiple outstanding Acks and history numbers as an
-   additional parameter to Reqs/Acks.
-   All that makes it harder to port the reset state engine into the
-   kernel because it is not just the same simple one as in (i)pppd but
-   it must be able to pass additional parameters and have multiple out-
-   standing Acks. We are trying to achieve the impossible by handling
-   reset transactions independent by their id. The id MUST change when
-   the data portion changes, thus any (de)compressor who uses more than
-   one resettable state must provide and recognize individual ids for
-   each individual reset transaction. The framework itself does _only_
-   differentiate them by id, because it has no other semantics like the
-   (de)compressor might.
-   This looks like a major redesign of the interface would be nice,
-   but I don't have an idea how to do it better. */
-
-/* Send a CCP Reset-Request or Reset-Ack directly from the kernel. This is
-   getting that lengthy because there is no simple "send-this-frame-out"
-   function above but every wrapper does a bit different. Hope I guess
-   correct in this hack... */
-
-static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto,
-				    unsigned char code, unsigned char id,
-				    unsigned char *data, int len)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-	int hl;
-	int cnt = 0;
-	isdn_net_local *lp = is->lp;
-
-	/* Alloc large enough skb */
-	hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-	skb = alloc_skb(len + hl + 16, GFP_ATOMIC);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "ippp: CCP cannot send reset - out of memory\n");
-		return;
-	}
-	skb_reserve(skb, hl);
-
-	/* We may need to stuff an address and control field first */
-	if (!(is->pppcfg & SC_COMP_AC)) {
-		p = skb_put(skb, 2);
-		*p++ = 0xff;
-		*p++ = 0x03;
-	}
-
-	/* Stuff proto, code, id and length */
-	p = skb_put(skb, 6);
-	*p++ = (proto >> 8);
-	*p++ = (proto & 0xff);
-	*p++ = code;
-	*p++ = id;
-	cnt = 4 + len;
-	*p++ = (cnt >> 8);
-	*p++ = (cnt & 0xff);
-
-	/* Now stuff remaining bytes */
-	if (len) {
-		skb_put_data(skb, data, len);
-	}
-
-	/* skb is now ready for xmit */
-	printk(KERN_DEBUG "Sending CCP Frame:\n");
-	isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	isdn_net_write_super(lp, skb);
-}
-
-/* Allocate the reset state vector */
-static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is)
-{
-	struct ippp_ccp_reset *r;
-	r = kzalloc(sizeof(struct ippp_ccp_reset), GFP_KERNEL);
-	if (!r) {
-		printk(KERN_ERR "ippp_ccp: failed to allocate reset data"
-		       " structure - no mem\n");
-		return NULL;
-	}
-	printk(KERN_DEBUG "ippp_ccp: allocated reset data structure %p\n", r);
-	is->reset = r;
-	return r;
-}
-
-/* Destroy the reset state vector. Kill all pending timers first. */
-static void isdn_ppp_ccp_reset_free(struct ippp_struct *is)
-{
-	unsigned int id;
-
-	printk(KERN_DEBUG "ippp_ccp: freeing reset data structure %p\n",
-	       is->reset);
-	for (id = 0; id < 256; id++) {
-		if (is->reset->rs[id]) {
-			isdn_ppp_ccp_reset_free_state(is, (unsigned char)id);
-		}
-	}
-	kfree(is->reset);
-	is->reset = NULL;
-}
-
-/* Free a given state and clear everything up for later reallocation */
-static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
-					  unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs;
-
-	if (is->reset->rs[id]) {
-		printk(KERN_DEBUG "ippp_ccp: freeing state for id %d\n", id);
-		rs = is->reset->rs[id];
-		/* Make sure the kernel will not call back later */
-		if (rs->ta)
-			del_timer(&rs->timer);
-		is->reset->rs[id] = NULL;
-		kfree(rs);
-	} else {
-		printk(KERN_WARNING "ippp_ccp: id %d is not allocated\n", id);
-	}
-}
-
-/* The timer callback function which is called when a ResetReq has timed out,
-   aka has never been answered by a ResetAck */
-static void isdn_ppp_ccp_timer_callback(struct timer_list *t)
-{
-	struct ippp_ccp_reset_state *rs =
-		from_timer(rs, t, timer);
-
-	if (!rs) {
-		printk(KERN_ERR "ippp_ccp: timer cb with zero closure.\n");
-		return;
-	}
-	if (rs->ta && rs->state == CCPResetSentReq) {
-		/* We are correct here */
-		if (!rs->expra) {
-			/* Hmm, there is no Ack really expected. We can clean
-			   up the state now, it will be reallocated if the
-			   decompressor insists on another reset */
-			rs->ta = 0;
-			isdn_ppp_ccp_reset_free_state(rs->is, rs->id);
-			return;
-		}
-		printk(KERN_DEBUG "ippp_ccp: CCP Reset timed out for id %d\n",
-		       rs->id);
-		/* Push it again */
-		isdn_ppp_ccp_xmit_reset(rs->is, PPP_CCP, CCP_RESETREQ, rs->id,
-					rs->data, rs->dlen);
-		/* Restart timer */
-		rs->timer.expires = jiffies + HZ * 5;
-		add_timer(&rs->timer);
-	} else {
-		printk(KERN_WARNING "ippp_ccp: timer cb in wrong state %d\n",
-		       rs->state);
-	}
-}
-
-/* Allocate a new reset transaction state */
-static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
-								   unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs;
-	if (is->reset->rs[id]) {
-		printk(KERN_WARNING "ippp_ccp: old state exists for id %d\n",
-		       id);
-		return NULL;
-	} else {
-		rs = kzalloc(sizeof(struct ippp_ccp_reset_state), GFP_ATOMIC);
-		if (!rs)
-			return NULL;
-		rs->state = CCPResetIdle;
-		rs->is = is;
-		rs->id = id;
-		timer_setup(&rs->timer, isdn_ppp_ccp_timer_callback, 0);
-		is->reset->rs[id] = rs;
-	}
-	return rs;
-}
-
-
-/* A decompressor wants a reset with a set of parameters - do what is
-   necessary to fulfill it */
-static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
-				     struct isdn_ppp_resetparams *rp)
-{
-	struct ippp_ccp_reset_state *rs;
-
-	if (rp->valid) {
-		/* The decompressor defines parameters by itself */
-		if (rp->rsend) {
-			/* And he wants us to send a request */
-			if (!(rp->idval)) {
-				printk(KERN_ERR "ippp_ccp: decompressor must"
-				       " specify reset id\n");
-				return;
-			}
-			if (is->reset->rs[rp->id]) {
-				/* There is already a transaction in existence
-				   for this id. May be still waiting for a
-				   Ack or may be wrong. */
-				rs = is->reset->rs[rp->id];
-				if (rs->state == CCPResetSentReq && rs->ta) {
-					printk(KERN_DEBUG "ippp_ccp: reset"
-					       " trans still in progress"
-					       " for id %d\n", rp->id);
-				} else {
-					printk(KERN_WARNING "ippp_ccp: reset"
-					       " trans in wrong state %d for"
-					       " id %d\n", rs->state, rp->id);
-				}
-			} else {
-				/* Ok, this is a new transaction */
-				printk(KERN_DEBUG "ippp_ccp: new trans for id"
-				       " %d to be started\n", rp->id);
-				rs = isdn_ppp_ccp_reset_alloc_state(is, rp->id);
-				if (!rs) {
-					printk(KERN_ERR "ippp_ccp: out of mem"
-					       " allocing ccp trans\n");
-					return;
-				}
-				rs->state = CCPResetSentReq;
-				rs->expra = rp->expra;
-				if (rp->dtval) {
-					rs->dlen = rp->dlen;
-					memcpy(rs->data, rp->data, rp->dlen);
-				}
-				/* HACK TODO - add link comp here */
-				isdn_ppp_ccp_xmit_reset(is, PPP_CCP,
-							CCP_RESETREQ, rs->id,
-							rs->data, rs->dlen);
-				/* Start the timer */
-				rs->timer.expires = jiffies + 5 * HZ;
-				add_timer(&rs->timer);
-				rs->ta = 1;
-			}
-		} else {
-			printk(KERN_DEBUG "ippp_ccp: no reset sent\n");
-		}
-	} else {
-		/* The reset params are invalid. The decompressor does not
-		   care about them, so we just send the minimal requests
-		   and increase ids only when an Ack is received for a
-		   given id */
-		if (is->reset->rs[is->reset->lastid]) {
-			/* There is already a transaction in existence
-			   for this id. May be still waiting for a
-			   Ack or may be wrong. */
-			rs = is->reset->rs[is->reset->lastid];
-			if (rs->state == CCPResetSentReq && rs->ta) {
-				printk(KERN_DEBUG "ippp_ccp: reset"
-				       " trans still in progress"
-				       " for id %d\n", rp->id);
-			} else {
-				printk(KERN_WARNING "ippp_ccp: reset"
-				       " trans in wrong state %d for"
-				       " id %d\n", rs->state, rp->id);
-			}
-		} else {
-			printk(KERN_DEBUG "ippp_ccp: new trans for id"
-			       " %d to be started\n", is->reset->lastid);
-			rs = isdn_ppp_ccp_reset_alloc_state(is,
-							    is->reset->lastid);
-			if (!rs) {
-				printk(KERN_ERR "ippp_ccp: out of mem"
-				       " allocing ccp trans\n");
-				return;
-			}
-			rs->state = CCPResetSentReq;
-			/* We always expect an Ack if the decompressor doesn't
-			   know	better */
-			rs->expra = 1;
-			rs->dlen = 0;
-			/* HACK TODO - add link comp here */
-			isdn_ppp_ccp_xmit_reset(is, PPP_CCP, CCP_RESETREQ,
-						rs->id, NULL, 0);
-			/* Start the timer */
-			rs->timer.expires = jiffies + 5 * HZ;
-			add_timer(&rs->timer);
-			rs->ta = 1;
-		}
-	}
-}
-
-/* An Ack was received for this id. This means we stop the timer and clean
-   up the state prior to calling the decompressors reset routine. */
-static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is,
-					unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs = is->reset->rs[id];
-
-	if (rs) {
-		if (rs->ta && rs->state == CCPResetSentReq) {
-			/* Great, we are correct */
-			if (!rs->expra)
-				printk(KERN_DEBUG "ippp_ccp: ResetAck received"
-				       " for id %d but not expected\n", id);
-		} else {
-			printk(KERN_INFO "ippp_ccp: ResetAck received out of"
-			       "sync for id %d\n", id);
-		}
-		if (rs->ta) {
-			rs->ta = 0;
-			del_timer(&rs->timer);
-		}
-		isdn_ppp_ccp_reset_free_state(is, id);
-	} else {
-		printk(KERN_INFO "ippp_ccp: ResetAck received for unknown id"
-		       " %d\n", id);
-	}
-	/* Make sure the simple reset stuff uses a new id next time */
-	is->reset->lastid++;
-}
-
-/*
- * decompress packet
- *
- * if master = 0, we're trying to uncompress an per-link compressed packet,
- * as opposed to an compressed reconstructed-from-MPPP packet.
- * proto is updated to protocol field of uncompressed packet.
- *
- * retval: decompressed packet,
- *         same packet if uncompressed,
- *	   NULL if decompression error
- */
-
-static struct sk_buff *isdn_ppp_decompress(struct sk_buff *skb, struct ippp_struct *is, struct ippp_struct *master,
-					   int *proto)
-{
-	void *stat = NULL;
-	struct isdn_ppp_compressor *ipc = NULL;
-	struct sk_buff *skb_out;
-	int len;
-	struct ippp_struct *ri;
-	struct isdn_ppp_resetparams rsparm;
-	unsigned char rsdata[IPPP_RESET_MAXDATABYTES];
-
-	if (!master) {
-		// per-link decompression
-		stat = is->link_decomp_stat;
-		ipc = is->link_decompressor;
-		ri = is;
-	} else {
-		stat = master->decomp_stat;
-		ipc = master->decompressor;
-		ri = master;
-	}
-
-	if (!ipc) {
-		// no decompressor -> we can't decompress.
-		printk(KERN_DEBUG "ippp: no decompressor defined!\n");
-		return skb;
-	}
-	BUG_ON(!stat); // if we have a compressor, stat has been set as well
-
-	if ((master && *proto == PPP_COMP) || (!master && *proto == PPP_COMPFRAG)) {
-		// compressed packets are compressed by their protocol type
-
-		// Set up reset params for the decompressor
-		memset(&rsparm, 0, sizeof(rsparm));
-		rsparm.data = rsdata;
-		rsparm.maxdlen = IPPP_RESET_MAXDATABYTES;
-
-		skb_out = dev_alloc_skb(is->mru + PPP_HDRLEN);
-		if (!skb_out) {
-			kfree_skb(skb);
-			printk(KERN_ERR "ippp: decomp memory allocation failure\n");
-			return NULL;
-		}
-		len = ipc->decompress(stat, skb, skb_out, &rsparm);
-		kfree_skb(skb);
-		if (len <= 0) {
-			switch (len) {
-			case DECOMP_ERROR:
-				printk(KERN_INFO "ippp: decomp wants reset %s params\n",
-				       rsparm.valid ? "with" : "without");
-
-				isdn_ppp_ccp_reset_trans(ri, &rsparm);
-				break;
-			case DECOMP_FATALERROR:
-				ri->pppcfg |= SC_DC_FERROR;
-				/* Kick ipppd to recognize the error */
-				isdn_ppp_ccp_kickup(ri);
-				break;
-			}
-			kfree_skb(skb_out);
-			return NULL;
-		}
-		*proto = isdn_ppp_strip_proto(skb_out);
-		if (*proto < 0) {
-			kfree_skb(skb_out);
-			return NULL;
-		}
-		return skb_out;
-	} else {
-		// uncompressed packets are fed through the decompressor to
-		// update the decompressor state
-		ipc->incomp(stat, skb, *proto);
-		return skb;
-	}
-}
-
-/*
- * compress a frame
- *   type=0: normal/bundle compression
- *       =1: link compression
- * returns original skb if we haven't compressed the frame
- * and a new skb pointer if we've done it
- */
-static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto,
-					 struct ippp_struct *is, struct ippp_struct *master, int type)
-{
-	int ret;
-	int new_proto;
-	struct isdn_ppp_compressor *compressor;
-	void *stat;
-	struct sk_buff *skb_out;
-
-	/* we do not compress control protocols */
-	if (*proto < 0 || *proto > 0x3fff) {
-		return skb_in;
-	}
-
-	if (type) { /* type=1 => Link compression */
-		return skb_in;
-	}
-	else {
-		if (!master) {
-			compressor = is->compressor;
-			stat = is->comp_stat;
-		}
-		else {
-			compressor = master->compressor;
-			stat = master->comp_stat;
-		}
-		new_proto = PPP_COMP;
-	}
-
-	if (!compressor) {
-		printk(KERN_ERR "isdn_ppp: No compressor set!\n");
-		return skb_in;
-	}
-	if (!stat) {
-		printk(KERN_ERR "isdn_ppp: Compressor not initialized?\n");
-		return skb_in;
-	}
-
-	/* Allow for at least 150 % expansion (for now) */
-	skb_out = alloc_skb(skb_in->len + skb_in->len / 2 + 32 +
-			    skb_headroom(skb_in), GFP_ATOMIC);
-	if (!skb_out)
-		return skb_in;
-	skb_reserve(skb_out, skb_headroom(skb_in));
-
-	ret = (compressor->compress)(stat, skb_in, skb_out, *proto);
-	if (!ret) {
-		dev_kfree_skb(skb_out);
-		return skb_in;
-	}
-
-	dev_kfree_skb(skb_in);
-	*proto = new_proto;
-	return skb_out;
-}
-
-/*
- * we received a CCP frame ..
- * not a clean solution, but we MUST handle a few cases in the kernel
- */
-static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto)
-{
-	struct ippp_struct *is;
-	struct ippp_struct *mis;
-	int len;
-	struct isdn_ppp_resetparams rsparm;
-	unsigned char rsdata[IPPP_RESET_MAXDATABYTES];
-
-	printk(KERN_DEBUG "Received CCP frame from peer slot(%d)\n",
-	       lp->ppp_slot);
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	is = ippp_table[lp->ppp_slot];
-	isdn_ppp_frame_log("ccp-rcv", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	if (lp->master) {
-		int slot = ISDN_MASTER_PRIV(lp)->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: slot(%d) out of range\n",
-			       __func__, slot);
-			return;
-		}
-		mis = ippp_table[slot];
-	} else
-		mis = is;
-
-	switch (skb->data[0]) {
-	case CCP_CONFREQ:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable compression here!\n");
-		if (proto == PPP_CCP)
-			mis->compflags &= ~SC_COMP_ON;
-		else
-			is->compflags &= ~SC_LINK_COMP_ON;
-		break;
-	case CCP_TERMREQ:
-	case CCP_TERMACK:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable (de)compression here!\n");
-		if (proto == PPP_CCP)
-			mis->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON);
-		else
-			is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON);
-		break;
-	case CCP_CONFACK:
-		/* if we RECEIVE an ackowledge we enable the decompressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Enable decompression here!\n");
-		if (proto == PPP_CCP) {
-			if (!mis->decompressor)
-				break;
-			mis->compflags |= SC_DECOMP_ON;
-		} else {
-			if (!is->decompressor)
-				break;
-			is->compflags |= SC_LINK_DECOMP_ON;
-		}
-		break;
-
-	case CCP_RESETACK:
-		printk(KERN_DEBUG "Received ResetAck from peer\n");
-		len = (skb->data[2] << 8) | skb->data[3];
-		len -= 4;
-
-		if (proto == PPP_CCP) {
-			/* If a reset Ack was outstanding for this id, then
-			   clean up the state engine */
-			isdn_ppp_ccp_reset_ack_rcvd(mis, skb->data[1]);
-			if (mis->decompressor && mis->decomp_stat)
-				mis->decompressor->
-					reset(mis->decomp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, NULL);
-			/* TODO: This is not easy to decide here */
-			mis->compflags &= ~SC_DECOMP_DISCARD;
-		}
-		else {
-			isdn_ppp_ccp_reset_ack_rcvd(is, skb->data[1]);
-			if (is->link_decompressor && is->link_decomp_stat)
-				is->link_decompressor->
-					reset(is->link_decomp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, NULL);
-			/* TODO: neither here */
-			is->compflags &= ~SC_LINK_DECOMP_DISCARD;
-		}
-		break;
-
-	case CCP_RESETREQ:
-		printk(KERN_DEBUG "Received ResetReq from peer\n");
-		/* Receiving a ResetReq means we must reset our compressor */
-		/* Set up reset params for the reset entry */
-		memset(&rsparm, 0, sizeof(rsparm));
-		rsparm.data = rsdata;
-		rsparm.maxdlen = IPPP_RESET_MAXDATABYTES;
-		/* Isolate data length */
-		len = (skb->data[2] << 8) | skb->data[3];
-		len -= 4;
-		if (proto == PPP_CCP) {
-			if (mis->compressor && mis->comp_stat)
-				mis->compressor->
-					reset(mis->comp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, &rsparm);
-		}
-		else {
-			if (is->link_compressor && is->link_comp_stat)
-				is->link_compressor->
-					reset(is->link_comp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, &rsparm);
-		}
-		/* Ack the Req as specified by rsparm */
-		if (rsparm.valid) {
-			/* Compressor reset handler decided how to answer */
-			if (rsparm.rsend) {
-				/* We should send a Frame */
-				isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK,
-							rsparm.idval ? rsparm.id
-							: skb->data[1],
-							rsparm.dtval ?
-							rsparm.data : NULL,
-							rsparm.dtval ?
-							rsparm.dlen : 0);
-			} else {
-				printk(KERN_DEBUG "ResetAck suppressed\n");
-			}
-		} else {
-			/* We answer with a straight reflected Ack */
-			isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK,
-						skb->data[1],
-						len ? &skb->data[4] : NULL,
-						len);
-		}
-		break;
-	}
-}
-
-
-/*
- * Daemon sends a CCP frame ...
- */
-
-/* TODO: Clean this up with new Reset semantics */
-
-/* I believe the CCP handling as-is is done wrong. Compressed frames
- * should only be sent/received after CCP reaches UP state, which means
- * both sides have sent CONF_ACK. Currently, we handle both directions
- * independently, which means we may accept compressed frames too early
- * (supposedly not a problem), but may also mean we send compressed frames
- * too early, which may turn out to be a problem.
- * This part of state machine should actually be handled by (i)pppd, but
- * that's too big of a change now. --kai
- */
-
-/* Actually, we might turn this into an advantage: deal with the RFC in
- * the old tradition of beeing generous on what we accept, but beeing
- * strict on what we send. Thus we should just
- * - accept compressed frames as soon as decompression is negotiated
- * - send compressed frames only when decomp *and* comp are negotiated
- * - drop rx compressed frames if we cannot decomp (instead of pushing them
- *   up to ipppd)
- * and I tried to modify this file according to that. --abp
- */
-
-static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb)
-{
-	struct ippp_struct *mis, *is;
-	int proto, slot = lp->ppp_slot;
-	unsigned char *data;
-
-	if (!skb || skb->len < 3)
-		return;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, slot);
-		return;
-	}
-	is = ippp_table[slot];
-	/* Daemon may send with or without address and control field comp */
-	data = skb->data;
-	if (!(is->pppcfg & SC_COMP_AC) && data[0] == 0xff && data[1] == 0x03) {
-		data += 2;
-		if (skb->len < 5)
-			return;
-	}
-
-	proto = ((int)data[0]<<8) + data[1];
-	if (proto != PPP_CCP && proto != PPP_CCPFRAG)
-		return;
-
-	printk(KERN_DEBUG "Received CCP frame from daemon:\n");
-	isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	if (lp->master) {
-		slot = ISDN_MASTER_PRIV(lp)->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: slot(%d) out of range\n",
-			       __func__, slot);
-			return;
-		}
-		mis = ippp_table[slot];
-	} else
-		mis = is;
-	if (mis != is)
-		printk(KERN_DEBUG "isdn_ppp: Ouch! Master CCP sends on slave slot!\n");
-
-	switch (data[2]) {
-	case CCP_CONFREQ:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable decompression here!\n");
-		if (proto == PPP_CCP)
-			is->compflags &= ~SC_DECOMP_ON;
-		else
-			is->compflags &= ~SC_LINK_DECOMP_ON;
-		break;
-	case CCP_TERMREQ:
-	case CCP_TERMACK:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable (de)compression here!\n");
-		if (proto == PPP_CCP)
-			is->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON);
-		else
-			is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON);
-		break;
-	case CCP_CONFACK:
-		/* if we SEND an ackowledge we can/must enable the compressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Enable compression here!\n");
-		if (proto == PPP_CCP) {
-			if (!is->compressor)
-				break;
-			is->compflags |= SC_COMP_ON;
-		} else {
-			if (!is->compressor)
-				break;
-			is->compflags |= SC_LINK_COMP_ON;
-		}
-		break;
-	case CCP_RESETACK:
-		/* If we send a ACK we should reset our compressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Reset decompression state here!\n");
-		printk(KERN_DEBUG "ResetAck from daemon passed by\n");
-		if (proto == PPP_CCP) {
-			/* link to master? */
-			if (is->compressor && is->comp_stat)
-				is->compressor->reset(is->comp_stat, 0, 0,
-						      NULL, 0, NULL);
-			is->compflags &= ~SC_COMP_DISCARD;
-		}
-		else {
-			if (is->link_compressor && is->link_comp_stat)
-				is->link_compressor->reset(is->link_comp_stat,
-							   0, 0, NULL, 0, NULL);
-			is->compflags &= ~SC_LINK_COMP_DISCARD;
-		}
-		break;
-	case CCP_RESETREQ:
-		/* Just let it pass by */
-		printk(KERN_DEBUG "ResetReq from daemon passed by\n");
-		break;
-	}
-}
-
-int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc)
-{
-	ipc->next = ipc_head;
-	ipc->prev = NULL;
-	if (ipc_head) {
-		ipc_head->prev = ipc;
-	}
-	ipc_head = ipc;
-	return 0;
-}
-
-int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc)
-{
-	if (ipc->prev)
-		ipc->prev->next = ipc->next;
-	else
-		ipc_head = ipc->next;
-	if (ipc->next)
-		ipc->next->prev = ipc->prev;
-	ipc->prev = ipc->next = NULL;
-	return 0;
-}
-
-static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *data)
-{
-	struct isdn_ppp_compressor *ipc = ipc_head;
-	int ret;
-	void *stat;
-	int num = data->num;
-
-	if (is->debug & 0x10)
-		printk(KERN_DEBUG "[%d] Set %s type %d\n", is->unit,
-		       (data->flags & IPPP_COMP_FLAG_XMIT) ? "compressor" : "decompressor", num);
-
-	/* If is has no valid reset state vector, we cannot allocate a
-	   decompressor. The decompressor would cause reset transactions
-	   sooner or later, and they need that vector. */
-
-	if (!(data->flags & IPPP_COMP_FLAG_XMIT) && !is->reset) {
-		printk(KERN_ERR "ippp_ccp: no reset data structure - can't"
-		       " allow decompression.\n");
-		return -ENOMEM;
-	}
-
-	while (ipc) {
-		if (ipc->num == num) {
-			stat = ipc->alloc(data);
-			if (stat) {
-				ret = ipc->init(stat, data, is->unit, 0);
-				if (!ret) {
-					printk(KERN_ERR "Can't init (de)compression!\n");
-					ipc->free(stat);
-					stat = NULL;
-					break;
-				}
-			}
-			else {
-				printk(KERN_ERR "Can't alloc (de)compression!\n");
-				break;
-			}
-
-			if (data->flags & IPPP_COMP_FLAG_XMIT) {
-				if (data->flags & IPPP_COMP_FLAG_LINK) {
-					if (is->link_comp_stat)
-						is->link_compressor->free(is->link_comp_stat);
-					is->link_comp_stat = stat;
-					is->link_compressor = ipc;
-				}
-				else {
-					if (is->comp_stat)
-						is->compressor->free(is->comp_stat);
-					is->comp_stat = stat;
-					is->compressor = ipc;
-				}
-			}
-			else {
-				if (data->flags & IPPP_COMP_FLAG_LINK) {
-					if (is->link_decomp_stat)
-						is->link_decompressor->free(is->link_decomp_stat);
-					is->link_decomp_stat = stat;
-					is->link_decompressor = ipc;
-				}
-				else {
-					if (is->decomp_stat)
-						is->decompressor->free(is->decomp_stat);
-					is->decomp_stat = stat;
-					is->decompressor = ipc;
-				}
-			}
-			return 0;
-		}
-		ipc = ipc->next;
-	}
-	return -EINVAL;
-}
diff --git a/drivers/isdn/i4l/isdn_ppp.h b/drivers/isdn/i4l/isdn_ppp.h
deleted file mode 100644
index 34b8a2ce84f3..000000000000
--- a/drivers/isdn/i4l/isdn_ppp.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* $Id: isdn_ppp.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, functions for synchronous PPP (linklevel).
- *
- * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/ppp_defs.h>     /* for PPP_PROTOCOL */
-#include <linux/isdn_ppp.h>	/* for isdn_ppp info */
-
-extern int isdn_ppp_read(int, struct file *, char __user *, int);
-extern int isdn_ppp_write(int, struct file *, const char __user *, int);
-extern int isdn_ppp_open(int, struct file *);
-extern int isdn_ppp_init(void);
-extern void isdn_ppp_cleanup(void);
-extern int isdn_ppp_free(isdn_net_local *);
-extern int isdn_ppp_bind(isdn_net_local *);
-extern int isdn_ppp_autodial_filter(struct sk_buff *, isdn_net_local *);
-extern int isdn_ppp_xmit(struct sk_buff *, struct net_device *);
-extern void isdn_ppp_receive(isdn_net_dev *, isdn_net_local *, struct sk_buff *);
-extern int isdn_ppp_dev_ioctl(struct net_device *, struct ifreq *, int);
-extern __poll_t isdn_ppp_poll(struct file *, struct poll_table_struct *);
-extern int isdn_ppp_ioctl(int, struct file *, unsigned int, unsigned long);
-extern void isdn_ppp_release(int, struct file *);
-extern int isdn_ppp_dial_slave(char *);
-extern void isdn_ppp_wakeup_daemon(isdn_net_local *);
-
-extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc);
-extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc);
-
-#define IPPP_OPEN	0x01
-#define IPPP_CONNECT	0x02
-#define IPPP_CLOSEWAIT	0x04
-#define IPPP_NOBLOCK	0x08
-#define IPPP_ASSIGNED	0x10
-
-#define IPPP_MAX_HEADER 10
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
deleted file mode 100644
index 43700fc19a31..000000000000
--- a/drivers/isdn/i4l/isdn_tty.c
+++ /dev/null
@@ -1,3756 +0,0 @@
-/*
- * Linux ISDN subsystem, tty functions and AT-command emulator (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#undef ISDN_TTY_STAT_DEBUG
-
-#include <linux/isdn.h>
-#include <linux/serial.h> /* ASYNC_* flags */
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/mutex.h>
-#include <linux/sched/signal.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#ifdef CONFIG_ISDN_AUDIO
-#include "isdn_audio.h"
-#define VBUF 0x3e0
-#define VBUFX (VBUF/16)
-#endif
-
-#define FIX_FILE_TRANSFER
-#define	DUMMY_HAYES_AT
-
-/* Prototypes */
-
-static DEFINE_MUTEX(modem_info_mutex);
-static int isdn_tty_edit_at(const char *, int, modem_info *);
-static void isdn_tty_check_esc(const u_char *, u_char, int, int *, u_long *);
-static void isdn_tty_modem_reset_regs(modem_info *, int);
-static void isdn_tty_cmd_ATA(modem_info *);
-static void isdn_tty_flush_buffer(struct tty_struct *);
-static void isdn_tty_modem_result(int, modem_info *);
-#ifdef CONFIG_ISDN_AUDIO
-static int isdn_tty_countDLE(unsigned char *, int);
-#endif
-
-/* Leave this unchanged unless you know what you do! */
-#define MODEM_PARANOIA_CHECK
-#define MODEM_DO_RESTART
-
-static int bit2si[8] =
-{1, 5, 7, 7, 7, 7, 7, 7};
-static int si2bit[8] =
-{4, 1, 4, 4, 4, 4, 4, 4};
-
-/* isdn_tty_try_read() is called from within isdn_tty_rcv_skb()
- * to stuff incoming data directly into a tty's flip-buffer. This
- * is done to speed up tty-receiving if the receive-queue is empty.
- * This routine MUST be called with interrupts off.
- * Return:
- *  1 = Success
- *  0 = Failure, data has to be buffered and later processed by
- *      isdn_tty_readmodem().
- */
-static int
-isdn_tty_try_read(modem_info *info, struct sk_buff *skb)
-{
-	struct tty_port *port = &info->port;
-	int c;
-	int len;
-	char last;
-
-	if (!info->online)
-		return 0;
-
-	if (!(info->mcr & UART_MCR_RTS))
-		return 0;
-
-	len = skb->len
-#ifdef CONFIG_ISDN_AUDIO
-		+ ISDN_AUDIO_SKB_DLECOUNT(skb)
-#endif
-		;
-
-	c = tty_buffer_request_room(port, len);
-	if (c < len)
-		return 0;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (ISDN_AUDIO_SKB_DLECOUNT(skb)) {
-		int l = skb->len;
-		unsigned char *dp = skb->data;
-		while (--l) {
-			if (*dp == DLE)
-				tty_insert_flip_char(port, DLE, 0);
-			tty_insert_flip_char(port, *dp++, 0);
-		}
-		if (*dp == DLE)
-			tty_insert_flip_char(port, DLE, 0);
-		last = *dp;
-	} else {
-#endif
-		if (len > 1)
-			tty_insert_flip_string(port, skb->data, len - 1);
-		last = skb->data[len - 1];
-#ifdef CONFIG_ISDN_AUDIO
-	}
-#endif
-	if (info->emu.mdmreg[REG_CPPP] & BIT_CPPP)
-		tty_insert_flip_char(port, last, 0xFF);
-	else
-		tty_insert_flip_char(port, last, TTY_NORMAL);
-	tty_flip_buffer_push(port);
-	kfree_skb(skb);
-
-	return 1;
-}
-
-/* isdn_tty_readmodem() is called periodically from within timer-interrupt.
- * It tries getting received data from the receive queue an stuff it into
- * the tty's flip-buffer.
- */
-void
-isdn_tty_readmodem(void)
-{
-	int resched = 0;
-	int midx;
-	int i;
-	int r;
-	modem_info *info;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		midx = dev->m_idx[i];
-		if (midx < 0)
-			continue;
-
-		info = &dev->mdm.info[midx];
-		if (!info->online)
-			continue;
-
-		r = 0;
-#ifdef CONFIG_ISDN_AUDIO
-		isdn_audio_eval_dtmf(info);
-		if ((info->vonline & 1) && (info->emu.vpar[1]))
-			isdn_audio_eval_silence(info);
-#endif
-		if (info->mcr & UART_MCR_RTS) {
-			/* CISCO AsyncPPP Hack */
-			if (!(info->emu.mdmreg[REG_CPPP] & BIT_CPPP))
-				r = isdn_readbchan_tty(info->isdn_driver,
-						info->isdn_channel,
-						&info->port, 0);
-			else
-				r = isdn_readbchan_tty(info->isdn_driver,
-						info->isdn_channel,
-						&info->port, 1);
-			if (r)
-				tty_flip_buffer_push(&info->port);
-		} else
-			r = 1;
-
-		if (r) {
-			info->rcvsched = 0;
-			resched = 1;
-		} else
-			info->rcvsched = 1;
-	}
-	if (!resched)
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 0);
-}
-
-int
-isdn_tty_rcv_skb(int i, int di, int channel, struct sk_buff *skb)
-{
-	ulong flags;
-	int midx;
-#ifdef CONFIG_ISDN_AUDIO
-	int ifmt;
-#endif
-	modem_info *info;
-
-	if ((midx = dev->m_idx[i]) < 0) {
-		/* if midx is invalid, packet is not for tty */
-		return 0;
-	}
-	info = &dev->mdm.info[midx];
-#ifdef CONFIG_ISDN_AUDIO
-	ifmt = 1;
-
-	if ((info->vonline) && (!info->emu.vpar[4]))
-		isdn_audio_calc_dtmf(info, skb->data, skb->len, ifmt);
-	if ((info->vonline & 1) && (info->emu.vpar[1]))
-		isdn_audio_calc_silence(info, skb->data, skb->len, ifmt);
-#endif
-	if ((info->online < 2)
-#ifdef CONFIG_ISDN_AUDIO
-	    && (!(info->vonline & 1))
-#endif
-		) {
-		/* If Modem not listening, drop data */
-		kfree_skb(skb);
-		return 1;
-	}
-	if (info->emu.mdmreg[REG_T70] & BIT_T70) {
-		if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT) {
-			/* T.70 decoding: throw away the T.70 header (2 or 4 bytes)   */
-			if (skb->data[0] == 3) /* pure data packet -> 4 byte headers  */
-				skb_pull(skb, 4);
-			else
-				if (skb->data[0] == 1) /* keepalive packet -> 2 byte hdr  */
-					skb_pull(skb, 2);
-		} else
-			/* T.70 decoding: Simply throw away the T.70 header (4 bytes) */
-			if ((skb->data[0] == 1) && ((skb->data[1] == 0) || (skb->data[1] == 1)))
-				skb_pull(skb, 4);
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-	ISDN_AUDIO_SKB_LOCK(skb) = 0;
-	if (info->vonline & 1) {
-		/* voice conversion/compression */
-		switch (info->emu.vpar[3]) {
-		case 2:
-		case 3:
-		case 4:
-			/* adpcm
-			 * Since compressed data takes less
-			 * space, we can overwrite the buffer.
-			 */
-			skb_trim(skb, isdn_audio_xlaw2adpcm(info->adpcmr,
-							    ifmt,
-							    skb->data,
-							    skb->data,
-							    skb->len));
-			break;
-		case 5:
-			/* a-law */
-			if (!ifmt)
-				isdn_audio_ulaw2alaw(skb->data, skb->len);
-			break;
-		case 6:
-			/* u-law */
-			if (ifmt)
-				isdn_audio_alaw2ulaw(skb->data, skb->len);
-			break;
-		}
-		ISDN_AUDIO_SKB_DLECOUNT(skb) =
-			isdn_tty_countDLE(skb->data, skb->len);
-	}
-#ifdef CONFIG_ISDN_TTY_FAX
-	else {
-		if (info->faxonline & 2) {
-			isdn_tty_fax_bitorder(info, skb);
-			ISDN_AUDIO_SKB_DLECOUNT(skb) =
-				isdn_tty_countDLE(skb->data, skb->len);
-		}
-	}
-#endif
-#endif
-	/* Try to deliver directly via tty-buf if queue is empty */
-	spin_lock_irqsave(&info->readlock, flags);
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
-		if (isdn_tty_try_read(info, skb)) {
-			spin_unlock_irqrestore(&info->readlock, flags);
-			return 1;
-		}
-	/* Direct deliver failed or queue wasn't empty.
-	 * Queue up for later dequeueing via timer-irq.
-	 */
-	__skb_queue_tail(&dev->drv[di]->rpqueue[channel], skb);
-	dev->drv[di]->rcvcount[channel] +=
-		(skb->len
-#ifdef CONFIG_ISDN_AUDIO
-		 + ISDN_AUDIO_SKB_DLECOUNT(skb)
-#endif
-			);
-	spin_unlock_irqrestore(&info->readlock, flags);
-	/* Schedule dequeuing */
-	if ((dev->modempoll) && (info->rcvsched))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-	return 1;
-}
-
-static void
-isdn_tty_cleanup_xmit(modem_info *info)
-{
-	skb_queue_purge(&info->xmit_queue);
-#ifdef CONFIG_ISDN_AUDIO
-	skb_queue_purge(&info->dtmf_queue);
-#endif
-}
-
-static void
-isdn_tty_tint(modem_info *info)
-{
-	struct sk_buff *skb = skb_dequeue(&info->xmit_queue);
-	int len, slen;
-
-	if (!skb)
-		return;
-	len = skb->len;
-	if ((slen = isdn_writebuf_skb_stub(info->isdn_driver,
-					   info->isdn_channel, 1, skb)) == len) {
-		struct tty_struct *tty = info->port.tty;
-		info->send_outstanding++;
-		info->msr &= ~UART_MSR_CTS;
-		info->lsr &= ~UART_LSR_TEMT;
-		tty_wakeup(tty);
-		return;
-	}
-	if (slen < 0) {
-		/* Error: no channel, already shutdown, or wrong parameter */
-		dev_kfree_skb(skb);
-		return;
-	}
-	skb_queue_head(&info->xmit_queue, skb);
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-static int
-isdn_tty_countDLE(unsigned char *buf, int len)
-{
-	int count = 0;
-
-	while (len--)
-		if (*buf++ == DLE)
-			count++;
-	return count;
-}
-
-/* This routine is called from within isdn_tty_write() to perform
- * DLE-decoding when sending audio-data.
- */
-static int
-isdn_tty_handleDLEdown(modem_info *info, atemu *m, int len)
-{
-	unsigned char *p = &info->port.xmit_buf[info->xmit_count];
-	int count = 0;
-
-	while (len > 0) {
-		if (m->lastDLE) {
-			m->lastDLE = 0;
-			switch (*p) {
-			case DLE:
-				/* Escape code */
-				if (len > 1)
-					memmove(p, p + 1, len - 1);
-				p--;
-				count++;
-				break;
-			case ETX:
-				/* End of data */
-				info->vonline |= 4;
-				return count;
-			case DC4:
-				/* Abort RX */
-				info->vonline &= ~1;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-				printk(KERN_DEBUG
-				       "DLEdown: got DLE-DC4, send DLE-ETX on ttyI%d\n",
-				       info->line);
-#endif
-				isdn_tty_at_cout("\020\003", info);
-				if (!info->vonline) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-					printk(KERN_DEBUG
-					       "DLEdown: send VCON on ttyI%d\n",
-					       info->line);
-#endif
-					isdn_tty_at_cout("\r\nVCON\r\n", info);
-				}
-				/* Fall through */
-			case 'q':
-			case 's':
-				/* Silence */
-				if (len > 1)
-					memmove(p, p + 1, len - 1);
-				p--;
-				break;
-			}
-		} else {
-			if (*p == DLE)
-				m->lastDLE = 1;
-			else
-				count++;
-		}
-		p++;
-		len--;
-	}
-	if (len < 0) {
-		printk(KERN_WARNING "isdn_tty: len<0 in DLEdown\n");
-		return 0;
-	}
-	return count;
-}
-
-/* This routine is called from within isdn_tty_write() when receiving
- * audio-data. It interrupts receiving, if an character other than
- * ^S or ^Q is sent.
- */
-static int
-isdn_tty_end_vrx(const char *buf, int c)
-{
-	char ch;
-
-	while (c--) {
-		ch = *buf;
-		if ((ch != 0x11) && (ch != 0x13))
-			return 1;
-		buf++;
-	}
-	return 0;
-}
-
-static int voice_cf[7] =
-{0, 0, 4, 3, 2, 0, 0};
-
-#endif                          /* CONFIG_ISDN_AUDIO */
-
-/* isdn_tty_senddown() is called either directly from within isdn_tty_write()
- * or via timer-interrupt from within isdn_tty_modem_xmit(). It pulls
- * outgoing data from the tty's xmit-buffer, handles voice-decompression or
- * T.70 if necessary, and finally queues it up for sending via isdn_tty_tint.
- */
-static void
-isdn_tty_senddown(modem_info *info)
-{
-	int buflen;
-	int skb_res;
-#ifdef CONFIG_ISDN_AUDIO
-	int audio_len;
-#endif
-	struct sk_buff *skb;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 4) {
-		info->vonline &= ~6;
-		if (!info->vonline) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG
-			       "senddown: send VCON on ttyI%d\n",
-			       info->line);
-#endif
-			isdn_tty_at_cout("\r\nVCON\r\n", info);
-		}
-	}
-#endif
-	if (!(buflen = info->xmit_count))
-		return;
-	if ((info->emu.mdmreg[REG_CTS] & BIT_CTS) != 0)
-		info->msr &= ~UART_MSR_CTS;
-	info->lsr &= ~UART_LSR_TEMT;
-	/* info->xmit_count is modified here and in isdn_tty_write().
-	 * So we return here if isdn_tty_write() is in the
-	 * critical section.
-	 */
-	atomic_inc(&info->xmit_lock);
-	if (!(atomic_dec_and_test(&info->xmit_lock)))
-		return;
-	if (info->isdn_driver < 0) {
-		info->xmit_count = 0;
-		return;
-	}
-	skb_res = dev->drv[info->isdn_driver]->interface->hl_hdrlen + 4;
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 2)
-		audio_len = buflen * voice_cf[info->emu.vpar[3]];
-	else
-		audio_len = 0;
-	skb = dev_alloc_skb(skb_res + buflen + audio_len);
-#else
-	skb = dev_alloc_skb(skb_res + buflen);
-#endif
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_tty: Out of memory in ttyI%d senddown\n",
-		       info->line);
-		return;
-	}
-	skb_reserve(skb, skb_res);
-	skb_put_data(skb, info->port.xmit_buf, buflen);
-	info->xmit_count = 0;
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 2) {
-		/* For now, ifmt is fixed to 1 (alaw), since this
-		 * is used with ISDN everywhere in the world, except
-		 * US, Canada and Japan.
-		 * Later, when US-ISDN protocols are implemented,
-		 * this setting will depend on the D-channel protocol.
-		 */
-		int ifmt = 1;
-
-		/* voice conversion/decompression */
-		switch (info->emu.vpar[3]) {
-		case 2:
-		case 3:
-		case 4:
-			/* adpcm, compatible to ZyXel 1496 modem
-			 * with ROM revision 6.01
-			 */
-			audio_len = isdn_audio_adpcm2xlaw(info->adpcms,
-							  ifmt,
-							  skb->data,
-							  skb_put(skb, audio_len),
-							  buflen);
-			skb_pull(skb, buflen);
-			skb_trim(skb, audio_len);
-			break;
-		case 5:
-			/* a-law */
-			if (!ifmt)
-				isdn_audio_alaw2ulaw(skb->data,
-						     buflen);
-			break;
-		case 6:
-			/* u-law */
-			if (ifmt)
-				isdn_audio_ulaw2alaw(skb->data,
-						     buflen);
-			break;
-		}
-	}
-#endif                          /* CONFIG_ISDN_AUDIO */
-	if (info->emu.mdmreg[REG_T70] & BIT_T70) {
-		/* Add T.70 simplified header */
-		if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT)
-			memcpy(skb_push(skb, 2), "\1\0", 2);
-		else
-			memcpy(skb_push(skb, 4), "\1\0\1\0", 4);
-	}
-	skb_queue_tail(&info->xmit_queue, skb);
-}
-
-/************************************************************
- *
- * Modem-functions
- *
- * mostly "stolen" from original Linux-serial.c and friends.
- *
- ************************************************************/
-
-/* The next routine is called once from within timer-interrupt
- * triggered within isdn_tty_modem_ncarrier(). It calls
- * isdn_tty_modem_result() to stuff a "NO CARRIER" Message
- * into the tty's buffer.
- */
-static void
-isdn_tty_modem_do_ncarrier(struct timer_list *t)
-{
-	modem_info *info = from_timer(info, t, nc_timer);
-	isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-}
-
-/* Next routine is called, whenever the DTR-signal is raised.
- * It checks the ncarrier-flag, and triggers the above routine
- * when necessary. The ncarrier-flag is set, whenever DTR goes
- * low.
- */
-static void
-isdn_tty_modem_ncarrier(modem_info *info)
-{
-	if (info->ncarrier) {
-		info->nc_timer.expires = jiffies + HZ;
-		add_timer(&info->nc_timer);
-	}
-}
-
-/*
- * return the usage calculated by si and layer 2 protocol
- */
-static int
-isdn_calc_usage(int si, int l2)
-{
-	int usg = ISDN_USAGE_MODEM;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (si == 1) {
-		switch (l2) {
-		case ISDN_PROTO_L2_MODEM:
-			usg = ISDN_USAGE_MODEM;
-			break;
-#ifdef CONFIG_ISDN_TTY_FAX
-		case ISDN_PROTO_L2_FAX:
-			usg = ISDN_USAGE_FAX;
-			break;
-#endif
-		case ISDN_PROTO_L2_TRANS:
-		default:
-			usg = ISDN_USAGE_VOICE;
-			break;
-		}
-	}
-#endif
-	return (usg);
-}
-
-/* isdn_tty_dial() performs dialing of a tty an the necessary
- * setup of the lower levels before that.
- */
-static void
-isdn_tty_dial(char *n, modem_info *info, atemu *m)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	u_long flags;
-	isdn_ctrl cmd;
-	int i;
-	int j;
-
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-		strcpy(info->last_num, n);
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (l2 == ISDN_PROTO_L2_FAX) {
-			cmd.parm.fax = info->fax;
-			info->fax->direction = ISDN_TTY_FAX_CONN_OUT;
-		}
-#endif
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		sprintf(cmd.parm.setup.phone, "%s", n);
-		sprintf(cmd.parm.setup.eazmsn, "%s",
-			isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.parm.setup.si1 = si;
-		cmd.parm.setup.si2 = m->mdmreg[REG_SI2];
-		cmd.command = ISDN_CMD_DIAL;
-		info->dialing = 1;
-		info->emu.carrierwait = 0;
-		strcpy(dev->num[i], n);
-		isdn_info_update();
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	}
-}
-
-/* isdn_tty_hangup() disassociates a tty from the real
- * ISDN-line (hangup). The usage-status is cleared
- * and some cleanup is done also.
- */
-void
-isdn_tty_modem_hup(modem_info *info, int local)
-{
-	isdn_ctrl cmd;
-	int di, ch;
-
-	if (!info)
-		return;
-
-	di = info->isdn_driver;
-	ch = info->isdn_channel;
-	if (di < 0 || ch < 0)
-		return;
-
-	info->isdn_driver = -1;
-	info->isdn_channel = -1;
-
-#ifdef ISDN_DEBUG_MODEM_HUP
-	printk(KERN_DEBUG "Mhup ttyI%d\n", info->line);
-#endif
-	info->rcvsched = 0;
-	isdn_tty_flush_buffer(info->port.tty);
-	if (info->online) {
-		info->last_lhup = local;
-		info->online = 0;
-		isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	info->vonline = 0;
-#ifdef CONFIG_ISDN_TTY_FAX
-	info->faxonline = 0;
-	info->fax->phase = ISDN_FAX_PHASE_IDLE;
-#endif
-	info->emu.vpar[4] = 0;
-	info->emu.vpar[5] = 8;
-	kfree(info->dtmf_state);
-	info->dtmf_state = NULL;
-	kfree(info->silence_state);
-	info->silence_state = NULL;
-	kfree(info->adpcms);
-	info->adpcms = NULL;
-	kfree(info->adpcmr);
-	info->adpcmr = NULL;
-#endif
-	if ((info->msr & UART_MSR_RI) &&
-	    (info->emu.mdmreg[REG_RUNG] & BIT_RUNG))
-		isdn_tty_modem_result(RESULT_RUNG, info);
-	info->msr &= ~(UART_MSR_DCD | UART_MSR_RI);
-	info->lsr |= UART_LSR_TEMT;
-
-	if (local) {
-		cmd.driver = di;
-		cmd.command = ISDN_CMD_HANGUP;
-		cmd.arg = ch;
-		isdn_command(&cmd);
-	}
-
-	isdn_all_eaz(di, ch);
-	info->emu.mdmreg[REG_RINGCNT] = 0;
-	isdn_free_channel(di, ch, 0);
-
-	if (info->drv_index >= 0) {
-		dev->m_idx[info->drv_index] = -1;
-		info->drv_index = -1;
-	}
-}
-
-/*
- * Begin of a CAPI like interface, currently used only for
- * supplementary service (CAPI 2.0 part III)
- */
-#include <linux/isdn/capicmd.h>
-#include <linux/module.h>
-
-int
-isdn_tty_capi_facility(capi_msg *cm) {
-	return (-1); /* dummy */
-}
-
-/* isdn_tty_suspend() tries to suspend the current tty connection
- */
-static void
-isdn_tty_suspend(char *id, modem_info *info, atemu *m)
-{
-	isdn_ctrl cmd;
-
-	int l;
-
-	if (!info)
-		return;
-
-#ifdef ISDN_DEBUG_MODEM_SERVICES
-	printk(KERN_DEBUG "Msusp ttyI%d\n", info->line);
-#endif
-	l = strlen(id);
-	if ((info->isdn_driver >= 0)) {
-		cmd.parm.cmsg.Length = l + 18;
-		cmd.parm.cmsg.Command = CAPI_FACILITY;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */
-		cmd.parm.cmsg.para[1] = 0;
-		cmd.parm.cmsg.para[2] = l + 3;
-		cmd.parm.cmsg.para[3] = 4; /* 16 bit 0x0004 Suspend */
-		cmd.parm.cmsg.para[4] = 0;
-		cmd.parm.cmsg.para[5] = l;
-		memcpy(&cmd.parm.cmsg.para[6], id, l);
-		cmd.command = CAPI_PUT_MESSAGE;
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		isdn_command(&cmd);
-	}
-}
-
-/* isdn_tty_resume() tries to resume a suspended call
- * setup of the lower levels before that. unfortunately here is no
- * checking for compatibility of used protocols implemented by Q931
- * It does the same things like isdn_tty_dial, the last command
- * is different, may be we can merge it.
- */
-
-static void
-isdn_tty_resume(char *id, modem_info *info, atemu *m)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	isdn_ctrl cmd;
-	ulong flags;
-	int i;
-	int j;
-	int l;
-
-	l = strlen(id);
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-//		strcpy(info->last_num, n);
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.parm.cmsg.Length = l + 18;
-		cmd.parm.cmsg.Command = CAPI_FACILITY;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */
-		cmd.parm.cmsg.para[1] = 0;
-		cmd.parm.cmsg.para[2] = l + 3;
-		cmd.parm.cmsg.para[3] = 5; /* 16 bit 0x0005 Resume */
-		cmd.parm.cmsg.para[4] = 0;
-		cmd.parm.cmsg.para[5] = l;
-		memcpy(&cmd.parm.cmsg.para[6], id, l);
-		cmd.command = CAPI_PUT_MESSAGE;
-		info->dialing = 1;
-//		strcpy(dev->num[i], n);
-		isdn_info_update();
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	}
-}
-
-/* isdn_tty_send_msg() sends a message to a HL driver
- * This is used for hybrid modem cards to send AT commands to it
- */
-
-static void
-isdn_tty_send_msg(modem_info *info, atemu *m, char *msg)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	isdn_ctrl cmd;
-	ulong flags;
-	int i;
-	int j;
-	int l;
-
-	l = min(strlen(msg), sizeof(cmd.parm) - sizeof(cmd.parm.cmsg)
-		+ sizeof(cmd.parm.cmsg.para) - 2);
-
-	if (!l) {
-		isdn_tty_modem_result(RESULT_ERROR, info);
-		return;
-	}
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.parm.cmsg.Length = l + 14;
-		cmd.parm.cmsg.Command = CAPI_MANUFACTURER;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = l + 1;
-		strncpy(&cmd.parm.cmsg.para[1], msg, l);
-		cmd.parm.cmsg.para[l + 1] = 0xd;
-		cmd.command = CAPI_PUT_MESSAGE;
-/*		info->dialing = 1;
-		strcpy(dev->num[i], n);
-		isdn_info_update();
-*/
-		isdn_command(&cmd);
-	}
-}
-
-static inline int
-isdn_tty_paranoia_check(modem_info *info, char *name, const char *routine)
-{
-#ifdef MODEM_PARANOIA_CHECK
-	if (!info) {
-		printk(KERN_WARNING "isdn_tty: null info_struct for %s in %s\n",
-		       name, routine);
-		return 1;
-	}
-	if (info->magic != ISDN_ASYNC_MAGIC) {
-		printk(KERN_WARNING "isdn_tty: bad magic for modem struct %s in %s\n",
-		       name, routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-/*
- * This routine is called to set the UART divisor registers to match
- * the specified baud rate for a serial port.
- */
-static void
-isdn_tty_change_speed(modem_info *info)
-{
-	struct tty_port *port = &info->port;
-	uint cflag,
-		cval,
-		quot;
-	int i;
-
-	if (!port->tty)
-		return;
-	cflag = port->tty->termios.c_cflag;
-
-	quot = i = cflag & CBAUD;
-	if (i & CBAUDEX) {
-		i &= ~CBAUDEX;
-		if (i < 1 || i > 2)
-			port->tty->termios.c_cflag &= ~CBAUDEX;
-		else
-			i += 15;
-	}
-	if (quot) {
-		info->mcr |= UART_MCR_DTR;
-		isdn_tty_modem_ncarrier(info);
-	} else {
-		info->mcr &= ~UART_MCR_DTR;
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in changespeed\n");
-#endif
-			if (info->online)
-				info->ncarrier = 1;
-			isdn_tty_modem_reset_regs(info, 0);
-			isdn_tty_modem_hup(info, 1);
-		}
-		return;
-	}
-	/* byte size and parity */
-	cval = cflag & (CSIZE | CSTOPB);
-	cval >>= 4;
-	if (cflag & PARENB)
-		cval |= UART_LCR_PARITY;
-	if (!(cflag & PARODD))
-		cval |= UART_LCR_EPAR;
-
-	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
-}
-
-static int
-isdn_tty_startup(modem_info *info)
-{
-	if (tty_port_initialized(&info->port))
-		return 0;
-	isdn_lock_drivers();
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "starting up ttyi%d ...\n", info->line);
-#endif
-	/*
-	 * Now, initialize the UART
-	 */
-	info->mcr = UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2;
-	if (info->port.tty)
-		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
-	/*
-	 * and set the speed of the serial port
-	 */
-	isdn_tty_change_speed(info);
-
-	tty_port_set_initialized(&info->port, 1);
-	info->msr |= (UART_MSR_DSR | UART_MSR_CTS);
-	info->send_outstanding = 0;
-	return 0;
-}
-
-/*
- * This routine will shutdown a serial port; interrupts are disabled, and
- * DTR is dropped if the hangup on close termio flag is on.
- */
-static void
-isdn_tty_shutdown(modem_info *info)
-{
-	if (!tty_port_initialized(&info->port))
-		return;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "Shutting down isdnmodem port %d ....\n", info->line);
-#endif
-	isdn_unlock_drivers();
-	info->msr &= ~UART_MSR_RI;
-	if (!info->port.tty || (info->port.tty->termios.c_cflag & HUPCL)) {
-		info->mcr &= ~(UART_MCR_DTR | UART_MCR_RTS);
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-			isdn_tty_modem_reset_regs(info, 0);
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in isdn_tty_shutdown\n");
-#endif
-			isdn_tty_modem_hup(info, 1);
-		}
-	}
-	if (info->port.tty)
-		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
-
-	tty_port_set_initialized(&info->port, 0);
-}
-
-/* isdn_tty_write() is the main send-routine. It is called from the upper
- * levels within the kernel to perform sending data. Depending on the
- * online-flag it either directs output to the at-command-interpreter or
- * to the lower level. Additional tasks done here:
- *  - If online, check for escape-sequence (+++)
- *  - If sending audio-data, call isdn_tty_DLEdown() to parse DLE-codes.
- *  - If receiving audio-data, call isdn_tty_end_vrx() to abort if needed.
- *  - If dialing, abort dial.
- */
-static int
-isdn_tty_write(struct tty_struct *tty, const u_char *buf, int count)
-{
-	int c;
-	int total = 0;
-	modem_info *info = (modem_info *) tty->driver_data;
-	atemu *m = &info->emu;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write"))
-		return 0;
-	/* See isdn_tty_senddown() */
-	atomic_inc(&info->xmit_lock);
-	while (1) {
-		c = count;
-		if (c > info->xmit_size - info->xmit_count)
-			c = info->xmit_size - info->xmit_count;
-		if (info->isdn_driver >= 0 && c > dev->drv[info->isdn_driver]->maxbufsize)
-			c = dev->drv[info->isdn_driver]->maxbufsize;
-		if (c <= 0)
-			break;
-		if ((info->online > 1)
-#ifdef CONFIG_ISDN_AUDIO
-		    || (info->vonline & 3)
-#endif
-			) {
-#ifdef CONFIG_ISDN_AUDIO
-			if (!info->vonline)
-#endif
-				isdn_tty_check_esc(buf, m->mdmreg[REG_ESC], c,
-						   &(m->pluscount),
-						   &(m->lastplus));
-			memcpy(&info->port.xmit_buf[info->xmit_count], buf, c);
-#ifdef CONFIG_ISDN_AUDIO
-			if (info->vonline) {
-				int cc = isdn_tty_handleDLEdown(info, m, c);
-				if (info->vonline & 2) {
-					if (!cc) {
-						/* If DLE decoding results in zero-transmit, but
-						 * c originally was non-zero, do a wakeup.
-						 */
-						tty_wakeup(tty);
-						info->msr |= UART_MSR_CTS;
-						info->lsr |= UART_LSR_TEMT;
-					}
-					info->xmit_count += cc;
-				}
-				if ((info->vonline & 3) == 1) {
-					/* Do NOT handle Ctrl-Q or Ctrl-S
-					 * when in full-duplex audio mode.
-					 */
-					if (isdn_tty_end_vrx(buf, c)) {
-						info->vonline &= ~1;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-						printk(KERN_DEBUG
-						       "got !^Q/^S, send DLE-ETX,VCON on ttyI%d\n",
-						       info->line);
-#endif
-						isdn_tty_at_cout("\020\003\r\nVCON\r\n", info);
-					}
-				}
-			} else
-				if (TTY_IS_FCLASS1(info)) {
-					int cc = isdn_tty_handleDLEdown(info, m, c);
-
-					if (info->vonline & 4) { /* ETX seen */
-						isdn_ctrl c;
-
-						c.command = ISDN_CMD_FAXCMD;
-						c.driver = info->isdn_driver;
-						c.arg = info->isdn_channel;
-						c.parm.aux.cmd = ISDN_FAX_CLASS1_CTRL;
-						c.parm.aux.subcmd = ETX;
-						isdn_command(&c);
-					}
-					info->vonline = 0;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-					printk(KERN_DEBUG "fax dle cc/c %d/%d\n", cc, c);
-#endif
-					info->xmit_count += cc;
-				} else
-#endif
-					info->xmit_count += c;
-		} else {
-			info->msr |= UART_MSR_CTS;
-			info->lsr |= UART_LSR_TEMT;
-			if (info->dialing) {
-				info->dialing = 0;
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in isdn_tty_write\n");
-#endif
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				isdn_tty_modem_hup(info, 1);
-			} else
-				c = isdn_tty_edit_at(buf, c, info);
-		}
-		buf += c;
-		count -= c;
-		total += c;
-	}
-	atomic_dec(&info->xmit_lock);
-	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) {
-		if (m->mdmreg[REG_DXMT] & BIT_DXMT) {
-			isdn_tty_senddown(info);
-			isdn_tty_tint(info);
-		}
-		isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
-	}
-	return total;
-}
-
-static int
-isdn_tty_write_room(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	int ret;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write_room"))
-		return 0;
-	if (!info->online)
-		return info->xmit_size;
-	ret = info->xmit_size - info->xmit_count;
-	return (ret < 0) ? 0 : ret;
-}
-
-static int
-isdn_tty_chars_in_buffer(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_chars_in_buffer"))
-		return 0;
-	if (!info->online)
-		return 0;
-	return (info->xmit_count);
-}
-
-static void
-isdn_tty_flush_buffer(struct tty_struct *tty)
-{
-	modem_info *info;
-
-	if (!tty) {
-		return;
-	}
-	info = (modem_info *) tty->driver_data;
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_buffer")) {
-		return;
-	}
-	isdn_tty_cleanup_xmit(info);
-	info->xmit_count = 0;
-	tty_wakeup(tty);
-}
-
-static void
-isdn_tty_flush_chars(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_chars"))
-		return;
-	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_throttle()
- *
- * This routine is called by the upper-layer tty layer to signal that
- * incoming characters should be throttled.
- * ------------------------------------------------------------
- */
-static void
-isdn_tty_throttle(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_throttle"))
-		return;
-	if (I_IXOFF(tty))
-		info->x_char = STOP_CHAR(tty);
-	info->mcr &= ~UART_MCR_RTS;
-}
-
-static void
-isdn_tty_unthrottle(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_unthrottle"))
-		return;
-	if (I_IXOFF(tty)) {
-		if (info->x_char)
-			info->x_char = 0;
-		else
-			info->x_char = START_CHAR(tty);
-	}
-	info->mcr |= UART_MCR_RTS;
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_ioctl() and friends
- * ------------------------------------------------------------
- */
-
-/*
- * isdn_tty_get_lsr_info - get line status register info
- *
- * Purpose: Let user call ioctl() to get info when the UART physically
- *          is emptied.  On bus types like RS485, the transmitter must
- *          release the bus after transmitting. This must be done when
- *          the transmit shift register is empty, not be done when the
- *          transmit holding register is empty.  This functionality
- *          allows RS485 driver to be written in user space.
- */
-static int
-isdn_tty_get_lsr_info(modem_info *info, uint __user *value)
-{
-	u_char status;
-	uint result;
-
-	status = info->lsr;
-	result = ((status & UART_LSR_TEMT) ? TIOCSER_TEMT : 0);
-	return put_user(result, value);
-}
-
-
-static int
-isdn_tty_tiocmget(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	u_char control, status;
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-
-	mutex_lock(&modem_info_mutex);
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-	printk(KERN_DEBUG "ttyI%d ioctl TIOCMGET\n", info->line);
-#endif
-
-	control = info->mcr;
-	status = info->msr;
-	mutex_unlock(&modem_info_mutex);
-	return ((control & UART_MCR_RTS) ? TIOCM_RTS : 0)
-		| ((control & UART_MCR_DTR) ? TIOCM_DTR : 0)
-		| ((status & UART_MSR_DCD) ? TIOCM_CAR : 0)
-		| ((status & UART_MSR_RI) ? TIOCM_RNG : 0)
-		| ((status & UART_MSR_DSR) ? TIOCM_DSR : 0)
-		| ((status & UART_MSR_CTS) ? TIOCM_CTS : 0);
-}
-
-static int
-isdn_tty_tiocmset(struct tty_struct *tty,
-		  unsigned int set, unsigned int clear)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-	printk(KERN_DEBUG "ttyI%d ioctl TIOCMxxx: %x %x\n", info->line, set, clear);
-#endif
-
-	mutex_lock(&modem_info_mutex);
-	if (set & TIOCM_RTS)
-		info->mcr |= UART_MCR_RTS;
-	if (set & TIOCM_DTR) {
-		info->mcr |= UART_MCR_DTR;
-		isdn_tty_modem_ncarrier(info);
-	}
-
-	if (clear & TIOCM_RTS)
-		info->mcr &= ~UART_MCR_RTS;
-	if (clear & TIOCM_DTR) {
-		info->mcr &= ~UART_MCR_DTR;
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-			isdn_tty_modem_reset_regs(info, 0);
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in TIOCMSET\n");
-#endif
-			if (info->online)
-				info->ncarrier = 1;
-			isdn_tty_modem_hup(info, 1);
-		}
-	}
-	mutex_unlock(&modem_info_mutex);
-	return 0;
-}
-
-static int
-isdn_tty_ioctl(struct tty_struct *tty, uint cmd, ulong arg)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_ioctl"))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-	switch (cmd) {
-	case TIOCSERGETLSR:	/* Get line status register */
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-		printk(KERN_DEBUG "ttyI%d ioctl TIOCSERGETLSR\n", info->line);
-#endif
-		return isdn_tty_get_lsr_info(info, (uint __user *) arg);
-	default:
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-		printk(KERN_DEBUG "UNKNOWN ioctl 0x%08x on ttyi%d\n", cmd, info->line);
-#endif
-		return -ENOIOCTLCMD;
-	}
-	return 0;
-}
-
-static void
-isdn_tty_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	mutex_lock(&modem_info_mutex);
-	if (!old_termios)
-		isdn_tty_change_speed(info);
-	else {
-		if (tty->termios.c_cflag == old_termios->c_cflag &&
-		    tty->termios.c_ispeed == old_termios->c_ispeed &&
-		    tty->termios.c_ospeed == old_termios->c_ospeed) {
-			mutex_unlock(&modem_info_mutex);
-			return;
-		}
-		isdn_tty_change_speed(info);
-	}
-	mutex_unlock(&modem_info_mutex);
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_open() and friends
- * ------------------------------------------------------------
- */
-
-static int isdn_tty_install(struct tty_driver *driver, struct tty_struct *tty)
-{
-	modem_info *info = &dev->mdm.info[tty->index];
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	tty->driver_data = info;
-
-	return tty_port_install(&info->port, driver, tty);
-}
-
-/*
- * This routine is called whenever a serial port is opened.  It
- * enables interrupts for a serial port, linking in its async structure into
- * the IRQ chain.   It also performs the serial-specific
- * initialization for the tty structure.
- */
-static int
-isdn_tty_open(struct tty_struct *tty, struct file *filp)
-{
-	modem_info *info = tty->driver_data;
-	struct tty_port *port = &info->port;
-	int retval;
-
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open %s, count = %d\n", tty->name,
-	       port->count);
-#endif
-	port->count++;
-	port->tty = tty;
-	/*
-	 * Start up serial port
-	 */
-	retval = isdn_tty_startup(info);
-	if (retval) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_open return after startup\n");
-#endif
-		return retval;
-	}
-	retval = tty_port_block_til_ready(port, tty, filp);
-	if (retval) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_open return after isdn_tty_block_til_ready \n");
-#endif
-		return retval;
-	}
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open ttyi%d successful...\n", info->line);
-#endif
-	dev->modempoll++;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open normal exit\n");
-#endif
-	return 0;
-}
-
-static void
-isdn_tty_close(struct tty_struct *tty, struct file *filp)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	struct tty_port *port = &info->port;
-	ulong timeout;
-
-	if (!info || isdn_tty_paranoia_check(info, tty->name, "isdn_tty_close"))
-		return;
-	if (tty_hung_up_p(filp)) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_close return after tty_hung_up_p\n");
-#endif
-		return;
-	}
-	if ((tty->count == 1) && (port->count != 1)) {
-		/*
-		 * Uh, oh.  tty->count is 1, which means that the tty
-		 * structure will be freed.  Info->count should always
-		 * be one in these conditions.  If it's greater than
-		 * one, we've got real problems, since it means the
-		 * serial port won't be shutdown.
-		 */
-		printk(KERN_ERR "isdn_tty_close: bad port count; tty->count is 1, "
-		       "info->count is %d\n", port->count);
-		port->count = 1;
-	}
-	if (--port->count < 0) {
-		printk(KERN_ERR "isdn_tty_close: bad port count for ttyi%d: %d\n",
-		       info->line, port->count);
-		port->count = 0;
-	}
-	if (port->count) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_close after info->count != 0\n");
-#endif
-		return;
-	}
-	info->closing = 1;
-
-	tty->closing = 1;
-	/*
-	 * At this point we stop accepting input.  To do this, we
-	 * disable the receive line status interrupts, and tell the
-	 * interrupt driver to stop checking the data ready bit in the
-	 * line status register.
-	 */
-	if (tty_port_initialized(port)) {
-		tty_wait_until_sent(tty, 3000);	/* 30 seconds timeout */
-		/*
-		 * Before we drop DTR, make sure the UART transmitter
-		 * has completely drained; this is especially
-		 * important if there is a transmit FIFO!
-		 */
-		timeout = jiffies + HZ;
-		while (!(info->lsr & UART_LSR_TEMT)) {
-			schedule_timeout_interruptible(20);
-			if (time_after(jiffies, timeout))
-				break;
-		}
-	}
-	dev->modempoll--;
-	isdn_tty_shutdown(info);
-	isdn_tty_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-	port->tty = NULL;
-	info->ncarrier = 0;
-
-	tty_port_close_end(port, tty);
-	info->closing = 0;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_close normal exit\n");
-#endif
-}
-
-/*
- * isdn_tty_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void
-isdn_tty_hangup(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	struct tty_port *port = &info->port;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_hangup"))
-		return;
-	isdn_tty_shutdown(info);
-	port->count = 0;
-	tty_port_set_active(port, 0);
-	port->tty = NULL;
-	wake_up_interruptible(&port->open_wait);
-}
-
-/* This routine initializes all emulator-data.
- */
-static void
-isdn_tty_reset_profile(atemu *m)
-{
-	m->profile[0] = 0;
-	m->profile[1] = 0;
-	m->profile[2] = 43;
-	m->profile[3] = 13;
-	m->profile[4] = 10;
-	m->profile[5] = 8;
-	m->profile[6] = 3;
-	m->profile[7] = 60;
-	m->profile[8] = 2;
-	m->profile[9] = 6;
-	m->profile[10] = 7;
-	m->profile[11] = 70;
-	m->profile[12] = 0x45;
-	m->profile[13] = 4;
-	m->profile[14] = ISDN_PROTO_L2_X75I;
-	m->profile[15] = ISDN_PROTO_L3_TRANS;
-	m->profile[16] = ISDN_SERIAL_XMIT_SIZE / 16;
-	m->profile[17] = ISDN_MODEM_WINSIZE;
-	m->profile[18] = 4;
-	m->profile[19] = 0;
-	m->profile[20] = 0;
-	m->profile[23] = 0;
-	m->pmsn[0] = '\0';
-	m->plmsn[0] = '\0';
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-static void
-isdn_tty_modem_reset_vpar(atemu *m)
-{
-	m->vpar[0] = 2;         /* Voice-device            (2 = phone line) */
-	m->vpar[1] = 0;         /* Silence detection level (0 = none      ) */
-	m->vpar[2] = 70;        /* Silence interval        (7 sec.        ) */
-	m->vpar[3] = 2;         /* Compression type        (1 = ADPCM-2   ) */
-	m->vpar[4] = 0;         /* DTMF detection level    (0 = softcode  ) */
-	m->vpar[5] = 8;         /* DTMF interval           (8 * 5 ms.     ) */
-}
-#endif
-
-#ifdef CONFIG_ISDN_TTY_FAX
-static void
-isdn_tty_modem_reset_faxpar(modem_info *info)
-{
-	T30_s *f = info->fax;
-
-	f->code = 0;
-	f->phase = ISDN_FAX_PHASE_IDLE;
-	f->direction = 0;
-	f->resolution = 1;	/* fine */
-	f->rate = 5;		/* 14400 bit/s */
-	f->width = 0;
-	f->length = 0;
-	f->compression = 0;
-	f->ecm = 0;
-	f->binary = 0;
-	f->scantime = 0;
-	memset(&f->id[0], 32, FAXIDLEN - 1);
-	f->id[FAXIDLEN - 1] = 0;
-	f->badlin = 0;
-	f->badmul = 0;
-	f->bor = 0;
-	f->nbc = 0;
-	f->cq = 0;
-	f->cr = 0;
-	f->ctcrty = 0;
-	f->minsp = 0;
-	f->phcto = 30;
-	f->rel = 0;
-	memset(&f->pollid[0], 32, FAXIDLEN - 1);
-	f->pollid[FAXIDLEN - 1] = 0;
-}
-#endif
-
-static void
-isdn_tty_modem_reset_regs(modem_info *info, int force)
-{
-	atemu *m = &info->emu;
-	if ((m->mdmreg[REG_DTRR] & BIT_DTRR) || force) {
-		memcpy(m->mdmreg, m->profile, ISDN_MODEM_NUMREG);
-		memcpy(m->msn, m->pmsn, ISDN_MSNLEN);
-		memcpy(m->lmsn, m->plmsn, ISDN_LMSNLEN);
-		info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	isdn_tty_modem_reset_vpar(m);
-#endif
-#ifdef CONFIG_ISDN_TTY_FAX
-	isdn_tty_modem_reset_faxpar(info);
-#endif
-	m->mdmcmdl = 0;
-}
-
-static void
-modem_write_profile(atemu *m)
-{
-	memcpy(m->profile, m->mdmreg, ISDN_MODEM_NUMREG);
-	memcpy(m->pmsn, m->msn, ISDN_MSNLEN);
-	memcpy(m->plmsn, m->lmsn, ISDN_LMSNLEN);
-	if (dev->profd)
-		send_sig(SIGIO, dev->profd, 1);
-}
-
-static const struct tty_operations modem_ops = {
-	.install = isdn_tty_install,
-	.open = isdn_tty_open,
-	.close = isdn_tty_close,
-	.write = isdn_tty_write,
-	.flush_chars = isdn_tty_flush_chars,
-	.write_room = isdn_tty_write_room,
-	.chars_in_buffer = isdn_tty_chars_in_buffer,
-	.flush_buffer = isdn_tty_flush_buffer,
-	.ioctl = isdn_tty_ioctl,
-	.throttle = isdn_tty_throttle,
-	.unthrottle = isdn_tty_unthrottle,
-	.set_termios = isdn_tty_set_termios,
-	.hangup = isdn_tty_hangup,
-	.tiocmget = isdn_tty_tiocmget,
-	.tiocmset = isdn_tty_tiocmset,
-};
-
-static int isdn_tty_carrier_raised(struct tty_port *port)
-{
-	modem_info *info = container_of(port, modem_info, port);
-	return info->msr & UART_MSR_DCD;
-}
-
-static const struct tty_port_operations isdn_tty_port_ops = {
-	.carrier_raised = isdn_tty_carrier_raised,
-};
-
-int
-isdn_tty_modem_init(void)
-{
-	isdn_modem_t	*m;
-	int		i, retval;
-	modem_info	*info;
-
-	m = &dev->mdm;
-	m->tty_modem = alloc_tty_driver(ISDN_MAX_CHANNELS);
-	if (!m->tty_modem)
-		return -ENOMEM;
-	m->tty_modem->name = "ttyI";
-	m->tty_modem->major = ISDN_TTY_MAJOR;
-	m->tty_modem->minor_start = 0;
-	m->tty_modem->type = TTY_DRIVER_TYPE_SERIAL;
-	m->tty_modem->subtype = SERIAL_TYPE_NORMAL;
-	m->tty_modem->init_termios = tty_std_termios;
-	m->tty_modem->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	m->tty_modem->flags = TTY_DRIVER_REAL_RAW;
-	m->tty_modem->driver_name = "isdn_tty";
-	tty_set_operations(m->tty_modem, &modem_ops);
-	retval = tty_register_driver(m->tty_modem);
-	if (retval) {
-		printk(KERN_WARNING "isdn_tty: Couldn't register modem-device\n");
-		goto err;
-	}
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		info = &m->info[i];
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (!(info->fax = kmalloc(sizeof(T30_s), GFP_KERNEL))) {
-			printk(KERN_ERR "Could not allocate fax t30-buffer\n");
-			retval = -ENOMEM;
-			goto err_unregister;
-		}
-#endif
-		tty_port_init(&info->port);
-		info->port.ops = &isdn_tty_port_ops;
-		spin_lock_init(&info->readlock);
-		sprintf(info->last_cause, "0000");
-		sprintf(info->last_num, "none");
-		info->last_dir = 0;
-		info->last_lhup = 1;
-		info->last_l2 = -1;
-		info->last_si = 0;
-		isdn_tty_reset_profile(&info->emu);
-		isdn_tty_modem_reset_regs(info, 1);
-		info->magic = ISDN_ASYNC_MAGIC;
-		info->line = i;
-		info->x_char = 0;
-		info->isdn_driver = -1;
-		info->isdn_channel = -1;
-		info->drv_index = -1;
-		info->xmit_size = ISDN_SERIAL_XMIT_SIZE;
-		timer_setup(&info->nc_timer, isdn_tty_modem_do_ncarrier, 0);
-		skb_queue_head_init(&info->xmit_queue);
-#ifdef CONFIG_ISDN_AUDIO
-		skb_queue_head_init(&info->dtmf_queue);
-#endif
-		info->port.xmit_buf = kmalloc(ISDN_SERIAL_XMIT_MAX + 5,
-				GFP_KERNEL);
-		if (!info->port.xmit_buf) {
-			printk(KERN_ERR "Could not allocate modem xmit-buffer\n");
-			retval = -ENOMEM;
-			goto err_unregister;
-		}
-		/* Make room for T.70 header */
-		info->port.xmit_buf += 4;
-	}
-	return 0;
-err_unregister:
-	for (i--; i >= 0; i--) {
-		info = &m->info[i];
-#ifdef CONFIG_ISDN_TTY_FAX
-		kfree(info->fax);
-#endif
-		kfree(info->port.xmit_buf - 4);
-		info->port.xmit_buf = NULL;
-		tty_port_destroy(&info->port);
-	}
-	tty_unregister_driver(m->tty_modem);
-err:
-	put_tty_driver(m->tty_modem);
-	m->tty_modem = NULL;
-	return retval;
-}
-
-void
-isdn_tty_exit(void)
-{
-	modem_info *info;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		info = &dev->mdm.info[i];
-		isdn_tty_cleanup_xmit(info);
-#ifdef CONFIG_ISDN_TTY_FAX
-		kfree(info->fax);
-#endif
-		kfree(info->port.xmit_buf - 4);
-		info->port.xmit_buf = NULL;
-		tty_port_destroy(&info->port);
-	}
-	tty_unregister_driver(dev->mdm.tty_modem);
-	put_tty_driver(dev->mdm.tty_modem);
-	dev->mdm.tty_modem = NULL;
-}
-
-
-/*
- * isdn_tty_match_icall(char *MSN, atemu *tty_emulator, int dev_idx)
- *      match the MSN against the MSNs (glob patterns) defined for tty_emulator,
- *      and return 0 for match, 1 for no match, 2 if MSN could match if longer.
- */
-
-static int
-isdn_tty_match_icall(char *cid, atemu *emu, int di)
-{
-#ifdef ISDN_DEBUG_MODEM_ICALL
-	printk(KERN_DEBUG "m_fi: msn=%s lmsn=%s mmsn=%s mreg[SI1]=%d mreg[SI2]=%d\n",
-	       emu->msn, emu->lmsn, isdn_map_eaz2msn(emu->msn, di),
-	       emu->mdmreg[REG_SI1], emu->mdmreg[REG_SI2]);
-#endif
-	if (strlen(emu->lmsn)) {
-		char *p = emu->lmsn;
-		char *q;
-		int  tmp;
-		int  ret = 0;
-
-		while (1) {
-			if ((q = strchr(p, ';')))
-				*q = '\0';
-			if ((tmp = isdn_msncmp(cid, isdn_map_eaz2msn(p, di))) > ret)
-				ret = tmp;
-#ifdef ISDN_DEBUG_MODEM_ICALL
-			printk(KERN_DEBUG "m_fi: lmsnX=%s mmsn=%s -> tmp=%d\n",
-			       p, isdn_map_eaz2msn(emu->msn, di), tmp);
-#endif
-			if (q) {
-				*q = ';';
-				p = q;
-				p++;
-			}
-			if (!tmp)
-				return 0;
-			if (!q)
-				break;
-		}
-		return ret;
-	} else {
-		int tmp;
-		tmp = isdn_msncmp(cid, isdn_map_eaz2msn(emu->msn, di));
-#ifdef ISDN_DEBUG_MODEM_ICALL
-		printk(KERN_DEBUG "m_fi: mmsn=%s -> tmp=%d\n",
-		       isdn_map_eaz2msn(emu->msn, di), tmp);
-#endif
-		return tmp;
-	}
-}
-
-/*
- * An incoming call-request has arrived.
- * Search the tty-devices for an appropriate device and bind
- * it to the ISDN-Channel.
- * Return:
- *
- *  0 = No matching device found.
- *  1 = A matching device found.
- *  3 = No match found, but eventually would match, if
- *      CID is longer.
- */
-int
-isdn_tty_find_icall(int di, int ch, setup_parm *setup)
-{
-	char *eaz;
-	int i;
-	int wret;
-	int idx;
-	int si1;
-	int si2;
-	char *nr;
-	ulong flags;
-
-	if (!setup->phone[0]) {
-		nr = "0";
-		printk(KERN_INFO "isdn_tty: Incoming call without OAD, assuming '0'\n");
-	} else
-		nr = setup->phone;
-	si1 = (int) setup->si1;
-	si2 = (int) setup->si2;
-	if (!setup->eazmsn[0]) {
-		printk(KERN_WARNING "isdn_tty: Incoming call without CPN, assuming '0'\n");
-		eaz = "0";
-	} else
-		eaz = setup->eazmsn;
-#ifdef ISDN_DEBUG_MODEM_ICALL
-	printk(KERN_DEBUG "m_fi: eaz=%s si1=%d si2=%d\n", eaz, si1, si2);
-#endif
-	wret = 0;
-	spin_lock_irqsave(&dev->lock, flags);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-
-		if (info->port.count == 0)
-			continue;
-		if ((info->emu.mdmreg[REG_SI1] & si2bit[si1]) &&  /* SI1 is matching */
-		    (info->emu.mdmreg[REG_SI2] == si2))	{         /* SI2 is matching */
-			idx = isdn_dc2minor(di, ch);
-#ifdef ISDN_DEBUG_MODEM_ICALL
-			printk(KERN_DEBUG "m_fi: match1 wret=%d\n", wret);
-			printk(KERN_DEBUG "m_fi: idx=%d flags=%08lx drv=%d ch=%d usg=%d\n", idx,
-			       info->port.flags, info->isdn_driver,
-			       info->isdn_channel, dev->usage[idx]);
-#endif
-			if (
-#ifndef FIX_FILE_TRANSFER
-			    tty_port_active(&info->port) &&
-#endif
-				(info->isdn_driver == -1) &&
-				(info->isdn_channel == -1) &&
-				(USG_NONE(dev->usage[idx]))) {
-				int matchret;
-
-				if ((matchret = isdn_tty_match_icall(eaz, &info->emu, di)) > wret)
-					wret = matchret;
-				if (!matchret) {                  /* EAZ is matching */
-					info->isdn_driver = di;
-					info->isdn_channel = ch;
-					info->drv_index = idx;
-					dev->m_idx[idx] = info->line;
-					dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE;
-					dev->usage[idx] |= isdn_calc_usage(si1, info->emu.mdmreg[REG_L2PROT]);
-					strcpy(dev->num[idx], nr);
-					strcpy(info->emu.cpn, eaz);
-					info->emu.mdmreg[REG_SI1I] = si2bit[si1];
-					info->emu.mdmreg[REG_PLAN] = setup->plan;
-					info->emu.mdmreg[REG_SCREEN] = setup->screen;
-					isdn_info_update();
-					spin_unlock_irqrestore(&dev->lock, flags);
-					printk(KERN_INFO "isdn_tty: call from %s, -> RING on ttyI%d\n", nr,
-					       info->line);
-					info->msr |= UART_MSR_RI;
-					isdn_tty_modem_result(RESULT_RING, info);
-					isdn_timer_ctrl(ISDN_TIMER_MODEMRING, 1);
-					return 1;
-				}
-			}
-		}
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	printk(KERN_INFO "isdn_tty: call from %s -> %s %s\n", nr, eaz,
-	       ((dev->drv[di]->flags & DRV_FLAG_REJBUS) && (wret != 2)) ? "rejected" : "ignored");
-	return (wret == 2) ? 3 : 0;
-}
-
-int
-isdn_tty_stat_callback(int i, isdn_ctrl *c)
-{
-	int mi;
-	modem_info *info;
-	char *e;
-
-	if (i < 0)
-		return 0;
-	if ((mi = dev->m_idx[i]) >= 0) {
-		info = &dev->mdm.info[mi];
-		switch (c->command) {
-		case ISDN_STAT_CINF:
-			printk(KERN_DEBUG "CHARGEINFO on ttyI%d: %ld %s\n", info->line, c->arg, c->parm.num);
-			info->emu.charge = (unsigned) simple_strtoul(c->parm.num, &e, 10);
-			if (e == (char *)c->parm.num)
-				info->emu.charge = 0;
-
-			break;
-		case ISDN_STAT_BSENT:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BSENT ttyI%d\n", info->line);
-#endif
-			if ((info->isdn_driver == c->driver) &&
-			    (info->isdn_channel == c->arg)) {
-				info->msr |= UART_MSR_CTS;
-				if (info->send_outstanding)
-					if (!(--info->send_outstanding))
-						info->lsr |= UART_LSR_TEMT;
-				isdn_tty_tint(info);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_CAUSE:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_CAUSE ttyI%d\n", info->line);
-#endif
-			/* Signal cause to tty-device */
-			strncpy(info->last_cause, c->parm.num, 5);
-			return 1;
-		case ISDN_STAT_DISPLAY:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DISPLAY ttyI%d\n", info->line);
-#endif
-			/* Signal display to tty-device */
-			if ((info->emu.mdmreg[REG_DISPLAY] & BIT_DISPLAY) &&
-			    !(info->emu.mdmreg[REG_RESPNUM] & BIT_RESPNUM)) {
-				isdn_tty_at_cout("\r\n", info);
-				isdn_tty_at_cout("DISPLAY: ", info);
-				isdn_tty_at_cout(c->parm.display, info);
-				isdn_tty_at_cout("\r\n", info);
-			}
-			return 1;
-		case ISDN_STAT_DCONN:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DCONN ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing == 1) {
-					info->dialing = 2;
-					return 1;
-				}
-			}
-			break;
-		case ISDN_STAT_DHUP:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DHUP ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing == 1)
-					isdn_tty_modem_result(RESULT_BUSY, info);
-				if (info->dialing > 1)
-					isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				info->dialing = 0;
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in ISDN_STAT_DHUP\n");
-#endif
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_BCONN:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BCONN ttyI%d\n", info->line);
-#endif
-			/* Wake up any processes waiting
-			 * for incoming call of this device when
-			 * DCD follow the state of incoming carrier
-			 */
-			if (info->port.blocked_open &&
-			    (info->emu.mdmreg[REG_DCD] & BIT_DCD)) {
-				wake_up_interruptible(&info->port.open_wait);
-			}
-
-			/* Schedule CONNECT-Message to any tty
-			 * waiting for it and
-			 * set DCD-bit of its modem-status.
-			 */
-			if (tty_port_active(&info->port) ||
-			    (info->port.blocked_open &&
-			     (info->emu.mdmreg[REG_DCD] & BIT_DCD))) {
-				info->msr |= UART_MSR_DCD;
-				info->emu.charge = 0;
-				if (info->dialing & 0xf)
-					info->last_dir = 1;
-				else
-					info->last_dir = 0;
-				info->dialing = 0;
-				info->rcvsched = 1;
-				if (USG_MODEM(dev->usage[i])) {
-					if (info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) {
-						strcpy(info->emu.connmsg, c->parm.num);
-						isdn_tty_modem_result(RESULT_CONNECT, info);
-					} else
-						isdn_tty_modem_result(RESULT_CONNECT64000, info);
-				}
-				if (USG_VOICE(dev->usage[i]))
-					isdn_tty_modem_result(RESULT_VCON, info);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_BHUP:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BHUP ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in ISDN_STAT_BHUP\n");
-#endif
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_NODCH:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_NODCH ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing) {
-					info->dialing = 0;
-					info->last_l2 = -1;
-					info->last_si = 0;
-					sprintf(info->last_cause, "0000");
-					isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-				}
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_UNLOAD:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_UNLOAD ttyI%d\n", info->line);
-#endif
-			for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-				info = &dev->mdm.info[i];
-				if (info->isdn_driver == c->driver) {
-					if (info->online)
-						isdn_tty_modem_hup(info, 1);
-				}
-			}
-			return 1;
-#ifdef CONFIG_ISDN_TTY_FAX
-		case ISDN_STAT_FAXIND:
-			if (tty_port_active(&info->port)) {
-				isdn_tty_fax_command(info, c);
-			}
-			break;
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-		case ISDN_STAT_AUDIO:
-			if (tty_port_active(&info->port)) {
-				switch (c->parm.num[0]) {
-				case ISDN_AUDIO_DTMF:
-					if (info->vonline) {
-						isdn_audio_put_dle_code(info,
-									c->parm.num[1]);
-					}
-					break;
-				}
-			}
-			break;
-#endif
-		}
-	}
-	return 0;
-}
-
-/*********************************************************************
- Modem-Emulator-Routines
-*********************************************************************/
-
-#define cmdchar(c) ((c >= ' ') && (c <= 0x7f))
-
-/*
- * Put a message from the AT-emulator into receive-buffer of tty,
- * convert CR, LF, and BS to values in modem-registers 3, 4 and 5.
- */
-void
-isdn_tty_at_cout(char *msg, modem_info *info)
-{
-	struct tty_port *port = &info->port;
-	atemu *m = &info->emu;
-	char *p;
-	char c;
-	u_long flags;
-	struct sk_buff *skb = NULL;
-	char *sp = NULL;
-	int l;
-
-	if (!msg) {
-		printk(KERN_WARNING "isdn_tty: Null-Message in isdn_tty_at_cout\n");
-		return;
-	}
-
-	l = strlen(msg);
-
-	spin_lock_irqsave(&info->readlock, flags);
-	if (info->closing) {
-		spin_unlock_irqrestore(&info->readlock, flags);
-		return;
-	}
-
-	/* use queue instead of direct, if online and */
-	/* data is in queue or buffer is full */
-	if (info->online && ((tty_buffer_request_room(port, l) < l) ||
-			     !skb_queue_empty(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel]))) {
-		skb = alloc_skb(l, GFP_ATOMIC);
-		if (!skb) {
-			spin_unlock_irqrestore(&info->readlock, flags);
-			return;
-		}
-		sp = skb_put(skb, l);
-#ifdef CONFIG_ISDN_AUDIO
-		ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-		ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-	}
-
-	for (p = msg; *p; p++) {
-		switch (*p) {
-		case '\r':
-			c = m->mdmreg[REG_CR];
-			break;
-		case '\n':
-			c = m->mdmreg[REG_LF];
-			break;
-		case '\b':
-			c = m->mdmreg[REG_BS];
-			break;
-		default:
-			c = *p;
-		}
-		if (skb) {
-			*sp++ = c;
-		} else {
-			if (tty_insert_flip_char(port, c, TTY_NORMAL) == 0)
-				break;
-		}
-	}
-	if (skb) {
-		__skb_queue_tail(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel], skb);
-		dev->drv[info->isdn_driver]->rcvcount[info->isdn_channel] += skb->len;
-		spin_unlock_irqrestore(&info->readlock, flags);
-		/* Schedule dequeuing */
-		if (dev->modempoll && info->rcvsched)
-			isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-
-	} else {
-		spin_unlock_irqrestore(&info->readlock, flags);
-		tty_flip_buffer_push(port);
-	}
-}
-
-/*
- * Perform ATH Hangup
- */
-static void
-isdn_tty_on_hook(modem_info *info)
-{
-	if (info->isdn_channel >= 0) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-		printk(KERN_DEBUG "Mhup in isdn_tty_on_hook\n");
-#endif
-		isdn_tty_modem_hup(info, 1);
-	}
-}
-
-static void
-isdn_tty_off_hook(void)
-{
-	printk(KERN_DEBUG "isdn_tty_off_hook\n");
-}
-
-#define PLUSWAIT1 (HZ / 2)      /* 0.5 sec. */
-#define PLUSWAIT2 (HZ * 3 / 2)  /* 1.5 sec */
-
-/*
- * Check Buffer for Modem-escape-sequence, activate timer-callback to
- * isdn_tty_modem_escape() if sequence found.
- *
- * Parameters:
- *   p          pointer to databuffer
- *   plus       escape-character
- *   count      length of buffer
- *   pluscount  count of valid escape-characters so far
- *   lastplus   timestamp of last character
- */
-static void
-isdn_tty_check_esc(const u_char *p, u_char plus, int count, int *pluscount,
-		   u_long *lastplus)
-{
-	if (plus > 127)
-		return;
-	if (count > 3) {
-		p += count - 3;
-		count = 3;
-		*pluscount = 0;
-	}
-	while (count > 0) {
-		if (*(p++) == plus) {
-			if ((*pluscount)++) {
-				/* Time since last '+' > 0.5 sec. ? */
-				if (time_after(jiffies, *lastplus + PLUSWAIT1))
-					*pluscount = 1;
-			} else {
-				/* Time since last non-'+' < 1.5 sec. ? */
-				if (time_before(jiffies, *lastplus + PLUSWAIT2))
-					*pluscount = 0;
-			}
-			if ((*pluscount == 3) && (count == 1))
-				isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, 1);
-			if (*pluscount > 3)
-				*pluscount = 1;
-		} else
-			*pluscount = 0;
-		*lastplus = jiffies;
-		count--;
-	}
-}
-
-/*
- * Return result of AT-emulator to tty-receive-buffer, depending on
- * modem-register 12, bit 0 and 1.
- * For CONNECT-messages also switch to online-mode.
- * For RING-message handle auto-ATA if register 0 != 0
- */
-
-static void
-isdn_tty_modem_result(int code, modem_info *info)
-{
-	atemu *m = &info->emu;
-	static char *msg[] =
-		{"OK", "CONNECT", "RING", "NO CARRIER", "ERROR",
-		 "CONNECT 64000", "NO DIALTONE", "BUSY", "NO ANSWER",
-		 "RINGING", "NO MSN/EAZ", "VCON", "RUNG"};
-	char s[ISDN_MSNLEN + 10];
-
-	switch (code) {
-	case RESULT_RING:
-		m->mdmreg[REG_RINGCNT]++;
-		if (m->mdmreg[REG_RINGCNT] == m->mdmreg[REG_RINGATA])
-			/* Automatically accept incoming call */
-			isdn_tty_cmd_ATA(info);
-		break;
-	case RESULT_NO_CARRIER:
-#ifdef ISDN_DEBUG_MODEM_HUP
-		printk(KERN_DEBUG "modem_result: NO CARRIER %d %d\n",
-		       info->closing, !info->port.tty);
-#endif
-		m->mdmreg[REG_RINGCNT] = 0;
-		del_timer(&info->nc_timer);
-		info->ncarrier = 0;
-		if (info->closing || !info->port.tty)
-			return;
-
-#ifdef CONFIG_ISDN_AUDIO
-		if (info->vonline & 1) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG "res3: send DLE-ETX on ttyI%d\n",
-			       info->line);
-#endif
-			/* voice-recording, add DLE-ETX */
-			isdn_tty_at_cout("\020\003", info);
-		}
-		if (info->vonline & 2) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG "res3: send DLE-DC4 on ttyI%d\n",
-			       info->line);
-#endif
-			/* voice-playing, add DLE-DC4 */
-			isdn_tty_at_cout("\020\024", info);
-		}
-#endif
-		break;
-	case RESULT_CONNECT:
-	case RESULT_CONNECT64000:
-		sprintf(info->last_cause, "0000");
-		if (!info->online)
-			info->online = 2;
-		break;
-	case RESULT_VCON:
-#ifdef ISDN_DEBUG_MODEM_VOICE
-		printk(KERN_DEBUG "res3: send VCON on ttyI%d\n",
-		       info->line);
-#endif
-		sprintf(info->last_cause, "0000");
-		if (!info->online)
-			info->online = 1;
-		break;
-	} /* switch (code) */
-
-	if (m->mdmreg[REG_RESP] & BIT_RESP) {
-		/* Show results */
-		if (m->mdmreg[REG_RESPNUM] & BIT_RESPNUM) {
-			/* Show numeric results only */
-			sprintf(s, "\r\n%d\r\n", code);
-			isdn_tty_at_cout(s, info);
-		} else {
-			if (code == RESULT_RING) {
-				/* return if "show RUNG" and ringcounter>1 */
-				if ((m->mdmreg[REG_RUNG] & BIT_RUNG) &&
-				    (m->mdmreg[REG_RINGCNT] > 1))
-					return;
-				/* print CID, _before_ _every_ ring */
-				if (!(m->mdmreg[REG_CIDONCE] & BIT_CIDONCE)) {
-					isdn_tty_at_cout("\r\nCALLER NUMBER: ", info);
-					isdn_tty_at_cout(dev->num[info->drv_index], info);
-					if (m->mdmreg[REG_CDN] & BIT_CDN) {
-						isdn_tty_at_cout("\r\nCALLED NUMBER: ", info);
-						isdn_tty_at_cout(info->emu.cpn, info);
-					}
-				}
-			}
-			isdn_tty_at_cout("\r\n", info);
-			isdn_tty_at_cout(msg[code], info);
-			switch (code) {
-			case RESULT_CONNECT:
-				switch (m->mdmreg[REG_L2PROT]) {
-				case ISDN_PROTO_L2_MODEM:
-					isdn_tty_at_cout(" ", info);
-					isdn_tty_at_cout(m->connmsg, info);
-					break;
-				}
-				break;
-			case RESULT_RING:
-				/* Append CPN, if enabled */
-				if ((m->mdmreg[REG_CPN] & BIT_CPN)) {
-					sprintf(s, "/%s", m->cpn);
-					isdn_tty_at_cout(s, info);
-				}
-				/* Print CID only once, _after_ 1st RING */
-				if ((m->mdmreg[REG_CIDONCE] & BIT_CIDONCE) &&
-				    (m->mdmreg[REG_RINGCNT] == 1)) {
-					isdn_tty_at_cout("\r\n", info);
-					isdn_tty_at_cout("CALLER NUMBER: ", info);
-					isdn_tty_at_cout(dev->num[info->drv_index], info);
-					if (m->mdmreg[REG_CDN] & BIT_CDN) {
-						isdn_tty_at_cout("\r\nCALLED NUMBER: ", info);
-						isdn_tty_at_cout(info->emu.cpn, info);
-					}
-				}
-				break;
-			case RESULT_NO_CARRIER:
-			case RESULT_NO_DIALTONE:
-			case RESULT_BUSY:
-			case RESULT_NO_ANSWER:
-				m->mdmreg[REG_RINGCNT] = 0;
-				/* Append Cause-Message if enabled */
-				if (m->mdmreg[REG_RESPXT] & BIT_RESPXT) {
-					sprintf(s, "/%s", info->last_cause);
-					isdn_tty_at_cout(s, info);
-				}
-				break;
-			case RESULT_CONNECT64000:
-				/* Append Protocol to CONNECT message */
-				switch (m->mdmreg[REG_L2PROT]) {
-				case ISDN_PROTO_L2_X75I:
-				case ISDN_PROTO_L2_X75UI:
-				case ISDN_PROTO_L2_X75BUI:
-					isdn_tty_at_cout("/X.75", info);
-					break;
-				case ISDN_PROTO_L2_HDLC:
-					isdn_tty_at_cout("/HDLC", info);
-					break;
-				case ISDN_PROTO_L2_V11096:
-					isdn_tty_at_cout("/V110/9600", info);
-					break;
-				case ISDN_PROTO_L2_V11019:
-					isdn_tty_at_cout("/V110/19200", info);
-					break;
-				case ISDN_PROTO_L2_V11038:
-					isdn_tty_at_cout("/V110/38400", info);
-					break;
-				}
-				if (m->mdmreg[REG_T70] & BIT_T70) {
-					isdn_tty_at_cout("/T.70", info);
-					if (m->mdmreg[REG_T70] & BIT_T70_EXT)
-						isdn_tty_at_cout("+", info);
-				}
-				break;
-			}
-			isdn_tty_at_cout("\r\n", info);
-		}
-	}
-	if (code == RESULT_NO_CARRIER) {
-		if (info->closing || (!info->port.tty))
-			return;
-
-		if (tty_port_check_carrier(&info->port))
-			tty_hangup(info->port.tty);
-	}
-}
-
-
-/*
- * Display a modem-register-value.
- */
-static void
-isdn_tty_show_profile(int ridx, modem_info *info)
-{
-	char v[6];
-
-	sprintf(v, "\r\n%d", info->emu.mdmreg[ridx]);
-	isdn_tty_at_cout(v, info);
-}
-
-/*
- * Get MSN-string from char-pointer, set pointer to end of number
- */
-static void
-isdn_tty_get_msnstr(char *n, char **p)
-{
-	int limit = ISDN_MSNLEN - 1;
-
-	while (((*p[0] >= '0' && *p[0] <= '9') ||
-		/* Why a comma ??? */
-		(*p[0] == ',') || (*p[0] == ':')) &&
-	       (limit--))
-		*n++ = *p[0]++;
-	*n = '\0';
-}
-
-/*
- * Get phone-number from modem-commandbuffer
- */
-static void
-isdn_tty_getdial(char *p, char *q, int cnt)
-{
-	int first = 1;
-	int limit = ISDN_MSNLEN - 1;	/* MUST match the size of interface var to avoid
-					   buffer overflow */
-
-	while (strchr(" 0123456789,#.*WPTSR-", *p) && *p && --cnt > 0) {
-		if ((*p >= '0' && *p <= '9') || ((*p == 'S') && first) ||
-		    ((*p == 'R') && first) ||
-		    (*p == '*') || (*p == '#')) {
-			*q++ = *p;
-			limit--;
-		}
-		if (!limit)
-			break;
-		p++;
-		first = 0;
-	}
-	*q = 0;
-}
-
-#define PARSE_ERROR { isdn_tty_modem_result(RESULT_ERROR, info); return; }
-#define PARSE_ERROR1 { isdn_tty_modem_result(RESULT_ERROR, info); return 1; }
-
-static void
-isdn_tty_report(modem_info *info)
-{
-	atemu *m = &info->emu;
-	char s[80];
-
-	isdn_tty_at_cout("\r\nStatistics of last connection:\r\n\r\n", info);
-	sprintf(s, "    Remote Number:    %s\r\n", info->last_num);
-	isdn_tty_at_cout(s, info);
-	sprintf(s, "    Direction:        %s\r\n", info->last_dir ? "outgoing" : "incoming");
-	isdn_tty_at_cout(s, info);
-	isdn_tty_at_cout("    Layer-2 Protocol: ", info);
-	switch (info->last_l2) {
-	case ISDN_PROTO_L2_X75I:
-		isdn_tty_at_cout("X.75i", info);
-		break;
-	case ISDN_PROTO_L2_X75UI:
-		isdn_tty_at_cout("X.75ui", info);
-		break;
-	case ISDN_PROTO_L2_X75BUI:
-		isdn_tty_at_cout("X.75bui", info);
-		break;
-	case ISDN_PROTO_L2_HDLC:
-		isdn_tty_at_cout("HDLC", info);
-		break;
-	case ISDN_PROTO_L2_V11096:
-		isdn_tty_at_cout("V.110 9600 Baud", info);
-		break;
-	case ISDN_PROTO_L2_V11019:
-		isdn_tty_at_cout("V.110 19200 Baud", info);
-		break;
-	case ISDN_PROTO_L2_V11038:
-		isdn_tty_at_cout("V.110 38400 Baud", info);
-		break;
-	case ISDN_PROTO_L2_TRANS:
-		isdn_tty_at_cout("transparent", info);
-		break;
-	case ISDN_PROTO_L2_MODEM:
-		isdn_tty_at_cout("modem", info);
-		break;
-	case ISDN_PROTO_L2_FAX:
-		isdn_tty_at_cout("fax", info);
-		break;
-	default:
-		isdn_tty_at_cout("unknown", info);
-		break;
-	}
-	if (m->mdmreg[REG_T70] & BIT_T70) {
-		isdn_tty_at_cout("/T.70", info);
-		if (m->mdmreg[REG_T70] & BIT_T70_EXT)
-			isdn_tty_at_cout("+", info);
-	}
-	isdn_tty_at_cout("\r\n", info);
-	isdn_tty_at_cout("    Service:          ", info);
-	switch (info->last_si) {
-	case 1:
-		isdn_tty_at_cout("audio\r\n", info);
-		break;
-	case 5:
-		isdn_tty_at_cout("btx\r\n", info);
-		break;
-	case 7:
-		isdn_tty_at_cout("data\r\n", info);
-		break;
-	default:
-		sprintf(s, "%d\r\n", info->last_si);
-		isdn_tty_at_cout(s, info);
-		break;
-	}
-	sprintf(s, "    Hangup location:  %s\r\n", info->last_lhup ? "local" : "remote");
-	isdn_tty_at_cout(s, info);
-	sprintf(s, "    Last cause:       %s\r\n", info->last_cause);
-	isdn_tty_at_cout(s, info);
-}
-
-/*
- * Parse AT&.. commands.
- */
-static int
-isdn_tty_cmd_ATand(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int i;
-	char rb[100];
-
-#define MAXRB (sizeof(rb) - 1)
-
-	switch (*p[0]) {
-	case 'B':
-		/* &B - Set Buffersize */
-		p[0]++;
-		i = isdn_getnum(p);
-		if ((i < 0) || (i > ISDN_SERIAL_XMIT_MAX))
-			PARSE_ERROR1;
-#ifdef CONFIG_ISDN_AUDIO
-		if ((m->mdmreg[REG_SI1] & 1) && (i > VBUF))
-			PARSE_ERROR1;
-#endif
-		m->mdmreg[REG_PSIZE] = i / 16;
-		info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-		switch (m->mdmreg[REG_L2PROT]) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			info->xmit_size /= 10;
-		}
-		break;
-	case 'C':
-		/* &C - DCD Status */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_DCD] &= ~BIT_DCD;
-			break;
-		case 1:
-			m->mdmreg[REG_DCD] |= BIT_DCD;
-			break;
-		default:
-			PARSE_ERROR1
-				}
-		break;
-	case 'D':
-		/* &D - Set DTR-Low-behavior */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_DTRHUP] &= ~BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] &= ~BIT_DTRR;
-			break;
-		case 2:
-			m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] &= ~BIT_DTRR;
-			break;
-		case 3:
-			m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] |= BIT_DTRR;
-			break;
-		default:
-			PARSE_ERROR1
-				}
-		break;
-	case 'E':
-		/* &E -Set EAZ/MSN */
-		p[0]++;
-		isdn_tty_get_msnstr(m->msn, p);
-		break;
-	case 'F':
-		/* &F -Set Factory-Defaults */
-		p[0]++;
-		if (info->msr & UART_MSR_DCD)
-			PARSE_ERROR1;
-		isdn_tty_reset_profile(m);
-		isdn_tty_modem_reset_regs(info, 1);
-		break;
-#ifdef DUMMY_HAYES_AT
-	case 'K':
-		/* only for be compilant with common scripts */
-		/* &K Flowcontrol - no function */
-		p[0]++;
-		isdn_getnum(p);
-		break;
-#endif
-	case 'L':
-		/* &L -Set Numbers to listen on */
-		p[0]++;
-		i = 0;
-		while (*p[0] && (strchr("0123456789,-*[]?;", *p[0])) &&
-		       (i < ISDN_LMSNLEN - 1))
-			m->lmsn[i++] = *p[0]++;
-		m->lmsn[i] = '\0';
-		break;
-	case 'R':
-		/* &R - Set V.110 bitrate adaption */
-		p[0]++;
-		i = isdn_getnum(p);
-		switch (i) {
-		case 0:
-			/* Switch off V.110, back to X.75 */
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			m->mdmreg[REG_SI2] = 0;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-			break;
-		case 9600:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11096;
-			m->mdmreg[REG_SI2] = 197;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		case 19200:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11019;
-			m->mdmreg[REG_SI2] = 199;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		case 38400:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11038;
-			m->mdmreg[REG_SI2] = 198; /* no existing standard for this */
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		/* Switch off T.70 */
-		m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT);
-		/* Set Service 7 */
-		m->mdmreg[REG_SI1] |= 4;
-		break;
-	case 'S':
-		/* &S - Set Windowsize */
-		p[0]++;
-		i = isdn_getnum(p);
-		if ((i > 0) && (i < 9))
-			m->mdmreg[REG_WSIZE] = i;
-		else
-			PARSE_ERROR1;
-		break;
-	case 'V':
-		/* &V - Show registers */
-		p[0]++;
-		isdn_tty_at_cout("\r\n", info);
-		for (i = 0; i < ISDN_MODEM_NUMREG; i++) {
-			sprintf(rb, "S%02d=%03d%s", i,
-				m->mdmreg[i], ((i + 1) % 10) ? " " : "\r\n");
-			isdn_tty_at_cout(rb, info);
-		}
-		sprintf(rb, "\r\nEAZ/MSN: %.50s\r\n",
-			strlen(m->msn) ? m->msn : "None");
-		isdn_tty_at_cout(rb, info);
-		if (strlen(m->lmsn)) {
-			isdn_tty_at_cout("\r\nListen: ", info);
-			isdn_tty_at_cout(m->lmsn, info);
-			isdn_tty_at_cout("\r\n", info);
-		}
-		break;
-	case 'W':
-		/* &W - Write Profile */
-		p[0]++;
-		switch (*p[0]) {
-		case '0':
-			p[0]++;
-			modem_write_profile(m);
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 'X':
-		/* &X - Switch to BTX-Mode and T.70 */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT);
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-			break;
-		case 1:
-			m->mdmreg[REG_T70] |= BIT_T70;
-			m->mdmreg[REG_T70] &= ~BIT_T70_EXT;
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			info->xmit_size = 112;
-			m->mdmreg[REG_SI1] = 4;
-			m->mdmreg[REG_SI2] = 0;
-			break;
-		case 2:
-			m->mdmreg[REG_T70] |= (BIT_T70 | BIT_T70_EXT);
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			info->xmit_size = 112;
-			m->mdmreg[REG_SI1] = 4;
-			m->mdmreg[REG_SI2] = 0;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	return 0;
-}
-
-static int
-isdn_tty_check_ats(int mreg, int mval, modem_info *info, atemu *m)
-{
-	/* Some plausibility checks */
-	switch (mreg) {
-	case REG_L2PROT:
-		if (mval > ISDN_PROTO_L2_MAX)
-			return 1;
-		break;
-	case REG_PSIZE:
-		if ((mval * 16) > ISDN_SERIAL_XMIT_MAX)
-			return 1;
-#ifdef CONFIG_ISDN_AUDIO
-		if ((m->mdmreg[REG_SI1] & 1) && (mval > VBUFX))
-			return 1;
-#endif
-		info->xmit_size = mval * 16;
-		switch (m->mdmreg[REG_L2PROT]) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			info->xmit_size /= 10;
-		}
-		break;
-	case REG_SI1I:
-	case REG_PLAN:
-	case REG_SCREEN:
-		/* readonly registers */
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Perform ATS command
- */
-static int
-isdn_tty_cmd_ATS(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int bitpos;
-	int mreg;
-	int mval;
-	int bval;
-
-	mreg = isdn_getnum(p);
-	if (mreg < 0 || mreg >= ISDN_MODEM_NUMREG)
-		PARSE_ERROR1;
-	switch (*p[0]) {
-	case '=':
-		p[0]++;
-		mval = isdn_getnum(p);
-		if (mval < 0 || mval > 255)
-			PARSE_ERROR1;
-		if (isdn_tty_check_ats(mreg, mval, info, m))
-			PARSE_ERROR1;
-		m->mdmreg[mreg] = mval;
-		break;
-	case '.':
-		/* Set/Clear a single bit */
-		p[0]++;
-		bitpos = isdn_getnum(p);
-		if ((bitpos < 0) || (bitpos > 7))
-			PARSE_ERROR1;
-		switch (*p[0]) {
-		case '=':
-			p[0]++;
-			bval = isdn_getnum(p);
-			if (bval < 0 || bval > 1)
-				PARSE_ERROR1;
-			if (bval)
-				mval = m->mdmreg[mreg] | (1 << bitpos);
-			else
-				mval = m->mdmreg[mreg] & ~(1 << bitpos);
-			if (isdn_tty_check_ats(mreg, mval, info, m))
-				PARSE_ERROR1;
-			m->mdmreg[mreg] = mval;
-			break;
-		case '?':
-			p[0]++;
-			isdn_tty_at_cout("\r\n", info);
-			isdn_tty_at_cout((m->mdmreg[mreg] & (1 << bitpos)) ? "1" : "0",
-					 info);
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case '?':
-		p[0]++;
-		isdn_tty_show_profile(mreg, info);
-		break;
-	default:
-		PARSE_ERROR1;
-		break;
-	}
-	return 0;
-}
-
-/*
- * Perform ATA command
- */
-static void
-isdn_tty_cmd_ATA(modem_info *info)
-{
-	atemu *m = &info->emu;
-	isdn_ctrl cmd;
-	int l2;
-
-	if (info->msr & UART_MSR_RI) {
-		/* Accept incoming call */
-		info->last_dir = 0;
-		strcpy(info->last_num, dev->num[info->drv_index]);
-		m->mdmreg[REG_RINGCNT] = 0;
-		info->msr &= ~UART_MSR_RI;
-		l2 = m->mdmreg[REG_L2PROT];
-#ifdef CONFIG_ISDN_AUDIO
-		/* If more than one bit set in reg18, autoselect Layer2 */
-		if ((m->mdmreg[REG_SI1] & m->mdmreg[REG_SI1I]) != m->mdmreg[REG_SI1]) {
-			if (m->mdmreg[REG_SI1I] == 1) {
-				if ((l2 != ISDN_PROTO_L2_MODEM) && (l2 != ISDN_PROTO_L2_FAX))
-					l2 = ISDN_PROTO_L2_TRANS;
-			} else
-				l2 = ISDN_PROTO_L2_X75I;
-		}
-#endif
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		info->last_l2 = l2;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (l2 == ISDN_PROTO_L2_FAX) {
-			cmd.parm.fax = info->fax;
-			info->fax->direction = ISDN_TTY_FAX_CONN_IN;
-		}
-#endif
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_ACCEPTD;
-		info->dialing = 16;
-		info->emu.carrierwait = 0;
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	} else
-		isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-/*
- * Parse AT+F.. commands
- */
-static int
-isdn_tty_cmd_PLUSF(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	char rs[20];
-
-	if (!strncmp(p[0], "CLASS", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d",
-				(m->mdmreg[REG_SI1] & 1) ? 8 : 0);
-#ifdef CONFIG_ISDN_TTY_FAX
-			if (TTY_IS_FCLASS2(info))
-				sprintf(rs, "\r\n2");
-			else if (TTY_IS_FCLASS1(info))
-				sprintf(rs, "\r\n1");
-#endif
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '0':
-				p[0]++;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS;
-				m->mdmreg[REG_SI1] = 4;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-#ifdef CONFIG_ISDN_TTY_FAX
-			case '1':
-				p[0]++;
-				if (!(dev->global_features &
-				      ISDN_FEATURE_L3_FCLASS1))
-					PARSE_ERROR1;
-				m->mdmreg[REG_SI1] = 1;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS1;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-			case '2':
-				p[0]++;
-				if (!(dev->global_features &
-				      ISDN_FEATURE_L3_FCLASS2))
-					PARSE_ERROR1;
-				m->mdmreg[REG_SI1] = 1;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS2;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-#endif
-			case '8':
-				p[0]++;
-				/* L2 will change on dialout with si=1 */
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS;
-				m->mdmreg[REG_SI1] = 5;
-				info->xmit_size = VBUF;
-				break;
-			case '?':
-				p[0]++;
-				strcpy(rs, "\r\n0,");
-#ifdef CONFIG_ISDN_TTY_FAX
-				if (dev->global_features &
-				    ISDN_FEATURE_L3_FCLASS1)
-					strcat(rs, "1,");
-				if (dev->global_features &
-				    ISDN_FEATURE_L3_FCLASS2)
-					strcat(rs, "2,");
-#endif
-				strcat(rs, "8");
-				isdn_tty_at_cout(rs, info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-#ifdef CONFIG_ISDN_TTY_FAX
-	return (isdn_tty_cmd_PLUSF_FAX(p, info));
-#else
-	PARSE_ERROR1;
-#endif
-}
-
-/*
- * Parse AT+V.. commands
- */
-static int
-isdn_tty_cmd_PLUSV(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	isdn_ctrl cmd;
-	static char *vcmd[] =
-		{"NH", "IP", "LS", "RX", "SD", "SM", "TX", "DD", NULL};
-	int i;
-	int par1;
-	int par2;
-	char rs[20];
-
-	i = 0;
-	while (vcmd[i]) {
-		if (!strncmp(vcmd[i], p[0], 2)) {
-			p[0] += 2;
-			break;
-		}
-		i++;
-	}
-	switch (i) {
-	case 0:
-		/* AT+VNH - Auto hangup feature */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			isdn_tty_at_cout("\r\n1", info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '1':
-				p[0]++;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n1", info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 1:
-		/* AT+VIP - Reset all voice parameters */
-		isdn_tty_modem_reset_vpar(m);
-		break;
-	case 2:
-		/* AT+VLS - Select device, accept incoming call */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", m->vpar[0]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '0':
-				p[0]++;
-				m->vpar[0] = 0;
-				break;
-			case '2':
-				p[0]++;
-				m->vpar[0] = 2;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n0,2", info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 3:
-		/* AT+VRX - Start recording */
-		if (!m->vpar[0])
-			PARSE_ERROR1;
-		if (info->online != 1) {
-			isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-			return 1;
-		}
-		info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state);
-		if (!info->dtmf_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n");
-			PARSE_ERROR1;
-		}
-		info->silence_state = isdn_audio_silence_init(info->silence_state);
-		if (!info->silence_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc silence state\n");
-			PARSE_ERROR1;
-		}
-		if (m->vpar[3] < 5) {
-			info->adpcmr = isdn_audio_adpcm_init(info->adpcmr, m->vpar[3]);
-			if (!info->adpcmr) {
-				printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n");
-				PARSE_ERROR1;
-			}
-		}
-#ifdef ISDN_DEBUG_AT
-		printk(KERN_DEBUG "AT: +VRX\n");
-#endif
-		info->vonline |= 1;
-		isdn_tty_modem_result(RESULT_CONNECT, info);
-		return 0;
-		break;
-	case 4:
-		/* AT+VSD - Silence detection */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d>",
-				m->vpar[1],
-				m->vpar[2]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if ((*p[0] >= '0') && (*p[0] <= '9')) {
-				par1 = isdn_getnum(p);
-				if ((par1 < 0) || (par1 > 31))
-					PARSE_ERROR1;
-				if (*p[0] != ',')
-					PARSE_ERROR1;
-				p[0]++;
-				par2 = isdn_getnum(p);
-				if ((par2 < 0) || (par2 > 255))
-					PARSE_ERROR1;
-				m->vpar[1] = par1;
-				m->vpar[2] = par2;
-				break;
-			} else
-				if (*p[0] == '?') {
-					p[0]++;
-					isdn_tty_at_cout("\r\n<0-31>,<0-255>",
-							 info);
-					break;
-				} else
-					PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 5:
-		/* AT+VSM - Select compression */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d><8000>",
-				m->vpar[3],
-				m->vpar[1]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '2':
-			case '3':
-			case '4':
-			case '5':
-			case '6':
-				par1 = isdn_getnum(p);
-				if ((par1 < 2) || (par1 > 6))
-					PARSE_ERROR1;
-				m->vpar[3] = par1;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n2;ADPCM;2;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("3;ADPCM;3;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("4;ADPCM;4;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("5;ALAW;8;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("6;ULAW;8;0;(8000)\r\n",
-						 info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 6:
-		/* AT+VTX - Start sending */
-		if (!m->vpar[0])
-			PARSE_ERROR1;
-		if (info->online != 1) {
-			isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-			return 1;
-		}
-		info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state);
-		if (!info->dtmf_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n");
-			PARSE_ERROR1;
-		}
-		if (m->vpar[3] < 5) {
-			info->adpcms = isdn_audio_adpcm_init(info->adpcms, m->vpar[3]);
-			if (!info->adpcms) {
-				printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n");
-				PARSE_ERROR1;
-			}
-		}
-#ifdef ISDN_DEBUG_AT
-		printk(KERN_DEBUG "AT: +VTX\n");
-#endif
-		m->lastDLE = 0;
-		info->vonline |= 2;
-		isdn_tty_modem_result(RESULT_CONNECT, info);
-		return 0;
-		break;
-	case 7:
-		/* AT+VDD - DTMF detection */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d>",
-				m->vpar[4],
-				m->vpar[5]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if ((*p[0] >= '0') && (*p[0] <= '9')) {
-				if (info->online != 1)
-					PARSE_ERROR1;
-				par1 = isdn_getnum(p);
-				if ((par1 < 0) || (par1 > 15))
-					PARSE_ERROR1;
-				if (*p[0] != ',')
-					PARSE_ERROR1;
-				p[0]++;
-				par2 = isdn_getnum(p);
-				if ((par2 < 0) || (par2 > 255))
-					PARSE_ERROR1;
-				m->vpar[4] = par1;
-				m->vpar[5] = par2;
-				cmd.driver = info->isdn_driver;
-				cmd.command = ISDN_CMD_AUDIO;
-				cmd.arg = info->isdn_channel + (ISDN_AUDIO_SETDD << 8);
-				cmd.parm.num[0] = par1;
-				cmd.parm.num[1] = par2;
-				isdn_command(&cmd);
-				break;
-			} else
-				if (*p[0] == '?') {
-					p[0]++;
-					isdn_tty_at_cout("\r\n<0-15>,<0-255>",
-							 info);
-					break;
-				} else
-					PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	return 0;
-}
-#endif                          /* CONFIG_ISDN_AUDIO */
-
-/*
- * Parse and perform an AT-command-line.
- */
-static void
-isdn_tty_parse_at(modem_info *info)
-{
-	atemu *m = &info->emu;
-	char *p;
-	char ds[ISDN_MSNLEN];
-
-#ifdef ISDN_DEBUG_AT
-	printk(KERN_DEBUG "AT: '%s'\n", m->mdmcmd);
-#endif
-	for (p = &m->mdmcmd[2]; *p;) {
-		switch (*p) {
-		case ' ':
-			p++;
-			break;
-		case 'A':
-			/* A - Accept incoming call */
-			p++;
-			isdn_tty_cmd_ATA(info);
-			return;
-		case 'D':
-			/* D - Dial */
-			if (info->msr & UART_MSR_DCD)
-				PARSE_ERROR;
-			if (info->msr & UART_MSR_RI) {
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				return;
-			}
-			isdn_tty_getdial(++p, ds, sizeof ds);
-			p += strlen(p);
-			if (!strlen(m->msn))
-				isdn_tty_modem_result(RESULT_NO_MSN_EAZ, info);
-			else if (strlen(ds))
-				isdn_tty_dial(ds, info, m);
-			else
-				PARSE_ERROR;
-			return;
-		case 'E':
-			/* E - Turn Echo on/off */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_ECHO] &= ~BIT_ECHO;
-				break;
-			case 1:
-				m->mdmreg[REG_ECHO] |= BIT_ECHO;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'H':
-			/* H - On/Off-hook */
-			p++;
-			switch (*p) {
-			case '0':
-				p++;
-				isdn_tty_on_hook(info);
-				break;
-			case '1':
-				p++;
-				isdn_tty_off_hook();
-				break;
-			default:
-				isdn_tty_on_hook(info);
-				break;
-			}
-			break;
-		case 'I':
-			/* I - Information */
-			p++;
-			isdn_tty_at_cout("\r\nLinux ISDN", info);
-			switch (*p) {
-			case '0':
-			case '1':
-				p++;
-				break;
-			case '2':
-				p++;
-				isdn_tty_report(info);
-				break;
-			case '3':
-				p++;
-				snprintf(ds, sizeof(ds), "\r\n%d", info->emu.charge);
-				isdn_tty_at_cout(ds, info);
-				break;
-			default:;
-			}
-			break;
-#ifdef DUMMY_HAYES_AT
-		case 'L':
-		case 'M':
-			/* only for be compilant with common scripts */
-			/* no function */
-			p++;
-			isdn_getnum(&p);
-			break;
-#endif
-		case 'O':
-			/* O - Go online */
-			p++;
-			if (info->msr & UART_MSR_DCD)
-				/* if B-Channel is up */
-				isdn_tty_modem_result((m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) ? RESULT_CONNECT : RESULT_CONNECT64000, info);
-			else
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-			return;
-		case 'Q':
-			/* Q - Turn Emulator messages on/off */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_RESP] |= BIT_RESP;
-				break;
-			case 1:
-				m->mdmreg[REG_RESP] &= ~BIT_RESP;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'S':
-			/* S - Set/Get Register */
-			p++;
-			if (isdn_tty_cmd_ATS(&p, info))
-				return;
-			break;
-		case 'V':
-			/* V - Numeric or ASCII Emulator-messages */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_RESP] |= BIT_RESPNUM;
-				break;
-			case 1:
-				m->mdmreg[REG_RESP] &= ~BIT_RESPNUM;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'Z':
-			/* Z - Load Registers from Profile */
-			p++;
-			if (info->msr & UART_MSR_DCD) {
-				info->online = 0;
-				isdn_tty_on_hook(info);
-			}
-			isdn_tty_modem_reset_regs(info, 1);
-			break;
-		case '+':
-			p++;
-			switch (*p) {
-#ifdef CONFIG_ISDN_AUDIO
-			case 'F':
-				p++;
-				if (isdn_tty_cmd_PLUSF(&p, info))
-					return;
-				break;
-			case 'V':
-				if ((!(m->mdmreg[REG_SI1] & 1)) ||
-				    (m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM))
-					PARSE_ERROR;
-				p++;
-				if (isdn_tty_cmd_PLUSV(&p, info))
-					return;
-				break;
-#endif                          /* CONFIG_ISDN_AUDIO */
-			case 'S':	/* SUSPEND */
-				p++;
-				isdn_tty_get_msnstr(ds, &p);
-				isdn_tty_suspend(ds, info, m);
-				break;
-			case 'R':	/* RESUME */
-				p++;
-				isdn_tty_get_msnstr(ds, &p);
-				isdn_tty_resume(ds, info, m);
-				break;
-			case 'M':	/* MESSAGE */
-				p++;
-				isdn_tty_send_msg(info, m, p);
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case '&':
-			p++;
-			if (isdn_tty_cmd_ATand(&p, info))
-				return;
-			break;
-		default:
-			PARSE_ERROR;
-		}
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	if (!info->vonline)
-#endif
-		isdn_tty_modem_result(RESULT_OK, info);
-}
-
-/* Need own toupper() because standard-toupper is not available
- * within modules.
- */
-#define my_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c & 0xdf) : c)
-
-/*
- * Perform line-editing of AT-commands
- *
- * Parameters:
- *   p        inputbuffer
- *   count    length of buffer
- *   channel  index to line (minor-device)
- */
-static int
-isdn_tty_edit_at(const char *p, int count, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int total = 0;
-	u_char c;
-	char eb[2];
-	int cnt;
-
-	for (cnt = count; cnt > 0; p++, cnt--) {
-		c = *p;
-		total++;
-		if (c == m->mdmreg[REG_CR] || c == m->mdmreg[REG_LF]) {
-			/* Separator (CR or LF) */
-			m->mdmcmd[m->mdmcmdl] = 0;
-			if (m->mdmreg[REG_ECHO] & BIT_ECHO) {
-				eb[0] = c;
-				eb[1] = 0;
-				isdn_tty_at_cout(eb, info);
-			}
-			if ((m->mdmcmdl >= 2) && (!(strncmp(m->mdmcmd, "AT", 2))))
-				isdn_tty_parse_at(info);
-			m->mdmcmdl = 0;
-			continue;
-		}
-		if (c == m->mdmreg[REG_BS] && m->mdmreg[REG_BS] < 128) {
-			/* Backspace-Function */
-			if ((m->mdmcmdl > 2) || (!m->mdmcmdl)) {
-				if (m->mdmcmdl)
-					m->mdmcmdl--;
-				if (m->mdmreg[REG_ECHO] & BIT_ECHO)
-					isdn_tty_at_cout("\b", info);
-			}
-			continue;
-		}
-		if (cmdchar(c)) {
-			if (m->mdmreg[REG_ECHO] & BIT_ECHO) {
-				eb[0] = c;
-				eb[1] = 0;
-				isdn_tty_at_cout(eb, info);
-			}
-			if (m->mdmcmdl < 255) {
-				c = my_toupper(c);
-				switch (m->mdmcmdl) {
-				case 1:
-					if (c == 'T') {
-						m->mdmcmd[m->mdmcmdl] = c;
-						m->mdmcmd[++m->mdmcmdl] = 0;
-						break;
-					} else
-						m->mdmcmdl = 0;
-					/* Fall through - check for 'A' */
-				case 0:
-					if (c == 'A') {
-						m->mdmcmd[m->mdmcmdl] = c;
-						m->mdmcmd[++m->mdmcmdl] = 0;
-					}
-					break;
-				default:
-					m->mdmcmd[m->mdmcmdl] = c;
-					m->mdmcmd[++m->mdmcmdl] = 0;
-				}
-			}
-		}
-	}
-	return total;
-}
-
-/*
- * Switch all modem-channels who are online and got a valid
- * escape-sequence 1.5 seconds ago, to command-mode.
- * This function is called every second via timer-interrupt from within
- * timer-dispatcher isdn_timer_function()
- */
-void
-isdn_tty_modem_escape(void)
-{
-	int ton = 0;
-	int i;
-	int midx;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (USG_MODEM(dev->usage[i]) && (midx = dev->m_idx[i]) >= 0) {
-			modem_info *info = &dev->mdm.info[midx];
-			if (info->online) {
-				ton = 1;
-				if ((info->emu.pluscount == 3) &&
-				    time_after(jiffies,
-					    info->emu.lastplus + PLUSWAIT2)) {
-					info->emu.pluscount = 0;
-					info->online = 0;
-					isdn_tty_modem_result(RESULT_OK, info);
-				}
-			}
-		}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, ton);
-}
-
-/*
- * Put a RING-message to all modem-channels who have the RI-bit set.
- * This function is called every second via timer-interrupt from within
- * timer-dispatcher isdn_timer_function()
- */
-void
-isdn_tty_modem_ring(void)
-{
-	int ton = 0;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (info->msr & UART_MSR_RI) {
-			ton = 1;
-			isdn_tty_modem_result(RESULT_RING, info);
-		}
-	}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMRING, ton);
-}
-
-/*
- * For all online tty's, try sending data to
- * the lower levels.
- */
-void
-isdn_tty_modem_xmit(void)
-{
-	int ton = 1;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (info->online) {
-			ton = 1;
-			isdn_tty_senddown(info);
-			isdn_tty_tint(info);
-		}
-	}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, ton);
-}
-
-/*
- * Check all channels if we have a 'no carrier' timeout.
- * Timeout value is set by Register S7.
- */
-void
-isdn_tty_carrier_timeout(void)
-{
-	int ton = 0;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (!info->dialing)
-			continue;
-		if (info->emu.carrierwait++ > info->emu.mdmreg[REG_WAITC]) {
-			info->dialing = 0;
-			isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-			isdn_tty_modem_hup(info, 1);
-		} else
-			ton = 1;
-	}
-	isdn_timer_ctrl(ISDN_TIMER_CARRIER, ton);
-}
diff --git a/drivers/isdn/i4l/isdn_tty.h b/drivers/isdn/i4l/isdn_tty.h
deleted file mode 100644
index a6f801d2263b..000000000000
--- a/drivers/isdn/i4l/isdn_tty.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* $Id: isdn_tty.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, tty related functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-
-#define DLE 0x10
-#define ETX 0x03
-#define DC4 0x14
-
-
-/*
- * Definition of some special Registers of AT-Emulator
- */
-#define REG_RINGATA   0
-#define REG_RINGCNT   1  /* ring counter register */
-#define REG_ESC       2
-#define REG_CR        3
-#define REG_LF        4
-#define REG_BS        5
-
-#define REG_WAITC     7
-
-#define REG_RESP     12  /* show response messages register */
-#define BIT_RESP      1  /* show response messages bit      */
-#define REG_RESPNUM  12  /* show numeric responses register */
-#define BIT_RESPNUM   2  /* show numeric responses bit      */
-#define REG_ECHO     12
-#define BIT_ECHO      4
-#define REG_DCD      12
-#define BIT_DCD       8
-#define REG_CTS      12
-#define BIT_CTS      16
-#define REG_DTRR     12
-#define BIT_DTRR     32
-#define REG_DSR      12
-#define BIT_DSR      64
-#define REG_CPPP     12
-#define BIT_CPPP    128
-
-#define REG_DXMT     13
-#define BIT_DXMT      1
-#define REG_T70      13
-#define BIT_T70       2
-#define BIT_T70_EXT  32
-#define REG_DTRHUP   13
-#define BIT_DTRHUP    4
-#define REG_RESPXT   13
-#define BIT_RESPXT    8
-#define REG_CIDONCE  13
-#define BIT_CIDONCE  16
-#define REG_RUNG     13  /* show RUNG message register      */
-#define BIT_RUNG     64  /* show RUNG message bit           */
-#define REG_DISPLAY  13
-#define BIT_DISPLAY 128
-
-#define REG_L2PROT   14
-#define REG_L3PROT   15
-#define REG_PSIZE    16
-#define REG_WSIZE    17
-#define REG_SI1      18
-#define REG_SI2      19
-#define REG_SI1I     20
-#define REG_PLAN     21
-#define REG_SCREEN   22
-
-#define REG_CPN      23
-#define BIT_CPN       1
-#define REG_CPNFCON  23
-#define BIT_CPNFCON   2
-#define REG_CDN      23
-#define BIT_CDN       4
-
-/* defines for result codes */
-#define RESULT_OK		0
-#define RESULT_CONNECT		1
-#define RESULT_RING		2
-#define RESULT_NO_CARRIER	3
-#define RESULT_ERROR		4
-#define RESULT_CONNECT64000	5
-#define RESULT_NO_DIALTONE	6
-#define RESULT_BUSY		7
-#define RESULT_NO_ANSWER	8
-#define RESULT_RINGING		9
-#define RESULT_NO_MSN_EAZ	10
-#define RESULT_VCON		11
-#define RESULT_RUNG		12
-
-#define TTY_IS_FCLASS1(info)						\
-	((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) &&		\
-	 (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS1))
-#define TTY_IS_FCLASS2(info)						\
-	((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) &&		\
-	 (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS2))
-
-extern void isdn_tty_modem_escape(void);
-extern void isdn_tty_modem_ring(void);
-extern void isdn_tty_carrier_timeout(void);
-extern void isdn_tty_modem_xmit(void);
-extern int  isdn_tty_modem_init(void);
-extern void isdn_tty_exit(void);
-extern void isdn_tty_readmodem(void);
-extern int  isdn_tty_find_icall(int, int, setup_parm *);
-extern int  isdn_tty_stat_callback(int, isdn_ctrl *);
-extern int  isdn_tty_rcv_skb(int, int, int, struct sk_buff *);
-extern int  isdn_tty_capi_facility(capi_msg *cm);
-extern void isdn_tty_at_cout(char *, modem_info *);
-extern void isdn_tty_modem_hup(modem_info *, int);
-#ifdef CONFIG_ISDN_TTY_FAX
-extern int  isdn_tty_cmd_PLUSF_FAX(char **, modem_info *);
-extern int  isdn_tty_fax_command(modem_info *, isdn_ctrl *);
-extern void isdn_tty_fax_bitorder(modem_info *, struct sk_buff *);
-#endif
diff --git a/drivers/isdn/i4l/isdn_ttyfax.c b/drivers/isdn/i4l/isdn_ttyfax.c
deleted file mode 100644
index 47aae4916730..000000000000
--- a/drivers/isdn/i4l/isdn_ttyfax.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/* $Id: isdn_ttyfax.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, tty_fax AT-command emulator (linklevel).
- *
- * Copyright 1999    by Armin Schindler (mac@melware.de)
- * Copyright 1999    by Ralf Spachmann (mel@melware.de)
- * Copyright 1999    by Cytronics & Melware
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#undef ISDN_TTY_FAX_STAT_DEBUG
-#undef ISDN_TTY_FAX_CMD_DEBUG
-
-#include <linux/isdn.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#include "isdn_ttyfax.h"
-
-
-static char *isdn_tty_fax_revision = "$Revision: 1.1.2.2 $";
-
-#define PARSE_ERROR1 { isdn_tty_fax_modem_result(1, info); return 1; }
-
-static char *
-isdn_getrev(const char *revision)
-{
-	char *rev;
-	char *p;
-
-	if ((p = strchr(revision, ':'))) {
-		rev = p + 2;
-		p = strchr(rev, '$');
-		*--p = 0;
-	} else
-		rev = "???";
-	return rev;
-}
-
-/*
- * Fax Class 2 Modem results
- *
- */
-
-static void
-isdn_tty_fax_modem_result(int code, modem_info *info)
-{
-	atemu *m = &info->emu;
-	T30_s *f = info->fax;
-	char rs[50];
-	char rss[50];
-	char *rp;
-	int i;
-	static char *msg[] =
-		{"OK", "ERROR", "+FCON", "+FCSI:", "+FDIS:",
-		 "+FHNG:", "+FDCS:", "CONNECT", "+FTSI:",
-		 "+FCFR", "+FPTS:", "+FET:"};
-
-
-	isdn_tty_at_cout("\r\n", info);
-	isdn_tty_at_cout(msg[code], info);
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: Fax send %s on ttyI%d\n",
-	       msg[code], info->line);
-#endif
-	switch (code) {
-	case 0: /* OK */
-		break;
-	case 1: /* ERROR */
-		break;
-	case 2:	/* +FCON */
-		/* Append CPN, if enabled */
-		if ((m->mdmreg[REG_CPNFCON] & BIT_CPNFCON) &&
-		    (!(dev->usage[info->isdn_channel] & ISDN_USAGE_OUTGOING))) {
-			sprintf(rs, "/%s", m->cpn);
-			isdn_tty_at_cout(rs, info);
-		}
-		info->online = 1;
-		f->fet = 0;
-		if (f->phase == ISDN_FAX_PHASE_A)
-			f->phase = ISDN_FAX_PHASE_B;
-		break;
-	case 3:	/* +FCSI */
-	case 8:	/* +FTSI */
-		sprintf(rs, "\"%s\"", f->r_id);
-		isdn_tty_at_cout(rs, info);
-		break;
-	case 4:	/* +FDIS */
-		rs[0] = 0;
-		rp = &f->r_resolution;
-		for (i = 0; i < 8; i++) {
-			sprintf(rss, "%c%s", rp[i] + 48,
-				(i < 7) ? "," : "");
-			strcat(rs, rss);
-		}
-		isdn_tty_at_cout(rs, info);
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax DIS=%s on ttyI%d\n",
-		       rs, info->line);
-#endif
-		break;
-	case 5:	/* +FHNG */
-		sprintf(rs, "%d", f->code);
-		isdn_tty_at_cout(rs, info);
-		info->faxonline = 0;
-		break;
-	case 6:	/* +FDCS */
-		rs[0] = 0;
-		rp = &f->r_resolution;
-		for (i = 0; i < 8; i++) {
-			sprintf(rss, "%c%s", rp[i] + 48,
-				(i < 7) ? "," : "");
-			strcat(rs, rss);
-		}
-		isdn_tty_at_cout(rs, info);
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax DCS=%s on ttyI%d\n",
-		       rs, info->line);
-#endif
-		break;
-	case 7:	/* CONNECT */
-		info->faxonline |= 2;
-		break;
-	case 9:	/* FCFR */
-		break;
-	case 10:	/* FPTS */
-		isdn_tty_at_cout("1", info);
-		break;
-	case 11:	/* FET */
-		sprintf(rs, "%d", f->fet);
-		isdn_tty_at_cout(rs, info);
-		break;
-	}
-
-	isdn_tty_at_cout("\r\n", info);
-
-	switch (code) {
-	case 7:	/* CONNECT */
-		info->online = 2;
-		if (info->faxonline & 1) {
-			sprintf(rs, "%c", XON);
-			isdn_tty_at_cout(rs, info);
-		}
-		break;
-	}
-}
-
-static int
-isdn_tty_fax_command1(modem_info *info, isdn_ctrl *c)
-{
-	static char *msg[] =
-		{"OK", "CONNECT", "NO CARRIER", "ERROR", "FCERROR"};
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: FCLASS1 cmd(%d)\n", c->parm.aux.cmd);
-#endif
-	if (c->parm.aux.cmd < ISDN_FAX_CLASS1_QUERY) {
-		if (info->online)
-			info->online = 1;
-		isdn_tty_at_cout("\r\n", info);
-		isdn_tty_at_cout(msg[c->parm.aux.cmd], info);
-		isdn_tty_at_cout("\r\n", info);
-	}
-	switch (c->parm.aux.cmd) {
-	case ISDN_FAX_CLASS1_CONNECT:
-		info->online = 2;
-		break;
-	case ISDN_FAX_CLASS1_OK:
-	case ISDN_FAX_CLASS1_FCERROR:
-	case ISDN_FAX_CLASS1_ERROR:
-	case ISDN_FAX_CLASS1_NOCARR:
-		break;
-	case ISDN_FAX_CLASS1_QUERY:
-		isdn_tty_at_cout("\r\n", info);
-		if (!c->parm.aux.para[0]) {
-			isdn_tty_at_cout(msg[ISDN_FAX_CLASS1_ERROR], info);
-			isdn_tty_at_cout("\r\n", info);
-		} else {
-			isdn_tty_at_cout(c->parm.aux.para, info);
-			isdn_tty_at_cout("\r\nOK\r\n", info);
-		}
-		break;
-	}
-	return (0);
-}
-
-int
-isdn_tty_fax_command(modem_info *info, isdn_ctrl *c)
-{
-	T30_s *f = info->fax;
-	char rs[10];
-
-	if (TTY_IS_FCLASS1(info))
-		return (isdn_tty_fax_command1(info, c));
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: Fax cmd %d on ttyI%d\n",
-	       f->r_code, info->line);
-#endif
-	switch (f->r_code) {
-	case ISDN_TTY_FAX_FCON:
-		info->faxonline = 1;
-		isdn_tty_fax_modem_result(2, info);	/* +FCON */
-		return (0);
-	case ISDN_TTY_FAX_FCON_I:
-		info->faxonline = 16;
-		isdn_tty_fax_modem_result(2, info);	/* +FCON */
-		return (0);
-	case ISDN_TTY_FAX_RID:
-		if (info->faxonline & 1)
-			isdn_tty_fax_modem_result(3, info);	/* +FCSI */
-		if (info->faxonline & 16)
-			isdn_tty_fax_modem_result(8, info);	/* +FTSI */
-		return (0);
-	case ISDN_TTY_FAX_DIS:
-		isdn_tty_fax_modem_result(4, info);	/* +FDIS */
-		return (0);
-	case ISDN_TTY_FAX_HNG:
-		if (f->phase == ISDN_FAX_PHASE_C) {
-			if (f->direction == ISDN_TTY_FAX_CONN_IN) {
-				sprintf(rs, "%c%c", DLE, ETX);
-				isdn_tty_at_cout(rs, info);
-			} else {
-				sprintf(rs, "%c", 0x18);
-				isdn_tty_at_cout(rs, info);
-			}
-			info->faxonline &= ~2;	/* leave data mode */
-			info->online = 1;
-		}
-		f->phase = ISDN_FAX_PHASE_E;
-		isdn_tty_fax_modem_result(5, info);	/* +FHNG */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_DCS:
-		isdn_tty_fax_modem_result(6, info);	/* +FDCS */
-		isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-		f->phase = ISDN_FAX_PHASE_C;
-		return (0);
-	case ISDN_TTY_FAX_TRAIN_OK:
-		isdn_tty_fax_modem_result(6, info);	/* +FDCS */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_SENT:
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_CFR:
-		isdn_tty_fax_modem_result(9, info);	/* +FCFR */
-		return (0);
-	case ISDN_TTY_FAX_ET:
-		sprintf(rs, "%c%c", DLE, ETX);
-		isdn_tty_at_cout(rs, info);
-		isdn_tty_fax_modem_result(10, info);	/* +FPTS */
-		isdn_tty_fax_modem_result(11, info);	/* +FET */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		info->faxonline &= ~2;	/* leave data mode */
-		info->online = 1;
-		f->phase = ISDN_FAX_PHASE_D;
-		return (0);
-	case ISDN_TTY_FAX_PTS:
-		isdn_tty_fax_modem_result(10, info);	/* +FPTS */
-		if (f->direction == ISDN_TTY_FAX_CONN_OUT) {
-			if (f->fet == 1)
-				f->phase = ISDN_FAX_PHASE_B;
-			if (f->fet == 0)
-				isdn_tty_fax_modem_result(0, info);	/* OK */
-		}
-		return (0);
-	case ISDN_TTY_FAX_EOP:
-		info->faxonline &= ~2;	/* leave data mode */
-		info->online = 1;
-		f->phase = ISDN_FAX_PHASE_D;
-		return (0);
-
-	}
-	return (-1);
-}
-
-
-void
-isdn_tty_fax_bitorder(modem_info *info, struct sk_buff *skb)
-{
-	__u8 LeftMask;
-	__u8 RightMask;
-	__u8 fBit;
-	__u8 Data;
-	int i;
-
-	if (!info->fax->bor) {
-		for (i = 0; i < skb->len; i++) {
-			Data = skb->data[i];
-			for (
-				LeftMask = 0x80, RightMask = 0x01;
-				LeftMask > RightMask;
-				LeftMask >>= 1, RightMask <<= 1
-				) {
-				fBit = (Data & LeftMask);
-				if (Data & RightMask)
-					Data |= LeftMask;
-				else
-					Data &= ~LeftMask;
-				if (fBit)
-					Data |= RightMask;
-				else
-					Data &= ~RightMask;
-
-			}
-			skb->data[i] = Data;
-		}
-	}
-}
-
-/*
- * Parse AT+F.. FAX class 1 commands
- */
-
-static int
-isdn_tty_cmd_FCLASS1(char **p, modem_info *info)
-{
-	static char *cmd[] =
-		{"AE", "TS", "RS", "TM", "RM", "TH", "RH"};
-	isdn_ctrl c;
-	int par, i;
-	u_long flags;
-
-	for (c.parm.aux.cmd = 0; c.parm.aux.cmd < 7; c.parm.aux.cmd++)
-		if (!strncmp(p[0], cmd[c.parm.aux.cmd], 2))
-			break;
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 (%s,%d)\n", p[0], c.parm.aux.cmd);
-#endif
-	if (c.parm.aux.cmd == 7)
-		PARSE_ERROR1;
-
-	p[0] += 2;
-	switch (*p[0]) {
-	case '?':
-		p[0]++;
-		c.parm.aux.subcmd = AT_QUERY;
-		break;
-	case '=':
-		p[0]++;
-		if (*p[0] == '?') {
-			p[0]++;
-			c.parm.aux.subcmd = AT_EQ_QUERY;
-		} else {
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 255))
-				PARSE_ERROR1;
-			c.parm.aux.subcmd = AT_EQ_VALUE;
-			c.parm.aux.para[0] = par;
-		}
-		break;
-	case 0:
-		c.parm.aux.subcmd = AT_COMMAND;
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	c.command = ISDN_CMD_FAXCMD;
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 %d/%d/%d)\n",
-	       c.parm.aux.cmd, c.parm.aux.subcmd, c.parm.aux.para[0]);
-#endif
-	if (info->isdn_driver < 0) {
-		if ((c.parm.aux.subcmd == AT_EQ_VALUE) ||
-		    (c.parm.aux.subcmd == AT_COMMAND)) {
-			PARSE_ERROR1;
-		}
-		spin_lock_irqsave(&dev->lock, flags);
-		/* get a temporary connection to the first free fax driver */
-		i = isdn_get_free_channel(ISDN_USAGE_FAX, ISDN_PROTO_L2_FAX,
-					  ISDN_PROTO_L3_FCLASS1, -1, -1, "00");
-		if (i < 0) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			PARSE_ERROR1;
-		}
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		spin_unlock_irqrestore(&dev->lock, flags);
-		c.driver = info->isdn_driver;
-		c.arg = info->isdn_channel;
-		isdn_command(&c);
-		spin_lock_irqsave(&dev->lock, flags);
-		isdn_free_channel(info->isdn_driver, info->isdn_channel,
-				  ISDN_USAGE_FAX);
-		info->isdn_driver = -1;
-		info->isdn_channel = -1;
-		if (info->drv_index >= 0) {
-			dev->m_idx[info->drv_index] = -1;
-			info->drv_index = -1;
-		}
-		spin_unlock_irqrestore(&dev->lock, flags);
-	} else {
-		c.driver = info->isdn_driver;
-		c.arg = info->isdn_channel;
-		isdn_command(&c);
-	}
-	return 1;
-}
-
-/*
- * Parse AT+F.. FAX class 2 commands
- */
-
-static int
-isdn_tty_cmd_FCLASS2(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	T30_s *f = info->fax;
-	isdn_ctrl cmd;
-	int par;
-	char rs[50];
-	char rss[50];
-	int maxdccval[] =
-		{1, 5, 2, 2, 3, 2, 0, 7};
-
-	/* FAA still unchanged */
-	if (!strncmp(p[0], "AA", 2)) {	/* TODO */
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", 0);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 255))
-				PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BADLIN=value - dummy 0=disable errorchk disabled, 1-255 nr. of lines for making page bad */
-	if (!strncmp(p[0], "BADLIN", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->badlin);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->badlin = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBADLIN=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BADMUL=value - dummy 0=disable errorchk disabled (threshold multiplier) */
-	if (!strncmp(p[0], "BADMUL", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->badmul);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->badmul = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBADMUL=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BOR=n - Phase C bit order, 0=direct, 1=reverse */
-	if (!strncmp(p[0], "BOR", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->bor);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->bor = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBOR=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* NBC=n - No Best Capabilities */
-	if (!strncmp(p[0], "NBC", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->nbc);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->nbc = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FNBC=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BUF? - Readonly buffersize readout  */
-	if (!strncmp(p[0], "BUF?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FBUF? (%d) \n", (16 * m->mdmreg[REG_PSIZE]));
-#endif
-		p[0]++;
-		sprintf(rs, "\r\n %d ", (16 * m->mdmreg[REG_PSIZE]));
-		isdn_tty_at_cout(rs, info);
-		return 0;
-	}
-	/* CIG=string - local fax station id string for polling rx */
-	if (!strncmp(p[0], "CIG", 3)) {
-		int i, r;
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n\"%s\"", f->pollid);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n\"STRING\"");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				if (*p[0] == '"')
-					p[0]++;
-				for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) {
-					f->pollid[i] = *p[0]++;
-				}
-				if (*p[0] == '"')
-					p[0]++;
-				for (r = i; r < FAXIDLEN; r++) {
-					f->pollid[r] = 32;
-				}
-				f->pollid[FAXIDLEN - 1] = 0;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax local poll ID rx \"%s\"\n", f->pollid);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CQ=n - copy qlty chk, 0= no chk, 1=only 1D chk, 2=1D+2D chk */
-	if (!strncmp(p[0], "CQ", 2)) {
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->cq);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1,2");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 2))
-					PARSE_ERROR1;
-				f->cq = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCQ=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CR=n - can receive? 0= no data rx or poll remote dev, 1=do receive data or poll remote dev */
-	if (!strncmp(p[0], "CR", 2)) {
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->cr);	/* read actual value from struct and print */
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");		/* display online help */
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->cr = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCR=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CTCRTY=value - ECM retry count */
-	if (!strncmp(p[0], "CTCRTY", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->ctcrty);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->ctcrty = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCTCRTY=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DCC=vr,br,wd,ln,df,ec,bf,st - DCE capabilities parms */
-	if (!strncmp(p[0], "DCC", 3)) {
-		char *rp = &f->resolution;
-		int i;
-
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			strcpy(rs, "\r\n");
-			for (i = 0; i < 8; i++) {
-				sprintf(rss, "%c%s", rp[i] + 48,
-					(i < 7) ? "," : "");
-				strcat(rs, rss);
-			}
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info);
-				p[0]++;
-			} else {
-				for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) {
-					if (*p[0] != ',') {
-						if ((*p[0] - 48) > maxdccval[i]) {
-							PARSE_ERROR1;
-						}
-						rp[i] = *p[0] - 48;
-						p[0]++;
-						if (*p[0] == ',')
-							p[0]++;
-					} else
-						p[0]++;
-				}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FDCC capabilities DCE=%d,%d,%d,%d,%d,%d,%d,%d\n",
-				       rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DIS=vr,br,wd,ln,df,ec,bf,st - current session parms */
-	if (!strncmp(p[0], "DIS", 3)) {
-		char *rp = &f->resolution;
-		int i;
-
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			strcpy(rs, "\r\n");
-			for (i = 0; i < 8; i++) {
-				sprintf(rss, "%c%s", rp[i] + 48,
-					(i < 7) ? "," : "");
-				strcat(rs, rss);
-			}
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info);
-				p[0]++;
-			} else {
-				for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) {
-					if (*p[0] != ',') {
-						if ((*p[0] - 48) > maxdccval[i]) {
-							PARSE_ERROR1;
-						}
-						rp[i] = *p[0] - 48;
-						p[0]++;
-						if (*p[0] == ',')
-							p[0]++;
-					} else
-						p[0]++;
-				}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FDIS session parms=%d,%d,%d,%d,%d,%d,%d,%d\n",
-				       rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DR - Receive Phase C data command, initiates document reception */
-	if (!strncmp(p[0], "DR", 2)) {
-		p[0] += 2;
-		if ((info->faxonline & 16) &&	/* incoming connection */
-		    ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D))) {
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-			printk(KERN_DEBUG "isdn_tty: Fax FDR\n");
-#endif
-			f->code = ISDN_TTY_FAX_DR;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-			if (f->phase == ISDN_FAX_PHASE_B) {
-				f->phase = ISDN_FAX_PHASE_C;
-			} else if (f->phase == ISDN_FAX_PHASE_D) {
-				switch (f->fet) {
-				case 0:	/* next page will be received */
-					f->phase = ISDN_FAX_PHASE_C;
-					isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-					break;
-				case 1:	/* next doc will be received */
-					f->phase = ISDN_FAX_PHASE_B;
-					break;
-				case 2:	/* fax session is terminating */
-					f->phase = ISDN_FAX_PHASE_E;
-					break;
-				default:
-					PARSE_ERROR1;
-				}
-			}
-		} else {
-			PARSE_ERROR1;
-		}
-		return 1;
-	}
-	/* DT=df,vr,wd,ln - TX phase C data command (release DCE to proceed with negotiation) */
-	if (!strncmp(p[0], "DT", 2)) {
-		int i, val[] =
-			{4, 0, 2, 3};
-		char *rp = &f->resolution;
-
-		p[0] += 2;
-		if (!(info->faxonline & 1))	/* not outgoing connection */
-			PARSE_ERROR1;
-
-		for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 4); i++) {
-			if (*p[0] != ',') {
-				if ((*p[0] - 48) > maxdccval[val[i]]) {
-					PARSE_ERROR1;
-				}
-				rp[val[i]] = *p[0] - 48;
-				p[0]++;
-				if (*p[0] == ',')
-					p[0]++;
-			} else
-				p[0]++;
-		}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FDT tx data command parms=%d,%d,%d,%d\n",
-		       rp[4], rp[0], rp[2], rp[3]);
-#endif
-		if ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D)) {
-			f->code = ISDN_TTY_FAX_DT;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-			if (f->phase == ISDN_FAX_PHASE_D) {
-				f->phase = ISDN_FAX_PHASE_C;
-				isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-			}
-		} else {
-			PARSE_ERROR1;
-		}
-		return 1;
-	}
-	/* ECM=n - Error mode control 0=disabled, 2=enabled, handled by DCE alone incl. buff of partial pages */
-	if (!strncmp(p[0], "ECM", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->ecm);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,2");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par != 0) && (par != 2))
-					PARSE_ERROR1;
-				f->ecm = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FECM=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* ET=n - End of page or document */
-	if (!strncmp(p[0], "ET=", 3)) {
-		p[0] += 3;
-		if (*p[0] == '?') {
-			p[0]++;
-			sprintf(rs, "\r\n0-2");
-			isdn_tty_at_cout(rs, info);
-		} else {
-			if ((f->phase != ISDN_FAX_PHASE_D) ||
-			    (!(info->faxonline & 1)))
-				PARSE_ERROR1;
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 2))
-				PARSE_ERROR1;
-			f->fet = par;
-			f->code = ISDN_TTY_FAX_ET;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-			printk(KERN_DEBUG "isdn_tty: Fax FET=%d\n", par);
-#endif
-			return 1;
-		}
-		return 0;
-	}
-	/* K - terminate */
-	if (!strncmp(p[0], "K", 1)) {
-		p[0] += 1;
-		if ((f->phase == ISDN_FAX_PHASE_IDLE) || (f->phase == ISDN_FAX_PHASE_E))
-			PARSE_ERROR1;
-		isdn_tty_modem_hup(info, 1);
-		return 1;
-	}
-	/* LID=string - local fax ID */
-	if (!strncmp(p[0], "LID", 3)) {
-		int i, r;
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n\"%s\"", f->id);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n\"STRING\"");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				if (*p[0] == '"')
-					p[0]++;
-				for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) {
-					f->id[i] = *p[0]++;
-				}
-				if (*p[0] == '"')
-					p[0]++;
-				for (r = i; r < FAXIDLEN; r++) {
-					f->id[r] = 32;
-				}
-				f->id[FAXIDLEN - 1] = 0;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax local ID \"%s\"\n", f->id);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-
-	/* MDL? - DCE Model       */
-	if (!strncmp(p[0], "MDL?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FMDL?\n");
-#endif
-		isdn_tty_at_cout("\r\nisdn4linux", info);
-		return 0;
-	}
-	/* MFR? - DCE Manufacturer */
-	if (!strncmp(p[0], "MFR?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FMFR?\n");
-#endif
-		isdn_tty_at_cout("\r\nisdn4linux", info);
-		return 0;
-	}
-	/* MINSP=n - Minimum Speed for Phase C */
-	if (!strncmp(p[0], "MINSP", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->minsp);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-5");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 5))
-					PARSE_ERROR1;
-				f->minsp = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FMINSP=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* PHCTO=value - DTE phase C timeout */
-	if (!strncmp(p[0], "PHCTO", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->phcto);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->phcto = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FPHCTO=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-
-	/* REL=n - Phase C received EOL alignment */
-	if (!strncmp(p[0], "REL", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->rel);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->rel = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FREL=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* REV? - DCE Revision */
-	if (!strncmp(p[0], "REV?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FREV?\n");
-#endif
-		strcpy(rss, isdn_tty_fax_revision);
-		sprintf(rs, "\r\nRev: %s", isdn_getrev(rss));
-		isdn_tty_at_cout(rs, info);
-		return 0;
-	}
-
-	/* Phase C Transmit Data Block Size */
-	if (!strncmp(p[0], "TBC=", 4)) {	/* dummy, not used */
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FTBC=%c\n", *p[0]);
-#endif
-		switch (*p[0]) {
-		case '0':
-			p[0]++;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	printk(KERN_DEBUG "isdn_tty: unknown token=>AT+F%s<\n", p[0]);
-	PARSE_ERROR1;
-}
-
-int
-isdn_tty_cmd_PLUSF_FAX(char **p, modem_info *info)
-{
-	if (TTY_IS_FCLASS2(info))
-		return (isdn_tty_cmd_FCLASS2(p, info));
-	else if (TTY_IS_FCLASS1(info))
-		return (isdn_tty_cmd_FCLASS1(p, info));
-	PARSE_ERROR1;
-}
diff --git a/drivers/isdn/i4l/isdn_ttyfax.h b/drivers/isdn/i4l/isdn_ttyfax.h
deleted file mode 100644
index ccda4fcf8f7b..000000000000
--- a/drivers/isdn/i4l/isdn_ttyfax.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: isdn_ttyfax.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, tty_fax related functions (linklevel).
- *
- * Copyright 1999   by Armin Schindler (mac@melware.de)
- * Copyright 1999   by Ralf Spachmann (mel@melware.de)
- * Copyright 1999   by Cytronics & Melware
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-
-#define XON	0x11
-#define XOFF	0x13
-#define DC2	0x12
diff --git a/drivers/isdn/i4l/isdn_v110.c b/drivers/isdn/i4l/isdn_v110.c
deleted file mode 100644
index d11fe76f138f..000000000000
--- a/drivers/isdn/i4l/isdn_v110.c
+++ /dev/null
@@ -1,625 +0,0 @@
-/* $Id: isdn_v110.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, V.110 related functions (linklevel).
- *
- * Copyright by Thomas Pfeiffer (pfeiffer@pds.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-
-#include <linux/isdn.h>
-#include "isdn_v110.h"
-
-#undef ISDN_V110_DEBUG
-
-char *isdn_v110_revision = "$Revision: 1.1.2.2 $";
-
-#define V110_38400 255
-#define V110_19200  15
-#define V110_9600    3
-
-/*
- * The following data are precoded matrices, online and offline matrix
- * for 9600, 19200 und 38400, respectively
- */
-static unsigned char V110_OnMatrix_9600[] =
-{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff,
- 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff,
- 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd};
-
-static unsigned char V110_OffMatrix_9600[] =
-{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
-static unsigned char V110_OnMatrix_19200[] =
-{0xf0, 0xf0, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7,
- 0xfd, 0xff, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7};
-
-static unsigned char V110_OffMatrix_19200[] =
-{0xf0, 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
-static unsigned char V110_OnMatrix_38400[] =
-{0x00, 0x7f, 0x7f, 0x7f, 0x7f, 0xfd, 0x7f, 0x7f, 0x7f, 0x7f};
-
-static unsigned char V110_OffMatrix_38400[] =
-{0x00, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff};
-
-/*
- * FlipBits reorders sequences of keylen bits in one byte.
- * E.g. source order 7654321 will be converted to 45670123 when keylen = 4,
- * and to 67452301 when keylen = 2. This is necessary because ordering on
- * the isdn line is the other way.
- */
-static inline unsigned char
-FlipBits(unsigned char c, int keylen)
-{
-	unsigned char b = c;
-	unsigned char bit = 128;
-	int i;
-	int j;
-	int hunks = (8 / keylen);
-
-	c = 0;
-	for (i = 0; i < hunks; i++) {
-		for (j = 0; j < keylen; j++) {
-			if (b & (bit >> j))
-				c |= bit >> (keylen - j - 1);
-		}
-		bit >>= keylen;
-	}
-	return c;
-}
-
-
-/* isdn_v110_open allocates and initializes private V.110 data
- * structures and returns a pointer to these.
- */
-static isdn_v110_stream *
-isdn_v110_open(unsigned char key, int hdrlen, int maxsize)
-{
-	int i;
-	isdn_v110_stream *v;
-
-	if ((v = kzalloc(sizeof(isdn_v110_stream), GFP_ATOMIC)) == NULL)
-		return NULL;
-	v->key = key;
-	v->nbits = 0;
-	for (i = 0; key & (1 << i); i++)
-		v->nbits++;
-
-	v->nbytes = 8 / v->nbits;
-	v->decodelen = 0;
-
-	switch (key) {
-	case V110_38400:
-		v->OnlineFrame = V110_OnMatrix_38400;
-		v->OfflineFrame = V110_OffMatrix_38400;
-		break;
-	case V110_19200:
-		v->OnlineFrame = V110_OnMatrix_19200;
-		v->OfflineFrame = V110_OffMatrix_19200;
-		break;
-	default:
-		v->OnlineFrame = V110_OnMatrix_9600;
-		v->OfflineFrame = V110_OffMatrix_9600;
-		break;
-	}
-	v->framelen = v->nbytes * 10;
-	v->SyncInit = 5;
-	v->introducer = 0;
-	v->dbit = 1;
-	v->b = 0;
-	v->skbres = hdrlen;
-	v->maxsize = maxsize - hdrlen;
-	if ((v->encodebuf = kmalloc(maxsize, GFP_ATOMIC)) == NULL) {
-		kfree(v);
-		return NULL;
-	}
-	return v;
-}
-
-/* isdn_v110_close frees private V.110 data structures */
-void
-isdn_v110_close(isdn_v110_stream *v)
-{
-	if (v == NULL)
-		return;
-#ifdef ISDN_V110_DEBUG
-	printk(KERN_DEBUG "v110 close\n");
-#endif
-	kfree(v->encodebuf);
-	kfree(v);
-}
-
-
-/*
- * ValidHeaderBytes return the number of valid bytes in v->decodebuf
- */
-static int
-ValidHeaderBytes(isdn_v110_stream *v)
-{
-	int i;
-	for (i = 0; (i < v->decodelen) && (i < v->nbytes); i++)
-		if ((v->decodebuf[i] & v->key) != 0)
-			break;
-	return i;
-}
-
-/*
- * SyncHeader moves the decodebuf ptr to the next valid header
- */
-static void
-SyncHeader(isdn_v110_stream *v)
-{
-	unsigned char *rbuf = v->decodebuf;
-	int len = v->decodelen;
-
-	if (len == 0)
-		return;
-	for (rbuf++, len--; len > 0; len--, rbuf++)	/* such den SyncHeader in buf ! */
-		if ((*rbuf & v->key) == 0)	/* erstes byte gefunden ?       */
-			break;  /* jupp!                        */
-	if (len)
-		memcpy(v->decodebuf, rbuf, len);
-
-	v->decodelen = len;
-#ifdef ISDN_V110_DEBUG
-	printk(KERN_DEBUG "isdn_v110: Header resync\n");
-#endif
-}
-
-/* DecodeMatrix takes n (n>=1) matrices (v110 frames, 10 bytes) where
-   len is the number of matrix-lines. len must be a multiple of 10, i.e.
-   only complete matices must be given.
-   From these, netto data is extracted and returned in buf. The return-value
-   is the bytecount of the decoded data.
-*/
-static int
-DecodeMatrix(isdn_v110_stream *v, unsigned char *m, int len, unsigned char *buf)
-{
-	int line = 0;
-	int buflen = 0;
-	int mbit = 64;
-	int introducer = v->introducer;
-	int dbit = v->dbit;
-	unsigned char b = v->b;
-
-	while (line < len) {    /* Are we done with all lines of the matrix? */
-		if ((line % 10) == 0) {	/* the 0. line of the matrix is always 0 ! */
-			if (m[line] != 0x00) {	/* not 0 ? -> error! */
-#ifdef ISDN_V110_DEBUG
-				printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad Header\n");
-				/* returning now is not the right thing, though :-( */
-#endif
-			}
-			line++; /* next line of matrix */
-			continue;
-		} else if ((line % 10) == 5) {	/* in line 5 there's only e-bits ! */
-			if ((m[line] & 0x70) != 0x30) {	/* 011 has to be at the beginning! */
-#ifdef ISDN_V110_DEBUG
-				printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad 5th line\n");
-				/* returning now is not the right thing, though :-( */
-#endif
-			}
-			line++; /* next line */
-			continue;
-		} else if (!introducer) {	/* every byte starts with 10 (stopbit, startbit) */
-			introducer = (m[line] & mbit) ? 0 : 1;	/* current bit of the matrix */
-		next_byte:
-			if (mbit > 2) {	/* was it the last bit in this line ? */
-				mbit >>= 1;	/* no -> take next */
-				continue;
-			}       /* otherwise start with leftmost bit in the next line */
-			mbit = 64;
-			line++;
-			continue;
-		} else {        /* otherwise we need to set a data bit */
-			if (m[line] & mbit)	/* was that bit set in the matrix ? */
-				b |= dbit;	/* yes -> set it in the data byte */
-			else
-				b &= dbit - 1;	/* no -> clear it in the data byte */
-			if (dbit < 128)	/* is that data byte done ? */
-				dbit <<= 1;	/* no, got the next bit */
-			else {  /* data byte is done */
-				buf[buflen++] = b;	/* copy byte into the output buffer */
-				introducer = b = 0;	/* init of the intro sequence and of the data byte */
-				dbit = 1;	/* next we look for the 0th bit */
-			}
-			goto next_byte;	/* look for next bit in the matrix */
-		}
-	}
-	v->introducer = introducer;
-	v->dbit = dbit;
-	v->b = b;
-	return buflen;          /* return number of bytes in the output buffer */
-}
-
-/*
- * DecodeStream receives V.110 coded data from the input stream. It recovers the
- * original frames.
- * The input stream doesn't need to be framed
- */
-struct sk_buff *
-isdn_v110_decode(isdn_v110_stream *v, struct sk_buff *skb)
-{
-	int i;
-	int j;
-	int len;
-	unsigned char *v110_buf;
-	unsigned char *rbuf;
-
-	if (!skb) {
-		printk(KERN_WARNING "isdn_v110_decode called with NULL skb!\n");
-		return NULL;
-	}
-	rbuf = skb->data;
-	len = skb->len;
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_decode called with NULL stream!\n");
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	if (v->decodelen == 0)  /* cache empty?               */
-		for (; len > 0; len--, rbuf++)	/* scan for SyncHeader in buf */
-			if ((*rbuf & v->key) == 0)
-				break;	/* found first byte           */
-	if (len == 0) {
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	/* copy new data to decode-buffer */
-	memcpy(&(v->decodebuf[v->decodelen]), rbuf, len);
-	v->decodelen += len;
-ReSync:
-	if (v->decodelen < v->nbytes) {	/* got a new header ? */
-		dev_kfree_skb(skb);
-		return NULL;    /* no, try later      */
-	}
-	if (ValidHeaderBytes(v) != v->nbytes) {	/* is that a valid header? */
-		SyncHeader(v);  /* no -> look for header */
-		goto ReSync;
-	}
-	len = (v->decodelen - (v->decodelen % (10 * v->nbytes))) / v->nbytes;
-	if ((v110_buf = kmalloc(len, GFP_ATOMIC)) == NULL) {
-		printk(KERN_WARNING "isdn_v110_decode: Couldn't allocate v110_buf\n");
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	for (i = 0; i < len; i++) {
-		v110_buf[i] = 0;
-		for (j = 0; j < v->nbytes; j++)
-			v110_buf[i] |= (v->decodebuf[(i * v->nbytes) + j] & v->key) << (8 - ((j + 1) * v->nbits));
-		v110_buf[i] = FlipBits(v110_buf[i], v->nbits);
-	}
-	v->decodelen = (v->decodelen % (10 * v->nbytes));
-	memcpy(v->decodebuf, &(v->decodebuf[len * v->nbytes]), v->decodelen);
-
-	skb_trim(skb, DecodeMatrix(v, v110_buf, len, skb->data));
-	kfree(v110_buf);
-	if (skb->len)
-		return skb;
-	else {
-		kfree_skb(skb);
-		return NULL;
-	}
-}
-
-/* EncodeMatrix takes input data in buf, len is the bytecount.
-   Data is encoded into v110 frames in m. Return value is the number of
-   matrix-lines generated.
-*/
-static int
-EncodeMatrix(unsigned char *buf, int len, unsigned char *m, int mlen)
-{
-	int line = 0;
-	int i = 0;
-	int mbit = 128;
-	int dbit = 1;
-	int introducer = 3;
-	int ibit[] = {0, 1, 1};
-
-	while ((i < len) && (line < mlen)) {	/* while we still have input data */
-		switch (line % 10) {	/* in which line of the matrix are we? */
-		case 0:
-			m[line++] = 0x00;	/* line 0 is always 0 */
-			mbit = 128;	/* go on with the 7th bit */
-			break;
-		case 5:
-			m[line++] = 0xbf;	/* line 5 is always 10111111 */
-			mbit = 128;	/* go on with the 7th bit */
-			break;
-		}
-		if (line >= mlen) {
-			printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n");
-			return line;
-		}
-	next_bit:
-		switch (mbit) { /* leftmost or rightmost bit ? */
-		case 1:
-			line++;	/* rightmost -> go to next line */
-			if (line >= mlen) {
-				printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n");
-				return line;
-			}
-			/* fall through */
-		case 128:
-			m[line] = 128;	/* leftmost -> set byte to 1000000 */
-			mbit = 64;	/* current bit in the matrix line */
-			continue;
-		}
-		if (introducer) {	/* set 110 sequence ? */
-			introducer--;	/* set on digit less */
-			m[line] |= ibit[introducer] ? mbit : 0;	/* set corresponding bit */
-			mbit >>= 1;	/* bit of matrix line  >> 1 */
-			goto next_bit;	/* and go on there */
-		}               /* else push data bits into the matrix! */
-		m[line] |= (buf[i] & dbit) ? mbit : 0;	/* set data bit in matrix */
-		if (dbit == 128) {	/* was it the last one? */
-			dbit = 1;	/* then go on with first bit of  */
-			i++;            /* next byte in input buffer */
-			if (i < len)	/* input buffer done ? */
-				introducer = 3;	/* no, write introducer 110 */
-			else {  /* input buffer done ! */
-				m[line] |= (mbit - 1) & 0xfe;	/* set remaining bits in line to 1 */
-				break;
-			}
-		} else          /* not the last data bit */
-			dbit <<= 1;	/* then go to next data bit */
-		mbit >>= 1;     /* go to next bit of matrix */
-		goto next_bit;
-
-	}
-	/* if necessary, generate remaining lines of the matrix... */
-	if ((line) && ((line + 10) < mlen))
-		switch (++line % 10) {
-		case 1:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 2:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 3:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 4:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 5:
-			m[line++] = 0xbf;
-			/* fall through */
-		case 6:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 7:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 8:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 9:
-			m[line++] = 0xfe;
-		}
-	return line;            /* that's how many lines we have */
-}
-
-/*
- * Build a sync frame.
- */
-static struct sk_buff *
-isdn_v110_sync(isdn_v110_stream *v)
-{
-	struct sk_buff *skb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n");
-		return NULL;
-	}
-	if ((skb = dev_alloc_skb(v->framelen + v->skbres))) {
-		skb_reserve(skb, v->skbres);
-		skb_put_data(skb, v->OfflineFrame, v->framelen);
-	}
-	return skb;
-}
-
-/*
- * Build an idle frame.
- */
-static struct sk_buff *
-isdn_v110_idle(isdn_v110_stream *v)
-{
-	struct sk_buff *skb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n");
-		return NULL;
-	}
-	if ((skb = dev_alloc_skb(v->framelen + v->skbres))) {
-		skb_reserve(skb, v->skbres);
-		skb_put_data(skb, v->OnlineFrame, v->framelen);
-	}
-	return skb;
-}
-
-struct sk_buff *
-isdn_v110_encode(isdn_v110_stream *v, struct sk_buff *skb)
-{
-	int i;
-	int j;
-	int rlen;
-	int mlen;
-	int olen;
-	int size;
-	int sval1;
-	int sval2;
-	int nframes;
-	unsigned char *v110buf;
-	unsigned char *rbuf;
-	struct sk_buff *nskb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_encode called with NULL stream!\n");
-		return NULL;
-	}
-	if (!skb) {
-		/* invalid skb, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_encode called with NULL skb!\n");
-		return NULL;
-	}
-	rlen = skb->len;
-	nframes = (rlen + 3) / 4;
-	v110buf = v->encodebuf;
-	if ((nframes * 40) > v->maxsize) {
-		size = v->maxsize;
-		rlen = v->maxsize / 40;
-	} else
-		size = nframes * 40;
-	if (!(nskb = dev_alloc_skb(size + v->skbres + sizeof(int)))) {
-		printk(KERN_WARNING "isdn_v110_encode: Couldn't alloc skb\n");
-		return NULL;
-	}
-	skb_reserve(nskb, v->skbres + sizeof(int));
-	if (skb->len == 0) {
-		skb_put_data(nskb, v->OnlineFrame, v->framelen);
-		*((int *)skb_push(nskb, sizeof(int))) = 0;
-		return nskb;
-	}
-	mlen = EncodeMatrix(skb->data, rlen, v110buf, size);
-	/* now distribute 2 or 4 bits each to the output stream! */
-	rbuf = skb_put(nskb, size);
-	olen = 0;
-	sval1 = 8 - v->nbits;
-	sval2 = v->key << sval1;
-	for (i = 0; i < mlen; i++) {
-		v110buf[i] = FlipBits(v110buf[i], v->nbits);
-		for (j = 0; j < v->nbytes; j++) {
-			if (size--)
-				*rbuf++ = ~v->key | (((v110buf[i] << (j * v->nbits)) & sval2) >> sval1);
-			else {
-				printk(KERN_WARNING "isdn_v110_encode: buffers full!\n");
-				goto buffer_full;
-			}
-			olen++;
-		}
-	}
-buffer_full:
-	skb_trim(nskb, olen);
-	*((int *)skb_push(nskb, sizeof(int))) = rlen;
-	return nskb;
-}
-
-int
-isdn_v110_stat_callback(int idx, isdn_ctrl *c)
-{
-	isdn_v110_stream *v = NULL;
-	int i;
-	int ret = 0;
-
-	if (idx < 0)
-		return 0;
-	switch (c->command) {
-	case ISDN_STAT_BSENT:
-		/* Keep the send-queue of the driver filled
-		 * with frames:
-		 * If number of outstanding frames < 3,
-		 * send down an Idle-Frame (or an Sync-Frame, if
-		 * v->SyncInit != 0).
-		 */
-		if (!(v = dev->v110[idx]))
-			return 0;
-		atomic_inc(&dev->v110use[idx]);
-		for (i = 0; i * v->framelen < c->parm.length; i++) {
-			if (v->skbidle > 0) {
-				v->skbidle--;
-				ret = 1;
-			} else {
-				if (v->skbuser > 0)
-					v->skbuser--;
-				ret = 0;
-			}
-		}
-		for (i = v->skbuser + v->skbidle; i < 2; i++) {
-			struct sk_buff *skb;
-			if (v->SyncInit > 0)
-				skb = isdn_v110_sync(v);
-			else
-				skb = isdn_v110_idle(v);
-			if (skb) {
-				if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) {
-					dev_kfree_skb(skb);
-					break;
-				} else {
-					if (v->SyncInit)
-						v->SyncInit--;
-					v->skbidle++;
-				}
-			} else
-				break;
-		}
-		atomic_dec(&dev->v110use[idx]);
-		return ret;
-	case ISDN_STAT_DHUP:
-	case ISDN_STAT_BHUP:
-		while (1) {
-			atomic_inc(&dev->v110use[idx]);
-			if (atomic_dec_and_test(&dev->v110use[idx])) {
-				isdn_v110_close(dev->v110[idx]);
-				dev->v110[idx] = NULL;
-				break;
-			}
-			mdelay(1);
-		}
-		break;
-	case ISDN_STAT_BCONN:
-		if (dev->v110emu[idx] && (dev->v110[idx] == NULL)) {
-			int hdrlen = dev->drv[c->driver]->interface->hl_hdrlen;
-			int maxsize = dev->drv[c->driver]->interface->maxbufsize;
-			atomic_inc(&dev->v110use[idx]);
-			switch (dev->v110emu[idx]) {
-			case ISDN_PROTO_L2_V11096:
-				dev->v110[idx] = isdn_v110_open(V110_9600, hdrlen, maxsize);
-				break;
-			case ISDN_PROTO_L2_V11019:
-				dev->v110[idx] = isdn_v110_open(V110_19200, hdrlen, maxsize);
-				break;
-			case ISDN_PROTO_L2_V11038:
-				dev->v110[idx] = isdn_v110_open(V110_38400, hdrlen, maxsize);
-				break;
-			default:;
-			}
-			if ((v = dev->v110[idx])) {
-				while (v->SyncInit) {
-					struct sk_buff *skb = isdn_v110_sync(v);
-					if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) {
-						dev_kfree_skb(skb);
-						/* Unable to send, try later */
-						break;
-					}
-					v->SyncInit--;
-					v->skbidle++;
-				}
-			} else
-				printk(KERN_WARNING "isdn_v110: Couldn't open stream for chan %d\n", idx);
-			atomic_dec(&dev->v110use[idx]);
-		}
-		break;
-	default:
-		return 0;
-	}
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_v110.h b/drivers/isdn/i4l/isdn_v110.h
deleted file mode 100644
index de774ab598c9..000000000000
--- a/drivers/isdn/i4l/isdn_v110.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* $Id: isdn_v110.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, V.110 related functions (linklevel).
- *
- * Copyright by Thomas Pfeiffer (pfeiffer@pds.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _isdn_v110_h_
-#define _isdn_v110_h_
-
-/*
- * isdn_v110_encode will take raw data and encode it using V.110
- */
-extern struct sk_buff *isdn_v110_encode(isdn_v110_stream *, struct sk_buff *);
-
-/*
- * isdn_v110_decode receives V.110 coded data from the stream and rebuilds
- * frames from them. The source stream doesn't need to be framed.
- */
-extern struct sk_buff *isdn_v110_decode(isdn_v110_stream *, struct sk_buff *);
-
-extern int isdn_v110_stat_callback(int, isdn_ctrl *);
-extern void isdn_v110_close(isdn_v110_stream *v);
-
-#endif
diff --git a/drivers/isdn/i4l/isdn_x25iface.c b/drivers/isdn/i4l/isdn_x25iface.c
deleted file mode 100644
index 48bfbcb4a09d..000000000000
--- a/drivers/isdn/i4l/isdn_x25iface.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/* $Id: isdn_x25iface.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, X.25 related functions
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- * stuff needed to support the Linux X.25 PLP code on top of devices that
- * can provide a lab_b service using the concap_proto mechanism.
- * This module supports a network interface which provides lapb_sematics
- * -- as defined in Documentation/networking/x25-iface.txt -- to
- * the upper layer and assumes that the lower layer provides a reliable
- * data link service by means of the concap_device_ops callbacks.
- *
- * Only protocol specific stuff goes here. Device specific stuff
- * goes to another -- device related -- concap_proto support source file.
- *
- */
-
-/* #include <linux/isdn.h> */
-#include <linux/netdevice.h>
-#include <linux/concap.h>
-#include <linux/slab.h>
-#include <linux/wanrouter.h>
-#include <net/x25device.h>
-#include "isdn_x25iface.h"
-
-/* for debugging messages not to cause an oops when device pointer is NULL*/
-#define MY_DEVNAME(dev)  ((dev) ? (dev)->name : "DEVICE UNSPECIFIED")
-
-
-typedef struct isdn_x25iface_proto_data {
-	int magic;
-	enum wan_states state;
-	/* Private stuff, not to be accessed via proto_data. We provide the
-	   other storage for the concap_proto instance here as well,
-	   enabling us to allocate both with just one kmalloc(): */
-	struct concap_proto priv;
-} ix25_pdata_t;
-
-
-
-/* is now in header file (extern): struct concap_proto * isdn_x25iface_proto_new(void); */
-static void isdn_x25iface_proto_del(struct concap_proto *);
-static int isdn_x25iface_proto_close(struct concap_proto *);
-static int isdn_x25iface_proto_restart(struct concap_proto *,
-				       struct net_device *,
-				       struct concap_device_ops *);
-static int isdn_x25iface_xmit(struct concap_proto *, struct sk_buff *);
-static int isdn_x25iface_receive(struct concap_proto *, struct sk_buff *);
-static int isdn_x25iface_connect_ind(struct concap_proto *);
-static int isdn_x25iface_disconn_ind(struct concap_proto *);
-
-
-static struct concap_proto_ops ix25_pops = {
-	.proto_new = &isdn_x25iface_proto_new,
-	.proto_del = &isdn_x25iface_proto_del,
-	.restart = &isdn_x25iface_proto_restart,
-	.close = &isdn_x25iface_proto_close,
-	.encap_and_xmit = &isdn_x25iface_xmit,
-	.data_ind = &isdn_x25iface_receive,
-	.connect_ind = &isdn_x25iface_connect_ind,
-	.disconn_ind = &isdn_x25iface_disconn_ind
-};
-
-/* error message helper function */
-static void illegal_state_warn(unsigned state, unsigned char firstbyte)
-{
-	printk(KERN_WARNING "isdn_x25iface: firstbyte %x illegal in"
-	       "current state %d\n", firstbyte, state);
-}
-
-/* check protocol data field for consistency */
-static int pdata_is_bad(ix25_pdata_t *pda) {
-
-	if (pda  &&  pda->magic == ISDN_X25IFACE_MAGIC) return 0;
-	printk(KERN_WARNING
-	       "isdn_x25iface_xxx: illegal pointer to proto data\n");
-	return 1;
-}
-
-/* create a new x25 interface protocol instance
- */
-struct concap_proto *isdn_x25iface_proto_new(void)
-{
-	ix25_pdata_t *tmp = kmalloc(sizeof(ix25_pdata_t), GFP_KERNEL);
-	IX25DEBUG("isdn_x25iface_proto_new\n");
-	if (tmp) {
-		tmp->magic = ISDN_X25IFACE_MAGIC;
-		tmp->state = WAN_UNCONFIGURED;
-		/* private data space used to hold the concap_proto data.
-		   Only to be accessed via the returned pointer */
-		spin_lock_init(&tmp->priv.lock);
-		tmp->priv.dops       = NULL;
-		tmp->priv.net_dev    = NULL;
-		tmp->priv.pops       = &ix25_pops;
-		tmp->priv.flags      = 0;
-		tmp->priv.proto_data = tmp;
-		return (&(tmp->priv));
-	}
-	return NULL;
-};
-
-/* close the x25iface encapsulation protocol
- */
-static int isdn_x25iface_proto_close(struct concap_proto *cprot) {
-
-	ix25_pdata_t *tmp;
-	int ret = 0;
-	ulong flags;
-
-	if (!cprot) {
-		printk(KERN_ERR "isdn_x25iface_proto_close: "
-		       "invalid concap_proto pointer\n");
-		return -1;
-	}
-	IX25DEBUG("isdn_x25iface_proto_close %s \n", MY_DEVNAME(cprot->net_dev));
-	spin_lock_irqsave(&cprot->lock, flags);
-	cprot->dops    = NULL;
-	cprot->net_dev = NULL;
-	tmp = cprot->proto_data;
-	if (pdata_is_bad(tmp)) {
-		ret = -1;
-	} else {
-		tmp->state = WAN_UNCONFIGURED;
-	}
-	spin_unlock_irqrestore(&cprot->lock, flags);
-	return ret;
-}
-
-/* Delete the x25iface encapsulation protocol instance
- */
-static void isdn_x25iface_proto_del(struct concap_proto *cprot) {
-
-	ix25_pdata_t *tmp;
-
-	IX25DEBUG("isdn_x25iface_proto_del \n");
-	if (!cprot) {
-		printk(KERN_ERR "isdn_x25iface_proto_del: "
-		       "concap_proto pointer is NULL\n");
-		return;
-	}
-	tmp = cprot->proto_data;
-	if (tmp == NULL) {
-		printk(KERN_ERR "isdn_x25iface_proto_del: inconsistent "
-		       "proto_data pointer (maybe already deleted?)\n");
-		return;
-	}
-	/* close if the protocol is still open */
-	if (cprot->dops) isdn_x25iface_proto_close(cprot);
-	/* freeing the storage should be sufficient now. But some additional
-	   settings might help to catch wild pointer bugs */
-	tmp->magic = 0;
-	cprot->proto_data = NULL;
-
-	kfree(tmp);
-	return;
-}
-
-/* (re-)initialize the data structures for x25iface encapsulation
- */
-static int isdn_x25iface_proto_restart(struct concap_proto *cprot,
-				       struct net_device *ndev,
-				       struct concap_device_ops *dops)
-{
-	ix25_pdata_t *pda = cprot->proto_data;
-	ulong flags;
-
-	IX25DEBUG("isdn_x25iface_proto_restart %s \n", MY_DEVNAME(ndev));
-
-	if (pdata_is_bad(pda)) return -1;
-
-	if (!(dops && dops->data_req && dops->connect_req
-	      && dops->disconn_req)) {
-		printk(KERN_WARNING "isdn_x25iface_restart: required dops"
-		       " missing\n");
-		isdn_x25iface_proto_close(cprot);
-		return -1;
-	}
-	spin_lock_irqsave(&cprot->lock, flags);
-	cprot->net_dev = ndev;
-	cprot->pops = &ix25_pops;
-	cprot->dops = dops;
-	pda->state = WAN_DISCONNECTED;
-	spin_unlock_irqrestore(&cprot->lock, flags);
-	return 0;
-}
-
-/* deliver a dl_data frame received from i4l HL driver to the network layer
- */
-static int isdn_x25iface_receive(struct concap_proto *cprot, struct sk_buff *skb)
-{
-	IX25DEBUG("isdn_x25iface_receive %s \n", MY_DEVNAME(cprot->net_dev));
-	if (((ix25_pdata_t *)(cprot->proto_data))
-	    ->state == WAN_CONNECTED) {
-		if (skb_push(skb, 1)) {
-			skb->data[0] = X25_IFACE_DATA;
-			skb->protocol = x25_type_trans(skb, cprot->net_dev);
-			netif_rx(skb);
-			return 0;
-		}
-	}
-	printk(KERN_WARNING "isdn_x25iface_receive %s: not connected, skb dropped\n", MY_DEVNAME(cprot->net_dev));
-	dev_kfree_skb(skb);
-	return -1;
-}
-
-/* a connection set up is indicated by lower layer
- */
-static int isdn_x25iface_connect_ind(struct concap_proto *cprot)
-{
-	struct sk_buff *skb;
-	enum wan_states *state_p
-		= &(((ix25_pdata_t *)(cprot->proto_data))->state);
-	IX25DEBUG("isdn_x25iface_connect_ind %s \n"
-		  , MY_DEVNAME(cprot->net_dev));
-	if (*state_p == WAN_UNCONFIGURED) {
-		printk(KERN_WARNING
-		       "isdn_x25iface_connect_ind while unconfigured %s\n"
-		       , MY_DEVNAME(cprot->net_dev));
-		return -1;
-	}
-	*state_p = WAN_CONNECTED;
-
-	skb = dev_alloc_skb(1);
-	if (skb) {
-		skb_put_u8(skb, X25_IFACE_CONNECT);
-		skb->protocol = x25_type_trans(skb, cprot->net_dev);
-		netif_rx(skb);
-		return 0;
-	} else {
-		printk(KERN_WARNING "isdn_x25iface_connect_ind: "
-		       " out of memory -- disconnecting\n");
-		cprot->dops->disconn_req(cprot);
-		return -1;
-	}
-}
-
-/* a disconnect is indicated by lower layer
- */
-static int isdn_x25iface_disconn_ind(struct concap_proto *cprot)
-{
-	struct sk_buff *skb;
-	enum wan_states *state_p
-		= &(((ix25_pdata_t *)(cprot->proto_data))->state);
-	IX25DEBUG("isdn_x25iface_disconn_ind %s \n", MY_DEVNAME(cprot->net_dev));
-	if (*state_p == WAN_UNCONFIGURED) {
-		printk(KERN_WARNING
-		       "isdn_x25iface_disconn_ind while unconfigured\n");
-		return -1;
-	}
-	if (!cprot->net_dev) return -1;
-	*state_p = WAN_DISCONNECTED;
-	skb = dev_alloc_skb(1);
-	if (skb) {
-		skb_put_u8(skb, X25_IFACE_DISCONNECT);
-		skb->protocol = x25_type_trans(skb, cprot->net_dev);
-		netif_rx(skb);
-		return 0;
-	} else {
-		printk(KERN_WARNING "isdn_x25iface_disconn_ind:"
-		       " out of memory\n");
-		return -1;
-	}
-}
-
-/* process a frame handed over to us from linux network layer. First byte
-   semantics as defined in Documentation/networking/x25-iface.txt
-*/
-static int isdn_x25iface_xmit(struct concap_proto *cprot, struct sk_buff *skb)
-{
-	unsigned char firstbyte = skb->data[0];
-	enum wan_states *state = &((ix25_pdata_t *)cprot->proto_data)->state;
-	int ret = 0;
-	IX25DEBUG("isdn_x25iface_xmit: %s first=%x state=%d\n",
-		  MY_DEVNAME(cprot->net_dev), firstbyte, *state);
-	switch (firstbyte) {
-	case X25_IFACE_DATA:
-		if (*state == WAN_CONNECTED) {
-			skb_pull(skb, 1);
-			netif_trans_update(cprot->net_dev);
-			ret = (cprot->dops->data_req(cprot, skb));
-			/* prepare for future retransmissions */
-			if (ret) skb_push(skb, 1);
-			return ret;
-		}
-		illegal_state_warn(*state, firstbyte);
-		break;
-	case X25_IFACE_CONNECT:
-		if (*state == WAN_DISCONNECTED) {
-			*state = WAN_CONNECTING;
-			ret = cprot->dops->connect_req(cprot);
-			if (ret) {
-				/* reset state and notify upper layer about
-				 * immidiatly failed attempts */
-				isdn_x25iface_disconn_ind(cprot);
-			}
-		} else {
-			illegal_state_warn(*state, firstbyte);
-		}
-		break;
-	case X25_IFACE_DISCONNECT:
-		switch (*state) {
-		case WAN_DISCONNECTED:
-			/* Should not happen. However, give upper layer a
-			   chance to recover from inconstistency  but don't
-			   trust the lower layer sending the disconn_confirm
-			   when already disconnected */
-			printk(KERN_WARNING "isdn_x25iface_xmit: disconnect "
-			       " requested while disconnected\n");
-			isdn_x25iface_disconn_ind(cprot);
-			break; /* prevent infinite loops */
-		case WAN_CONNECTING:
-		case WAN_CONNECTED:
-			*state = WAN_DISCONNECTED;
-			cprot->dops->disconn_req(cprot);
-			break;
-		default:
-			illegal_state_warn(*state, firstbyte);
-		}
-		break;
-	case X25_IFACE_PARAMS:
-		printk(KERN_WARNING "isdn_x25iface_xmit: setting of lapb"
-		       " options not yet supported\n");
-		break;
-	default:
-		printk(KERN_WARNING "isdn_x25iface_xmit: frame with illegal"
-		       " first byte %x ignored:\n", firstbyte);
-	}
-	dev_kfree_skb(skb);
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_x25iface.h b/drivers/isdn/i4l/isdn_x25iface.h
deleted file mode 100644
index ca08e082cf7c..000000000000
--- a/drivers/isdn/i4l/isdn_x25iface.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* $Id: isdn_x25iface.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, x.25 related functions
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _LINUX_ISDN_X25IFACE_H
-#define _LINUX_ISDN_X25IFACE_H
-
-#define ISDN_X25IFACE_MAGIC 0x1e75a2b9
-/* #define DEBUG_ISDN_X25 if you want isdn_x25 debugging messages */
-#ifdef DEBUG_ISDN_X25
-#   define IX25DEBUG(fmt, args...) printk(KERN_DEBUG fmt, ##args)
-#else
-#   define IX25DEBUG(fmt, args...)
-#endif
-
-#include <linux/skbuff.h>
-#include <linux/isdn.h>
-#include <linux/concap.h>
-
-extern struct concap_proto_ops *isdn_x25iface_concap_proto_ops_pt;
-extern struct concap_proto *isdn_x25iface_proto_new(void);
-
-
-
-#endif
diff --git a/drivers/isdn/isdnloop/Makefile b/drivers/isdn/isdnloop/Makefile
deleted file mode 100644
index 5ff4c0e09768..000000000000
--- a/drivers/isdn/isdnloop/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-# Makefile for the isdnloop ISDN device driver
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_DRV_LOOP)	+= isdnloop.o
diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c
deleted file mode 100644
index 755c6bbc9553..000000000000
--- a/drivers/isdn/isdnloop/isdnloop.c
+++ /dev/null
@@ -1,1528 +0,0 @@
-/* $Id: isdnloop.c,v 1.11.6.7 2001/11/11 19:54:31 kai Exp $
- *
- * ISDN low-level module implementing a dummy loop driver.
- *
- * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include "isdnloop.h"
-
-static char *isdnloop_id = "loop0";
-
-MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card");
-MODULE_AUTHOR("Fritz Elfert");
-MODULE_LICENSE("GPL");
-module_param(isdnloop_id, charp, 0);
-MODULE_PARM_DESC(isdnloop_id, "ID-String of first card");
-
-static int isdnloop_addcard(char *);
-
-/*
- * Free queue completely.
- *
- * Parameter:
- *   card    = pointer to card struct
- *   channel = channel number
- */
-static void
-isdnloop_free_queue(isdnloop_card *card, int channel)
-{
-	struct sk_buff_head *queue = &card->bqueue[channel];
-
-	skb_queue_purge(queue);
-	card->sndcount[channel] = 0;
-}
-
-/*
- * Send B-Channel data to another virtual card.
- * This routine is called via timer-callback from isdnloop_pollbchan().
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel number (0-based)
- */
-static void
-isdnloop_bchan_send(isdnloop_card *card, int ch)
-{
-	isdnloop_card *rcard = card->rcard[ch];
-	int rch = card->rch[ch], len, ack;
-	struct sk_buff *skb;
-	isdn_ctrl cmd;
-
-	while (card->sndcount[ch]) {
-		skb = skb_dequeue(&card->bqueue[ch]);
-		if (skb) {
-			len = skb->len;
-			card->sndcount[ch] -= len;
-			ack = *(skb->head); /* used as scratch area */
-			cmd.driver = card->myid;
-			cmd.arg = ch;
-			if (rcard) {
-				rcard->interface.rcvcallb_skb(rcard->myid, rch, skb);
-			} else {
-				printk(KERN_WARNING "isdnloop: no rcard, skb dropped\n");
-				dev_kfree_skb(skb);
-
-			}
-			cmd.command = ISDN_STAT_BSENT;
-			cmd.parm.length = len;
-			card->interface.statcallb(&cmd);
-		} else
-			card->sndcount[ch] = 0;
-	}
-}
-
-/*
- * Send/Receive Data to/from the B-Channel.
- * This routine is called via timer-callback.
- * It schedules itself while any B-Channel is open.
- *
- * Parameter:
- *   data = pointer to card struct, set by kernel timer.data
- */
-static void
-isdnloop_pollbchan(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, rb_timer);
-	unsigned long flags;
-
-	if (card->flags & ISDNLOOP_FLAGS_B1ACTIVE)
-		isdnloop_bchan_send(card, 0);
-	if (card->flags & ISDNLOOP_FLAGS_B2ACTIVE)
-		isdnloop_bchan_send(card, 1);
-	if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE)) {
-		/* schedule b-channel polling again */
-		spin_lock_irqsave(&card->isdnloop_lock, flags);
-		card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD;
-		add_timer(&card->rb_timer);
-		card->flags |= ISDNLOOP_FLAGS_RBTIMER;
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	} else
-		card->flags &= ~ISDNLOOP_FLAGS_RBTIMER;
-}
-
-/*
- * Parse ICN-type setup string and fill fields of setup-struct
- * with parsed data.
- *
- * Parameter:
- *   setup = setup string, format: [caller-id],si1,si2,[called-id]
- *   cmd   = pointer to struct to be filled.
- */
-static void
-isdnloop_parse_setup(char *setup, isdn_ctrl *cmd)
-{
-	char *t = setup;
-	char *s = strchr(t, ',');
-
-	*s++ = '\0';
-	strlcpy(cmd->parm.setup.phone, t, sizeof(cmd->parm.setup.phone));
-	s = strchr(t = s, ',');
-	*s++ = '\0';
-	if (!strlen(t))
-		cmd->parm.setup.si1 = 0;
-	else
-		cmd->parm.setup.si1 = simple_strtoul(t, NULL, 10);
-	s = strchr(t = s, ',');
-	*s++ = '\0';
-	if (!strlen(t))
-		cmd->parm.setup.si2 = 0;
-	else
-		cmd->parm.setup.si2 =
-			simple_strtoul(t, NULL, 10);
-	strlcpy(cmd->parm.setup.eazmsn, s, sizeof(cmd->parm.setup.eazmsn));
-	cmd->parm.setup.plan = 0;
-	cmd->parm.setup.screen = 0;
-}
-
-typedef struct isdnloop_stat {
-	char *statstr;
-	int command;
-	int action;
-} isdnloop_stat;
-/* *INDENT-OFF* */
-static isdnloop_stat isdnloop_stat_table[] = {
-	{"BCON_",          ISDN_STAT_BCONN, 1}, /* B-Channel connected        */
-	{"BDIS_",          ISDN_STAT_BHUP,  2}, /* B-Channel disconnected     */
-	{"DCON_",          ISDN_STAT_DCONN, 0}, /* D-Channel connected        */
-	{"DDIS_",          ISDN_STAT_DHUP,  0}, /* D-Channel disconnected     */
-	{"DCAL_I",         ISDN_STAT_ICALL, 3}, /* Incoming call dialup-line  */
-	{"DSCA_I",         ISDN_STAT_ICALL, 3}, /* Incoming call 1TR6-SPV     */
-	{"FCALL",          ISDN_STAT_ICALL, 4}, /* Leased line connection up  */
-	{"CIF",            ISDN_STAT_CINF,  5}, /* Charge-info, 1TR6-type     */
-	{"AOC",            ISDN_STAT_CINF,  6}, /* Charge-info, DSS1-type     */
-	{"CAU",            ISDN_STAT_CAUSE, 7}, /* Cause code                 */
-	{"TEI OK",         ISDN_STAT_RUN,   0}, /* Card connected to wallplug */
-	{"E_L1: ACT FAIL", ISDN_STAT_BHUP,  8}, /* Layer-1 activation failed  */
-	{"E_L2: DATA LIN", ISDN_STAT_BHUP,  8}, /* Layer-2 data link lost     */
-	{"E_L1: ACTIVATION FAILED",
-	 ISDN_STAT_BHUP,  8},         /* Layer-1 activation failed  */
-	{NULL, 0, -1}
-};
-/* *INDENT-ON* */
-
-
-/*
- * Parse Status message-strings from virtual card.
- * Depending on status, call statcallb for sending messages to upper
- * levels. Also set/reset B-Channel active-flags.
- *
- * Parameter:
- *   status  = status string to parse.
- *   channel = channel where message comes from.
- *   card    = card where message comes from.
- */
-static void
-isdnloop_parse_status(u_char *status, int channel, isdnloop_card *card)
-{
-	isdnloop_stat *s = isdnloop_stat_table;
-	int action = -1;
-	isdn_ctrl cmd;
-
-	while (s->statstr) {
-		if (!strncmp(status, s->statstr, strlen(s->statstr))) {
-			cmd.command = s->command;
-			action = s->action;
-			break;
-		}
-		s++;
-	}
-	if (action == -1)
-		return;
-	cmd.driver = card->myid;
-	cmd.arg = channel;
-	switch (action) {
-	case 1:
-		/* BCON_x */
-		card->flags |= (channel) ?
-			ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE;
-		break;
-	case 2:
-		/* BDIS_x */
-		card->flags &= ~((channel) ?
-				 ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE);
-		isdnloop_free_queue(card, channel);
-		break;
-	case 3:
-		/* DCAL_I and DSCA_I */
-		isdnloop_parse_setup(status + 6, &cmd);
-		break;
-	case 4:
-		/* FCALL */
-		sprintf(cmd.parm.setup.phone, "LEASED%d", card->myid);
-		sprintf(cmd.parm.setup.eazmsn, "%d", channel + 1);
-		cmd.parm.setup.si1 = 7;
-		cmd.parm.setup.si2 = 0;
-		cmd.parm.setup.plan = 0;
-		cmd.parm.setup.screen = 0;
-		break;
-	case 5:
-		/* CIF */
-		strlcpy(cmd.parm.num, status + 3, sizeof(cmd.parm.num));
-		break;
-	case 6:
-		/* AOC */
-		snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%d",
-			 (int) simple_strtoul(status + 7, NULL, 16));
-		break;
-	case 7:
-		/* CAU */
-		status += 3;
-		if (strlen(status) == 4)
-			snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%s%c%c",
-				 status + 2, *status, *(status + 1));
-		else
-			strlcpy(cmd.parm.num, status + 1, sizeof(cmd.parm.num));
-		break;
-	case 8:
-		/* Misc Errors on L1 and L2 */
-		card->flags &= ~ISDNLOOP_FLAGS_B1ACTIVE;
-		isdnloop_free_queue(card, 0);
-		cmd.arg = 0;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_DHUP;
-		cmd.arg = 0;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_BHUP;
-		card->flags &= ~ISDNLOOP_FLAGS_B2ACTIVE;
-		isdnloop_free_queue(card, 1);
-		cmd.arg = 1;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_DHUP;
-		cmd.arg = 1;
-		cmd.driver = card->myid;
-		break;
-	}
-	card->interface.statcallb(&cmd);
-}
-
-/*
- * Store a cwcharacter into ringbuffer for reading from /dev/isdnctrl
- *
- * Parameter:
- *   card = pointer to card struct.
- *   c    = char to store.
- */
-static void
-isdnloop_putmsg(isdnloop_card *card, unsigned char c)
-{
-	ulong flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	*card->msg_buf_write++ = (c == 0xff) ? '\n' : c;
-	if (card->msg_buf_write == card->msg_buf_read) {
-		if (++card->msg_buf_read > card->msg_buf_end)
-			card->msg_buf_read = card->msg_buf;
-	}
-	if (card->msg_buf_write > card->msg_buf_end)
-		card->msg_buf_write = card->msg_buf;
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Poll a virtual cards message queue.
- * If there are new status-replies from the card, copy them to
- * ringbuffer for reading on /dev/isdnctrl and call
- * isdnloop_parse_status() for processing them. Watch for special
- * Firmware bootmessage and parse it, to get the D-Channel protocol.
- * If there are B-Channels open, initiate a timer-callback to
- * isdnloop_pollbchan().
- * This routine is called periodically via timer interrupt.
- *
- * Parameter:
- *   data = pointer to card struct
- */
-static void
-isdnloop_polldchan(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, st_timer);
-	struct sk_buff *skb;
-	int avail;
-	int left;
-	u_char c;
-	int ch;
-	unsigned long flags;
-	u_char *p;
-	isdn_ctrl cmd;
-
-	skb = skb_dequeue(&card->dqueue);
-	if (skb)
-		avail = skb->len;
-	else
-		avail = 0;
-	for (left = avail; left > 0; left--) {
-		c = *skb->data;
-		skb_pull(skb, 1);
-		isdnloop_putmsg(card, c);
-		card->imsg[card->iptr] = c;
-		if (card->iptr < 59)
-			card->iptr++;
-		if (!skb->len) {
-			avail++;
-			isdnloop_putmsg(card, '\n');
-			card->imsg[card->iptr] = 0;
-			card->iptr = 0;
-			if (card->imsg[0] == '0' && card->imsg[1] >= '0' &&
-			    card->imsg[1] <= '2' && card->imsg[2] == ';') {
-				ch = (card->imsg[1] - '0') - 1;
-				p = &card->imsg[3];
-				isdnloop_parse_status(p, ch, card);
-			} else {
-				p = card->imsg;
-				if (!strncmp(p, "DRV1.", 5)) {
-					printk(KERN_INFO "isdnloop: (%s) %s\n", CID, p);
-					if (!strncmp(p + 7, "TC", 2)) {
-						card->ptype = ISDN_PTYPE_1TR6;
-						card->interface.features |= ISDN_FEATURE_P_1TR6;
-						printk(KERN_INFO
-						       "isdnloop: (%s) 1TR6-Protocol loaded and running\n", CID);
-					}
-					if (!strncmp(p + 7, "EC", 2)) {
-						card->ptype = ISDN_PTYPE_EURO;
-						card->interface.features |= ISDN_FEATURE_P_EURO;
-						printk(KERN_INFO
-						       "isdnloop: (%s) Euro-Protocol loaded and running\n", CID);
-					}
-					continue;
-
-				}
-			}
-		}
-	}
-	if (avail) {
-		cmd.command = ISDN_STAT_STAVAIL;
-		cmd.driver = card->myid;
-		cmd.arg = avail;
-		card->interface.statcallb(&cmd);
-	}
-	if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE))
-		if (!(card->flags & ISDNLOOP_FLAGS_RBTIMER)) {
-			/* schedule b-channel polling */
-			card->flags |= ISDNLOOP_FLAGS_RBTIMER;
-			spin_lock_irqsave(&card->isdnloop_lock, flags);
-			del_timer(&card->rb_timer);
-			card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD;
-			add_timer(&card->rb_timer);
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-		}
-	/* schedule again */
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD;
-	add_timer(&card->st_timer);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Append a packet to the transmit buffer-queue.
- *
- * Parameter:
- *   channel = Number of B-channel
- *   skb     = packet to send.
- *   card    = pointer to card-struct
- * Return:
- *   Number of bytes transferred, -E??? on error
- */
-static int
-isdnloop_sendbuf(int channel, struct sk_buff *skb, isdnloop_card *card)
-{
-	int len = skb->len;
-	unsigned long flags;
-	struct sk_buff *nskb;
-
-	if (len > 4000) {
-		printk(KERN_WARNING
-		       "isdnloop: Send packet too large\n");
-		return -EINVAL;
-	}
-	if (len) {
-		if (!(card->flags & (channel ? ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE)))
-			return 0;
-		if (card->sndcount[channel] > ISDNLOOP_MAX_SQUEUE)
-			return 0;
-		spin_lock_irqsave(&card->isdnloop_lock, flags);
-		nskb = dev_alloc_skb(skb->len);
-		if (nskb) {
-			skb_copy_from_linear_data(skb,
-						  skb_put(nskb, len), len);
-			skb_queue_tail(&card->bqueue[channel], nskb);
-			dev_kfree_skb(skb);
-		} else
-			len = 0;
-		card->sndcount[channel] += len;
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	}
-	return len;
-}
-
-/*
- * Read the messages from the card's ringbuffer
- *
- * Parameter:
- *   buf  = pointer to buffer.
- *   len  = number of bytes to read.
- *   user = flag, 1: called from userlevel 0: called from kernel.
- *   card = pointer to card struct.
- * Return:
- *   number of bytes actually transferred.
- */
-static int
-isdnloop_readstatus(u_char __user *buf, int len, isdnloop_card *card)
-{
-	int count;
-	u_char __user *p;
-
-	for (p = buf, count = 0; count < len; p++, count++) {
-		if (card->msg_buf_read == card->msg_buf_write)
-			return count;
-		if (put_user(*card->msg_buf_read++, p))
-			return -EFAULT;
-		if (card->msg_buf_read > card->msg_buf_end)
-			card->msg_buf_read = card->msg_buf;
-	}
-	return count;
-}
-
-/*
- * Simulate a card's response by appending it to the cards
- * message queue.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   s    = pointer to message-string.
- *   ch   = channel: 0 = generic messages, 1 and 2 = D-channel messages.
- * Return:
- *   0 on success, 1 on memory squeeze.
- */
-static int
-isdnloop_fake(isdnloop_card *card, char *s, int ch)
-{
-	struct sk_buff *skb;
-	int len = strlen(s) + ((ch >= 0) ? 3 : 0);
-	skb = dev_alloc_skb(len);
-	if (!skb) {
-		printk(KERN_WARNING "isdnloop: Out of memory in isdnloop_fake\n");
-		return 1;
-	}
-	if (ch >= 0)
-		sprintf(skb_put(skb, 3), "%02d;", ch);
-	skb_put_data(skb, s, strlen(s));
-	skb_queue_tail(&card->dqueue, skb);
-	return 0;
-}
-/* *INDENT-OFF* */
-static isdnloop_stat isdnloop_cmd_table[] = {
-	{"BCON_R",         0,  1},	/* B-Channel connect        */
-	{"BCON_I",         0, 17},	/* B-Channel connect ind    */
-	{"BDIS_R",         0,  2},	/* B-Channel disconnect     */
-	{"DDIS_R",         0,  3},	/* D-Channel disconnect     */
-	{"DCON_R",         0, 16},	/* D-Channel connect        */
-	{"DSCA_R",         0,  4},	/* Dial 1TR6-SPV     */
-	{"DCAL_R",         0,  5},	/* Dial */
-	{"EAZC",           0,  6},	/* Clear EAZ listener */
-	{"EAZ",            0,  7},	/* Set EAZ listener */
-	{"SEEAZ",          0,  8},	/* Get EAZ listener */
-	{"MSN",            0,  9},	/* Set/Clear MSN listener */
-	{"MSALL",          0, 10},	/* Set multi MSN listeners */
-	{"SETSIL",         0, 11},	/* Set SI list     */
-	{"SEESIL",         0, 12},	/* Get SI list     */
-	{"SILC",           0, 13},	/* Clear SI list     */
-	{"LOCK",           0, -1},	/* LOCK channel     */
-	{"UNLOCK",         0, -1},	/* UNLOCK channel     */
-	{"FV2ON",          1, 14},	/* Leased mode on               */
-	{"FV2OFF",         1, 15},	/* Leased mode off              */
-	{NULL, 0, -1}
-};
-/* *INDENT-ON* */
-
-
-/*
- * Simulate an error-response from a card.
- *
- * Parameter:
- *   card = pointer to card struct.
- */
-static void
-isdnloop_fake_err(isdnloop_card *card)
-{
-	char buf[64];
-
-	snprintf(buf, sizeof(buf), "E%s", card->omsg);
-	isdnloop_fake(card, buf, -1);
-	isdnloop_fake(card, "NAK", -1);
-}
-
-static u_char ctable_eu[] = {0x00, 0x11, 0x01, 0x12};
-static u_char ctable_1t[] = {0x00, 0x3b, 0x01, 0x3a};
-
-/*
- * Assemble a simplified cause message depending on the
- * D-channel protocol used.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   loc  = location: 0 = local, 1 = remote.
- *   cau  = cause: 1 = busy, 2 = nonexistent callerid, 3 = no user responding.
- * Return:
- *   Pointer to buffer containing the assembled message.
- */
-static char *
-isdnloop_unicause(isdnloop_card *card, int loc, int cau)
-{
-	static char buf[6];
-
-	switch (card->ptype) {
-	case ISDN_PTYPE_EURO:
-		sprintf(buf, "E%02X%02X", (loc) ? 4 : 2, ctable_eu[cau]);
-		break;
-	case ISDN_PTYPE_1TR6:
-		sprintf(buf, "%02X44", ctable_1t[cau]);
-		break;
-	default:
-		return "0000";
-	}
-	return buf;
-}
-
-/*
- * Release a virtual connection. Called from timer interrupt, when
- * called party did not respond.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel (0-based)
- */
-static void
-isdnloop_atimeout(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-	char buf[60];
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	if (card->rcard[ch]) {
-		isdnloop_fake(card->rcard[ch], "DDIS_I", card->rch[ch] + 1);
-		card->rcard[ch]->rcard[card->rch[ch]] = NULL;
-		card->rcard[ch] = NULL;
-	}
-	isdnloop_fake(card, "DDIS_I", ch + 1);
-	/* No user responding */
-	sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 3));
-	isdnloop_fake(card, buf, ch + 1);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Wrapper for isdnloop_atimeout().
- */
-static void
-isdnloop_atimeout0(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, c_timer[0]);
-
-	isdnloop_atimeout(card, 0);
-}
-
-/*
- * Wrapper for isdnloop_atimeout().
- */
-static void
-isdnloop_atimeout1(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, c_timer[1]);
-
-	isdnloop_atimeout(card, 1);
-}
-
-/*
- * Install a watchdog for a user, not responding.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel to watch for.
- */
-static void
-isdnloop_start_ctimer(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	timer_setup(&card->c_timer[ch], ch ? isdnloop_atimeout1
-					   : isdnloop_atimeout0, 0);
-	card->c_timer[ch].expires = jiffies + ISDNLOOP_TIMER_ALERTWAIT;
-	add_timer(&card->c_timer[ch]);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Kill a pending channel watchdog.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel (0-based).
- */
-static void
-isdnloop_kill_ctimer(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	del_timer(&card->c_timer[ch]);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-static u_char si2bit[] = {0, 1, 0, 0, 0, 2, 0, 4, 0, 0};
-static u_char bit2si[] = {1, 5, 7};
-
-/*
- * Try finding a listener for an outgoing call.
- *
- * Parameter:
- *   card = pointer to calling card.
- *   p    = pointer to ICN-type setup-string.
- *   lch  = channel of calling card.
- *   cmd  = pointer to struct to be filled when parsing setup.
- * Return:
- *   0 = found match, alerting should happen.
- *   1 = found matching number but it is busy.
- *   2 = no matching listener.
- *   3 = found matching number but SI does not match.
- */
-static int
-isdnloop_try_call(isdnloop_card *card, char *p, int lch, isdn_ctrl *cmd)
-{
-	isdnloop_card *cc = cards;
-	unsigned long flags;
-	int ch;
-	int num_match;
-	int i;
-	char *e;
-	char nbuf[32];
-
-	isdnloop_parse_setup(p, cmd);
-	while (cc) {
-		for (ch = 0; ch < 2; ch++) {
-			/* Exclude ourself */
-			if ((cc == card) && (ch == lch))
-				continue;
-			num_match = 0;
-			switch (cc->ptype) {
-			case ISDN_PTYPE_EURO:
-				for (i = 0; i < 3; i++)
-					if (!(strcmp(cc->s0num[i], cmd->parm.setup.phone)))
-						num_match = 1;
-				break;
-			case ISDN_PTYPE_1TR6:
-				e = cc->eazlist[ch];
-				while (*e) {
-					sprintf(nbuf, "%s%c", cc->s0num[0], *e);
-					if (!(strcmp(nbuf, cmd->parm.setup.phone)))
-						num_match = 1;
-					e++;
-				}
-			}
-			if (num_match) {
-				spin_lock_irqsave(&card->isdnloop_lock, flags);
-				/* channel idle? */
-				if (!(cc->rcard[ch])) {
-					/* Check SI */
-					if (!(si2bit[cmd->parm.setup.si1] & cc->sil[ch])) {
-						spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-						return 3;
-					}
-					/* ch is idle, si and number matches */
-					cc->rcard[ch] = card;
-					cc->rch[ch] = lch;
-					card->rcard[lch] = cc;
-					card->rch[lch] = ch;
-					spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-					return 0;
-				} else {
-					spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-					/* num matches, but busy */
-					if (ch == 1)
-						return 1;
-				}
-			}
-		}
-		cc = cc->next;
-	}
-	return 2;
-}
-
-/*
- * Depending on D-channel protocol and caller/called, modify
- * phone number.
- *
- * Parameter:
- *   card   = pointer to card struct.
- *   phone  = pointer phone number.
- *   caller = flag: 1 = caller, 0 = called.
- * Return:
- *   pointer to new phone number.
- */
-static char *
-isdnloop_vstphone(isdnloop_card *card, char *phone, int caller)
-{
-	int i;
-	static char nphone[30];
-
-	if (!card) {
-		printk("BUG!!!\n");
-		return "";
-	}
-	switch (card->ptype) {
-	case ISDN_PTYPE_EURO:
-		if (caller) {
-			for (i = 0; i < 2; i++)
-				if (!(strcmp(card->s0num[i], phone)))
-					return phone;
-			return card->s0num[0];
-		}
-		return phone;
-		break;
-	case ISDN_PTYPE_1TR6:
-		if (caller) {
-			sprintf(nphone, "%s%c", card->s0num[0], phone[0]);
-			return nphone;
-		} else
-			return &phone[strlen(phone) - 1];
-		break;
-	}
-	return "";
-}
-
-/*
- * Parse an ICN-type command string sent to the 'card'.
- * Perform misc. actions depending on the command.
- *
- * Parameter:
- *   card = pointer to card struct.
- */
-static void
-isdnloop_parse_cmd(isdnloop_card *card)
-{
-	char *p = card->omsg;
-	isdn_ctrl cmd;
-	char buf[60];
-	isdnloop_stat *s = isdnloop_cmd_table;
-	int action = -1;
-	int i;
-	int ch;
-
-	if ((card->omsg[0] != '0') && (card->omsg[2] != ';')) {
-		isdnloop_fake_err(card);
-		return;
-	}
-	ch = card->omsg[1] - '0';
-	if ((ch < 0) || (ch > 2)) {
-		isdnloop_fake_err(card);
-		return;
-	}
-	p += 3;
-	while (s->statstr) {
-		if (!strncmp(p, s->statstr, strlen(s->statstr))) {
-			action = s->action;
-			if (s->command && (ch != 0)) {
-				isdnloop_fake_err(card);
-				return;
-			}
-			break;
-		}
-		s++;
-	}
-	if (action == -1)
-		return;
-	switch (action) {
-	case 1:
-		/* 0x;BCON_R */
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BCON_I",
-				      card->rch[ch - 1] + 1);
-			isdnloop_fake(card, "BCON_C", ch);
-		}
-		break;
-	case 17:
-		/* 0x;BCON_I */
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BCON_C",
-				      card->rch[ch - 1] + 1);
-		}
-		break;
-	case 2:
-		/* 0x;BDIS_R */
-		isdnloop_fake(card, "BDIS_C", ch);
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BDIS_I",
-				      card->rch[ch - 1] + 1);
-		}
-		break;
-	case 16:
-		/* 0x;DCON_R */
-		isdnloop_kill_ctimer(card, ch - 1);
-		if (card->rcard[ch - 1]) {
-			isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]);
-			isdnloop_fake(card->rcard[ch - 1], "DCON_C",
-				      card->rch[ch - 1] + 1);
-			isdnloop_fake(card, "DCON_C", ch);
-		}
-		break;
-	case 3:
-		/* 0x;DDIS_R */
-		isdnloop_kill_ctimer(card, ch - 1);
-		if (card->rcard[ch - 1]) {
-			isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]);
-			isdnloop_fake(card->rcard[ch - 1], "DDIS_I",
-				      card->rch[ch - 1] + 1);
-			card->rcard[ch - 1] = NULL;
-		}
-		isdnloop_fake(card, "DDIS_C", ch);
-		break;
-	case 4:
-		/* 0x;DSCA_Rdd,yy,zz,oo */
-		if (card->ptype != ISDN_PTYPE_1TR6) {
-			isdnloop_fake_err(card);
-			return;
-		}
-		/* Fall through */
-	case 5:
-		/* 0x;DCAL_Rdd,yy,zz,oo */
-		p += 6;
-		switch (isdnloop_try_call(card, p, ch - 1, &cmd)) {
-		case 0:
-			/* Alerting */
-			sprintf(buf, "D%s_I%s,%02d,%02d,%s",
-				(action == 4) ? "SCA" : "CAL",
-				isdnloop_vstphone(card, cmd.parm.setup.eazmsn, 1),
-				cmd.parm.setup.si1,
-				cmd.parm.setup.si2,
-				isdnloop_vstphone(card->rcard[ch - 1],
-						  cmd.parm.setup.phone, 0));
-			isdnloop_fake(card->rcard[ch - 1], buf, card->rch[ch - 1] + 1);
-			/* Fall through */
-		case 3:
-			/* si1 does not match, don't alert but start timer */
-			isdnloop_start_ctimer(card, ch - 1);
-			break;
-		case 1:
-			/* Remote busy */
-			isdnloop_fake(card, "DDIS_I", ch);
-			sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 1));
-			isdnloop_fake(card, buf, ch);
-			break;
-		case 2:
-			/* No such user */
-			isdnloop_fake(card, "DDIS_I", ch);
-			sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 2));
-			isdnloop_fake(card, buf, ch);
-			break;
-		}
-		break;
-	case 6:
-		/* 0x;EAZC */
-		card->eazlist[ch - 1][0] = '\0';
-		break;
-	case 7:
-		/* 0x;EAZ */
-		p += 3;
-		if (strlen(p) >= sizeof(card->eazlist[0]))
-			break;
-		strcpy(card->eazlist[ch - 1], p);
-		break;
-	case 8:
-		/* 0x;SEEAZ */
-		sprintf(buf, "EAZ-LIST: %s", card->eazlist[ch - 1]);
-		isdnloop_fake(card, buf, ch + 1);
-		break;
-	case 9:
-		/* 0x;MSN */
-		break;
-	case 10:
-		/* 0x;MSNALL */
-		break;
-	case 11:
-		/* 0x;SETSIL */
-		p += 6;
-		i = 0;
-		while (strchr("0157", *p)) {
-			if (i)
-				card->sil[ch - 1] |= si2bit[*p - '0'];
-			i = (*p++ == '0');
-		}
-		if (*p)
-			isdnloop_fake_err(card);
-		break;
-	case 12:
-		/* 0x;SEESIL */
-		sprintf(buf, "SIN-LIST: ");
-		p = buf + 10;
-		for (i = 0; i < 3; i++)
-			if (card->sil[ch - 1] & (1 << i))
-				p += sprintf(p, "%02d", bit2si[i]);
-		isdnloop_fake(card, buf, ch + 1);
-		break;
-	case 13:
-		/* 0x;SILC */
-		card->sil[ch - 1] = 0;
-		break;
-	case 14:
-		/* 00;FV2ON */
-		break;
-	case 15:
-		/* 00;FV2OFF */
-		break;
-	}
-}
-
-/*
- * Put command-strings into the of the 'card'. In reality, execute them
- * right in place by calling isdnloop_parse_cmd(). Also copy every
- * command to the read message ringbuffer, preceding it with a '>'.
- * These mesagges can be read at /dev/isdnctrl.
- *
- * Parameter:
- *   buf  = pointer to command buffer.
- *   len  = length of buffer data.
- *   user = flag: 1 = called form userlevel, 0 called from kernel.
- *   card = pointer to card struct.
- * Return:
- *   number of bytes transferred (currently always equals len).
- */
-static int
-isdnloop_writecmd(const u_char *buf, int len, int user, isdnloop_card *card)
-{
-	int xcount = 0;
-	int ocount = 1;
-	isdn_ctrl cmd;
-
-	while (len) {
-		int count = len;
-		u_char *p;
-		u_char msg[0x100];
-
-		if (count > 255)
-			count = 255;
-		if (user) {
-			if (copy_from_user(msg, buf, count))
-				return -EFAULT;
-		} else
-			memcpy(msg, buf, count);
-		isdnloop_putmsg(card, '>');
-		for (p = msg; count > 0; count--, p++) {
-			len--;
-			xcount++;
-			isdnloop_putmsg(card, *p);
-			card->omsg[card->optr] = *p;
-			if (*p == '\n') {
-				card->omsg[card->optr] = '\0';
-				card->optr = 0;
-				isdnloop_parse_cmd(card);
-				if (len) {
-					isdnloop_putmsg(card, '>');
-					ocount++;
-				}
-			} else {
-				if (card->optr < 59)
-					card->optr++;
-			}
-			ocount++;
-		}
-	}
-	cmd.command = ISDN_STAT_STAVAIL;
-	cmd.driver = card->myid;
-	cmd.arg = ocount;
-	card->interface.statcallb(&cmd);
-	return xcount;
-}
-
-/*
- * Delete card's pending timers, send STOP to linklevel
- */
-static void
-isdnloop_stopcard(isdnloop_card *card)
-{
-	unsigned long flags;
-	isdn_ctrl cmd;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	if (card->flags & ISDNLOOP_FLAGS_RUNNING) {
-		card->flags &= ~ISDNLOOP_FLAGS_RUNNING;
-		del_timer(&card->st_timer);
-		del_timer(&card->rb_timer);
-		del_timer(&card->c_timer[0]);
-		del_timer(&card->c_timer[1]);
-		cmd.command = ISDN_STAT_STOP;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-	}
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Stop all cards before unload.
- */
-static void
-isdnloop_stopallcards(void)
-{
-	isdnloop_card *p = cards;
-
-	while (p) {
-		isdnloop_stopcard(p);
-		p = p->next;
-	}
-}
-
-/*
- * Start a 'card'. Simulate card's boot message and set the phone
- * number(s) of the virtual 'S0-Interface'. Install D-channel
- * poll timer.
- *
- * Parameter:
- *   card  = pointer to card struct.
- *   sdefp = pointer to struct holding ioctl parameters.
- * Return:
- *   0 on success, -E??? otherwise.
- */
-static int
-isdnloop_start(isdnloop_card *card, isdnloop_sdef *sdefp)
-{
-	unsigned long flags;
-	isdnloop_sdef sdef;
-	int i;
-
-	if (card->flags & ISDNLOOP_FLAGS_RUNNING)
-		return -EBUSY;
-	if (copy_from_user((char *) &sdef, (char *) sdefp, sizeof(sdef)))
-		return -EFAULT;
-
-	for (i = 0; i < 3; i++) {
-		if (!memchr(sdef.num[i], 0, sizeof(sdef.num[i])))
-			return -EINVAL;
-	}
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	switch (sdef.ptype) {
-	case ISDN_PTYPE_EURO:
-		if (isdnloop_fake(card, "DRV1.23EC-Q.931-CAPI-CNS-BASIS-20.02.96",
-				  -1)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		card->sil[0] = card->sil[1] = 4;
-		if (isdnloop_fake(card, "TEI OK", 0)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		for (i = 0; i < 3; i++) {
-			strlcpy(card->s0num[i], sdef.num[i],
-				sizeof(card->s0num[0]));
-		}
-		break;
-	case ISDN_PTYPE_1TR6:
-		if (isdnloop_fake(card, "DRV1.04TC-1TR6-CAPI-CNS-BASIS-29.11.95",
-				  -1)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		card->sil[0] = card->sil[1] = 4;
-		if (isdnloop_fake(card, "TEI OK", 0)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		strlcpy(card->s0num[0], sdef.num[0], sizeof(card->s0num[0]));
-		card->s0num[1][0] = '\0';
-		card->s0num[2][0] = '\0';
-		break;
-	default:
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-		printk(KERN_WARNING "isdnloop: Illegal D-channel protocol %d\n",
-		       sdef.ptype);
-		return -EINVAL;
-	}
-	timer_setup(&card->rb_timer, isdnloop_pollbchan, 0);
-	timer_setup(&card->st_timer, isdnloop_polldchan, 0);
-	card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD;
-	add_timer(&card->st_timer);
-	card->flags |= ISDNLOOP_FLAGS_RUNNING;
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	return 0;
-}
-
-/*
- * Main handler for commands sent by linklevel.
- */
-static int
-isdnloop_command(isdn_ctrl *c, isdnloop_card *card)
-{
-	ulong a;
-	int i;
-	char cbuf[80];
-	isdn_ctrl cmd;
-	isdnloop_cdef cdef;
-
-	switch (c->command) {
-	case ISDN_CMD_IOCTL:
-		memcpy(&a, c->parm.num, sizeof(ulong));
-		switch (c->arg) {
-		case ISDNLOOP_IOCTL_DEBUGVAR:
-			return (ulong) card;
-		case ISDNLOOP_IOCTL_STARTUP:
-			return isdnloop_start(card, (isdnloop_sdef *) a);
-			break;
-		case ISDNLOOP_IOCTL_ADDCARD:
-			if (copy_from_user((char *)&cdef,
-					   (char *)a,
-					   sizeof(cdef)))
-				return -EFAULT;
-			return isdnloop_addcard(cdef.id1);
-			break;
-		case ISDNLOOP_IOCTL_LEASEDCFG:
-			if (a) {
-				if (!card->leased) {
-					card->leased = 1;
-					while (card->ptype == ISDN_PTYPE_UNKNOWN)
-						schedule_timeout_interruptible(10);
-					schedule_timeout_interruptible(10);
-					sprintf(cbuf, "00;FV2ON\n01;EAZ1\n02;EAZ2\n");
-					i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-					printk(KERN_INFO
-					       "isdnloop: (%s) Leased-line mode enabled\n",
-					       CID);
-					cmd.command = ISDN_STAT_RUN;
-					cmd.driver = card->myid;
-					cmd.arg = 0;
-					card->interface.statcallb(&cmd);
-				}
-			} else {
-				if (card->leased) {
-					card->leased = 0;
-					sprintf(cbuf, "00;FV2OFF\n");
-					i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-					printk(KERN_INFO
-					       "isdnloop: (%s) Leased-line mode disabled\n",
-					       CID);
-					cmd.command = ISDN_STAT_RUN;
-					cmd.driver = card->myid;
-					cmd.arg = 0;
-					card->interface.statcallb(&cmd);
-				}
-			}
-			return 0;
-		default:
-			return -EINVAL;
-		}
-		break;
-	case ISDN_CMD_DIAL:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (card->leased)
-			break;
-		if ((c->arg & 255) < ISDNLOOP_BCH) {
-			char *p;
-			char dcode[4];
-
-			a = c->arg;
-			p = c->parm.setup.phone;
-			if (*p == 's' || *p == 'S') {
-				/* Dial for SPV */
-				p++;
-				strcpy(dcode, "SCA");
-			} else
-				/* Normal Dial */
-				strcpy(dcode, "CAL");
-			snprintf(cbuf, sizeof(cbuf),
-				 "%02d;D%s_R%s,%02d,%02d,%s\n", (int) (a + 1),
-				 dcode, p, c->parm.setup.si1,
-				 c->parm.setup.si2, c->parm.setup.eazmsn);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-		}
-		break;
-	case ISDN_CMD_ACCEPTD:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (c->arg < ISDNLOOP_BCH) {
-			a = c->arg + 1;
-			cbuf[0] = 0;
-			switch (card->l2_proto[a - 1]) {
-			case ISDN_PROTO_L2_X75I:
-				sprintf(cbuf, "%02d;BX75\n", (int) a);
-				break;
-#ifdef CONFIG_ISDN_X25
-			case ISDN_PROTO_L2_X25DTE:
-				sprintf(cbuf, "%02d;BX2T\n", (int) a);
-				break;
-			case ISDN_PROTO_L2_X25DCE:
-				sprintf(cbuf, "%02d;BX2C\n", (int) a);
-				break;
-#endif
-			case ISDN_PROTO_L2_HDLC:
-				sprintf(cbuf, "%02d;BTRA\n", (int) a);
-				break;
-			}
-			if (strlen(cbuf))
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			sprintf(cbuf, "%02d;DCON_R\n", (int) a);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-		}
-		break;
-	case ISDN_CMD_ACCEPTB:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (c->arg < ISDNLOOP_BCH) {
-			a = c->arg + 1;
-			switch (card->l2_proto[a - 1]) {
-			case ISDN_PROTO_L2_X75I:
-				sprintf(cbuf, "%02d;BCON_R,BX75\n", (int) a);
-				break;
-#ifdef CONFIG_ISDN_X25
-			case ISDN_PROTO_L2_X25DTE:
-				sprintf(cbuf, "%02d;BCON_R,BX2T\n", (int) a);
-				break;
-			case ISDN_PROTO_L2_X25DCE:
-				sprintf(cbuf, "%02d;BCON_R,BX2C\n", (int) a);
-				break;
-#endif
-			case ISDN_PROTO_L2_HDLC:
-				sprintf(cbuf, "%02d;BCON_R,BTRA\n", (int) a);
-				break;
-			default:
-				sprintf(cbuf, "%02d;BCON_R\n", (int) a);
-			}
-			printk(KERN_DEBUG "isdnloop writecmd '%s'\n", cbuf);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			break;
-		case ISDN_CMD_HANGUP:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				sprintf(cbuf, "%02d;BDIS_R\n%02d;DDIS_R\n", (int) a, (int) a);
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_SETEAZ:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (card->leased)
-				break;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				if (card->ptype == ISDN_PTYPE_EURO) {
-					sprintf(cbuf, "%02d;MS%s%s\n", (int) a,
-						c->parm.num[0] ? "N" : "ALL", c->parm.num);
-				} else
-					sprintf(cbuf, "%02d;EAZ%s\n", (int) a,
-						c->parm.num[0] ? c->parm.num : (u_char *) "0123456789");
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_CLREAZ:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (card->leased)
-				break;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				if (card->ptype == ISDN_PTYPE_EURO)
-					sprintf(cbuf, "%02d;MSNC\n", (int) a);
-				else
-					sprintf(cbuf, "%02d;EAZC\n", (int) a);
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_SETL2:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if ((c->arg & 255) < ISDNLOOP_BCH) {
-				a = c->arg;
-				switch (a >> 8) {
-				case ISDN_PROTO_L2_X75I:
-					sprintf(cbuf, "%02d;BX75\n", (int) (a & 255) + 1);
-					break;
-#ifdef CONFIG_ISDN_X25
-				case ISDN_PROTO_L2_X25DTE:
-					sprintf(cbuf, "%02d;BX2T\n", (int) (a & 255) + 1);
-					break;
-				case ISDN_PROTO_L2_X25DCE:
-					sprintf(cbuf, "%02d;BX2C\n", (int) (a & 255) + 1);
-					break;
-#endif
-				case ISDN_PROTO_L2_HDLC:
-					sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1);
-					break;
-				case ISDN_PROTO_L2_TRANS:
-					sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1);
-					break;
-				default:
-					return -EINVAL;
-				}
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-				card->l2_proto[a & 255] = (a >> 8);
-			}
-			break;
-		case ISDN_CMD_SETL3:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			return 0;
-		default:
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/*
- * Find card with given driverId
- */
-static inline isdnloop_card *
-isdnloop_findcard(int driverid)
-{
-	isdnloop_card *p = cards;
-
-	while (p) {
-		if (p->myid == driverid)
-			return p;
-		p = p->next;
-	}
-	return (isdnloop_card *) 0;
-}
-
-/*
- * Wrapper functions for interface to linklevel
- */
-static int
-if_command(isdn_ctrl *c)
-{
-	isdnloop_card *card = isdnloop_findcard(c->driver);
-
-	if (card)
-		return isdnloop_command(c, card);
-	printk(KERN_ERR
-	       "isdnloop: if_command called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_writecmd(const u_char __user *buf, int len, int id, int channel)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		return isdnloop_writecmd(buf, len, 1, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_writecmd called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_readstatus(u_char __user *buf, int len, int id, int channel)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		return isdnloop_readstatus(buf, len, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_readstatus called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_sendbuf(int id, int channel, int ack, struct sk_buff *skb)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		/* ack request stored in skb scratch area */
-		*(skb->head) = ack;
-		return isdnloop_sendbuf(channel, skb, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_sendbuf called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-/*
- * Allocate a new card-struct, initialize it
- * link it into cards-list and register it at linklevel.
- */
-static isdnloop_card *
-isdnloop_initcard(char *id)
-{
-	isdnloop_card *card;
-	int i;
-	card = kzalloc(sizeof(isdnloop_card), GFP_KERNEL);
-	if (!card) {
-		printk(KERN_WARNING
-		       "isdnloop: (%s) Could not allocate card-struct.\n", id);
-		return (isdnloop_card *) 0;
-	}
-	card->interface.owner = THIS_MODULE;
-	card->interface.channels = ISDNLOOP_BCH;
-	card->interface.hl_hdrlen  = 1; /* scratch area for storing ack flag*/
-	card->interface.maxbufsize = 4000;
-	card->interface.command = if_command;
-	card->interface.writebuf_skb = if_sendbuf;
-	card->interface.writecmd = if_writecmd;
-	card->interface.readstat = if_readstatus;
-	card->interface.features = ISDN_FEATURE_L2_X75I |
-#ifdef CONFIG_ISDN_X25
-		ISDN_FEATURE_L2_X25DTE |
-		ISDN_FEATURE_L2_X25DCE |
-#endif
-		ISDN_FEATURE_L2_HDLC |
-		ISDN_FEATURE_L3_TRANS |
-		ISDN_FEATURE_P_UNKNOWN;
-	card->ptype = ISDN_PTYPE_UNKNOWN;
-	strlcpy(card->interface.id, id, sizeof(card->interface.id));
-	card->msg_buf_write = card->msg_buf;
-	card->msg_buf_read = card->msg_buf;
-	card->msg_buf_end = &card->msg_buf[sizeof(card->msg_buf) - 1];
-	for (i = 0; i < ISDNLOOP_BCH; i++) {
-		card->l2_proto[i] = ISDN_PROTO_L2_X75I;
-		skb_queue_head_init(&card->bqueue[i]);
-	}
-	skb_queue_head_init(&card->dqueue);
-	spin_lock_init(&card->isdnloop_lock);
-	card->next = cards;
-	cards = card;
-	if (!register_isdn(&card->interface)) {
-		cards = cards->next;
-		printk(KERN_WARNING
-		       "isdnloop: Unable to register %s\n", id);
-		kfree(card);
-		return (isdnloop_card *) 0;
-	}
-	card->myid = card->interface.channels;
-	return card;
-}
-
-static int
-isdnloop_addcard(char *id1)
-{
-	isdnloop_card *card;
-	card = isdnloop_initcard(id1);
-	if (!card) {
-		return -EIO;
-	}
-	printk(KERN_INFO
-	       "isdnloop: (%s) virtual card added\n",
-	       card->interface.id);
-	return 0;
-}
-
-static int __init
-isdnloop_init(void)
-{
-	if (isdnloop_id)
-		return isdnloop_addcard(isdnloop_id);
-
-	return 0;
-}
-
-static void __exit
-isdnloop_exit(void)
-{
-	isdn_ctrl cmd;
-	isdnloop_card *card = cards;
-	isdnloop_card *last;
-	int i;
-
-	isdnloop_stopallcards();
-	while (card) {
-		cmd.command = ISDN_STAT_UNLOAD;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		for (i = 0; i < ISDNLOOP_BCH; i++)
-			isdnloop_free_queue(card, i);
-		card = card->next;
-	}
-	card = cards;
-	while (card) {
-		last = card;
-		skb_queue_purge(&card->dqueue);
-		card = card->next;
-		kfree(last);
-	}
-	printk(KERN_NOTICE "isdnloop-ISDN-driver unloaded\n");
-}
-
-module_init(isdnloop_init);
-module_exit(isdnloop_exit);
diff --git a/drivers/isdn/isdnloop/isdnloop.h b/drivers/isdn/isdnloop/isdnloop.h
deleted file mode 100644
index e9e035552bb4..000000000000
--- a/drivers/isdn/isdnloop/isdnloop.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* $Id: isdnloop.h,v 1.5.6.3 2001/09/23 22:24:56 kai Exp $
- *
- * Loopback lowlevel module for testing of linklevel.
- *
- * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef isdnloop_h
-#define isdnloop_h
-
-#define ISDNLOOP_IOCTL_DEBUGVAR  0
-#define ISDNLOOP_IOCTL_ADDCARD   1
-#define ISDNLOOP_IOCTL_LEASEDCFG 2
-#define ISDNLOOP_IOCTL_STARTUP   3
-
-/* Struct for adding new cards */
-typedef struct isdnloop_cdef {
-	char id1[10];
-} isdnloop_cdef;
-
-/* Struct for configuring cards */
-typedef struct isdnloop_sdef {
-	int ptype;
-	char num[3][20];
-} isdnloop_sdef;
-
-#if defined(__KERNEL__) || defined(__DEBUGVAR__)
-
-#ifdef __KERNEL__
-/* Kernel includes */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <asm/io.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/ioport.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/isdnif.h>
-
-#endif                          /* __KERNEL__ */
-
-#define ISDNLOOP_FLAGS_B1ACTIVE 1	/* B-Channel-1 is open           */
-#define ISDNLOOP_FLAGS_B2ACTIVE 2	/* B-Channel-2 is open           */
-#define ISDNLOOP_FLAGS_RUNNING  4	/* Cards driver activated        */
-#define ISDNLOOP_FLAGS_RBTIMER  8	/* scheduling of B-Channel-poll  */
-#define ISDNLOOP_TIMER_BCREAD 1 /* B-Channel poll-cycle          */
-#define ISDNLOOP_TIMER_DCREAD (HZ/2)	/* D-Channel poll-cycle          */
-#define ISDNLOOP_TIMER_ALERTWAIT (10 * HZ)	/* Alert timeout                 */
-#define ISDNLOOP_MAX_SQUEUE 65536	/* Max. outstanding send-data    */
-#define ISDNLOOP_BCH 2          /* channels per card             */
-
-/*
- * Per card driver data
- */
-typedef struct isdnloop_card {
-	struct isdnloop_card *next;	/* Pointer to next device struct    */
-	struct isdnloop_card
-	*rcard[ISDNLOOP_BCH];   /* Pointer to 'remote' card         */
-	int rch[ISDNLOOP_BCH];  /* 'remote' channel                 */
-	int myid;               /* Driver-Nr. assigned by linklevel */
-	int leased;             /* Flag: This Adapter is connected  */
-	/*       to a leased line           */
-	int sil[ISDNLOOP_BCH];  /* SI's to listen for               */
-	char eazlist[ISDNLOOP_BCH][11];
-	/* EAZ's to listen for              */
-	char s0num[3][20];      /* 1TR6 base-number or MSN's        */
-	unsigned short flags;   /* Statusflags                      */
-	int ptype;              /* Protocol type (1TR6 or Euro)     */
-	struct timer_list st_timer;	/* Timer for Status-Polls           */
-	struct timer_list rb_timer;	/* Timer for B-Channel-Polls        */
-	struct timer_list
-	c_timer[ISDNLOOP_BCH]; /* Timer for Alerting               */
-	int l2_proto[ISDNLOOP_BCH];	/* Current layer-2-protocol         */
-	isdn_if interface;      /* Interface to upper layer         */
-	int iptr;               /* Index to imsg-buffer             */
-	char imsg[60];          /* Internal buf for status-parsing  */
-	int optr;               /* Index to omsg-buffer             */
-	char omsg[60];          /* Internal buf for cmd-parsing     */
-	char msg_buf[2048];     /* Buffer for status-messages       */
-	char *msg_buf_write;    /* Writepointer for statusbuffer    */
-	char *msg_buf_read;     /* Readpointer for statusbuffer     */
-	char *msg_buf_end;      /* Pointer to end of statusbuffer   */
-	int sndcount[ISDNLOOP_BCH];	/* Byte-counters for B-Ch.-send     */
-	struct sk_buff_head
-	bqueue[ISDNLOOP_BCH];  /* B-Channel queues                 */
-	struct sk_buff_head dqueue;	/* D-Channel queue                  */
-	spinlock_t isdnloop_lock;
-} isdnloop_card;
-
-/*
- * Main driver data
- */
-#ifdef __KERNEL__
-static isdnloop_card *cards = (isdnloop_card *) 0;
-#endif                          /* __KERNEL__ */
-
-/* Utility-Macros */
-
-#define CID (card->interface.id)
-
-#endif                          /* defined(__KERNEL__) || defined(__DEBUGVAR__) */
-#endif                          /* isdnloop_h */
diff --git a/include/linux/concap.h b/include/linux/concap.h
deleted file mode 100644
index 977acb3d1fb2..000000000000
--- a/include/linux/concap.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* $Id: concap.h,v 1.3.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Copyright 1997 by Henner Eisen <eis@baty.hanse.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- */
-
-#ifndef _LINUX_CONCAP_H
-#define _LINUX_CONCAP_H
-
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-
-/* Stuff to support encapsulation protocols genericly. The encapsulation
-   protocol is processed at the uppermost layer of the network interface.
-
-   Based on a ideas developed in a 'synchronous device' thread in the
-   linux-x25 mailing list contributed by Alan Cox, Thomasz Motylewski
-   and Jonathan Naylor.
-
-   For more documetation on this refer to Documentation/isdn/README.concap
-*/
-
-struct concap_proto_ops;
-struct concap_device_ops;
-
-/* this manages all data needed by the encapsulation protocol
- */
-struct concap_proto{
-	struct net_device *net_dev;	/* net device using our service  */
-	struct concap_device_ops *dops;	/* callbacks provided by device */
- 	struct concap_proto_ops  *pops;	/* callbacks provided by us */
- 	spinlock_t lock;
-	int flags;
-	void *proto_data;		/* protocol specific private data, to
-					   be accessed via *pops methods only*/
-	/*
-	  :
-	  whatever 
-	  :
-	  */
-};
-
-/* Operations to be supported by the net device. Called by the encapsulation
- * protocol entity. No receive method is offered because the encapsulation
- * protocol directly calls netif_rx().
- */
-struct concap_device_ops{
-
-	/* to request data is submitted by device*/ 
-	int (*data_req)(struct concap_proto *, struct sk_buff *);
-
-	/* Control methods must be set to NULL by devices which do not
-	   support connection control.*/
-	/* to request a connection is set up */ 
-	int (*connect_req)(struct concap_proto *);
-
-	/* to request a connection is released */
-	int (*disconn_req)(struct concap_proto *);	
-};
-
-/* Operations to be supported by the encapsulation protocol. Called by
- * device driver.
- */
-struct concap_proto_ops{
-
-	/* create a new encapsulation protocol instance of same type */
-	struct concap_proto *  (*proto_new) (void);
-
-	/* delete encapsulation protocol instance and free all its resources.
-	   cprot may no loger be referenced after calling this */
-	void (*proto_del)(struct concap_proto *cprot);
-
-	/* initialize the protocol's data. To be called at interface startup
-	   or when the device driver resets the interface. All services of the
-	   encapsulation protocol may be used after this*/
-	int (*restart)(struct concap_proto *cprot, 
-		       struct net_device *ndev,
-		       struct concap_device_ops *dops);
-
-	/* inactivate an encapsulation protocol instance. The encapsulation
-	   protocol may not call any *dops methods after this. */
-	int (*close)(struct concap_proto *cprot);
-
-	/* process a frame handed down to us by upper layer */
-	int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called for each data entity received from lower layer*/ 
-	int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called when a connection was set up/down.
-	   Protocols that don't process these primitives might fill in
-	   dummy methods here */
-	int (*connect_ind)(struct concap_proto *cprot);
-	int (*disconn_ind)(struct concap_proto *cprot);
-  /*
-    Some network device support functions, like net_header(), rebuild_header(),
-    and others, that depend solely on the encapsulation protocol, might
-    be provided here, too. The net device would just fill them in its
-    corresponding fields when it is opened.
-    */
-};
-
-/* dummy restart/close/connect/reset/disconn methods
- */
-extern int concap_nop(struct concap_proto *cprot); 
-
-/* dummy submit method
- */
-extern int concap_drop_skb(struct concap_proto *cprot, struct sk_buff *skb);
-#endif
diff --git a/include/linux/isdn.h b/include/linux/isdn.h
deleted file mode 100644
index df97c8444f5d..000000000000
--- a/include/linux/isdn.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $
- *
- * Main header for the Linux ISDN subsystem (linklevel).
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef __ISDN_H__
-#define __ISDN_H__
-
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <asm/io.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial_reg.h>
-#include <linux/fcntl.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/ip.h>
-#include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/mutex.h>
-#include <uapi/linux/isdn.h>
-
-#define ISDN_TTY_MAJOR    43
-#define ISDN_TTYAUX_MAJOR 44
-#define ISDN_MAJOR        45
-
-/* The minor-devicenumbers for Channel 0 and 1 are used as arguments for
- * physical Channel-Mapping, so they MUST NOT be changed without changing
- * the correspondent code in isdn.c
- */
-
-#define ISDN_MINOR_B        0
-#define ISDN_MINOR_BMAX     (ISDN_MAX_CHANNELS-1)
-#define ISDN_MINOR_CTRL     64
-#define ISDN_MINOR_CTRLMAX  (64 + (ISDN_MAX_CHANNELS-1))
-#define ISDN_MINOR_PPP      128
-#define ISDN_MINOR_PPPMAX   (128 + (ISDN_MAX_CHANNELS-1))
-#define ISDN_MINOR_STATUS   255
-
-#ifdef CONFIG_ISDN_PPP
-
-#ifdef CONFIG_ISDN_PPP_VJ
-#  include <net/slhc_vj.h>
-#endif
-
-#include <linux/ppp_defs.h>
-#include <linux/ppp-ioctl.h>
-
-#include <linux/isdn_ppp.h>
-#endif
-
-#ifdef CONFIG_ISDN_X25
-#  include <linux/concap.h>
-#endif
-
-#include <linux/isdnif.h>
-
-#define ISDN_DRVIOCTL_MASK       0x7f  /* Mask for Device-ioctl */
-
-/* Until now unused */
-#define ISDN_SERVICE_VOICE 1
-#define ISDN_SERVICE_AB    1<<1 
-#define ISDN_SERVICE_X21   1<<2
-#define ISDN_SERVICE_G4    1<<3
-#define ISDN_SERVICE_BTX   1<<4
-#define ISDN_SERVICE_DFUE  1<<5
-#define ISDN_SERVICE_X25   1<<6
-#define ISDN_SERVICE_TTX   1<<7
-#define ISDN_SERVICE_MIXED 1<<8
-#define ISDN_SERVICE_FW    1<<9
-#define ISDN_SERVICE_GTEL  1<<10
-#define ISDN_SERVICE_BTXN  1<<11
-#define ISDN_SERVICE_BTEL  1<<12
-
-/* Macros checking plain usage */
-#define USG_NONE(x)         ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NONE)
-#define USG_RAW(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_RAW)
-#define USG_MODEM(x)        ((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM)
-#define USG_VOICE(x)        ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE)
-#define USG_NET(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NET)
-#define USG_FAX(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_FAX)
-#define USG_OUTGOING(x)     ((x & ISDN_USAGE_OUTGOING)==ISDN_USAGE_OUTGOING)
-#define USG_MODEMORVOICE(x) (((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM) || \
-                             ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE)     )
-
-/* Timer-delays and scheduling-flags */
-#define ISDN_TIMER_RES         4                         /* Main Timer-Resolution   */
-#define ISDN_TIMER_02SEC       (HZ/ISDN_TIMER_RES/5)     /* Slow-Timer1 .2 sec      */
-#define ISDN_TIMER_1SEC        (HZ/ISDN_TIMER_RES)       /* Slow-Timer2 1 sec       */
-#define ISDN_TIMER_RINGING     5 /* tty RINGs = ISDN_TIMER_1SEC * this factor       */
-#define ISDN_TIMER_KEEPINT    10 /* Cisco-Keepalive = ISDN_TIMER_1SEC * this factor */
-#define ISDN_TIMER_MODEMREAD   1
-#define ISDN_TIMER_MODEMPLUS   2
-#define ISDN_TIMER_MODEMRING   4
-#define ISDN_TIMER_MODEMXMIT   8
-#define ISDN_TIMER_NETDIAL    16 
-#define ISDN_TIMER_NETHANGUP  32
-#define ISDN_TIMER_CARRIER   256 /* Wait for Carrier */
-#define ISDN_TIMER_FAST      (ISDN_TIMER_MODEMREAD | ISDN_TIMER_MODEMPLUS | \
-                              ISDN_TIMER_MODEMXMIT)
-#define ISDN_TIMER_SLOW      (ISDN_TIMER_MODEMRING | ISDN_TIMER_NETHANGUP | \
-                              ISDN_TIMER_NETDIAL | ISDN_TIMER_CARRIER)
-
-/* Timeout-Values for isdn_net_dial() */
-#define ISDN_TIMER_DTIMEOUT10 (10*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-#define ISDN_TIMER_DTIMEOUT15 (15*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-#define ISDN_TIMER_DTIMEOUT60 (60*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-
-/* GLOBAL_FLAGS */
-#define ISDN_GLOBAL_STOPPED 1
-
-/*=================== Start of ip-over-ISDN stuff =========================*/
-
-/* Feature- and status-flags for a net-interface */
-#define ISDN_NET_CONNECTED  0x01       /* Bound to ISDN-Channel             */
-#define ISDN_NET_SECURE     0x02       /* Accept calls from phonelist only  */
-#define ISDN_NET_CALLBACK   0x04       /* activate callback                 */
-#define ISDN_NET_CBHUP      0x08       /* hangup before callback            */
-#define ISDN_NET_CBOUT      0x10       /* remote machine does callback      */
-
-#define ISDN_NET_MAGIC      0x49344C02 /* for paranoia-checking             */
-
-/* Phone-list-element */
-typedef struct {
-  void *next;
-  char num[ISDN_MSNLEN];
-} isdn_net_phone;
-
-/*
-   Principles when extending structures for generic encapsulation protocol
-   ("concap") support:
-   - Stuff which is hardware specific (here i4l-specific) goes in 
-     the netdev -> local structure (here: isdn_net_local)
-   - Stuff which is encapsulation protocol specific goes in the structure
-     which holds the linux device structure (here: isdn_net_device)
-*/
-
-/* Local interface-data */
-typedef struct isdn_net_local_s {
-  ulong                  magic;
-  struct net_device_stats stats;       /* Ethernet Statistics              */
-  int                    isdn_device;  /* Index to isdn-device             */
-  int                    isdn_channel; /* Index to isdn-channel            */
-  int			 ppp_slot;     /* PPPD device slot number          */
-  int                    pre_device;   /* Preselected isdn-device          */
-  int                    pre_channel;  /* Preselected isdn-channel         */
-  int                    exclusive;    /* If non-zero idx to reserved chan.*/
-  int                    flags;        /* Connection-flags                 */
-  int                    dialretry;    /* Counter for Dialout-retries      */
-  int                    dialmax;      /* Max. Number of Dial-retries      */
-  int                    cbdelay;      /* Delay before Callback starts     */
-  int                    dtimer;       /* Timeout-counter for dialing      */
-  char                   msn[ISDN_MSNLEN]; /* MSNs/EAZs for this interface */
-  u_char                 cbhup;        /* Flag: Reject Call before Callback*/
-  u_char                 dialstate;    /* State for dialing                */
-  u_char                 p_encap;      /* Packet encapsulation             */
-                                       /*   0 = Ethernet over ISDN         */
-				       /*   1 = RAW-IP                     */
-                                       /*   2 = IP with type field         */
-  u_char                 l2_proto;     /* Layer-2-protocol                 */
-				       /* See ISDN_PROTO_L2..-constants in */
-                                       /* isdnif.h                         */
-                                       /*   0 = X75/LAPB with I-Frames     */
-				       /*   1 = X75/LAPB with UI-Frames    */
-				       /*   2 = X75/LAPB with BUI-Frames   */
-				       /*   3 = HDLC                       */
-  u_char                 l3_proto;     /* Layer-3-protocol                 */
-				       /* See ISDN_PROTO_L3..-constants in */
-                                       /* isdnif.h                         */
-                                       /*   0 = Transparent                */
-  int                    huptimer;     /* Timeout-counter for auto-hangup  */
-  int                    charge;       /* Counter for charging units       */
-  ulong                  chargetime;   /* Timer for Charging info          */
-  int                    hupflags;     /* Flags for charge-unit-hangup:    */
-				       /* bit0: chargeint is invalid       */
-				       /* bit1: Getting charge-interval    */
-                                       /* bit2: Do charge-unit-hangup      */
-                                       /* bit3: Do hangup even on incoming */
-  int                    outgoing;     /* Flag: outgoing call              */
-  int                    onhtime;      /* Time to keep link up             */
-  int                    chargeint;    /* Interval between charge-infos    */
-  int                    onum;         /* Flag: at least 1 outgoing number */
-  int                    cps;          /* current speed of this interface  */
-  int                    transcount;   /* byte-counter for cps-calculation */
-  int                    sqfull;       /* Flag: netdev-queue overloaded    */
-  ulong                  sqfull_stamp; /* Start-Time of overload           */
-  ulong                  slavedelay;   /* Dynamic bundling delaytime       */
-  int                    triggercps;   /* BogoCPS needed for trigger slave */
-  isdn_net_phone         *phone[2];    /* List of remote-phonenumbers      */
-				       /* phone[0] = Incoming Numbers      */
-				       /* phone[1] = Outgoing Numbers      */
-  isdn_net_phone         *dial;        /* Pointer to dialed number         */
-  struct net_device      *master;      /* Ptr to Master device for slaves  */
-  struct net_device      *slave;       /* Ptr to Slave device for masters  */
-  struct isdn_net_local_s *next;       /* Ptr to next link in bundle       */
-  struct isdn_net_local_s *last;       /* Ptr to last link in bundle       */
-  struct isdn_net_dev_s  *netdev;      /* Ptr to netdev                    */
-  struct sk_buff_head    super_tx_queue; /* List of supervisory frames to  */
-	                               /* be transmitted asap              */
-  atomic_t frame_cnt;                  /* number of frames currently       */
-                        	       /* queued in HL driver              */    
-                                       /* Ptr to orig. hard_header_cache   */
-  spinlock_t             xmit_lock;    /* used to protect the xmit path of */
-                                       /* a particular channel (including  */
-                                       /* the frame_cnt                    */
-
-  int  pppbind;                        /* ippp device for bindings         */
-  int					dialtimeout;	/* How long shall we try on dialing? (jiffies) */
-  int					dialwait;		/* How long shall we wait after failed attempt? (jiffies) */
-  ulong					dialstarted;	/* jiffies of first dialing-attempt */
-  ulong					dialwait_timer;	/* jiffies of earliest next dialing-attempt */
-  int					huptimeout;		/* How long will the connection be up? (seconds) */
-#ifdef CONFIG_ISDN_X25
-  struct concap_device_ops *dops;      /* callbacks used by encapsulator   */
-#endif
-  /* use an own struct for that in later versions */
-  ulong cisco_myseq;                   /* Local keepalive seq. for Cisco   */
-  ulong cisco_mineseen;                /* returned keepalive seq. from remote */
-  ulong cisco_yourseq;                 /* Remote keepalive seq. for Cisco  */
-  int cisco_keepalive_period;		/* keepalive period */
-  ulong cisco_last_slarp_in;		/* jiffie of last keepalive packet we received */
-  char cisco_line_state;		/* state of line according to keepalive packets */
-  char cisco_debserint;			/* debugging flag of cisco hdlc with slarp */
-  struct timer_list cisco_timer;
-  struct work_struct tqueue;
-} isdn_net_local;
-
-/* the interface itself */
-typedef struct isdn_net_dev_s {
-  isdn_net_local *local;
-  isdn_net_local *queue;               /* circular list of all bundled
-					  channels, which are currently
-					  online                           */
-  spinlock_t queue_lock;               /* lock to protect queue            */
-  void *next;                          /* Pointer to next isdn-interface   */
-  struct net_device *dev;              /* interface to upper levels        */
-#ifdef CONFIG_ISDN_PPP
-  ippp_bundle * pb;		/* pointer to the common bundle structure
-   			         * with the per-bundle data */
-#endif
-#ifdef CONFIG_ISDN_X25
-  struct concap_proto  *cprot; /* connection oriented encapsulation protocol */
-#endif
-
-} isdn_net_dev;
-
-/*===================== End of ip-over-ISDN stuff ===========================*/
-
-/*======================= Start of ISDN-tty stuff ===========================*/
-
-#define ISDN_ASYNC_MAGIC          0x49344C01 /* for paranoia-checking        */
-#define ISDN_SERIAL_XMIT_SIZE           1024 /* Default bufsize for write    */
-#define ISDN_SERIAL_XMIT_MAX            4000 /* Maximum bufsize for write    */
-
-#ifdef CONFIG_ISDN_AUDIO
-/* For using sk_buffs with audio we need some private variables
- * within each sk_buff. For this purpose, we declare a struct here,
- * and put it always at the private skb->cb data array. A few macros help
- * accessing the variables.
- */
-typedef struct _isdn_audio_data {
-  unsigned short dle_count;
-  unsigned char  lock;
-} isdn_audio_data_t;
-
-#define ISDN_AUDIO_SKB_DLECOUNT(skb)	(((isdn_audio_data_t *)&skb->cb[0])->dle_count)
-#define ISDN_AUDIO_SKB_LOCK(skb)	(((isdn_audio_data_t *)&skb->cb[0])->lock)
-#endif
-
-/* Private data of AT-command-interpreter */
-typedef struct atemu {
-	u_char       profile[ISDN_MODEM_NUMREG]; /* Modem-Regs. Profile 0              */
-	u_char       mdmreg[ISDN_MODEM_NUMREG];  /* Modem-Registers                    */
-	char         pmsn[ISDN_MSNLEN];          /* EAZ/MSNs Profile 0                 */
-	char         msn[ISDN_MSNLEN];           /* EAZ/MSN                            */
-	char         plmsn[ISDN_LMSNLEN];        /* Listening MSNs Profile 0           */
-	char         lmsn[ISDN_LMSNLEN];         /* Listening MSNs                     */
-	char         cpn[ISDN_MSNLEN];           /* CalledPartyNumber on incoming call */
-	char         connmsg[ISDN_CMSGLEN];	 /* CONNECT-Msg from HL-Driver	       */
-#ifdef CONFIG_ISDN_AUDIO
-	u_char       vpar[10];                   /* Voice-parameters                   */
-	int          lastDLE;                    /* Flag for voice-coding: DLE seen    */
-#endif
-	int          mdmcmdl;                    /* Length of Modem-Commandbuffer      */
-	int          pluscount;                  /* Counter for +++ sequence           */
-	u_long       lastplus;                   /* Timestamp of last +                */
-	int	     carrierwait;                /* Seconds of carrier waiting         */
-	char         mdmcmd[255];                /* Modem-Commandbuffer                */
-	unsigned int charge;                     /* Charge units of current connection */
-} atemu;
-
-/* Private data (similar to async_struct in <linux/serial.h>) */
-typedef struct modem_info {
-  int			magic;
-  struct tty_port	port;
-  int			x_char;		 /* xon/xoff character             */
-  int			mcr;		 /* Modem control register         */
-  int                   msr;             /* Modem status register          */
-  int                   lsr;             /* Line status register           */
-  int			line;
-  int                   online;          /* 1 = B-Channel is up, drop data */
-					 /* 2 = B-Channel is up, deliver d.*/
-  int                   dialing;         /* Dial in progress or ATA        */
-  int                   closing;
-  int                   rcvsched;        /* Receive needs schedule         */
-  int                   isdn_driver;	 /* Index to isdn-driver           */
-  int                   isdn_channel;    /* Index to isdn-channel          */
-  int                   drv_index;       /* Index to dev->usage            */
-  int                   ncarrier;        /* Flag: schedule NO CARRIER      */
-  unsigned char         last_cause[8];   /* Last cause message             */
-  unsigned char         last_num[ISDN_MSNLEN];
-	                                 /* Last phone-number              */
-  unsigned char         last_l2;         /* Last layer-2 protocol          */
-  unsigned char         last_si;         /* Last service                   */
-  unsigned char         last_lhup;       /* Last hangup local?             */
-  unsigned char         last_dir;        /* Last direction (in or out)     */
-  struct timer_list     nc_timer;        /* Timer for delayed NO CARRIER   */
-  int                   send_outstanding;/* # of outstanding send-requests */
-  int                   xmit_size;       /* max. # of chars in xmit_buf    */
-  int                   xmit_count;      /* # of chars in xmit_buf         */
-  struct sk_buff_head   xmit_queue;      /* transmit queue                 */
-  atomic_t              xmit_lock;       /* Semaphore for isdn_tty_write   */
-#ifdef CONFIG_ISDN_AUDIO
-  int                   vonline;         /* Voice-channel status           */
-					 /* Bit 0 = recording              */
-					 /* Bit 1 = playback               */
-					 /* Bit 2 = playback, DLE-ETX seen */
-  struct sk_buff_head   dtmf_queue;      /* queue for dtmf results         */
-  void                  *adpcms;         /* state for adpcm decompression  */
-  void                  *adpcmr;         /* state for adpcm compression    */
-  void                  *dtmf_state;     /* state for dtmf decoder         */
-  void                  *silence_state;  /* state for silence detection    */
-#endif
-#ifdef CONFIG_ISDN_TTY_FAX
-  struct T30_s		*fax;		 /* T30 Fax Group 3 data/interface */
-  int			faxonline;	 /* Fax-channel status             */
-#endif
-  atemu                 emu;             /* AT-emulator data               */
-  spinlock_t	        readlock;
-} modem_info;
-
-#define ISDN_MODEM_WINSIZE 8
-
-/* Description of one ISDN-tty */
-typedef struct _isdn_modem {
-  int                refcount;				/* Number of opens        */
-  struct tty_driver  *tty_modem;			/* tty-device             */
-  struct tty_struct  *modem_table[ISDN_MAX_CHANNELS];	/* ?? copied from Orig    */
-  struct ktermios     *modem_termios[ISDN_MAX_CHANNELS];
-  struct ktermios     *modem_termios_locked[ISDN_MAX_CHANNELS];
-  modem_info         info[ISDN_MAX_CHANNELS];	   /* Private data           */
-} isdn_modem_t;
-
-/*======================= End of ISDN-tty stuff ============================*/
-
-/*======================== Start of V.110 stuff ============================*/
-#define V110_BUFSIZE 1024
-
-typedef struct {
-	int nbytes;                    /* 1 Matrixbyte -> nbytes in stream     */
-	int nbits;                     /* Number of used bits in streambyte    */
-	unsigned char key;             /* Bitmask in stream eg. 11 (nbits=2)   */
-	int decodelen;                 /* Amount of data in decodebuf          */
-	int SyncInit;                  /* Number of sync frames to send        */
-	unsigned char *OnlineFrame;    /* Precalculated V110 idle frame        */
-	unsigned char *OfflineFrame;   /* Precalculated V110 sync Frame        */
-	int framelen;                  /* Length of frames                     */
-	int skbuser;                   /* Number of unacked userdata skbs      */
-	int skbidle;                   /* Number of unacked idle/sync skbs     */
-	int introducer;                /* Local vars for decoder               */
-	int dbit;
-	unsigned char b;
-	int skbres;                    /* space to reserve in outgoing skb     */
-	int maxsize;                   /* maxbufsize of lowlevel driver        */
-	unsigned char *encodebuf;      /* temporary buffer for encoding        */
-	unsigned char decodebuf[V110_BUFSIZE]; /* incomplete V110 matrices     */
-} isdn_v110_stream;
-
-/*========================= End of V.110 stuff =============================*/
-
-/*======================= Start of general stuff ===========================*/
-
-typedef struct {
-	char *next;
-	char *private;
-} infostruct;
-
-#define DRV_FLAG_RUNNING 1
-#define DRV_FLAG_REJBUS  2
-#define DRV_FLAG_LOADED  4
-
-/* Description of hardware-level-driver */
-typedef struct _isdn_driver {
-	ulong               online;           /* Channel-Online flags             */
-	ulong               flags;            /* Misc driver Flags                */
-	int                 locks;            /* Number of locks for this driver  */
-	int                 channels;         /* Number of channels               */
-	wait_queue_head_t   st_waitq;         /* Wait-Queue for status-read's     */
-	int                 maxbufsize;       /* Maximum Buffersize supported     */
-	unsigned long       pktcount;         /* Until now: unused                */
-	int                 stavail;          /* Chars avail on Status-device     */
-	isdn_if            *interface;        /* Interface to driver              */
-	int                *rcverr;           /* Error-counters for B-Ch.-receive */
-	int                *rcvcount;         /* Byte-counters for B-Ch.-receive  */
-#ifdef CONFIG_ISDN_AUDIO
-	unsigned long      DLEflag;           /* Flags: Insert DLE at next read   */
-#endif
-	struct sk_buff_head *rpqueue;         /* Pointers to start of Rcv-Queue   */
-	wait_queue_head_t  *rcv_waitq;       /* Wait-Queues for B-Channel-Reads  */
-	wait_queue_head_t  *snd_waitq;       /* Wait-Queue for B-Channel-Send's  */
-	char               msn2eaz[10][ISDN_MSNLEN];  /* Mapping-Table MSN->EAZ   */
-} isdn_driver_t;
-
-/* Main driver-data */
-typedef struct isdn_devt {
-	struct module     *owner;
-	spinlock_t	  lock;
-	unsigned short    flags;		      /* Bitmapped Flags:           */
-	int               drivers;		      /* Current number of drivers  */
-	int               channels;		      /* Current number of channels */
-	int               net_verbose;                /* Verbose-Flag               */
-	int               modempoll;		      /* Flag: tty-read active      */
-	spinlock_t	  timerlock;
-	int               tflags;                     /* Timer-Flags:               */
-	/*  see ISDN_TIMER_..defines  */
-	int               global_flags;
-	infostruct        *infochain;                 /* List of open info-devs.    */
-	wait_queue_head_t info_waitq;                 /* Wait-Queue for isdninfo    */
-	struct timer_list timer;		      /* Misc.-function Timer       */
-	int               chanmap[ISDN_MAX_CHANNELS]; /* Map minor->device-channel  */
-	int               drvmap[ISDN_MAX_CHANNELS];  /* Map minor->driver-index    */
-	int               usage[ISDN_MAX_CHANNELS];   /* Used by tty/ip/voice       */
-	char              num[ISDN_MAX_CHANNELS][ISDN_MSNLEN];
-	/* Remote number of active ch.*/
-	int               m_idx[ISDN_MAX_CHANNELS];   /* Index for mdm....          */
-	isdn_driver_t     *drv[ISDN_MAX_DRIVERS];     /* Array of drivers           */
-	isdn_net_dev      *netdev;		      /* Linked list of net-if's    */
-	char              drvid[ISDN_MAX_DRIVERS][20];/* Driver-ID                 */
-	struct task_struct *profd;                    /* For iprofd                 */
-	isdn_modem_t      mdm;			      /* tty-driver-data            */
-	isdn_net_dev      *rx_netdev[ISDN_MAX_CHANNELS]; /* rx netdev-pointers     */
-	isdn_net_dev      *st_netdev[ISDN_MAX_CHANNELS]; /* stat netdev-pointers   */
-	ulong             ibytes[ISDN_MAX_CHANNELS];  /* Statistics incoming bytes  */
-	ulong             obytes[ISDN_MAX_CHANNELS];  /* Statistics outgoing bytes  */
-	int               v110emu[ISDN_MAX_CHANNELS]; /* V.110 emulator-mode 0=none */
-	atomic_t          v110use[ISDN_MAX_CHANNELS]; /* Usage-Semaphore for stream */
-	isdn_v110_stream  *v110[ISDN_MAX_CHANNELS];   /* V.110 private data         */
-	struct mutex      mtx;                        /* serialize list access*/
-	unsigned long     global_features;
-} isdn_dev;
-
-extern isdn_dev *dev;
-
-
-#endif /* __ISDN_H__ */
diff --git a/include/linux/isdn_divertif.h b/include/linux/isdn_divertif.h
deleted file mode 100644
index 19ab361f9f07..000000000000
--- a/include/linux/isdn_divertif.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $
- *
- * Header for the diversion supplementary interface for i4l.
- *
- * Author    Werner Cornelius (werner@titro.de)
- * Copyright by Werner Cornelius (werner@titro.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef _LINUX_ISDN_DIVERTIF_H
-#define _LINUX_ISDN_DIVERTIF_H
-
-#include <linux/isdnif.h>
-#include <linux/types.h>
-#include <uapi/linux/isdn_divertif.h>
-
-/***************************************************************/
-/* structure exchanging data between isdn hl and divert module */
-/***************************************************************/ 
-typedef struct
-  { ulong if_magic; /* magic info and version */
-    int cmd; /* command */
-    int (*stat_callback)(isdn_ctrl *); /* supplied by divert module when calling */
-    int (*ll_cmd)(isdn_ctrl *); /* supplied by hl on return */
-    char * (*drv_to_name)(int); /* map a driver id to name, supplied by hl */
-    int (*name_to_drv)(char *); /* map a driver id to name, supplied by hl */
-  } isdn_divert_if;
-
-/*********************/
-/* function register */
-/*********************/
-extern int DIVERT_REG_NAME(isdn_divert_if *);
-#endif /* _LINUX_ISDN_DIVERTIF_H */
diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h
deleted file mode 100644
index a0070c6dfaf8..000000000000
--- a/include/linux/isdn_ppp.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Linux ISDN subsystem, sync PPP, interface to ipppd
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * Copyright 2000-2002  by Kai Germaschewski (kai@germaschewski.name)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef _LINUX_ISDN_PPP_H
-#define _LINUX_ISDN_PPP_H
-
-
-
-
-#ifdef CONFIG_IPPP_FILTER
-#include <linux/filter.h>
-#endif
-#include <uapi/linux/isdn_ppp.h>
-
-#define DECOMP_ERR_NOMEM	(-10)
-
-#define MP_END_FRAG    0x40
-#define MP_BEGIN_FRAG  0x80
-
-#define MP_MAX_QUEUE_LEN	16
-
-/*
- * We need a way for the decompressor to influence the generation of CCP
- * Reset-Requests in a variety of ways. The decompressor is already returning
- * a lot of information (generated skb length, error conditions) so we use
- * another parameter. This parameter is a pointer to a structure which is
- * to be marked valid by the decompressor and only in this case is ever used.
- * Furthermore, the only case where this data is used is when the decom-
- * pressor returns DECOMP_ERROR.
- *
- * We use this same struct for the reset entry of the compressor to commu-
- * nicate to its caller how to deal with sending of a Reset Ack. In this
- * case, expra is not used, but other options still apply (suppressing
- * sending with rsend, appending arbitrary data, etc).
- */
-
-#define IPPP_RESET_MAXDATABYTES	32
-
-struct isdn_ppp_resetparams {
-  unsigned char valid:1;	/* rw Is this structure filled at all ? */
-  unsigned char rsend:1;	/* rw Should we send one at all ? */
-  unsigned char idval:1;	/* rw Is the id field valid ? */
-  unsigned char dtval:1;	/* rw Is the data field valid ? */
-  unsigned char expra:1;	/* rw Is an Ack expected for this Req ? */
-  unsigned char id;		/* wo Send CCP ResetReq with this id */
-  unsigned short maxdlen;	/* ro Max bytes to be stored in data field */
-  unsigned short dlen;		/* rw Bytes stored in data field */
-  unsigned char *data;		/* wo Data for ResetReq info field */
-};
-
-/*
- * this is an 'old friend' from ppp-comp.h under a new name 
- * check the original include for more information
- */
-struct isdn_ppp_compressor {
-  struct isdn_ppp_compressor *next, *prev;
-  struct module *owner;
-  int num; /* CCP compression protocol number */
-  
-  void *(*alloc) (struct isdn_ppp_comp_data *);
-  void (*free) (void *state);
-  int  (*init) (void *state, struct isdn_ppp_comp_data *,
-		int unit,int debug);
-  
-  /* The reset entry needs to get more exact information about the
-     ResetReq or ResetAck it was called with. The parameters are
-     obvious. If reset is called without a Req or Ack frame which
-     could be handed into it, code MUST be set to 0. Using rsparm,
-     the reset entry can control if and how a ResetAck is returned. */
-  
-  void (*reset) (void *state, unsigned char code, unsigned char id,
-		 unsigned char *data, unsigned len,
-		 struct isdn_ppp_resetparams *rsparm);
-  
-  int  (*compress) (void *state, struct sk_buff *in,
-		    struct sk_buff *skb_out, int proto);
-  
-	int  (*decompress) (void *state,struct sk_buff *in,
-			    struct sk_buff *skb_out,
-			    struct isdn_ppp_resetparams *rsparm);
-  
-  void (*incomp) (void *state, struct sk_buff *in,int proto);
-  void (*stat) (void *state, struct compstat *stats);
-};
-
-extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *);
-extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *);
-extern int isdn_ppp_dial_slave(char *);
-extern int isdn_ppp_hangup_slave(char *);
-
-typedef struct {
-  unsigned long seqerrs;
-  unsigned long frame_drops;
-  unsigned long overflows;
-  unsigned long max_queue_len;
-} isdn_mppp_stats;
-
-typedef struct {
-  int mp_mrru;                        /* unused                             */
-  struct sk_buff * frags;	/* fragments sl list -- use skb->next */
-  long frames;			/* number of frames in the frame list */
-  unsigned int seq;		/* last processed packet seq #: any packets
-  				 * with smaller seq # will be dropped
-				 * unconditionally */
-  spinlock_t lock;
-  int ref_ct;				 
-  /* statistics */
-  isdn_mppp_stats stats;
-} ippp_bundle;
-
-#define NUM_RCV_BUFFS     64
-
-struct ippp_buf_queue {
-  struct ippp_buf_queue *next;
-  struct ippp_buf_queue *last;
-  char *buf;                 /* NULL here indicates end of queue */
-  int len;
-};
-
-/* The data structure for one CCP reset transaction */
-enum ippp_ccp_reset_states {
-  CCPResetIdle,
-  CCPResetSentReq,
-  CCPResetRcvdReq,
-  CCPResetSentAck,
-  CCPResetRcvdAck
-};
-
-struct ippp_ccp_reset_state {
-  enum ippp_ccp_reset_states state;	/* State of this transaction */
-  struct ippp_struct *is;		/* Backlink to device stuff */
-  unsigned char id;			/* Backlink id index */
-  unsigned char ta:1;			/* The timer is active (flag) */
-  unsigned char expra:1;		/* We expect a ResetAck at all */
-  int dlen;				/* Databytes stored in data */
-  struct timer_list timer;		/* For timeouts/retries */
-  /* This is a hack but seems sufficient for the moment. We do not want
-     to have this be yet another allocation for some bytes, it is more
-     memory management overhead than the whole mess is worth. */
-  unsigned char data[IPPP_RESET_MAXDATABYTES];
-};
-
-/* The data structure keeping track of the currently outstanding CCP Reset
-   transactions. */
-struct ippp_ccp_reset {
-  struct ippp_ccp_reset_state *rs[256];	/* One per possible id */
-  unsigned char lastid;			/* Last id allocated by the engine */
-};
-
-struct ippp_struct {
-  struct ippp_struct *next_link;
-  int state;
-  spinlock_t buflock;
-  struct ippp_buf_queue rq[NUM_RCV_BUFFS]; /* packet queue for isdn_ppp_read() */
-  struct ippp_buf_queue *first;  /* pointer to (current) first packet */
-  struct ippp_buf_queue *last;   /* pointer to (current) last used packet in queue */
-  wait_queue_head_t wq;
-  struct task_struct *tk;
-  unsigned int mpppcfg;
-  unsigned int pppcfg;
-  unsigned int mru;
-  unsigned int mpmru;
-  unsigned int mpmtu;
-  unsigned int maxcid;
-  struct isdn_net_local_s *lp;
-  int unit;
-  int minor;
-  unsigned int last_link_seqno;
-  long mp_seqno;
-#ifdef CONFIG_ISDN_PPP_VJ
-  unsigned char *cbuf;
-  struct slcompress *slcomp;
-#endif
-#ifdef CONFIG_IPPP_FILTER
-  struct bpf_prog *pass_filter;   /* filter for packets to pass */
-  struct bpf_prog *active_filter; /* filter for pkts to reset idle */
-#endif
-  unsigned long debug;
-  struct isdn_ppp_compressor *compressor,*decompressor;
-  struct isdn_ppp_compressor *link_compressor,*link_decompressor;
-  void *decomp_stat,*comp_stat,*link_decomp_stat,*link_comp_stat;
-  struct ippp_ccp_reset *reset;	/* Allocated on demand, may never be needed */
-  unsigned long compflags;
-};
-
-#endif /* _LINUX_ISDN_PPP_H */
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
deleted file mode 100644
index 8d80fdc68647..000000000000
--- a/include/linux/isdnif.h
+++ /dev/null
@@ -1,505 +0,0 @@
-/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Linux ISDN subsystem
- * Definition of the interface between the subsystem and its low-level drivers.
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef __ISDNIF_H__
-#define __ISDNIF_H__
-
-
-#include <linux/skbuff.h>
-#include <uapi/linux/isdnif.h>
-
-/***************************************************************************/
-/* Extensions made by Werner Cornelius (werner@ikt.de)                     */
-/*                                                                         */ 
-/* The proceed command holds a incoming call in a state to leave processes */
-/* enough time to check whether ist should be accepted.                    */
-/* The PROT_IO Command extends the interface to make protocol dependent    */
-/* features available (call diversion, call waiting...).                   */
-/*                                                                         */ 
-/* The PROT_IO Command is executed with the desired driver id and the arg  */
-/* parameter coded as follows:                                             */
-/* The lower 8 bits of arg contain the desired protocol from ISDN_PTYPE    */
-/* definitions. The upper 24 bits represent the protocol specific cmd/stat.*/
-/* Any additional data is protocol and command specific.                   */
-/* This mechanism also applies to the statcallb callback STAT_PROT.        */    
-/*                                                                         */
-/* This suggested extension permits an easy expansion of protocol specific */
-/* handling. Extensions may be added at any time without changing the HL   */
-/* driver code and not getting conflicts without certifications.           */
-/* The well known CAPI 2.0 interface handles such extensions in a similar  */
-/* way. Perhaps a protocol specific module may be added and separately     */
-/* loaded and linked to the basic isdn module for handling.                */                    
-/***************************************************************************/
-
-/*****************/
-/* DSS1 commands */ 
-/*****************/
-#define DSS1_CMD_INVOKE       ((0x00 << 8) | ISDN_PTYPE_EURO)   /* invoke a supplementary service */
-#define DSS1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_EURO)   /* abort a invoke cmd */
-
-/*******************************/
-/* DSS1 Status callback values */
-/*******************************/
-#define DSS1_STAT_INVOKE_RES  ((0x80 << 8) | ISDN_PTYPE_EURO)   /* Result for invocation */
-#define DSS1_STAT_INVOKE_ERR  ((0x81 << 8) | ISDN_PTYPE_EURO)   /* Error Return for invocation */
-#define DSS1_STAT_INVOKE_BRD  ((0x82 << 8) | ISDN_PTYPE_EURO)   /* Deliver invoke broadcast info */
-
-
-/*********************************************************************/
-/* structures for DSS1 commands and callback                         */
-/*                                                                   */
-/* An action is invoked by sending a DSS1_CMD_INVOKE. The ll_id, proc*/
-/* timeout, datalen and data fields must be set before calling.      */
-/*                                                                   */
-/* The return value is a positive hl_id value also delivered in the  */
-/* hl_id field. A value of zero signals no more left hl_id capacitys.*/
-/* A negative return value signals errors in LL. So if the return    */
-/* value is <= 0 no action in LL will be taken -> request ignored    */
-/*                                                                   */
-/* The timeout field must be filled with a positive value specifying */
-/* the amount of time the INVOKED process waits for a reaction from  */
-/* the network.                                                      */
-/* If a response (either error or result) is received during this    */
-/* intervall, a reporting callback is initiated and the process will */
-/* be deleted, the hl identifier will be freed.                      */
-/* If no response is received during the specified intervall, a error*/
-/* callback is initiated with timeout set to -1 and a datalen set    */
-/* to 0.                                                             */
-/* If timeout is set to a value <= 0 during INVOCATION the process is*/
-/* immediately deleted after sending the data. No callback occurs !  */
-/*                                                                   */
-/* A currently waiting process may be aborted with INVOKE_ABORT. No  */
-/* callback will occur when a process has been aborted.              */
-/*                                                                   */
-/* Broadcast invoke frames from the network are reported via the     */
-/* STAT_INVOKE_BRD callback. The ll_id is set to 0, the other fields */
-/* are supplied by the network and not by the HL.                    */   
-/*********************************************************************/
-
-/*****************/
-/* NI1 commands */ 
-/*****************/
-#define NI1_CMD_INVOKE       ((0x00 << 8) | ISDN_PTYPE_NI1)   /* invoke a supplementary service */
-#define NI1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_NI1)   /* abort a invoke cmd */
-
-/*******************************/
-/* NI1 Status callback values */
-/*******************************/
-#define NI1_STAT_INVOKE_RES  ((0x80 << 8) | ISDN_PTYPE_NI1)   /* Result for invocation */
-#define NI1_STAT_INVOKE_ERR  ((0x81 << 8) | ISDN_PTYPE_NI1)   /* Error Return for invocation */
-#define NI1_STAT_INVOKE_BRD  ((0x82 << 8) | ISDN_PTYPE_NI1)   /* Deliver invoke broadcast info */
-
-typedef struct
-  { ulong ll_id; /* ID supplied by LL when executing    */
-		 /* a command and returned by HL for    */
-                 /* INVOKE_RES and INVOKE_ERR           */
-    int hl_id;   /* ID supplied by HL when called       */
-                 /* for executing a cmd and delivered   */
-                 /* for results and errors              */
-                 /* must be supplied by LL when aborting*/  
-    int proc;    /* invoke procedure used by CMD_INVOKE */
-                 /* returned by callback and broadcast  */ 
-    int timeout; /* timeout for INVOKE CMD in ms        */
-                 /* -1  in stat callback when timed out */
-                 /* error value when error callback     */
-    int datalen; /* length of cmd or stat data          */
-    u_char *data;/* pointer to data delivered or send   */
-  } isdn_cmd_stat;
-
-/*
- * Commands from linklevel to lowlevel
- *
- */
-#define ISDN_CMD_IOCTL    0       /* Perform ioctl                         */
-#define ISDN_CMD_DIAL     1       /* Dial out                              */
-#define ISDN_CMD_ACCEPTD  2       /* Accept an incoming call on D-Chan.    */
-#define ISDN_CMD_ACCEPTB  3       /* Request B-Channel connect.            */
-#define ISDN_CMD_HANGUP   4       /* Hangup                                */
-#define ISDN_CMD_CLREAZ   5       /* Clear EAZ(s) of channel               */
-#define ISDN_CMD_SETEAZ   6       /* Set EAZ(s) of channel                 */
-#define ISDN_CMD_GETEAZ   7       /* Get EAZ(s) of channel                 */
-#define ISDN_CMD_SETSIL   8       /* Set Service-Indicator-List of channel */
-#define ISDN_CMD_GETSIL   9       /* Get Service-Indicator-List of channel */
-#define ISDN_CMD_SETL2   10       /* Set B-Chan. Layer2-Parameter          */
-#define ISDN_CMD_GETL2   11       /* Get B-Chan. Layer2-Parameter          */
-#define ISDN_CMD_SETL3   12       /* Set B-Chan. Layer3-Parameter          */
-#define ISDN_CMD_GETL3   13       /* Get B-Chan. Layer3-Parameter          */
-// #define ISDN_CMD_LOCK    14       /* Signal usage by upper levels          */
-// #define ISDN_CMD_UNLOCK  15       /* Release usage-lock                    */
-#define ISDN_CMD_SUSPEND 16       /* Suspend connection                    */
-#define ISDN_CMD_RESUME  17       /* Resume connection                     */
-#define ISDN_CMD_PROCEED 18       /* Proceed with call establishment       */
-#define ISDN_CMD_ALERT   19       /* Alert after Proceeding                */
-#define ISDN_CMD_REDIR   20       /* Redir a incoming call                 */
-#define ISDN_CMD_PROT_IO 21       /* Protocol specific commands            */
-#define CAPI_PUT_MESSAGE 22       /* CAPI message send down or up          */
-#define ISDN_CMD_FAXCMD  23       /* FAX commands to HL-driver             */
-#define ISDN_CMD_AUDIO   24       /* DSP, DTMF, ... settings               */
-
-/*
- * Status-Values delivered from lowlevel to linklevel via
- * statcallb().
- *
- */
-#define ISDN_STAT_STAVAIL 256    /* Raw status-data available             */
-#define ISDN_STAT_ICALL   257    /* Incoming call detected                */
-#define ISDN_STAT_RUN     258    /* Signal protocol-code is running       */
-#define ISDN_STAT_STOP    259    /* Signal halt of protocol-code          */
-#define ISDN_STAT_DCONN   260    /* Signal D-Channel connect              */
-#define ISDN_STAT_BCONN   261    /* Signal B-Channel connect              */
-#define ISDN_STAT_DHUP    262    /* Signal D-Channel disconnect           */
-#define ISDN_STAT_BHUP    263    /* Signal B-Channel disconnect           */
-#define ISDN_STAT_CINF    264    /* Charge-Info                           */
-#define ISDN_STAT_LOAD    265    /* Signal new lowlevel-driver is loaded  */
-#define ISDN_STAT_UNLOAD  266    /* Signal unload of lowlevel-driver      */
-#define ISDN_STAT_BSENT   267    /* Signal packet sent                    */
-#define ISDN_STAT_NODCH   268    /* Signal no D-Channel                   */
-#define ISDN_STAT_ADDCH   269    /* Add more Channels                     */
-#define ISDN_STAT_CAUSE   270    /* Cause-Message                         */
-#define ISDN_STAT_ICALLW  271    /* Incoming call without B-chan waiting  */
-#define ISDN_STAT_REDIR   272    /* Redir result                          */
-#define ISDN_STAT_PROT    273    /* protocol IO specific callback         */
-#define ISDN_STAT_DISPLAY 274    /* deliver a received display message    */
-#define ISDN_STAT_L1ERR   275    /* Signal Layer-1 Error                  */
-#define ISDN_STAT_FAXIND  276    /* FAX indications from HL-driver        */
-#define ISDN_STAT_AUDIO   277    /* DTMF, DSP indications                 */
-#define ISDN_STAT_DISCH   278    /* Disable/Enable channel usage          */
-
-/*
- * Audio commands
- */
-#define ISDN_AUDIO_SETDD	0	/* Set DTMF detection           */
-#define ISDN_AUDIO_DTMF		1	/* Rx/Tx DTMF                   */
-
-/*
- * Values for errcode field
- */
-#define ISDN_STAT_L1ERR_SEND 1
-#define ISDN_STAT_L1ERR_RECV 2
-
-/*
- * Values for feature-field of interface-struct.
- */
-/* Layer 2 */
-#define ISDN_FEATURE_L2_X75I    (0x0001 << ISDN_PROTO_L2_X75I)
-#define ISDN_FEATURE_L2_X75UI   (0x0001 << ISDN_PROTO_L2_X75UI)
-#define ISDN_FEATURE_L2_X75BUI  (0x0001 << ISDN_PROTO_L2_X75BUI)
-#define ISDN_FEATURE_L2_HDLC    (0x0001 << ISDN_PROTO_L2_HDLC)
-#define ISDN_FEATURE_L2_TRANS   (0x0001 << ISDN_PROTO_L2_TRANS)
-#define ISDN_FEATURE_L2_X25DTE  (0x0001 << ISDN_PROTO_L2_X25DTE)
-#define ISDN_FEATURE_L2_X25DCE  (0x0001 << ISDN_PROTO_L2_X25DCE)
-#define ISDN_FEATURE_L2_V11096  (0x0001 << ISDN_PROTO_L2_V11096)
-#define ISDN_FEATURE_L2_V11019  (0x0001 << ISDN_PROTO_L2_V11019)
-#define ISDN_FEATURE_L2_V11038  (0x0001 << ISDN_PROTO_L2_V11038)
-#define ISDN_FEATURE_L2_MODEM   (0x0001 << ISDN_PROTO_L2_MODEM)
-#define ISDN_FEATURE_L2_FAX	(0x0001 << ISDN_PROTO_L2_FAX)
-#define ISDN_FEATURE_L2_HDLC_56K (0x0001 << ISDN_PROTO_L2_HDLC_56K)
-
-#define ISDN_FEATURE_L2_MASK    (0x0FFFF) /* Max. 16 protocols */
-#define ISDN_FEATURE_L2_SHIFT   (0)
-
-/* Layer 3 */
-#define ISDN_FEATURE_L3_TRANS   (0x10000 << ISDN_PROTO_L3_TRANS)
-#define ISDN_FEATURE_L3_TRANSDSP (0x10000 << ISDN_PROTO_L3_TRANSDSP)
-#define ISDN_FEATURE_L3_FCLASS2	(0x10000 << ISDN_PROTO_L3_FCLASS2)
-#define ISDN_FEATURE_L3_FCLASS1	(0x10000 << ISDN_PROTO_L3_FCLASS1)
-
-#define ISDN_FEATURE_L3_MASK    (0x0FF0000) /* Max. 8 Protocols */
-#define ISDN_FEATURE_L3_SHIFT   (16)
-
-/* Signaling */
-#define ISDN_FEATURE_P_UNKNOWN  (0x1000000 << ISDN_PTYPE_UNKNOWN)
-#define ISDN_FEATURE_P_1TR6     (0x1000000 << ISDN_PTYPE_1TR6)
-#define ISDN_FEATURE_P_EURO     (0x1000000 << ISDN_PTYPE_EURO)
-#define ISDN_FEATURE_P_NI1      (0x1000000 << ISDN_PTYPE_NI1)
-
-#define ISDN_FEATURE_P_MASK     (0x0FF000000) /* Max. 8 Protocols */
-#define ISDN_FEATURE_P_SHIFT    (24)
-
-typedef struct setup_parm {
-    unsigned char phone[32];	/* Remote Phone-Number */
-    unsigned char eazmsn[32];	/* Local EAZ or MSN    */
-    unsigned char si1;      /* Service Indicator 1 */
-    unsigned char si2;      /* Service Indicator 2 */
-    unsigned char plan;     /* Numbering plan      */
-    unsigned char screen;   /* Screening info      */
-} setup_parm;
-
-
-#ifdef CONFIG_ISDN_TTY_FAX
-/* T.30 Fax G3 */
-
-#define FAXIDLEN 21
-
-typedef struct T30_s {
-	/* session parameters */
-	__u8 resolution;
-	__u8 rate;
-	__u8 width;
-	__u8 length;
-	__u8 compression;
-	__u8 ecm;
-	__u8 binary;
-	__u8 scantime;
-	__u8 id[FAXIDLEN];
-	/* additional parameters */
-	__u8 phase;
-	__u8 direction;
-	__u8 code;
-	__u8 badlin;
-	__u8 badmul;
-	__u8 bor;
-	__u8 fet;
-	__u8 pollid[FAXIDLEN];
-	__u8 cq;
-	__u8 cr;
-	__u8 ctcrty;
-	__u8 minsp;
-	__u8 phcto;
-	__u8 rel;
-	__u8 nbc;
-	/* remote station parameters */
-	__u8 r_resolution;
-	__u8 r_rate;
-	__u8 r_width;
-	__u8 r_length;
-	__u8 r_compression;
-	__u8 r_ecm;
-	__u8 r_binary;
-	__u8 r_scantime;
-	__u8 r_id[FAXIDLEN];
-	__u8 r_code;
-} __packed T30_s;
-
-#define ISDN_TTY_FAX_CONN_IN	0
-#define ISDN_TTY_FAX_CONN_OUT	1
-
-#define ISDN_TTY_FAX_FCON	0
-#define ISDN_TTY_FAX_DIS 	1
-#define ISDN_TTY_FAX_FTT 	2
-#define ISDN_TTY_FAX_MCF 	3
-#define ISDN_TTY_FAX_DCS 	4
-#define ISDN_TTY_FAX_TRAIN_OK	5
-#define ISDN_TTY_FAX_EOP 	6
-#define ISDN_TTY_FAX_EOM 	7
-#define ISDN_TTY_FAX_MPS 	8
-#define ISDN_TTY_FAX_DTC 	9
-#define ISDN_TTY_FAX_RID 	10
-#define ISDN_TTY_FAX_HNG 	11
-#define ISDN_TTY_FAX_DT  	12
-#define ISDN_TTY_FAX_FCON_I	13
-#define ISDN_TTY_FAX_DR  	14
-#define ISDN_TTY_FAX_ET  	15
-#define ISDN_TTY_FAX_CFR 	16
-#define ISDN_TTY_FAX_PTS 	17
-#define ISDN_TTY_FAX_SENT	18
-
-#define ISDN_FAX_PHASE_IDLE	0
-#define ISDN_FAX_PHASE_A	1
-#define ISDN_FAX_PHASE_B   	2
-#define ISDN_FAX_PHASE_C   	3
-#define ISDN_FAX_PHASE_D   	4
-#define ISDN_FAX_PHASE_E   	5
-
-#endif /* TTY_FAX */
-
-#define ISDN_FAX_CLASS1_FAE	0
-#define ISDN_FAX_CLASS1_FTS	1
-#define ISDN_FAX_CLASS1_FRS	2
-#define ISDN_FAX_CLASS1_FTM	3
-#define ISDN_FAX_CLASS1_FRM	4
-#define ISDN_FAX_CLASS1_FTH	5
-#define ISDN_FAX_CLASS1_FRH	6
-#define ISDN_FAX_CLASS1_CTRL	7
-
-#define ISDN_FAX_CLASS1_OK	0
-#define ISDN_FAX_CLASS1_CONNECT	1
-#define ISDN_FAX_CLASS1_NOCARR	2
-#define ISDN_FAX_CLASS1_ERROR	3
-#define ISDN_FAX_CLASS1_FCERROR	4
-#define ISDN_FAX_CLASS1_QUERY	5
-
-typedef struct {
-	__u8	cmd;
-	__u8	subcmd;
-	__u8	para[50];
-} aux_s;
-
-#define AT_COMMAND	0
-#define AT_EQ_VALUE	1
-#define AT_QUERY	2
-#define AT_EQ_QUERY	3
-
-/* CAPI structs */
-
-/* this is compatible to the old union size */
-#define MAX_CAPI_PARA_LEN 50
-
-typedef struct {
-	/* Header */
-	__u16 Length;
-	__u16 ApplId;
-	__u8 Command;
-	__u8 Subcommand;
-	__u16 Messagenumber;
-
-	/* Parameter */
-	union {
-		__u32 Controller;
-		__u32 PLCI;
-		__u32 NCCI;
-	} adr;
-	__u8 para[MAX_CAPI_PARA_LEN];
-} capi_msg;
-
-/*
- * Structure for exchanging above infos
- *
- */
-typedef struct {
-	int   driver;		/* Lowlevel-Driver-ID            */
-	int   command;		/* Command or Status (see above) */
-	ulong arg;		/* Additional Data               */
-	union {
-		ulong errcode;	/* Type of error with STAT_L1ERR	*/
-		int length;	/* Amount of bytes sent with STAT_BSENT	*/
-		u_char num[50];	/* Additional Data			*/
-		setup_parm setup;/* For SETUP msg			*/
-		capi_msg cmsg;	/* For CAPI like messages		*/
-		char display[85];/* display message data		*/ 
-		isdn_cmd_stat isdn_io; /* ISDN IO-parameter/result	*/
-		aux_s aux;	/* for modem commands/indications	*/
-#ifdef CONFIG_ISDN_TTY_FAX
-		T30_s	*fax;	/* Pointer to ttys fax struct		*/
-#endif
-		ulong userdata;	/* User Data */
-	} parm;
-} isdn_ctrl;
-
-#define dss1_io    isdn_io
-#define ni1_io     isdn_io
-
-/*
- * The interface-struct itself (initialized at load-time of lowlevel-driver)
- *
- * See Documentation/isdn/INTERFACE for a description, how the communication
- * between the ISDN subsystem and its drivers is done.
- *
- */
-typedef struct {
-  struct module *owner;
-
-  /* Number of channels supported by this driver
-   */
-  int channels;
-
-  /* 
-   * Maximum Size of transmit/receive-buffer this driver supports.
-   */
-  int maxbufsize;
-
-  /* Feature-Flags for this driver.
-   * See defines ISDN_FEATURE_... for Values
-   */
-  unsigned long features;
-
-  /*
-   * Needed for calculating
-   * dev->hard_header_len = linklayer header + hl_hdrlen;
-   * Drivers, not supporting sk_buff's should set this to 0.
-   */
-  unsigned short hl_hdrlen;
-
-  /*
-   * Receive-Callback using sk_buff's
-   * Parameters:
-   *             int                    Driver-ID
-   *             int                    local channel-number (0 ...)
-   *             struct sk_buff *skb    received Data
-   */
-  void (*rcvcallb_skb)(int, int, struct sk_buff *);
-
-  /* Status-Callback
-   * Parameters:
-   *             isdn_ctrl*
-   *                   driver  = Driver ID.
-   *                   command = One of above ISDN_STAT_... constants.
-   *                   arg     = depending on status-type.
-   *                   num     = depending on status-type.
-   */
-  int (*statcallb)(isdn_ctrl*);
-
-  /* Send command
-   * Parameters:
-   *             isdn_ctrl*
-   *                   driver  = Driver ID.
-   *                   command = One of above ISDN_CMD_... constants.
-   *                   arg     = depending on command.
-   *                   num     = depending on command.
-   */
-  int (*command)(isdn_ctrl*);
-
-  /*
-   * Send data using sk_buff's
-   * Parameters:
-   *             int                    driverId
-   *             int                    local channel-number (0...)
-   *             int                    Flag: Need ACK for this packet.
-   *             struct sk_buff *skb    Data to send
-   */
-  int (*writebuf_skb) (int, int, int, struct sk_buff *);
-
-  /* Send raw D-Channel-Commands
-   * Parameters:
-   *             u_char pointer data
-   *             int    length of data
-   *             int    driverId
-   *             int    local channel-number (0 ...)
-   */
-  int (*writecmd)(const u_char __user *, int, int, int);
-
-  /* Read raw Status replies
-   *             u_char pointer data (volatile)
-   *             int    length of buffer
-   *             int    driverId
-   *             int    local channel-number (0 ...)
-   */
-  int (*readstat)(u_char __user *, int, int, int);
-
-  char id[20];
-} isdn_if;
-
-/*
- * Function which must be called by lowlevel-driver at loadtime with
- * the following fields of above struct set:
- *
- * channels     Number of channels that will be supported.
- * hl_hdrlen    Space to preserve in sk_buff's when sending. Drivers, not
- *              supporting sk_buff's should set this to 0.
- * command      Address of Command-Handler.
- * features     Bitwise coded Features of this driver. (use ISDN_FEATURE_...)
- * writebuf_skb Address of Skbuff-Send-Handler.
- * writecmd        "    "  D-Channel  " which accepts raw D-Ch-Commands.
- * readstat        "    "  D-Channel  " which delivers raw Status-Data.
- *
- * The linklevel-driver fills the following fields:
- *
- * channels      Driver-ID assigned to this driver. (Must be used on all
- *               subsequent callbacks.
- * rcvcallb_skb  Address of handler for received Skbuff's.
- * statcallb        "    "     "    for status-changes.
- *
- */
-extern int register_isdn(isdn_if*);
-#include <linux/uaccess.h>
-
-#endif /* __ISDNIF_H__ */
diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h
deleted file mode 100644
index f6358558f9f5..000000000000
--- a/include/linux/wanrouter.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * wanrouter.h	Legacy declarations kept around until X25 is removed
- */
-
-#ifndef	_ROUTER_H
-#define	_ROUTER_H
-
-#include <uapi/linux/wanrouter.h>
-
-#endif	/* _ROUTER_H */
diff --git a/include/uapi/linux/isdn.h b/include/uapi/linux/isdn.h
deleted file mode 100644
index f371fd52ed75..000000000000
--- a/include/uapi/linux/isdn.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $
- *
- * Main header for the Linux ISDN subsystem (linklevel).
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI__ISDN_H__
-#define _UAPI__ISDN_H__
-
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-
-#define ISDN_MAX_DRIVERS    32
-#define ISDN_MAX_CHANNELS   64
-
-/* New ioctl-codes */
-#define IIOCNETAIF  _IO('I',1)
-#define IIOCNETDIF  _IO('I',2)
-#define IIOCNETSCF  _IO('I',3)
-#define IIOCNETGCF  _IO('I',4)
-#define IIOCNETANM  _IO('I',5)
-#define IIOCNETDNM  _IO('I',6)
-#define IIOCNETGNM  _IO('I',7)
-#define IIOCGETSET  _IO('I',8) /* no longer supported */
-#define IIOCSETSET  _IO('I',9) /* no longer supported */
-#define IIOCSETVER  _IO('I',10)
-#define IIOCNETHUP  _IO('I',11)
-#define IIOCSETGST  _IO('I',12)
-#define IIOCSETBRJ  _IO('I',13)
-#define IIOCSIGPRF  _IO('I',14)
-#define IIOCGETPRF  _IO('I',15)
-#define IIOCSETPRF  _IO('I',16)
-#define IIOCGETMAP  _IO('I',17)
-#define IIOCSETMAP  _IO('I',18)
-#define IIOCNETASL  _IO('I',19)
-#define IIOCNETDIL  _IO('I',20)
-#define IIOCGETCPS  _IO('I',21)
-#define IIOCGETDVR  _IO('I',22)
-#define IIOCNETLCR  _IO('I',23) /* dwabc ioctl for LCR from isdnlog */
-#define IIOCNETDWRSET  _IO('I',24) /* dwabc ioctl to reset abc-values to default on a net-interface */
-
-#define IIOCNETALN  _IO('I',32)
-#define IIOCNETDLN  _IO('I',33)
-
-#define IIOCNETGPN  _IO('I',34)
-
-#define IIOCDBGVAR  _IO('I',127)
-
-#define IIOCDRVCTL  _IO('I',128)
-
-/* cisco hdlck device private ioctls */
-#define SIOCGKEEPPERIOD	(SIOCDEVPRIVATE + 0)
-#define SIOCSKEEPPERIOD	(SIOCDEVPRIVATE + 1)
-#define SIOCGDEBSERINT	(SIOCDEVPRIVATE + 2)
-#define SIOCSDEBSERINT	(SIOCDEVPRIVATE + 3)
-
-/* Packet encapsulations for net-interfaces */
-#define ISDN_NET_ENCAP_ETHER      0
-#define ISDN_NET_ENCAP_RAWIP      1
-#define ISDN_NET_ENCAP_IPTYP      2
-#define ISDN_NET_ENCAP_CISCOHDLC  3 /* Without SLARP and keepalive */
-#define ISDN_NET_ENCAP_SYNCPPP    4
-#define ISDN_NET_ENCAP_UIHDLC     5
-#define ISDN_NET_ENCAP_CISCOHDLCK 6 /* With SLARP and keepalive    */
-#define ISDN_NET_ENCAP_X25IFACE   7 /* Documentation/networking/x25-iface.txt */
-#define ISDN_NET_ENCAP_MAX_ENCAP  ISDN_NET_ENCAP_X25IFACE
-
-/* Facility which currently uses an ISDN-channel */
-#define ISDN_USAGE_NONE       0
-#define ISDN_USAGE_RAW        1
-#define ISDN_USAGE_MODEM      2
-#define ISDN_USAGE_NET        3
-#define ISDN_USAGE_VOICE      4
-#define ISDN_USAGE_FAX        5
-#define ISDN_USAGE_MASK       7 /* Mask to get plain usage */
-#define ISDN_USAGE_DISABLED  32 /* This bit is set, if channel is disabled */
-#define ISDN_USAGE_EXCLUSIVE 64 /* This bit is set, if channel is exclusive */
-#define ISDN_USAGE_OUTGOING 128 /* This bit is set, if channel is outgoing  */
-
-#define ISDN_MODEM_NUMREG    24        /* Number of Modem-Registers        */
-#define ISDN_LMSNLEN         255 /* Length of tty's Listen-MSN string */
-#define ISDN_CMSGLEN	     50	 /* Length of CONNECT-Message to add for Modem */
-
-#define ISDN_MSNLEN          32
-#define NET_DV 0x06  /* Data version for isdn_net_ioctl_cfg   */
-#define TTY_DV 0x06  /* Data version for iprofd etc.          */
-
-#define INF_DV 0x01  /* Data version for /dev/isdninfo        */
-
-typedef struct {
-  char drvid[25];
-  unsigned long arg;
-} isdn_ioctl_struct;
-
-typedef struct {
-  char name[10];
-  char phone[ISDN_MSNLEN];
-  int  outgoing;
-} isdn_net_ioctl_phone;
-
-typedef struct {
-  char name[10];     /* Name of interface                     */
-  char master[10];   /* Name of Master for Bundling           */
-  char slave[10];    /* Name of Slave for Bundling            */
-  char eaz[256];     /* EAZ/MSN                               */
-  char drvid[25];    /* DriverId for Bindings                 */
-  int  onhtime;      /* Hangup-Timeout                        */
-  int  charge;       /* Charge-Units                          */
-  int  l2_proto;     /* Layer-2 protocol                      */
-  int  l3_proto;     /* Layer-3 protocol                      */
-  int  p_encap;      /* Encapsulation                         */
-  int  exclusive;    /* Channel, if bound exclusive           */
-  int  dialmax;      /* Dial Retry-Counter                    */
-  int  slavedelay;   /* Delay until slave starts up           */
-  int  cbdelay;      /* Delay before Callback                 */
-  int  chargehup;    /* Flag: Charge-Hangup                   */
-  int  ihup;         /* Flag: Hangup-Timeout on incoming line */
-  int  secure;       /* Flag: Secure                          */
-  int  callback;     /* Flag: Callback                        */
-  int  cbhup;        /* Flag: Reject Call before Callback     */
-  int  pppbind;      /* ippp device for bindings              */
-  int  chargeint;    /* Use fixed charge interval length      */
-  int  triggercps;   /* BogoCPS needed for triggering slave   */
-  int  dialtimeout;  /* Dial-Timeout                          */
-  int  dialwait;     /* Time to wait after failed dial        */
-  int  dialmode;     /* Flag: off / on / auto                 */
-} isdn_net_ioctl_cfg;
-
-#define ISDN_NET_DIALMODE_MASK  0xC0    /* bits for status                */
-#define ISDN_NET_DM_OFF	        0x00    /* this interface is stopped      */
-#define ISDN_NET_DM_MANUAL	0x40    /* this interface is on (manual)  */
-#define ISDN_NET_DM_AUTO	0x80    /* this interface is autodial     */
-#define ISDN_NET_DIALMODE(x) ((&(x))->flags & ISDN_NET_DIALMODE_MASK)
-
-
-#endif /* _UAPI__ISDN_H__ */
diff --git a/include/uapi/linux/isdn_divertif.h b/include/uapi/linux/isdn_divertif.h
deleted file mode 100644
index 0a17bb1bcb1b..000000000000
--- a/include/uapi/linux/isdn_divertif.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $
- *
- * Header for the diversion supplementary interface for i4l.
- *
- * Author    Werner Cornelius (werner@titro.de)
- * Copyright by Werner Cornelius (werner@titro.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI_LINUX_ISDN_DIVERTIF_H
-#define _UAPI_LINUX_ISDN_DIVERTIF_H
-
-/***********************************************************/
-/* magic value is also used to control version information */
-/***********************************************************/
-#define DIVERT_IF_MAGIC 0x25873401
-#define DIVERT_CMD_REG  0x00  /* register command */
-#define DIVERT_CMD_REL  0x01  /* release command */
-#define DIVERT_NO_ERR   0x00  /* return value no error */
-#define DIVERT_CMD_ERR  0x01  /* invalid cmd */
-#define DIVERT_VER_ERR  0x02  /* magic/version invalid */
-#define DIVERT_REG_ERR  0x03  /* module already registered */
-#define DIVERT_REL_ERR  0x04  /* module not registered */
-#define DIVERT_REG_NAME isdn_register_divert
-
-
-#endif /* _UAPI_LINUX_ISDN_DIVERTIF_H */
diff --git a/include/uapi/linux/isdn_ppp.h b/include/uapi/linux/isdn_ppp.h
deleted file mode 100644
index 0bdc4efaacb2..000000000000
--- a/include/uapi/linux/isdn_ppp.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
-/* Linux ISDN subsystem, sync PPP, interface to ipppd
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * Copyright 2000-2002  by Kai Germaschewski (kai@germaschewski.name)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI_LINUX_ISDN_PPP_H
-#define _UAPI_LINUX_ISDN_PPP_H
-
-#define CALLTYPE_INCOMING 0x1
-#define CALLTYPE_OUTGOING 0x2
-#define CALLTYPE_CALLBACK 0x4
-
-#define IPPP_VERSION    "2.2.0"
-
-struct pppcallinfo
-{
-  int calltype;
-  unsigned char local_num[64];
-  unsigned char remote_num[64];
-  int charge_units;
-};
-
-#define PPPIOCGCALLINFO _IOWR('t',128,struct pppcallinfo)
-#define PPPIOCBUNDLE   _IOW('t',129,int)
-#define PPPIOCGMPFLAGS _IOR('t',130,int)
-#define PPPIOCSMPFLAGS _IOW('t',131,int)
-#define PPPIOCSMPMTU   _IOW('t',132,int)
-#define PPPIOCSMPMRU   _IOW('t',133,int)
-#define PPPIOCGCOMPRESSORS _IOR('t',134,unsigned long [8])
-#define PPPIOCSCOMPRESSOR _IOW('t',135,int)
-#define PPPIOCGIFNAME      _IOR('t',136, char [IFNAMSIZ] )
-
-
-#define SC_MP_PROT       0x00000200
-#define SC_REJ_MP_PROT   0x00000400
-#define SC_OUT_SHORT_SEQ 0x00000800
-#define SC_IN_SHORT_SEQ  0x00004000
-
-#define SC_DECOMP_ON		0x01
-#define SC_COMP_ON		0x02
-#define SC_DECOMP_DISCARD	0x04
-#define SC_COMP_DISCARD		0x08
-#define SC_LINK_DECOMP_ON	0x10
-#define SC_LINK_COMP_ON		0x20
-#define SC_LINK_DECOMP_DISCARD	0x40
-#define SC_LINK_COMP_DISCARD	0x80
-
-#define ISDN_PPP_COMP_MAX_OPTIONS 16
-
-#define IPPP_COMP_FLAG_XMIT 0x1
-#define IPPP_COMP_FLAG_LINK 0x2
-
-struct isdn_ppp_comp_data {
-  int num;
-  unsigned char options[ISDN_PPP_COMP_MAX_OPTIONS];
-  int optlen;
-  int flags;
-};
-
-#endif /* _UAPI_LINUX_ISDN_PPP_H */
diff --git a/include/uapi/linux/isdnif.h b/include/uapi/linux/isdnif.h
deleted file mode 100644
index 611a69196738..000000000000
--- a/include/uapi/linux/isdnif.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
-/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Linux ISDN subsystem
- * Definition of the interface between the subsystem and its low-level drivers.
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI__ISDNIF_H__
-#define _UAPI__ISDNIF_H__
-
-
-/*
- * Values for general protocol-selection
- */
-#define ISDN_PTYPE_UNKNOWN   0   /* Protocol undefined   */
-#define ISDN_PTYPE_1TR6      1   /* german 1TR6-protocol */
-#define ISDN_PTYPE_EURO      2   /* EDSS1-protocol       */
-#define ISDN_PTYPE_LEASED    3   /* for leased lines     */
-#define ISDN_PTYPE_NI1       4   /* US NI-1 protocol     */
-#define ISDN_PTYPE_MAX       7   /* Max. 8 Protocols     */
-
-/*
- * Values for Layer-2-protocol-selection
- */
-#define ISDN_PROTO_L2_X75I   0   /* X75/LAPB with I-Frames            */
-#define ISDN_PROTO_L2_X75UI  1   /* X75/LAPB with UI-Frames           */
-#define ISDN_PROTO_L2_X75BUI 2   /* X75/LAPB with UI-Frames           */
-#define ISDN_PROTO_L2_HDLC   3   /* HDLC                              */
-#define ISDN_PROTO_L2_TRANS  4   /* Transparent (Voice)               */
-#define ISDN_PROTO_L2_X25DTE 5   /* X25/LAPB DTE mode                 */
-#define ISDN_PROTO_L2_X25DCE 6   /* X25/LAPB DCE mode                 */
-#define ISDN_PROTO_L2_V11096 7   /* V.110 bitrate adaption 9600 Baud  */
-#define ISDN_PROTO_L2_V11019 8   /* V.110 bitrate adaption 19200 Baud */
-#define ISDN_PROTO_L2_V11038 9   /* V.110 bitrate adaption 38400 Baud */
-#define ISDN_PROTO_L2_MODEM  10  /* Analog Modem on Board */
-#define ISDN_PROTO_L2_FAX    11  /* Fax Group 2/3         */
-#define ISDN_PROTO_L2_HDLC_56K 12   /* HDLC 56k                          */
-#define ISDN_PROTO_L2_MAX    15  /* Max. 16 Protocols                 */
-
-/*
- * Values for Layer-3-protocol-selection
- */
-#define ISDN_PROTO_L3_TRANS	0	/* Transparent */
-#define ISDN_PROTO_L3_TRANSDSP	1	/* Transparent with DSP */
-#define ISDN_PROTO_L3_FCLASS2	2	/* Fax Group 2/3 CLASS 2 */
-#define ISDN_PROTO_L3_FCLASS1	3	/* Fax Group 2/3 CLASS 1 */
-#define ISDN_PROTO_L3_MAX	7	/* Max. 8 Protocols */
-
-
-#endif /* _UAPI__ISDNIF_H__ */
diff --git a/include/uapi/linux/wanrouter.h b/include/uapi/linux/wanrouter.h
deleted file mode 100644
index 2f1216d00caa..000000000000
--- a/include/uapi/linux/wanrouter.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * wanrouter.h	Legacy declarations kept around until X25 is removed
- */
-
-#ifndef _UAPI_ROUTER_H
-#define _UAPI_ROUTER_H
-
-/* 'state' defines */
-enum wan_states
-{
-	WAN_UNCONFIGURED,	/* link/channel is not configured */
-	WAN_DISCONNECTED,	/* link/channel is disconnected */
-	WAN_CONNECTING,		/* connection is in progress */
-	WAN_CONNECTED		/* link/channel is operational */
-};
-
-#endif /* _UAPI_ROUTER_H */
-- 
cgit v1.2.3


From 99c2aa151a7182c58f9477a376304c538d9cc5ab Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Apr 2019 22:57:08 +0200
Subject: isdn: hdlc: move into mISDN

The last remnant of the isdn4linux interface is now the isdnhdlc
support, used by the netjet driver. Move it next to that driver.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/isdn/Makefile                  |   1 -
 drivers/isdn/hardware/mISDN/Kconfig    |   7 +-
 drivers/isdn/hardware/mISDN/Makefile   |   2 +
 drivers/isdn/hardware/mISDN/isdnhdlc.c | 630 +++++++++++++++++++++++++++++++++
 drivers/isdn/hardware/mISDN/isdnhdlc.h |  82 +++++
 drivers/isdn/hardware/mISDN/netjet.c   |   2 +-
 drivers/isdn/i4l/Makefile              |   6 -
 drivers/isdn/i4l/isdnhdlc.c            | 630 ---------------------------------
 include/linux/isdn/hdlc.h              |  82 -----
 9 files changed, 720 insertions(+), 722 deletions(-)
 create mode 100644 drivers/isdn/hardware/mISDN/isdnhdlc.c
 create mode 100644 drivers/isdn/hardware/mISDN/isdnhdlc.h
 delete mode 100644 drivers/isdn/i4l/Makefile
 delete mode 100644 drivers/isdn/i4l/isdnhdlc.c
 delete mode 100644 include/linux/isdn/hdlc.h

(limited to 'include/linux')

diff --git a/drivers/isdn/Makefile b/drivers/isdn/Makefile
index 379b4a03c321..f2a529c5a511 100644
--- a/drivers/isdn/Makefile
+++ b/drivers/isdn/Makefile
@@ -3,7 +3,6 @@
 
 # Object files in subdirectories
 
-obj-$(CONFIG_ISDN_I4L)			+= i4l/
 obj-$(CONFIG_ISDN_CAPI)			+= capi/
 obj-$(CONFIG_MISDN)			+= mISDN/
 obj-$(CONFIG_ISDN)			+= hardware/
diff --git a/drivers/isdn/hardware/mISDN/Kconfig b/drivers/isdn/hardware/mISDN/Kconfig
index a7a34a85b970..304f50c08da2 100644
--- a/drivers/isdn/hardware/mISDN/Kconfig
+++ b/drivers/isdn/hardware/mISDN/Kconfig
@@ -79,11 +79,14 @@ config MISDN_NETJET
 	depends on PCI
 	depends on TTY
 	select MISDN_IPAC
-	select ISDN_HDLC
-	select ISDN_I4L
+	select MISDN_HDLC
 	help
 	  Enable support for Traverse Technologies NETJet PCI cards.
 
+config MISDN_HDLC
+	tristate
+	select CRC_CCITT
+	select BITREVERSE
 
 config MISDN_IPAC
 	tristate
diff --git a/drivers/isdn/hardware/mISDN/Makefile b/drivers/isdn/hardware/mISDN/Makefile
index 422f9fd8ab9a..3f50f8c4753f 100644
--- a/drivers/isdn/hardware/mISDN/Makefile
+++ b/drivers/isdn/hardware/mISDN/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_MISDN_NETJET) += netjet.o
 # chip modules
 obj-$(CONFIG_MISDN_IPAC) += mISDNipac.o
 obj-$(CONFIG_MISDN_ISAR) += mISDNisar.o
+
+obj-$(CONFIG_MISDN_HDLC) += isdnhdlc.o
diff --git a/drivers/isdn/hardware/mISDN/isdnhdlc.c b/drivers/isdn/hardware/mISDN/isdnhdlc.c
new file mode 100644
index 000000000000..3a8b562e63b1
--- /dev/null
+++ b/drivers/isdn/hardware/mISDN/isdnhdlc.c
@@ -0,0 +1,630 @@
+/*
+ * isdnhdlc.c  --  General purpose ISDN HDLC decoder.
+ *
+ * Copyright (C)
+ *	2009	Karsten Keil		<keil@b1-systems.de>
+ *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
+ *	2001	Frode Isaksen		<fisaksen@bewan.com>
+ *      2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/crc-ccitt.h>
+#include <linux/bitrev.h>
+#include "isdnhdlc.h"
+
+/*-------------------------------------------------------------------*/
+
+MODULE_AUTHOR("Wolfgang Mües <wolfgang@iksw-muees.de>, "
+	      "Frode Isaksen <fisaksen@bewan.com>, "
+	      "Kai Germaschewski <kai.germaschewski@gmx.de>");
+MODULE_DESCRIPTION("General purpose ISDN HDLC decoder");
+MODULE_LICENSE("GPL");
+
+/*-------------------------------------------------------------------*/
+
+enum {
+	HDLC_FAST_IDLE, HDLC_GET_FLAG_B0, HDLC_GETFLAG_B1A6, HDLC_GETFLAG_B7,
+	HDLC_GET_DATA, HDLC_FAST_FLAG
+};
+
+enum {
+	HDLC_SEND_DATA, HDLC_SEND_CRC1, HDLC_SEND_FAST_FLAG,
+	HDLC_SEND_FIRST_FLAG, HDLC_SEND_CRC2, HDLC_SEND_CLOSING_FLAG,
+	HDLC_SEND_IDLE1, HDLC_SEND_FAST_IDLE, HDLC_SENDFLAG_B0,
+	HDLC_SENDFLAG_B1A6, HDLC_SENDFLAG_B7, STOPPED, HDLC_SENDFLAG_ONE
+};
+
+void isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features)
+{
+	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
+	hdlc->state = HDLC_GET_DATA;
+	if (features & HDLC_56KBIT)
+		hdlc->do_adapt56 = 1;
+	if (features & HDLC_BITREVERSE)
+		hdlc->do_bitreverse = 1;
+}
+EXPORT_SYMBOL(isdnhdlc_out_init);
+
+void isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features)
+{
+	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
+	if (features & HDLC_DCHANNEL) {
+		hdlc->dchannel = 1;
+		hdlc->state = HDLC_SEND_FIRST_FLAG;
+	} else {
+		hdlc->dchannel = 0;
+		hdlc->state = HDLC_SEND_FAST_FLAG;
+		hdlc->ffvalue = 0x7e;
+	}
+	hdlc->cbin = 0x7e;
+	if (features & HDLC_56KBIT) {
+		hdlc->do_adapt56 = 1;
+		hdlc->state = HDLC_SENDFLAG_B0;
+	} else
+		hdlc->data_bits = 8;
+	if (features & HDLC_BITREVERSE)
+		hdlc->do_bitreverse = 1;
+}
+EXPORT_SYMBOL(isdnhdlc_rcv_init);
+
+static int
+check_frame(struct isdnhdlc_vars *hdlc)
+{
+	int status;
+
+	if (hdlc->dstpos < 2)	/* too small - framing error */
+		status = -HDLC_FRAMING_ERROR;
+	else if (hdlc->crc != 0xf0b8)	/* crc error */
+		status = -HDLC_CRC_ERROR;
+	else {
+		/* remove CRC */
+		hdlc->dstpos -= 2;
+		/* good frame */
+		status = hdlc->dstpos;
+	}
+	return status;
+}
+
+/*
+  isdnhdlc_decode - decodes HDLC frames from a transparent bit stream.
+
+  The source buffer is scanned for valid HDLC frames looking for
+  flags (01111110) to indicate the start of a frame. If the start of
+  the frame is found, the bit stuffing is removed (0 after 5 1's).
+  When a new flag is found, the complete frame has been received
+  and the CRC is checked.
+  If a valid frame is found, the function returns the frame length
+  excluding the CRC with the bit HDLC_END_OF_FRAME set.
+  If the beginning of a valid frame is found, the function returns
+  the length.
+  If a framing error is found (too many 1s and not a flag) the function
+  returns the length with the bit HDLC_FRAMING_ERROR set.
+  If a CRC error is found the function returns the length with the
+  bit HDLC_CRC_ERROR set.
+  If the frame length exceeds the destination buffer size, the function
+  returns the length with the bit HDLC_LENGTH_ERROR set.
+
+  src - source buffer
+  slen - source buffer length
+  count - number of bytes removed (decoded) from the source buffer
+  dst _ destination buffer
+  dsize - destination buffer size
+  returns - number of decoded bytes in the destination buffer and status
+  flag.
+*/
+int isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src, int slen,
+		    int *count, u8 *dst, int dsize)
+{
+	int status = 0;
+
+	static const unsigned char fast_flag[] = {
+		0x00, 0x00, 0x00, 0x20, 0x30, 0x38, 0x3c, 0x3e, 0x3f
+	};
+
+	static const unsigned char fast_flag_value[] = {
+		0x00, 0x7e, 0xfc, 0xf9, 0xf3, 0xe7, 0xcf, 0x9f, 0x3f
+	};
+
+	static const unsigned char fast_abort[] = {
+		0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
+	};
+
+#define handle_fast_flag(h)						\
+	do {								\
+		if (h->cbin == fast_flag[h->bit_shift]) {		\
+			h->ffvalue = fast_flag_value[h->bit_shift];	\
+			h->state = HDLC_FAST_FLAG;			\
+			h->ffbit_shift = h->bit_shift;			\
+			h->bit_shift = 1;				\
+		} else {						\
+			h->state = HDLC_GET_DATA;			\
+			h->data_received = 0;				\
+		}							\
+	} while (0)
+
+#define handle_abort(h)						\
+	do {							\
+		h->shift_reg = fast_abort[h->ffbit_shift - 1];	\
+		h->hdlc_bits1 = h->ffbit_shift - 2;		\
+		if (h->hdlc_bits1 < 0)				\
+			h->hdlc_bits1 = 0;			\
+		h->data_bits = h->ffbit_shift - 1;		\
+		h->state = HDLC_GET_DATA;			\
+		h->data_received = 0;				\
+	} while (0)
+
+	*count = slen;
+
+	while (slen > 0) {
+		if (hdlc->bit_shift == 0) {
+			/* the code is for bitreverse streams */
+			if (hdlc->do_bitreverse == 0)
+				hdlc->cbin = bitrev8(*src++);
+			else
+				hdlc->cbin = *src++;
+			slen--;
+			hdlc->bit_shift = 8;
+			if (hdlc->do_adapt56)
+				hdlc->bit_shift--;
+		}
+
+		switch (hdlc->state) {
+		case STOPPED:
+			return 0;
+		case HDLC_FAST_IDLE:
+			if (hdlc->cbin == 0xff) {
+				hdlc->bit_shift = 0;
+				break;
+			}
+			hdlc->state = HDLC_GET_FLAG_B0;
+			hdlc->hdlc_bits1 = 0;
+			hdlc->bit_shift = 8;
+			break;
+		case HDLC_GET_FLAG_B0:
+			if (!(hdlc->cbin & 0x80)) {
+				hdlc->state = HDLC_GETFLAG_B1A6;
+				hdlc->hdlc_bits1 = 0;
+			} else {
+				if ((!hdlc->do_adapt56) &&
+				    (++hdlc->hdlc_bits1 >= 8) &&
+				    (hdlc->bit_shift == 1))
+					hdlc->state = HDLC_FAST_IDLE;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GETFLAG_B1A6:
+			if (hdlc->cbin & 0x80) {
+				hdlc->hdlc_bits1++;
+				if (hdlc->hdlc_bits1 == 6)
+					hdlc->state = HDLC_GETFLAG_B7;
+			} else
+				hdlc->hdlc_bits1 = 0;
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GETFLAG_B7:
+			if (hdlc->cbin & 0x80) {
+				hdlc->state = HDLC_GET_FLAG_B0;
+			} else {
+				hdlc->state = HDLC_GET_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->shift_reg = 0;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_bits = 0;
+				hdlc->data_received = 0;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GET_DATA:
+			if (hdlc->cbin & 0x80) {
+				hdlc->hdlc_bits1++;
+				switch (hdlc->hdlc_bits1) {
+				case 6:
+					break;
+				case 7:
+					if (hdlc->data_received)
+						/* bad frame */
+						status = -HDLC_FRAMING_ERROR;
+					if (!hdlc->do_adapt56) {
+						if (hdlc->cbin == fast_abort
+						    [hdlc->bit_shift + 1]) {
+							hdlc->state =
+								HDLC_FAST_IDLE;
+							hdlc->bit_shift = 1;
+							break;
+						}
+					} else
+						hdlc->state = HDLC_GET_FLAG_B0;
+					break;
+				default:
+					hdlc->shift_reg >>= 1;
+					hdlc->shift_reg |= 0x80;
+					hdlc->data_bits++;
+					break;
+				}
+			} else {
+				switch (hdlc->hdlc_bits1) {
+				case 5:
+					break;
+				case 6:
+					if (hdlc->data_received)
+						status = check_frame(hdlc);
+					hdlc->crc = 0xffff;
+					hdlc->shift_reg = 0;
+					hdlc->data_bits = 0;
+					if (!hdlc->do_adapt56)
+						handle_fast_flag(hdlc);
+					else {
+						hdlc->state = HDLC_GET_DATA;
+						hdlc->data_received = 0;
+					}
+					break;
+				default:
+					hdlc->shift_reg >>= 1;
+					hdlc->data_bits++;
+					break;
+				}
+				hdlc->hdlc_bits1 = 0;
+			}
+			if (status) {
+				hdlc->dstpos = 0;
+				*count -= slen;
+				hdlc->cbin <<= 1;
+				hdlc->bit_shift--;
+				return status;
+			}
+			if (hdlc->data_bits == 8) {
+				hdlc->data_bits = 0;
+				hdlc->data_received = 1;
+				hdlc->crc = crc_ccitt_byte(hdlc->crc,
+							   hdlc->shift_reg);
+
+				/* good byte received */
+				if (hdlc->dstpos < dsize)
+					dst[hdlc->dstpos++] = hdlc->shift_reg;
+				else {
+					/* frame too long */
+					status = -HDLC_LENGTH_ERROR;
+					hdlc->dstpos = 0;
+				}
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_FAST_FLAG:
+			if (hdlc->cbin == hdlc->ffvalue) {
+				hdlc->bit_shift = 0;
+				break;
+			} else {
+				if (hdlc->cbin == 0xff) {
+					hdlc->state = HDLC_FAST_IDLE;
+					hdlc->bit_shift = 0;
+				} else if (hdlc->ffbit_shift == 8) {
+					hdlc->state = HDLC_GETFLAG_B7;
+					break;
+				} else
+					handle_abort(hdlc);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	*count -= slen;
+	return 0;
+}
+EXPORT_SYMBOL(isdnhdlc_decode);
+/*
+  isdnhdlc_encode - encodes HDLC frames to a transparent bit stream.
+
+  The bit stream starts with a beginning flag (01111110). After
+  that each byte is added to the bit stream with bit stuffing added
+  (0 after 5 1's).
+  When the last byte has been removed from the source buffer, the
+  CRC (2 bytes is added) and the frame terminates with the ending flag.
+  For the dchannel, the idle character (all 1's) is also added at the end.
+  If this function is called with empty source buffer (slen=0), flags or
+  idle character will be generated.
+
+  src - source buffer
+  slen - source buffer length
+  count - number of bytes removed (encoded) from source buffer
+  dst _ destination buffer
+  dsize - destination buffer size
+  returns - number of encoded bytes in the destination buffer
+*/
+int isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src, u16 slen,
+		    int *count, u8 *dst, int dsize)
+{
+	static const unsigned char xfast_flag_value[] = {
+		0x7e, 0x3f, 0x9f, 0xcf, 0xe7, 0xf3, 0xf9, 0xfc, 0x7e
+	};
+
+	int len = 0;
+
+	*count = slen;
+
+	/* special handling for one byte frames */
+	if ((slen == 1) && (hdlc->state == HDLC_SEND_FAST_FLAG))
+		hdlc->state = HDLC_SENDFLAG_ONE;
+	while (dsize > 0) {
+		if (hdlc->bit_shift == 0) {
+			if (slen && !hdlc->do_closing) {
+				hdlc->shift_reg = *src++;
+				slen--;
+				if (slen == 0)
+					/* closing sequence, CRC + flag(s) */
+					hdlc->do_closing = 1;
+				hdlc->bit_shift = 8;
+			} else {
+				if (hdlc->state == HDLC_SEND_DATA) {
+					if (hdlc->data_received) {
+						hdlc->state = HDLC_SEND_CRC1;
+						hdlc->crc ^= 0xffff;
+						hdlc->bit_shift = 8;
+						hdlc->shift_reg =
+							hdlc->crc & 0xff;
+					} else if (!hdlc->do_adapt56)
+						hdlc->state =
+							HDLC_SEND_FAST_FLAG;
+					else
+						hdlc->state =
+							HDLC_SENDFLAG_B0;
+				}
+
+			}
+		}
+
+		switch (hdlc->state) {
+		case STOPPED:
+			while (dsize--)
+				*dst++ = 0xff;
+			return dsize;
+		case HDLC_SEND_FAST_FLAG:
+			hdlc->do_closing = 0;
+			if (slen == 0) {
+				/* the code is for bitreverse streams */
+				if (hdlc->do_bitreverse == 0)
+					*dst++ = bitrev8(hdlc->ffvalue);
+				else
+					*dst++ = hdlc->ffvalue;
+				len++;
+				dsize--;
+				break;
+			}
+			/* fall through */
+		case HDLC_SENDFLAG_ONE:
+			if (hdlc->bit_shift == 8) {
+				hdlc->cbin = hdlc->ffvalue >>
+					(8 - hdlc->data_bits);
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_received = 1;
+			}
+			break;
+		case HDLC_SENDFLAG_B0:
+			hdlc->do_closing = 0;
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			hdlc->hdlc_bits1 = 0;
+			hdlc->state = HDLC_SENDFLAG_B1A6;
+			break;
+		case HDLC_SENDFLAG_B1A6:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			hdlc->cbin++;
+			if (++hdlc->hdlc_bits1 == 6)
+				hdlc->state = HDLC_SENDFLAG_B7;
+			break;
+		case HDLC_SENDFLAG_B7:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (slen == 0) {
+				hdlc->state = HDLC_SENDFLAG_B0;
+				break;
+			}
+			if (hdlc->bit_shift == 8) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_received = 1;
+			}
+			break;
+		case HDLC_SEND_FIRST_FLAG:
+			hdlc->data_received = 1;
+			if (hdlc->data_bits == 8) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->shift_reg & 0x01)
+				hdlc->cbin++;
+			hdlc->shift_reg >>= 1;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+			}
+			break;
+		case HDLC_SEND_DATA:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->bit_shift == 8)
+				hdlc->crc = crc_ccitt_byte(hdlc->crc,
+							   hdlc->shift_reg);
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			break;
+		case HDLC_SEND_CRC1:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			if (hdlc->bit_shift == 0) {
+				hdlc->shift_reg = (hdlc->crc >> 8);
+				hdlc->state = HDLC_SEND_CRC2;
+				hdlc->bit_shift = 8;
+			}
+			break;
+		case HDLC_SEND_CRC2:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			if (hdlc->bit_shift == 0) {
+				hdlc->shift_reg = 0x7e;
+				hdlc->state = HDLC_SEND_CLOSING_FLAG;
+				hdlc->bit_shift = 8;
+			}
+			break;
+		case HDLC_SEND_CLOSING_FLAG:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01)
+				hdlc->cbin++;
+			hdlc->shift_reg >>= 1;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->ffvalue =
+					xfast_flag_value[hdlc->data_bits];
+				if (hdlc->dchannel) {
+					hdlc->ffvalue = 0x7e;
+					hdlc->state = HDLC_SEND_IDLE1;
+					hdlc->bit_shift = 8-hdlc->data_bits;
+					if (hdlc->bit_shift == 0)
+						hdlc->state =
+							HDLC_SEND_FAST_IDLE;
+				} else {
+					if (!hdlc->do_adapt56) {
+						hdlc->state =
+							HDLC_SEND_FAST_FLAG;
+						hdlc->data_received = 0;
+					} else {
+						hdlc->state = HDLC_SENDFLAG_B0;
+						hdlc->data_received = 0;
+					}
+					/* Finished this frame, send flags */
+					if (dsize > 1)
+						dsize = 1;
+				}
+			}
+			break;
+		case HDLC_SEND_IDLE1:
+			hdlc->do_closing = 0;
+			hdlc->cbin <<= 1;
+			hdlc->cbin++;
+			hdlc->data_bits++;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->state = HDLC_SEND_FAST_IDLE;
+				hdlc->bit_shift = 0;
+			}
+			break;
+		case HDLC_SEND_FAST_IDLE:
+			hdlc->do_closing = 0;
+			hdlc->cbin = 0xff;
+			hdlc->data_bits = 8;
+			if (hdlc->bit_shift == 8) {
+				hdlc->cbin = 0x7e;
+				hdlc->state = HDLC_SEND_FIRST_FLAG;
+			} else {
+				/* the code is for bitreverse streams */
+				if (hdlc->do_bitreverse == 0)
+					*dst++ = bitrev8(hdlc->cbin);
+				else
+					*dst++ = hdlc->cbin;
+				hdlc->bit_shift = 0;
+				hdlc->data_bits = 0;
+				len++;
+				dsize = 0;
+			}
+			break;
+		default:
+			break;
+		}
+		if (hdlc->do_adapt56) {
+			if (hdlc->data_bits == 7) {
+				hdlc->cbin <<= 1;
+				hdlc->cbin++;
+				hdlc->data_bits++;
+			}
+		}
+		if (hdlc->data_bits == 8) {
+			/* the code is for bitreverse streams */
+			if (hdlc->do_bitreverse == 0)
+				*dst++ = bitrev8(hdlc->cbin);
+			else
+				*dst++ = hdlc->cbin;
+			hdlc->data_bits = 0;
+			len++;
+			dsize--;
+		}
+	}
+	*count -= slen;
+
+	return len;
+}
+EXPORT_SYMBOL(isdnhdlc_encode);
diff --git a/drivers/isdn/hardware/mISDN/isdnhdlc.h b/drivers/isdn/hardware/mISDN/isdnhdlc.h
new file mode 100644
index 000000000000..96521370c782
--- /dev/null
+++ b/drivers/isdn/hardware/mISDN/isdnhdlc.h
@@ -0,0 +1,82 @@
+/*
+ * hdlc.h  --  General purpose ISDN HDLC decoder.
+ *
+ * Implementation of a HDLC decoder/encoder in software.
+ * Necessary because some ISDN devices don't have HDLC
+ * controllers.
+ *
+ * Copyright (C)
+ *	2009	Karsten Keil		<keil@b1-systems.de>
+ *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
+ *	2001	Frode Isaksen		<fisaksen@bewan.com>
+ *	2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __ISDNHDLC_H__
+#define __ISDNHDLC_H__
+
+struct isdnhdlc_vars {
+	int bit_shift;
+	int hdlc_bits1;
+	int data_bits;
+	int ffbit_shift;	/* encoding only */
+	int state;
+	int dstpos;
+
+	u16 crc;
+
+	u8 cbin;
+	u8 shift_reg;
+	u8 ffvalue;
+
+	/* set if transferring data */
+	u32 data_received:1;
+	/* set if D channel (send idle instead of flags) */
+	u32 dchannel:1;
+	/* set if 56K adaptation */
+	u32 do_adapt56:1;
+	/* set if in closing phase (need to send CRC + flag) */
+	u32 do_closing:1;
+	/* set if data is bitreverse */
+	u32 do_bitreverse:1;
+};
+
+/* Feature Flags */
+#define HDLC_56KBIT	0x01
+#define HDLC_DCHANNEL	0x02
+#define HDLC_BITREVERSE	0x04
+
+/*
+  The return value from isdnhdlc_decode is
+  the frame length, 0 if no complete frame was decoded,
+  or a negative error number
+*/
+#define HDLC_FRAMING_ERROR     1
+#define HDLC_CRC_ERROR         2
+#define HDLC_LENGTH_ERROR      3
+
+extern void	isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features);
+
+extern int	isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src,
+			int slen, int *count, u8 *dst, int dsize);
+
+extern void	isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features);
+
+extern int	isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src,
+			u16 slen, int *count, u8 *dst, int dsize);
+
+#endif /* __ISDNHDLC_H__ */
diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c
index 2b317cb63d06..93a2d361eda5 100644
--- a/drivers/isdn/hardware/mISDN/netjet.c
+++ b/drivers/isdn/hardware/mISDN/netjet.c
@@ -29,7 +29,7 @@
 #include "ipac.h"
 #include "iohelper.h"
 #include "netjet.h"
-#include <linux/isdn/hdlc.h>
+#include "isdnhdlc.h"
 
 #define NETJET_REV	"2.0"
 
diff --git a/drivers/isdn/i4l/Makefile b/drivers/isdn/i4l/Makefile
deleted file mode 100644
index 11fe697739d5..000000000000
--- a/drivers/isdn/i4l/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for the kernel ISDN subsystem and device drivers.
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_HDLC)		+= isdnhdlc.o
diff --git a/drivers/isdn/i4l/isdnhdlc.c b/drivers/isdn/i4l/isdnhdlc.c
deleted file mode 100644
index 027d1c590679..000000000000
--- a/drivers/isdn/i4l/isdnhdlc.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * isdnhdlc.c  --  General purpose ISDN HDLC decoder.
- *
- * Copyright (C)
- *	2009	Karsten Keil		<keil@b1-systems.de>
- *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
- *	2001	Frode Isaksen		<fisaksen@bewan.com>
- *      2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/crc-ccitt.h>
-#include <linux/isdn/hdlc.h>
-#include <linux/bitrev.h>
-
-/*-------------------------------------------------------------------*/
-
-MODULE_AUTHOR("Wolfgang Mües <wolfgang@iksw-muees.de>, "
-	      "Frode Isaksen <fisaksen@bewan.com>, "
-	      "Kai Germaschewski <kai.germaschewski@gmx.de>");
-MODULE_DESCRIPTION("General purpose ISDN HDLC decoder");
-MODULE_LICENSE("GPL");
-
-/*-------------------------------------------------------------------*/
-
-enum {
-	HDLC_FAST_IDLE, HDLC_GET_FLAG_B0, HDLC_GETFLAG_B1A6, HDLC_GETFLAG_B7,
-	HDLC_GET_DATA, HDLC_FAST_FLAG
-};
-
-enum {
-	HDLC_SEND_DATA, HDLC_SEND_CRC1, HDLC_SEND_FAST_FLAG,
-	HDLC_SEND_FIRST_FLAG, HDLC_SEND_CRC2, HDLC_SEND_CLOSING_FLAG,
-	HDLC_SEND_IDLE1, HDLC_SEND_FAST_IDLE, HDLC_SENDFLAG_B0,
-	HDLC_SENDFLAG_B1A6, HDLC_SENDFLAG_B7, STOPPED, HDLC_SENDFLAG_ONE
-};
-
-void isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features)
-{
-	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
-	hdlc->state = HDLC_GET_DATA;
-	if (features & HDLC_56KBIT)
-		hdlc->do_adapt56 = 1;
-	if (features & HDLC_BITREVERSE)
-		hdlc->do_bitreverse = 1;
-}
-EXPORT_SYMBOL(isdnhdlc_out_init);
-
-void isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features)
-{
-	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
-	if (features & HDLC_DCHANNEL) {
-		hdlc->dchannel = 1;
-		hdlc->state = HDLC_SEND_FIRST_FLAG;
-	} else {
-		hdlc->dchannel = 0;
-		hdlc->state = HDLC_SEND_FAST_FLAG;
-		hdlc->ffvalue = 0x7e;
-	}
-	hdlc->cbin = 0x7e;
-	if (features & HDLC_56KBIT) {
-		hdlc->do_adapt56 = 1;
-		hdlc->state = HDLC_SENDFLAG_B0;
-	} else
-		hdlc->data_bits = 8;
-	if (features & HDLC_BITREVERSE)
-		hdlc->do_bitreverse = 1;
-}
-EXPORT_SYMBOL(isdnhdlc_rcv_init);
-
-static int
-check_frame(struct isdnhdlc_vars *hdlc)
-{
-	int status;
-
-	if (hdlc->dstpos < 2)	/* too small - framing error */
-		status = -HDLC_FRAMING_ERROR;
-	else if (hdlc->crc != 0xf0b8)	/* crc error */
-		status = -HDLC_CRC_ERROR;
-	else {
-		/* remove CRC */
-		hdlc->dstpos -= 2;
-		/* good frame */
-		status = hdlc->dstpos;
-	}
-	return status;
-}
-
-/*
-  isdnhdlc_decode - decodes HDLC frames from a transparent bit stream.
-
-  The source buffer is scanned for valid HDLC frames looking for
-  flags (01111110) to indicate the start of a frame. If the start of
-  the frame is found, the bit stuffing is removed (0 after 5 1's).
-  When a new flag is found, the complete frame has been received
-  and the CRC is checked.
-  If a valid frame is found, the function returns the frame length
-  excluding the CRC with the bit HDLC_END_OF_FRAME set.
-  If the beginning of a valid frame is found, the function returns
-  the length.
-  If a framing error is found (too many 1s and not a flag) the function
-  returns the length with the bit HDLC_FRAMING_ERROR set.
-  If a CRC error is found the function returns the length with the
-  bit HDLC_CRC_ERROR set.
-  If the frame length exceeds the destination buffer size, the function
-  returns the length with the bit HDLC_LENGTH_ERROR set.
-
-  src - source buffer
-  slen - source buffer length
-  count - number of bytes removed (decoded) from the source buffer
-  dst _ destination buffer
-  dsize - destination buffer size
-  returns - number of decoded bytes in the destination buffer and status
-  flag.
-*/
-int isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src, int slen,
-		    int *count, u8 *dst, int dsize)
-{
-	int status = 0;
-
-	static const unsigned char fast_flag[] = {
-		0x00, 0x00, 0x00, 0x20, 0x30, 0x38, 0x3c, 0x3e, 0x3f
-	};
-
-	static const unsigned char fast_flag_value[] = {
-		0x00, 0x7e, 0xfc, 0xf9, 0xf3, 0xe7, 0xcf, 0x9f, 0x3f
-	};
-
-	static const unsigned char fast_abort[] = {
-		0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
-	};
-
-#define handle_fast_flag(h)						\
-	do {								\
-		if (h->cbin == fast_flag[h->bit_shift]) {		\
-			h->ffvalue = fast_flag_value[h->bit_shift];	\
-			h->state = HDLC_FAST_FLAG;			\
-			h->ffbit_shift = h->bit_shift;			\
-			h->bit_shift = 1;				\
-		} else {						\
-			h->state = HDLC_GET_DATA;			\
-			h->data_received = 0;				\
-		}							\
-	} while (0)
-
-#define handle_abort(h)						\
-	do {							\
-		h->shift_reg = fast_abort[h->ffbit_shift - 1];	\
-		h->hdlc_bits1 = h->ffbit_shift - 2;		\
-		if (h->hdlc_bits1 < 0)				\
-			h->hdlc_bits1 = 0;			\
-		h->data_bits = h->ffbit_shift - 1;		\
-		h->state = HDLC_GET_DATA;			\
-		h->data_received = 0;				\
-	} while (0)
-
-	*count = slen;
-
-	while (slen > 0) {
-		if (hdlc->bit_shift == 0) {
-			/* the code is for bitreverse streams */
-			if (hdlc->do_bitreverse == 0)
-				hdlc->cbin = bitrev8(*src++);
-			else
-				hdlc->cbin = *src++;
-			slen--;
-			hdlc->bit_shift = 8;
-			if (hdlc->do_adapt56)
-				hdlc->bit_shift--;
-		}
-
-		switch (hdlc->state) {
-		case STOPPED:
-			return 0;
-		case HDLC_FAST_IDLE:
-			if (hdlc->cbin == 0xff) {
-				hdlc->bit_shift = 0;
-				break;
-			}
-			hdlc->state = HDLC_GET_FLAG_B0;
-			hdlc->hdlc_bits1 = 0;
-			hdlc->bit_shift = 8;
-			break;
-		case HDLC_GET_FLAG_B0:
-			if (!(hdlc->cbin & 0x80)) {
-				hdlc->state = HDLC_GETFLAG_B1A6;
-				hdlc->hdlc_bits1 = 0;
-			} else {
-				if ((!hdlc->do_adapt56) &&
-				    (++hdlc->hdlc_bits1 >= 8) &&
-				    (hdlc->bit_shift == 1))
-					hdlc->state = HDLC_FAST_IDLE;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GETFLAG_B1A6:
-			if (hdlc->cbin & 0x80) {
-				hdlc->hdlc_bits1++;
-				if (hdlc->hdlc_bits1 == 6)
-					hdlc->state = HDLC_GETFLAG_B7;
-			} else
-				hdlc->hdlc_bits1 = 0;
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GETFLAG_B7:
-			if (hdlc->cbin & 0x80) {
-				hdlc->state = HDLC_GET_FLAG_B0;
-			} else {
-				hdlc->state = HDLC_GET_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->shift_reg = 0;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_bits = 0;
-				hdlc->data_received = 0;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GET_DATA:
-			if (hdlc->cbin & 0x80) {
-				hdlc->hdlc_bits1++;
-				switch (hdlc->hdlc_bits1) {
-				case 6:
-					break;
-				case 7:
-					if (hdlc->data_received)
-						/* bad frame */
-						status = -HDLC_FRAMING_ERROR;
-					if (!hdlc->do_adapt56) {
-						if (hdlc->cbin == fast_abort
-						    [hdlc->bit_shift + 1]) {
-							hdlc->state =
-								HDLC_FAST_IDLE;
-							hdlc->bit_shift = 1;
-							break;
-						}
-					} else
-						hdlc->state = HDLC_GET_FLAG_B0;
-					break;
-				default:
-					hdlc->shift_reg >>= 1;
-					hdlc->shift_reg |= 0x80;
-					hdlc->data_bits++;
-					break;
-				}
-			} else {
-				switch (hdlc->hdlc_bits1) {
-				case 5:
-					break;
-				case 6:
-					if (hdlc->data_received)
-						status = check_frame(hdlc);
-					hdlc->crc = 0xffff;
-					hdlc->shift_reg = 0;
-					hdlc->data_bits = 0;
-					if (!hdlc->do_adapt56)
-						handle_fast_flag(hdlc);
-					else {
-						hdlc->state = HDLC_GET_DATA;
-						hdlc->data_received = 0;
-					}
-					break;
-				default:
-					hdlc->shift_reg >>= 1;
-					hdlc->data_bits++;
-					break;
-				}
-				hdlc->hdlc_bits1 = 0;
-			}
-			if (status) {
-				hdlc->dstpos = 0;
-				*count -= slen;
-				hdlc->cbin <<= 1;
-				hdlc->bit_shift--;
-				return status;
-			}
-			if (hdlc->data_bits == 8) {
-				hdlc->data_bits = 0;
-				hdlc->data_received = 1;
-				hdlc->crc = crc_ccitt_byte(hdlc->crc,
-							   hdlc->shift_reg);
-
-				/* good byte received */
-				if (hdlc->dstpos < dsize)
-					dst[hdlc->dstpos++] = hdlc->shift_reg;
-				else {
-					/* frame too long */
-					status = -HDLC_LENGTH_ERROR;
-					hdlc->dstpos = 0;
-				}
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_FAST_FLAG:
-			if (hdlc->cbin == hdlc->ffvalue) {
-				hdlc->bit_shift = 0;
-				break;
-			} else {
-				if (hdlc->cbin == 0xff) {
-					hdlc->state = HDLC_FAST_IDLE;
-					hdlc->bit_shift = 0;
-				} else if (hdlc->ffbit_shift == 8) {
-					hdlc->state = HDLC_GETFLAG_B7;
-					break;
-				} else
-					handle_abort(hdlc);
-			}
-			break;
-		default:
-			break;
-		}
-	}
-	*count -= slen;
-	return 0;
-}
-EXPORT_SYMBOL(isdnhdlc_decode);
-/*
-  isdnhdlc_encode - encodes HDLC frames to a transparent bit stream.
-
-  The bit stream starts with a beginning flag (01111110). After
-  that each byte is added to the bit stream with bit stuffing added
-  (0 after 5 1's).
-  When the last byte has been removed from the source buffer, the
-  CRC (2 bytes is added) and the frame terminates with the ending flag.
-  For the dchannel, the idle character (all 1's) is also added at the end.
-  If this function is called with empty source buffer (slen=0), flags or
-  idle character will be generated.
-
-  src - source buffer
-  slen - source buffer length
-  count - number of bytes removed (encoded) from source buffer
-  dst _ destination buffer
-  dsize - destination buffer size
-  returns - number of encoded bytes in the destination buffer
-*/
-int isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src, u16 slen,
-		    int *count, u8 *dst, int dsize)
-{
-	static const unsigned char xfast_flag_value[] = {
-		0x7e, 0x3f, 0x9f, 0xcf, 0xe7, 0xf3, 0xf9, 0xfc, 0x7e
-	};
-
-	int len = 0;
-
-	*count = slen;
-
-	/* special handling for one byte frames */
-	if ((slen == 1) && (hdlc->state == HDLC_SEND_FAST_FLAG))
-		hdlc->state = HDLC_SENDFLAG_ONE;
-	while (dsize > 0) {
-		if (hdlc->bit_shift == 0) {
-			if (slen && !hdlc->do_closing) {
-				hdlc->shift_reg = *src++;
-				slen--;
-				if (slen == 0)
-					/* closing sequence, CRC + flag(s) */
-					hdlc->do_closing = 1;
-				hdlc->bit_shift = 8;
-			} else {
-				if (hdlc->state == HDLC_SEND_DATA) {
-					if (hdlc->data_received) {
-						hdlc->state = HDLC_SEND_CRC1;
-						hdlc->crc ^= 0xffff;
-						hdlc->bit_shift = 8;
-						hdlc->shift_reg =
-							hdlc->crc & 0xff;
-					} else if (!hdlc->do_adapt56)
-						hdlc->state =
-							HDLC_SEND_FAST_FLAG;
-					else
-						hdlc->state =
-							HDLC_SENDFLAG_B0;
-				}
-
-			}
-		}
-
-		switch (hdlc->state) {
-		case STOPPED:
-			while (dsize--)
-				*dst++ = 0xff;
-			return dsize;
-		case HDLC_SEND_FAST_FLAG:
-			hdlc->do_closing = 0;
-			if (slen == 0) {
-				/* the code is for bitreverse streams */
-				if (hdlc->do_bitreverse == 0)
-					*dst++ = bitrev8(hdlc->ffvalue);
-				else
-					*dst++ = hdlc->ffvalue;
-				len++;
-				dsize--;
-				break;
-			}
-			/* fall through */
-		case HDLC_SENDFLAG_ONE:
-			if (hdlc->bit_shift == 8) {
-				hdlc->cbin = hdlc->ffvalue >>
-					(8 - hdlc->data_bits);
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_received = 1;
-			}
-			break;
-		case HDLC_SENDFLAG_B0:
-			hdlc->do_closing = 0;
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			hdlc->hdlc_bits1 = 0;
-			hdlc->state = HDLC_SENDFLAG_B1A6;
-			break;
-		case HDLC_SENDFLAG_B1A6:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			hdlc->cbin++;
-			if (++hdlc->hdlc_bits1 == 6)
-				hdlc->state = HDLC_SENDFLAG_B7;
-			break;
-		case HDLC_SENDFLAG_B7:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (slen == 0) {
-				hdlc->state = HDLC_SENDFLAG_B0;
-				break;
-			}
-			if (hdlc->bit_shift == 8) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_received = 1;
-			}
-			break;
-		case HDLC_SEND_FIRST_FLAG:
-			hdlc->data_received = 1;
-			if (hdlc->data_bits == 8) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->shift_reg & 0x01)
-				hdlc->cbin++;
-			hdlc->shift_reg >>= 1;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-			}
-			break;
-		case HDLC_SEND_DATA:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->bit_shift == 8)
-				hdlc->crc = crc_ccitt_byte(hdlc->crc,
-							   hdlc->shift_reg);
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			break;
-		case HDLC_SEND_CRC1:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			if (hdlc->bit_shift == 0) {
-				hdlc->shift_reg = (hdlc->crc >> 8);
-				hdlc->state = HDLC_SEND_CRC2;
-				hdlc->bit_shift = 8;
-			}
-			break;
-		case HDLC_SEND_CRC2:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			if (hdlc->bit_shift == 0) {
-				hdlc->shift_reg = 0x7e;
-				hdlc->state = HDLC_SEND_CLOSING_FLAG;
-				hdlc->bit_shift = 8;
-			}
-			break;
-		case HDLC_SEND_CLOSING_FLAG:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01)
-				hdlc->cbin++;
-			hdlc->shift_reg >>= 1;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->ffvalue =
-					xfast_flag_value[hdlc->data_bits];
-				if (hdlc->dchannel) {
-					hdlc->ffvalue = 0x7e;
-					hdlc->state = HDLC_SEND_IDLE1;
-					hdlc->bit_shift = 8-hdlc->data_bits;
-					if (hdlc->bit_shift == 0)
-						hdlc->state =
-							HDLC_SEND_FAST_IDLE;
-				} else {
-					if (!hdlc->do_adapt56) {
-						hdlc->state =
-							HDLC_SEND_FAST_FLAG;
-						hdlc->data_received = 0;
-					} else {
-						hdlc->state = HDLC_SENDFLAG_B0;
-						hdlc->data_received = 0;
-					}
-					/* Finished this frame, send flags */
-					if (dsize > 1)
-						dsize = 1;
-				}
-			}
-			break;
-		case HDLC_SEND_IDLE1:
-			hdlc->do_closing = 0;
-			hdlc->cbin <<= 1;
-			hdlc->cbin++;
-			hdlc->data_bits++;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->state = HDLC_SEND_FAST_IDLE;
-				hdlc->bit_shift = 0;
-			}
-			break;
-		case HDLC_SEND_FAST_IDLE:
-			hdlc->do_closing = 0;
-			hdlc->cbin = 0xff;
-			hdlc->data_bits = 8;
-			if (hdlc->bit_shift == 8) {
-				hdlc->cbin = 0x7e;
-				hdlc->state = HDLC_SEND_FIRST_FLAG;
-			} else {
-				/* the code is for bitreverse streams */
-				if (hdlc->do_bitreverse == 0)
-					*dst++ = bitrev8(hdlc->cbin);
-				else
-					*dst++ = hdlc->cbin;
-				hdlc->bit_shift = 0;
-				hdlc->data_bits = 0;
-				len++;
-				dsize = 0;
-			}
-			break;
-		default:
-			break;
-		}
-		if (hdlc->do_adapt56) {
-			if (hdlc->data_bits == 7) {
-				hdlc->cbin <<= 1;
-				hdlc->cbin++;
-				hdlc->data_bits++;
-			}
-		}
-		if (hdlc->data_bits == 8) {
-			/* the code is for bitreverse streams */
-			if (hdlc->do_bitreverse == 0)
-				*dst++ = bitrev8(hdlc->cbin);
-			else
-				*dst++ = hdlc->cbin;
-			hdlc->data_bits = 0;
-			len++;
-			dsize--;
-		}
-	}
-	*count -= slen;
-
-	return len;
-}
-EXPORT_SYMBOL(isdnhdlc_encode);
diff --git a/include/linux/isdn/hdlc.h b/include/linux/isdn/hdlc.h
deleted file mode 100644
index 96521370c782..000000000000
--- a/include/linux/isdn/hdlc.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * hdlc.h  --  General purpose ISDN HDLC decoder.
- *
- * Implementation of a HDLC decoder/encoder in software.
- * Necessary because some ISDN devices don't have HDLC
- * controllers.
- *
- * Copyright (C)
- *	2009	Karsten Keil		<keil@b1-systems.de>
- *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
- *	2001	Frode Isaksen		<fisaksen@bewan.com>
- *	2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __ISDNHDLC_H__
-#define __ISDNHDLC_H__
-
-struct isdnhdlc_vars {
-	int bit_shift;
-	int hdlc_bits1;
-	int data_bits;
-	int ffbit_shift;	/* encoding only */
-	int state;
-	int dstpos;
-
-	u16 crc;
-
-	u8 cbin;
-	u8 shift_reg;
-	u8 ffvalue;
-
-	/* set if transferring data */
-	u32 data_received:1;
-	/* set if D channel (send idle instead of flags) */
-	u32 dchannel:1;
-	/* set if 56K adaptation */
-	u32 do_adapt56:1;
-	/* set if in closing phase (need to send CRC + flag) */
-	u32 do_closing:1;
-	/* set if data is bitreverse */
-	u32 do_bitreverse:1;
-};
-
-/* Feature Flags */
-#define HDLC_56KBIT	0x01
-#define HDLC_DCHANNEL	0x02
-#define HDLC_BITREVERSE	0x04
-
-/*
-  The return value from isdnhdlc_decode is
-  the frame length, 0 if no complete frame was decoded,
-  or a negative error number
-*/
-#define HDLC_FRAMING_ERROR     1
-#define HDLC_CRC_ERROR         2
-#define HDLC_LENGTH_ERROR      3
-
-extern void	isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features);
-
-extern int	isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src,
-			int slen, int *count, u8 *dst, int dsize);
-
-extern void	isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features);
-
-extern int	isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src,
-			u16 slen, int *count, u8 *dst, int dsize);
-
-#endif /* __ISDNHDLC_H__ */
-- 
cgit v1.2.3


From 2cf6bffc49dae26edd12af6b57c8c780590380bf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 May 2019 15:44:12 +0200
Subject: netfilter: replace skb_make_writable with skb_ensure_writable

This converts all remaining users and then removes skb_make_writable.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h        |  5 -----
 net/netfilter/core.c             | 22 ----------------------
 net/netfilter/nf_synproxy_core.c |  2 +-
 net/netfilter/nfnetlink_queue.c  |  2 +-
 net/netfilter/xt_DSCP.c          |  8 ++++----
 5 files changed, 6 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 996bc247ef6e..049aeb40fa35 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -336,11 +336,6 @@ int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval,
 		char __user *opt, int *len);
 #endif
 
-/* Call this before modifying an existing packet: ensures it is
-   modifiable and linear to the point you care about (writable_len).
-   Returns true or false. */
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len);
-
 struct flowi;
 struct nf_queue_entry;
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b96fd3f54705..817a9e5d16e4 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 }
 EXPORT_SYMBOL(nf_hook_slow);
 
-
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
-{
-	if (writable_len > skb->len)
-		return 0;
-
-	/* Not exclusive use of packet?  Must copy. */
-	if (!skb_cloned(skb)) {
-		if (writable_len <= skb_headlen(skb))
-			return 1;
-	} else if (skb_clone_writable(skb, writable_len))
-		return 1;
-
-	if (writable_len <= skb_headlen(skb))
-		writable_len = 0;
-	else
-		writable_len -= skb_headlen(skb);
-
-	return !!__pskb_pull_tail(skb, writable_len);
-}
-EXPORT_SYMBOL(skb_make_writable);
-
 /* This needs to be compiled in any case to avoid dependencies between the
  * nfnetlink_queue code and nf_conntrack.
  */
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8ff4d22f10b2..3d58a9e93e5a 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -196,7 +196,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
 	optoff = protoff + sizeof(struct tcphdr);
 	optend = protoff + th->doff * 4;
 
-	if (!skb_make_writable(skb, optend))
+	if (skb_ensure_writable(skb, optend))
 		return 0;
 
 	while (optoff < optend) {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 27dac47b29c2..831f57008d78 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -863,7 +863,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
 		}
 		skb_put(e->skb, diff);
 	}
-	if (!skb_make_writable(e->skb, data_len))
+	if (skb_ensure_writable(e->skb, data_len))
 		return -ENOMEM;
 	skb_copy_to_linear_data(e->skb, data, data_len);
 	e->skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 098ed851b7a7..30d554d6c213 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -34,7 +34,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		ipv4_change_dsfield(ip_hdr(skb),
@@ -52,7 +52,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
-		if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+		if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
 			return NF_DROP;
 
 		ipv6_change_dsfield(ipv6_hdr(skb),
@@ -82,7 +82,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	nv   = (orig & ~info->tos_mask) ^ info->tos_value;
 
 	if (orig != nv) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 		iph = ip_hdr(skb);
 		ipv4_change_dsfield(iph, 0, nv);
@@ -102,7 +102,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	nv   = (orig & ~info->tos_mask) ^ info->tos_value;
 
 	if (orig != nv) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 		iph = ipv6_hdr(skb);
 		ipv6_change_dsfield(iph, 0, nv);
-- 
cgit v1.2.3


From c9bb6165a16e6d5498981a6c777b94a78e74462b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 31 May 2019 11:15:26 +0200
Subject: netfilter: nf_conntrack_bridge: fix CONFIG_IPV6=y

This patch fixes a few problems with CONFIG_IPV6=y and
CONFIG_NF_CONNTRACK_BRIDGE=m:

In file included from net/netfilter/utils.c:5:
include/linux/netfilter_ipv6.h: In function 'nf_ipv6_br_defrag':
include/linux/netfilter_ipv6.h:110:9: error: implicit declaration of function 'nf_ct_frag6_gather'; did you mean 'nf_ct_attach'? [-Werror=implicit-function-declaration]

And these too:

net/ipv6/netfilter.c:242:2: error: unknown field 'br_defrag' specified in initializer
net/ipv6/netfilter.c:243:2: error: unknown field 'br_fragment' specified in initializer

This patch includes an original chunk from wenxu.

Fixes: 764dd163ac92 ("netfilter: nf_conntrack_bridge: add support for IPv6")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Yuehaibing <yuehaibing@huawei.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h | 2 ++
 net/ipv6/netfilter.c           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index a21b8c9623ee..3a3dc4b1f0e7 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -96,6 +96,8 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #endif
 }
 
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
 static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 				    u32 user)
 {
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index c6665382acb5..9530cc280953 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -238,7 +238,7 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
 	.reroute		= nf_ip6_reroute,
-#if IS_MODULE(CONFIG_NF_CONNTRACK_BRIDGE)
+#if IS_MODULE(CONFIG_IPV6)
 	.br_defrag		= nf_ct_frag6_gather,
 	.br_fragment		= br_ip6_fragment,
 #endif
-- 
cgit v1.2.3


From a5e112e6424adb77d953eac20e6936b952fd6b32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 13 May 2019 12:37:17 -0700
Subject: cgroup: add cgroup_parse_float()

cgroup already uses floating point for percent[ile] numbers and there
are several controllers which want to take them as input.  Add a
generic parse helper to handle inputs.

Update the interface convention documentation about the use of
percentage numbers.  While at it, also clarify the default time unit.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  6 +++++
 include/linux/cgroup.h                  |  2 ++
 kernel/cgroup/cgroup.c                  | 43 +++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 88e746074252..73b0c0d8df31 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -696,6 +696,12 @@ Conventions
   informational files on the root cgroup which end up showing global
   information available elsewhere shouldn't exist.
 
+- The default time unit is microseconds.  If a different unit is ever
+  used, an explicit unit suffix must be present.
+
+- A parts-per quantity should use a percentage decimal with at least
+  two digit fractional part - e.g. 13.40.
+
 - If a controller implements weight based resource distribution, its
   interface file should be named "weight" and have the range [1,
   10000] with 100 as the default.  The values are chosen to allow
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0297f930a56e..3745ecdad925 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -131,6 +131,8 @@ void cgroup_free(struct task_struct *p);
 int cgroup_init_early(void);
 int cgroup_init(void);
 
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);
+
 /*
  * Iteration helpers and macros.
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a7df319c2e9a..7dffcfe17441 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6387,4 +6387,47 @@ static int __init cgroup_sysfs_init(void)
 	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
 }
 subsys_initcall(cgroup_sysfs_init);
+
+static u64 power_of_ten(int power)
+{
+	u64 v = 1;
+	while (power--)
+		v *= 10;
+	return v;
+}
+
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times.  For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+	s64 whole, frac = 0;
+	int fstart = 0, fend = 0, flen;
+
+	if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+		return -EINVAL;
+	if (frac < 0)
+		return -EINVAL;
+
+	flen = fend > fstart ? fend - fstart : 0;
+	if (flen < dec_shift)
+		frac *= power_of_ten(dec_shift - flen);
+	else
+		frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+
+	*v = whole * power_of_ten(dec_shift) + frac;
+	return 0;
+}
+
 #endif /* CONFIG_SYSFS */
-- 
cgit v1.2.3


From 0b9055a112fd86c07b9d4857b61019485ec6526f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 29 May 2019 22:50:24 +0000
Subject: net/mlx5: Add core dump register access HW bits

Add Firmware core dump registers and HW definitions.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5a27246db883..b5431f7d97cb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,6 +107,7 @@ enum {
 	MLX5_REG_FPGA_CAP	 = 0x4022,
 	MLX5_REG_FPGA_CTRL	 = 0x4023,
 	MLX5_REG_FPGA_ACCESS_REG = 0x4024,
+	MLX5_REG_CORE_DUMP	 = 0x402e,
 	MLX5_REG_PCAP		 = 0x5001,
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5e74305e2e57..7ee422e38826 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -715,7 +715,9 @@ struct mlx5_ifc_qos_cap_bits {
 };
 
 struct mlx5_ifc_debug_cap_bits {
-	u8         reserved_at_0[0x20];
+	u8         core_dump_general[0x1];
+	u8         core_dump_qp[0x1];
+	u8         reserved_at_2[0x1e];
 
 	u8         reserved_at_20[0x2];
 	u8         stall_detect[0x1];
@@ -2531,6 +2533,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
 	struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
+	struct mlx5_ifc_debug_cap_bits debug_cap;
 	struct mlx5_ifc_fpga_cap_bits fpga_cap;
 	u8         reserved_at_0[0x8000];
 };
@@ -8546,6 +8549,18 @@ struct mlx5_ifc_qcam_reg_bits {
 	u8         reserved_at_1c0[0x80];
 };
 
+struct mlx5_ifc_core_dump_reg_bits {
+	u8         reserved_at_0[0x18];
+	u8         core_dump_type[0x8];
+
+	u8         reserved_at_20[0x30];
+	u8         vhca_id[0x10];
+
+	u8         reserved_at_60[0x8];
+	u8         qpn[0x18];
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From c6d4e45d3b44b71227588c2f76615380b3961f96 Mon Sep 17 00:00:00 2001
From: Eli Britstein <elibr@mellanox.com>
Date: Wed, 29 May 2019 22:50:29 +0000
Subject: net/mlx5: Introduce termination table bits

Termination table is a flow table with a termination flag. The flag
allows the firmware to assume that the the specified actions are the last
actions list. This assumption allows the FW to safely perform potential
looping logic (e.g. hairpin). Introduce the bits for this attribute.

Signed-off-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 3 +++
 include/linux/mlx5/fs.h                          | 1 +
 include/linux/mlx5/mlx5_ifc.h                    | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 013b1ca4a791..bb24c3797218 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -147,6 +147,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 {
 	int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	int en_decap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
+	int term = !!(ft->flags & MLX5_FLOW_TABLE_TERMINATION);
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
 	struct mlx5_core_dev *dev = ns->dev;
@@ -167,6 +168,8 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 		 en_decap);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en,
 		 en_encap);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.termination_table,
+		 term);
 
 	switch (ft->op_mod) {
 	case FS_FT_OP_MOD_NORMAL:
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index e690ba0f965c..2ddaa97f2179 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -47,6 +47,7 @@ enum {
 enum {
 	MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT = BIT(0),
 	MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1),
+	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7ee422e38826..feaa909bf14f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -382,7 +382,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8	   reformat_and_modify_action[0x1];
 	u8         reserved_at_15[0x2];
 	u8	   table_miss_action_domain[0x1];
-	u8         reserved_at_18[0x8];
+	u8         termination_table[0x1];
+	u8         reserved_at_19[0x7];
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
@@ -7239,7 +7240,8 @@ struct mlx5_ifc_create_flow_table_out_bits {
 struct mlx5_ifc_flow_table_context_bits {
 	u8         reformat_en[0x1];
 	u8         decap_en[0x1];
-	u8         reserved_at_2[0x2];
+	u8         reserved_at_2[0x1];
+	u8         termination_table[0x1];
 	u8         table_miss_action[0x4];
 	u8         level[0x8];
 	u8         reserved_at_10[0x8];
-- 
cgit v1.2.3


From cd56f929e6a547180f889a4def370bdd6d48d223 Mon Sep 17 00:00:00 2001
From: Vu Pham <vuhuong@mellanox.com>
Date: Wed, 29 May 2019 22:50:34 +0000
Subject: net/mlx5: E-Switch, Replace host_params event with functions_changed
 event

To support sriov on a E-Switch manager, num_vfs are queried
to the firmware whenever E-Switch manager is notified by
esw_functions_changed event.

Replace host_params event with esw_functions_changed event that reflects
more appropriate naming.

While at it, also correct num_vfs type from int to u16 as expected by
the function mlx5_esw_query_functions().

Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c     | 27 ---------
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.h     |  4 --
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 32 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  6 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 69 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |  4 +-
 include/linux/mlx5/device.h                        |  2 +-
 include/linux/mlx5/mlx5_ifc.h                      |  6 +-
 10 files changed, 86 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 937ba4bcb056..7d3aec98e31f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -316,7 +316,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT:
 	case MLX5_CMD_OP_DEALLOC_MEMIC:
 	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
-	case MLX5_CMD_OP_QUERY_HOST_PARAMS:
+	case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -628,7 +628,7 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(QUERY_MODIFY_HEADER_CONTEXT);
 	MLX5_COMMAND_STR_CASE(ALLOC_MEMIC);
 	MLX5_COMMAND_STR_CASE(DEALLOC_MEMIC);
-	MLX5_COMMAND_STR_CASE(QUERY_HOST_PARAMS);
+	MLX5_COMMAND_STR_CASE(QUERY_ESW_FUNCTIONS);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index 4746f2d28fb6..1bcf8b8f9713 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -83,30 +83,3 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 
 	mlx5_peer_pf_cleanup(dev);
 }
-
-static int mlx5_query_host_params_context(struct mlx5_core_dev *dev,
-					  u32 *out, int outlen)
-{
-	u32 in[MLX5_ST_SZ_DW(query_host_params_in)] = {};
-
-	MLX5_SET(query_host_params_in, in, opcode,
-		 MLX5_CMD_OP_QUERY_HOST_PARAMS);
-
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
-}
-
-int mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf)
-{
-	u32 out[MLX5_ST_SZ_DW(query_host_params_out)] = {};
-	int err;
-
-	err = mlx5_query_host_params_context(dev, out, sizeof(out));
-	if (err)
-		return err;
-
-	*num_vf = MLX5_GET(query_host_params_out, out,
-			   host_params_context.host_num_of_vfs);
-	mlx5_core_dbg(dev, "host_num_of_vfs %d\n", *num_vf);
-
-	return 0;
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
index 346372df218f..d3d7a00a02ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
@@ -16,7 +16,6 @@ enum {
 bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev);
 int mlx5_ec_init(struct mlx5_core_dev *dev);
 void mlx5_ec_cleanup(struct mlx5_core_dev *dev);
-int mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf);
 
 #else  /* CONFIG_MLX5_ESWITCH */
 
@@ -24,9 +23,6 @@ static inline bool
 mlx5_read_embedded_cpu(struct mlx5_core_dev *dev) { return false; }
 static inline int mlx5_ec_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_ec_cleanup(struct mlx5_core_dev *dev) {}
-static inline int
-mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf)
-{ return -EOPNOTSUPP; }
 
 #endif /* CONFIG_MLX5_ESWITCH */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 23883d1fa22f..052bd70e4aa6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -534,7 +534,8 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_MONITOR_COUNTER);
 
 	if (mlx5_core_is_ecpf_esw_manager(dev))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE);
+		async_event_mask |=
+			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
 	return async_event_mask;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9ea0ccfe5ef5..d8935232964a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1686,13 +1686,41 @@ static int eswitch_vport_event(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int query_esw_functions(struct mlx5_core_dev *dev,
+			       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_esw_functions_in)] = {0};
+
+	MLX5_SET(query_esw_functions_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_ESW_FUNCTIONS);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u16 *num_vfs)
+{
+	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {0};
+	int err;
+
+	err = query_esw_functions(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	*num_vfs = MLX5_GET(query_esw_functions_out, out,
+			    host_params_context.host_num_of_vfs);
+	esw_debug(dev, "host_num_of_vfs=%d\n", *num_vfs);
+
+	return 0;
+}
+
 /* Public E-Switch API */
 #define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
 
 int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 {
-	int vf_nvports = 0, total_nvports = 0;
 	struct mlx5_vport *vport;
+	int total_nvports = 0;
+	u16 vf_nvports = 0;
 	int err;
 	int i, enabled_events;
 
@@ -1712,7 +1740,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	if (mode == SRIOV_OFFLOADS) {
 		if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-			err = mlx5_query_host_params_num_vfs(esw->dev, &vf_nvports);
+			err = mlx5_esw_query_functions(esw->dev, &vf_nvports);
 			if (err)
 				return err;
 			total_nvports = esw->total_vports;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ed3fad689ec9..320dd83dd301 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -190,7 +190,7 @@ struct mlx5_host_work {
 	struct mlx5_eswitch	*esw;
 };
 
-struct mlx5_host_info {
+struct mlx5_esw_functions {
 	struct mlx5_nb		nb;
 	u16			num_vfs;
 };
@@ -219,7 +219,7 @@ struct mlx5_eswitch {
 	int                     mode;
 	int                     nvports;
 	u16                     manager_vport;
-	struct mlx5_host_info	host_info;
+	struct mlx5_esw_functions esw_funcs;
 };
 
 void esw_offloads_cleanup(struct mlx5_eswitch *esw);
@@ -386,6 +386,8 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
 bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 			       struct mlx5_core_dev *dev1);
 
+int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u16 *num_vfs);
+
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
 #define esw_info(__dev, format, ...)			\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index e09ae27485ee..83689678b400 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -41,7 +41,6 @@
 #include "en.h"
 #include "fs_core.h"
 #include "lib/devcom.h"
-#include "ecpf.h"
 #include "lib/eq.h"
 
 /* There are two match-all miss flows, one for unicast dst mac and
@@ -1782,57 +1781,79 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 		esw_prio_tag_acls_cleanup(esw);
 }
 
-static void esw_host_params_event_handler(struct work_struct *work)
+static void esw_functions_changed_event_handler(struct work_struct *work)
 {
 	struct mlx5_host_work *host_work;
 	struct mlx5_eswitch *esw;
-	int err, num_vf = 0;
+	u16 num_vfs = 0;
+	int err;
 
 	host_work = container_of(work, struct mlx5_host_work, work);
 	esw = host_work->esw;
 
-	err = mlx5_query_host_params_num_vfs(esw->dev, &num_vf);
-	if (err || num_vf == esw->host_info.num_vfs)
+	err = mlx5_esw_query_functions(esw->dev, &num_vfs);
+	if (err || num_vfs == esw->esw_funcs.num_vfs)
 		goto out;
 
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
-	if (esw->host_info.num_vfs > 0) {
-		esw_offloads_unload_vf_reps(esw, esw->host_info.num_vfs);
+	if (esw->esw_funcs.num_vfs > 0) {
+		esw_offloads_unload_vf_reps(esw, esw->esw_funcs.num_vfs);
 	} else {
-		err = esw_offloads_load_vf_reps(esw, num_vf);
+		err = esw_offloads_load_vf_reps(esw, num_vfs);
 
 		if (err)
 			goto out;
 	}
 
-	esw->host_info.num_vfs = num_vf;
+	esw->esw_funcs.num_vfs = num_vfs;
 
 out:
 	kfree(host_work);
 }
 
-static int esw_host_params_event(struct notifier_block *nb,
-				 unsigned long type, void *data)
+static int esw_functions_changed_event(struct notifier_block *nb,
+				       unsigned long type, void *data)
 {
+	struct mlx5_esw_functions *esw_funcs;
 	struct mlx5_host_work *host_work;
-	struct mlx5_host_info *host_info;
 	struct mlx5_eswitch *esw;
 
 	host_work = kzalloc(sizeof(*host_work), GFP_ATOMIC);
 	if (!host_work)
 		return NOTIFY_DONE;
 
-	host_info = mlx5_nb_cof(nb, struct mlx5_host_info, nb);
-	esw = container_of(host_info, struct mlx5_eswitch, host_info);
+	esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
+	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
 
 	host_work->esw = esw;
 
-	INIT_WORK(&host_work->work, esw_host_params_event_handler);
+	INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
 	queue_work(esw->work_queue, &host_work->work);
 
 	return NOTIFY_OK;
 }
 
+static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
+					     u16 vf_nvports)
+{
+	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+		return;
+
+	MLX5_NB_INIT(&esw->esw_funcs.nb, esw_functions_changed_event,
+		     ESW_FUNCTIONS_CHANGED);
+	mlx5_eq_notifier_register(esw->dev, &esw->esw_funcs.nb);
+	esw->esw_funcs.num_vfs = vf_nvports;
+}
+
+static void esw_functions_changed_event_cleanup(struct mlx5_eswitch *esw)
+{
+	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+		return;
+
+	mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
+	flush_workqueue(esw->work_queue);
+}
+
 int esw_offloads_init(struct mlx5_eswitch *esw, int vf_nvports,
 		      int total_nvports)
 {
@@ -1848,12 +1869,7 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int vf_nvports,
 
 	esw_offloads_devcom_init(esw);
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-		MLX5_NB_INIT(&esw->host_info.nb, esw_host_params_event,
-			     HOST_PARAMS_CHANGE);
-		mlx5_eq_notifier_register(esw->dev, &esw->host_info.nb);
-		esw->host_info.num_vfs = vf_nvports;
-	}
+	esw_functions_changed_event_init(esw, vf_nvports);
 
 	mlx5_rdma_enable_roce(esw->dev);
 
@@ -1887,13 +1903,12 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
 {
 	u16 num_vfs;
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-		mlx5_eq_notifier_unregister(esw->dev, &esw->host_info.nb);
-		flush_workqueue(esw->work_queue);
-		num_vfs = esw->host_info.num_vfs;
-	} else {
+	esw_functions_changed_event_cleanup(esw);
+
+	if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+		num_vfs = esw->esw_funcs.num_vfs;
+	else
 		num_vfs = esw->dev->priv.sriov.num_vfs;
-	}
 
 	mlx5_rdma_disable_roce(esw->dev);
 	esw_offloads_devcom_cleanup(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index a81e8d2168d8..8bcf3426b9c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -108,8 +108,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_STALL_EVENT";
 	case MLX5_EVENT_TYPE_CMD:
 		return "MLX5_EVENT_TYPE_CMD";
-	case MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE:
-		return "MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE";
+	case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED:
+		return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED";
 	case MLX5_EVENT_TYPE_PAGE_REQUEST:
 		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
 	case MLX5_EVENT_TYPE_PAGE_FAULT:
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index fc2b6e807f06..5e760067ac41 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -342,7 +342,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PAGE_FAULT	   = 0xc,
 	MLX5_EVENT_TYPE_NIC_VPORT_CHANGE   = 0xd,
 
-	MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE = 0xe,
+	MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED = 0xe,
 
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index feaa909bf14f..0780242a757a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -155,7 +155,7 @@ enum {
 	MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY     = 0x725,
 	MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY       = 0x726,
 	MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS        = 0x727,
-	MLX5_CMD_OP_QUERY_HOST_PARAMS             = 0x740,
+	MLX5_CMD_OP_QUERY_ESW_FUNCTIONS           = 0x740,
 	MLX5_CMD_OP_QUERY_VPORT_STATE             = 0x750,
 	MLX5_CMD_OP_MODIFY_VPORT_STATE            = 0x751,
 	MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT       = 0x752,
@@ -9721,7 +9721,7 @@ struct mlx5_ifc_host_params_context_bits {
 	u8         reserved_at_80[0x180];
 };
 
-struct mlx5_ifc_query_host_params_in_bits {
+struct mlx5_ifc_query_esw_functions_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -9731,7 +9731,7 @@ struct mlx5_ifc_query_host_params_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_query_host_params_out_bits {
+struct mlx5_ifc_query_esw_functions_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
-- 
cgit v1.2.3


From 6706a3b94f890145ca09797f748d2b30e1414fd3 Mon Sep 17 00:00:00 2001
From: Vu Pham <vuhuong@mellanox.com>
Date: Wed, 29 May 2019 22:50:37 +0000
Subject: net/mlx5: E-Switch, Honor eswitch functions changed event cap

Whenever device supports eswitch functions changed event, honor
such device setting. Do not limit it to ECPF.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c               |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 13 +++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |  6 +++---
 include/linux/mlx5/mlx5_ifc.h                              |  4 +++-
 4 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 052bd70e4aa6..5e9319d3d90c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -533,7 +533,7 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, max_num_of_monitor_counters))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_MONITOR_COUNTER);
 
-	if (mlx5_core_is_ecpf_esw_manager(dev))
+	if (mlx5_eswitch_is_funcs_handler(dev))
 		async_event_mask |=
 			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 320dd83dd301..b524813cccac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -406,6 +406,18 @@ static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
 		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
 }
 
+static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev)
+{
+	/* Ideally device should have the functions changed supported
+	 * capability regardless of it being ECPF or PF wherever such
+	 * event should be processed such as on eswitch manager device.
+	 * However, some ECPF based device might not have this capability
+	 * set. Hence OR for ECPF check to cover such device.
+	 */
+	return MLX5_CAP_ESW(dev, esw_functions_changed) ||
+	       mlx5_core_is_ecpf_esw_manager(dev);
+}
+
 static inline int mlx5_eswitch_uplink_idx(struct mlx5_eswitch *esw)
 {
 	/* Uplink always locate at the last element of the array.*/
@@ -500,6 +512,7 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; }
+static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 
 #define FDB_MAX_CHAIN 1
 #define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 83689678b400..05cb2fffd887 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1836,7 +1836,7 @@ static int esw_functions_changed_event(struct notifier_block *nb,
 static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
 					     u16 vf_nvports)
 {
-	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (!mlx5_eswitch_is_funcs_handler(esw->dev))
 		return;
 
 	MLX5_NB_INIT(&esw->esw_funcs.nb, esw_functions_changed_event,
@@ -1847,7 +1847,7 @@ static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
 
 static void esw_functions_changed_event_cleanup(struct mlx5_eswitch *esw)
 {
-	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (!mlx5_eswitch_is_funcs_handler(esw->dev))
 		return;
 
 	mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
@@ -1905,7 +1905,7 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
 
 	esw_functions_changed_event_cleanup(esw);
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (mlx5_eswitch_is_funcs_handler(esw->dev))
 		num_vfs = esw->esw_funcs.num_vfs;
 	else
 		num_vfs = esw->dev->priv.sriov.num_vfs;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 0780242a757a..6513b985c5e9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -665,7 +665,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x16];
+	u8         reserved_at_5[0x14];
+	u8         esw_functions_changed[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         ecpf_vport_exists[0x1];
 	u8         counter_eswitch_affinity[0x1];
 	u8         merged_eswitch[0x1];
-- 
cgit v1.2.3


From 8693115af4c24d92b971ad895c5f329761ed5d38 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 29 May 2019 22:50:41 +0000
Subject: {IB,net}/mlx5: Constify rep ops functions pointers

Currently for every representor type and for every single vport,
representer function pointers copy is stored even though they don't
change from one to other vport.

Additionally priv data entry for the rep is not passed during
registration, but its copied. It is used (set and cleared) by the user
of the reps.

As we want to scale vports, to simplify and also to split constants
from data,

1. Rename mlx5_eswitch_rep_if to mlx5_eswitch_rep_ops as to match _ops
prefix with other standard netdev, ibdev ops.
2. Constify the IB and Ethernet rep ops structure.
3. Instead of storing copy of all rep function pointers, store copy
per eswitch rep type.
4. Split data and function pointers to mlx5_eswitch_rep_ops and
mlx5_eswitch_rep_data.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                | 19 ++++++-----
 drivers/infiniband/hw/mlx5/ib_rep.h                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 15 +++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  1 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 38 ++++++++++------------
 include/linux/mlx5/eswitch.h                       | 20 +++++++-----
 7 files changed, 49 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index cbcc40d776b9..22e651cb5534 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -60,7 +60,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	if (!__mlx5_ib_add(ibdev, profile))
 		return -EINVAL;
 
-	rep->rep_if[REP_IB].priv = ibdev;
+	rep->rep_data[REP_IB].priv = ibdev;
 
 	return 0;
 }
@@ -70,13 +70,13 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
 	struct mlx5_ib_dev *dev;
 
-	if (!rep->rep_if[REP_IB].priv ||
+	if (!rep->rep_data[REP_IB].priv ||
 	    rep->vport != MLX5_VPORT_UPLINK)
 		return;
 
 	dev = mlx5_ib_rep_to_dev(rep);
 	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
-	rep->rep_if[REP_IB].priv = NULL;
+	rep->rep_data[REP_IB].priv = NULL;
 }
 
 static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
@@ -84,16 +84,17 @@ static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
 	return mlx5_ib_rep_to_dev(rep);
 }
 
+static const struct mlx5_eswitch_rep_ops rep_ops = {
+	.load = mlx5_ib_vport_rep_load,
+	.unload = mlx5_ib_vport_rep_unload,
+	.get_proto_dev = mlx5_ib_vport_get_proto_dev,
+};
+
 void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	struct mlx5_eswitch_rep_if rep_if = {};
-
-	rep_if.load = mlx5_ib_vport_rep_load;
-	rep_if.unload = mlx5_ib_vport_rep_unload;
-	rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
 
-	mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_IB);
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
 }
 
 void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev)
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index c995102b0276..22adce2d6795 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -72,6 +72,6 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 static inline
 struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
 {
-	return rep->rep_if[REP_IB].priv;
+	return rep->rep_data[REP_IB].priv;
 }
 #endif /* __MLX5_IB_REP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 91e24f1cead8..33f8f99681a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1752,7 +1752,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	}
 
 	rpriv->netdev = netdev;
-	rep->rep_if[REP_ETH].priv = rpriv;
+	rep->rep_data[REP_ETH].priv = rpriv;
 	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
 
 	if (rep->vport == MLX5_VPORT_UPLINK) {
@@ -1826,16 +1826,17 @@ static void *mlx5e_vport_rep_get_proto_dev(struct mlx5_eswitch_rep *rep)
 	return rpriv->netdev;
 }
 
+static const struct mlx5_eswitch_rep_ops rep_ops = {
+	.load = mlx5e_vport_rep_load,
+	.unload = mlx5e_vport_rep_unload,
+	.get_proto_dev = mlx5e_vport_rep_get_proto_dev
+};
+
 void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	struct mlx5_eswitch_rep_if rep_if = {};
-
-	rep_if.load = mlx5e_vport_rep_load;
-	rep_if.unload = mlx5e_vport_rep_unload;
-	rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
 
-	mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_ETH);
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_ETH);
 }
 
 void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index c40c025afd99..e34573fd88c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -91,7 +91,7 @@ struct mlx5e_rep_priv {
 static inline
 struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
 {
-	return rep->rep_if[REP_ETH].priv;
+	return rep->rep_data[REP_ETH].priv;
 }
 
 struct mlx5e_neigh {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b524813cccac..135d9a29bbdf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -173,6 +173,7 @@ struct mlx5_esw_offload {
 	struct mutex peer_mutex;
 	DECLARE_HASHTABLE(encap_tbl, 8);
 	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
 	u8 inline_mode;
 	u64 num_flows;
 	u8 encap;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 05cb2fffd887..d6246ee042fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -332,7 +332,7 @@ static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
 	for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
 		rep = &esw->offloads.vport_reps[vf_vport];
-		if (atomic_read(&rep->rep_if[REP_ETH].state) != REP_LOADED)
+		if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED)
 			continue;
 
 		err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
@@ -1276,7 +1276,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 		ether_addr_copy(rep->hw_id, hw_id);
 
 		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
-			atomic_set(&rep->rep_if[rep_type].state,
+			atomic_set(&rep->rep_data[rep_type].state,
 				   REP_UNREGISTERED);
 	}
 
@@ -1286,9 +1286,9 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
 				      struct mlx5_eswitch_rep *rep, u8 rep_type)
 {
-	if (atomic_cmpxchg(&rep->rep_if[rep_type].state,
+	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_LOADED, REP_REGISTERED) == REP_LOADED)
-		rep->rep_if[rep_type].unload(rep);
+		esw->offloads.rep_ops[rep_type]->unload(rep);
 }
 
 static void __unload_reps_special_vport(struct mlx5_eswitch *esw, u8 rep_type)
@@ -1349,11 +1349,11 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
 {
 	int err = 0;
 
-	if (atomic_cmpxchg(&rep->rep_if[rep_type].state,
+	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_REGISTERED, REP_LOADED) == REP_REGISTERED) {
-		err = rep->rep_if[rep_type].load(esw->dev, rep);
+		err = esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
 		if (err)
-			atomic_set(&rep->rep_if[rep_type].state,
+			atomic_set(&rep->rep_data[rep_type].state,
 				   REP_REGISTERED);
 	}
 
@@ -2216,21 +2216,17 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
 }
 
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-				      struct mlx5_eswitch_rep_if *__rep_if,
+				      const struct mlx5_eswitch_rep_ops *ops,
 				      u8 rep_type)
 {
-	struct mlx5_eswitch_rep_if *rep_if;
+	struct mlx5_eswitch_rep_data *rep_data;
 	struct mlx5_eswitch_rep *rep;
 	int i;
 
+	esw->offloads.rep_ops[rep_type] = ops;
 	mlx5_esw_for_all_reps(esw, i, rep) {
-		rep_if = &rep->rep_if[rep_type];
-		rep_if->load   = __rep_if->load;
-		rep_if->unload = __rep_if->unload;
-		rep_if->get_proto_dev = __rep_if->get_proto_dev;
-		rep_if->priv = __rep_if->priv;
-
-		atomic_set(&rep_if->state, REP_REGISTERED);
+		rep_data = &rep->rep_data[rep_type];
+		atomic_set(&rep_data->state, REP_REGISTERED);
 	}
 }
 EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
@@ -2245,7 +2241,7 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
 		__unload_reps_all_vport(esw, max_vf, rep_type);
 
 	mlx5_esw_for_all_reps(esw, i, rep)
-		atomic_set(&rep->rep_if[rep_type].state, REP_UNREGISTERED);
+		atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
 }
 EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
 
@@ -2254,7 +2250,7 @@ void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
 	struct mlx5_eswitch_rep *rep;
 
 	rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
-	return rep->rep_if[rep_type].priv;
+	return rep->rep_data[rep_type].priv;
 }
 
 void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
@@ -2265,9 +2261,9 @@ void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
 
 	rep = mlx5_eswitch_get_rep(esw, vport);
 
-	if (atomic_read(&rep->rep_if[rep_type].state) == REP_LOADED &&
-	    rep->rep_if[rep_type].get_proto_dev)
-		return rep->rep_if[rep_type].get_proto_dev(rep);
+	if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED &&
+	    esw->offloads.rep_ops[rep_type]->get_proto_dev)
+		return esw->offloads.rep_ops[rep_type]->get_proto_dev(rep);
 	return NULL;
 }
 EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 0ca77dd1429c..d81ee4df181c 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -29,17 +29,19 @@ enum {
 };
 
 struct mlx5_eswitch_rep;
-struct mlx5_eswitch_rep_if {
-	int		       (*load)(struct mlx5_core_dev *dev,
-				       struct mlx5_eswitch_rep *rep);
-	void		       (*unload)(struct mlx5_eswitch_rep *rep);
-	void		       *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
-	void			*priv;
-	atomic_t		state;
+struct mlx5_eswitch_rep_ops {
+	int (*load)(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep);
+	void (*unload)(struct mlx5_eswitch_rep *rep);
+	void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
+};
+
+struct mlx5_eswitch_rep_data {
+	void *priv;
+	atomic_t state;
 };
 
 struct mlx5_eswitch_rep {
-	struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES];
+	struct mlx5_eswitch_rep_data rep_data[NUM_REP_TYPES];
 	u16		       vport;
 	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
@@ -47,7 +49,7 @@ struct mlx5_eswitch_rep {
 };
 
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-				      struct mlx5_eswitch_rep_if *rep_if,
+				      const struct mlx5_eswitch_rep_ops *ops,
 				      u8 rep_type);
 void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type);
 void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
-- 
cgit v1.2.3


From 320587e6eac960591077b90271f40bfad24d6155 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:57:34 +0100
Subject: net: sfp: add mandatory attach/detach methods for sfp buses

Add attach and detach methods for SFP buses, which will allow us to get
rid of the netdev storage in sfp-bus.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 16 ++++++++++++++++
 drivers/net/phy/sfp-bus.c |  4 ++--
 include/linux/sfp.h       |  6 ++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index eb07c3d8f09e..503f4b221696 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1650,6 +1650,20 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL_GPL(phylink_mii_ioctl);
 
+static void phylink_sfp_attach(void *upstream, struct sfp_bus *bus)
+{
+	struct phylink *pl = upstream;
+
+	pl->netdev->sfp_bus = bus;
+}
+
+static void phylink_sfp_detach(void *upstream, struct sfp_bus *bus)
+{
+	struct phylink *pl = upstream;
+
+	pl->netdev->sfp_bus = NULL;
+}
+
 static int phylink_sfp_module_insert(void *upstream,
 				     const struct sfp_eeprom_id *id)
 {
@@ -1768,6 +1782,8 @@ static void phylink_sfp_disconnect_phy(void *upstream)
 }
 
 static const struct sfp_upstream_ops sfp_phylink_ops = {
+	.attach = phylink_sfp_attach,
+	.detach = phylink_sfp_detach,
 	.module_insert = phylink_sfp_module_insert,
 	.link_up = phylink_sfp_link_up,
 	.link_down = phylink_sfp_link_down,
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index e9c187946cca..0608203cc752 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -351,7 +351,7 @@ static int sfp_register_bus(struct sfp_bus *bus)
 	bus->socket_ops->attach(bus->sfp);
 	if (bus->started)
 		bus->socket_ops->start(bus->sfp);
-	bus->netdev->sfp_bus = bus;
+	bus->upstream_ops->attach(bus->upstream, bus);
 	bus->registered = true;
 	return 0;
 }
@@ -360,8 +360,8 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 {
 	const struct sfp_upstream_ops *ops = bus->upstream_ops;
 
-	bus->netdev->sfp_bus = NULL;
 	if (bus->registered) {
+		bus->upstream_ops->detach(bus->upstream, bus);
 		if (bus->started)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index d9d9de3fcf8e..a3f0336dd703 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -469,6 +469,10 @@ struct sfp_bus;
 
 /**
  * struct sfp_upstream_ops - upstream operations structure
+ * @attach: called when the sfp socket driver is bound to the upstream
+ *   (mandatory).
+ * @detach: called when the sfp socket driver is unbound from the upstream
+ *   (mandatory).
  * @module_insert: called after a module has been detected to determine
  *   whether the module is supported for the upstream device.
  * @module_remove: called after the module has been removed.
@@ -481,6 +485,8 @@ struct sfp_bus;
  *   been removed.
  */
 struct sfp_upstream_ops {
+	void (*attach)(void *priv, struct sfp_bus *bus);
+	void (*detach)(void *priv, struct sfp_bus *bus);
 	int (*module_insert)(void *priv, const struct sfp_eeprom_id *id);
 	void (*module_remove)(void *priv);
 	void (*link_down)(void *priv);
-- 
cgit v1.2.3


From 54f70b3ba364f19291dc8b9cb096b02a00fb4461 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:57:39 +0100
Subject: net: sfp: remove sfp-bus use of netdevs

The sfp-bus code now no longer has any use for the network device
structure, so remove its use.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c |  3 +--
 drivers/net/phy/sfp-bus.c | 10 +++-------
 include/linux/sfp.h       |  6 ++----
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 503f4b221696..f5b97dab3017 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -565,8 +565,7 @@ static int phylink_register_sfp(struct phylink *pl,
 		return ret;
 	}
 
-	pl->sfp_bus = sfp_register_upstream(ref.fwnode, pl->netdev, pl,
-					    &sfp_phylink_ops);
+	pl->sfp_bus = sfp_register_upstream(ref.fwnode, pl, &sfp_phylink_ops);
 	if (!pl->sfp_bus)
 		return -ENOMEM;
 
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 0608203cc752..b23fc41896ef 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -24,7 +24,6 @@ struct sfp_bus {
 
 	const struct sfp_upstream_ops *upstream_ops;
 	void *upstream;
-	struct net_device *netdev;
 	struct phy_device *phydev;
 
 	bool registered;
@@ -443,13 +442,11 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
 {
 	bus->upstream_ops = NULL;
 	bus->upstream = NULL;
-	bus->netdev = NULL;
 }
 
 /**
  * sfp_register_upstream() - Register the neighbouring device
  * @fwnode: firmware node for the SFP bus
- * @ndev: network device associated with the interface
  * @upstream: the upstream private data
  * @ops: the upstream's &struct sfp_upstream_ops
  *
@@ -460,7 +457,7 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
  * On error, returns %NULL.
  */
 struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      struct net_device *ndev, void *upstream,
+				      void *upstream,
 				      const struct sfp_upstream_ops *ops)
 {
 	struct sfp_bus *bus = sfp_bus_get(fwnode);
@@ -470,7 +467,6 @@ struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
 		rtnl_lock();
 		bus->upstream_ops = ops;
 		bus->upstream = upstream;
-		bus->netdev = ndev;
 
 		if (bus->sfp) {
 			ret = sfp_register_bus(bus);
@@ -592,7 +588,7 @@ struct sfp_bus *sfp_register_socket(struct device *dev, struct sfp *sfp,
 		bus->sfp = sfp;
 		bus->socket_ops = ops;
 
-		if (bus->netdev) {
+		if (bus->upstream_ops) {
 			ret = sfp_register_bus(bus);
 			if (ret)
 				sfp_socket_clear(bus);
@@ -612,7 +608,7 @@ EXPORT_SYMBOL_GPL(sfp_register_socket);
 void sfp_unregister_socket(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	if (bus->netdev)
+	if (bus->upstream_ops)
 		sfp_unregister_bus(bus);
 	sfp_socket_clear(bus);
 	rtnl_unlock();
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index a3f0336dd703..1c35428e98bc 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -464,7 +464,6 @@ enum {
 struct fwnode_handle;
 struct ethtool_eeprom;
 struct ethtool_modinfo;
-struct net_device;
 struct sfp_bus;
 
 /**
@@ -510,7 +509,7 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
 struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      struct net_device *ndev, void *upstream,
+				      void *upstream,
 				      const struct sfp_upstream_ops *ops);
 void sfp_unregister_upstream(struct sfp_bus *bus);
 #else
@@ -555,8 +554,7 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus)
 }
 
 static inline struct sfp_bus *sfp_register_upstream(
-	struct fwnode_handle *fwnode,
-	struct net_device *ndev, void *upstream,
+	struct fwnode_handle *fwnode, void *upstream,
 	const struct sfp_upstream_ops *ops)
 {
 	return (struct sfp_bus *)-1;
-- 
cgit v1.2.3


From 0ccc171ea6a2fa34a6b898329c0a447c84e27057 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Wed, 30 Jan 2019 17:21:55 +0200
Subject: net/mlx5: Geneve, Manage Geneve TLV options

Use Geneve TLV Options object to manage the flex parser matching
on the 32-bit options data.

When the first flow with a certain class/type values is requested to
be offloaded, create a FW object with FW command (Geneve TLV Options
general object) and start counting the number of flows using this object.

During this time, any request with a different class/type values will
fail to be offloaded.
Once the refcount reaches 0, destroy the TLV options general object,
and can now offload a flow with any class/type parameters.

Geneve TLV Options object is added to core device.
It is currently used to manage Geneve TLV options general
object allocation in FW and its reference counting only.
In the future it will also be used for managing geneve ports
by registering callbacks for ndo_udp_tunnel_add/del.

Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/geneve.c   | 157 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/geneve.h   |  33 +++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   4 +
 include/linux/mlx5/driver.h                        |   2 +
 5 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 243368dc23db..e31027277a6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -31,7 +31,8 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 mlx5_core-$(CONFIG_MLX5_EN_ARFS)     += en_arfs.o
 mlx5_core-$(CONFIG_MLX5_EN_RXNFC)    += en_fs_ethtool.o
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
-mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
+					lib/geneve.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
new file mode 100644
index 000000000000..23361a9ae4fa
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include <linux/kernel.h>
+#include "mlx5_core.h"
+#include "geneve.h"
+
+struct mlx5_geneve {
+	struct mlx5_core_dev *mdev;
+	__be16 opt_class;
+	u8 opt_type;
+	u32 obj_id;
+	struct mutex sync_lock; /* protect GENEVE obj operations */
+	u32 refcount;
+};
+
+static int mlx5_geneve_tlv_option_create(struct mlx5_core_dev *mdev,
+					 __be16 class,
+					 u8 type,
+					 u8 len)
+{
+	u32 in[MLX5_ST_SZ_DW(create_geneve_tlv_option_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	u64 general_obj_types;
+	void *hdr, *opt;
+	u16 obj_id;
+	int err;
+
+	general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types);
+	if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT))
+		return -EINVAL;
+
+	hdr = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, hdr);
+	opt = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, geneve_tlv_opt);
+
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT);
+
+	MLX5_SET(geneve_tlv_option, opt, option_class, be16_to_cpu(class));
+	MLX5_SET(geneve_tlv_option, opt, option_type, type);
+	MLX5_SET(geneve_tlv_option, opt, option_data_length, len);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+	return obj_id;
+}
+
+static void mlx5_geneve_tlv_option_destroy(struct mlx5_core_dev *mdev, u16 obj_id)
+{
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id);
+
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt)
+{
+	int res = 0;
+
+	if (IS_ERR_OR_NULL(geneve))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&geneve->sync_lock);
+
+	if (geneve->refcount) {
+		if (geneve->opt_class == opt->opt_class &&
+		    geneve->opt_type == opt->type) {
+			/* We already have TLV options obj allocated */
+			geneve->refcount++;
+		} else {
+			/* TLV options obj allocated, but its params
+			 * do not match the new request.
+			 * We support only one such object.
+			 */
+			mlx5_core_warn(geneve->mdev,
+				       "Won't create Geneve TLV opt object with class:type:len = 0x%x:0x%x:%d (another class:type already exists)\n",
+				       be16_to_cpu(opt->opt_class),
+				       opt->type,
+				       opt->length);
+			res = -EOPNOTSUPP;
+			goto unlock;
+		}
+	} else {
+		/* We don't have any TLV options obj allocated */
+
+		res = mlx5_geneve_tlv_option_create(geneve->mdev,
+						    opt->opt_class,
+						    opt->type,
+						    opt->length);
+		if (res < 0) {
+			mlx5_core_warn(geneve->mdev,
+				       "Failed creating Geneve TLV opt object class:type:len = 0x%x:0x%x:%d (err=%d)\n",
+				       be16_to_cpu(opt->opt_class),
+				       opt->type, opt->length, res);
+			goto unlock;
+		}
+		geneve->opt_class = opt->opt_class;
+		geneve->opt_type = opt->type;
+		geneve->obj_id = res;
+		geneve->refcount++;
+	}
+
+unlock:
+	mutex_unlock(&geneve->sync_lock);
+	return res;
+}
+
+void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve)
+{
+	if (IS_ERR_OR_NULL(geneve))
+		return;
+
+	mutex_lock(&geneve->sync_lock);
+	if (--geneve->refcount == 0) {
+		/* We've just removed the last user of Geneve option.
+		 * Now delete the object in FW.
+		 */
+		mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id);
+
+		geneve->opt_class = 0;
+		geneve->opt_type = 0;
+		geneve->obj_id = 0;
+	}
+	mutex_unlock(&geneve->sync_lock);
+}
+
+struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_geneve *geneve =
+		kzalloc(sizeof(*geneve), GFP_KERNEL);
+
+	if (!geneve)
+		return ERR_PTR(-ENOMEM);
+	geneve->mdev = mdev;
+	mutex_init(&geneve->sync_lock);
+
+	return geneve;
+}
+
+void mlx5_geneve_destroy(struct mlx5_geneve *geneve)
+{
+	if (IS_ERR_OR_NULL(geneve))
+		return;
+
+	/* Lockless since we are unloading */
+	if (geneve->refcount)
+		mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id);
+
+	kfree(geneve);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h
new file mode 100644
index 000000000000..adee0cbba19c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_GENEVE_H__
+#define __MLX5_GENEVE_H__
+
+#include <net/geneve.h>
+#include <linux/mlx5/driver.h>
+
+struct mlx5_geneve;
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev);
+void mlx5_geneve_destroy(struct mlx5_geneve *geneve);
+
+int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt);
+void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve);
+
+#else /* CONFIG_MLX5_ESWITCH */
+
+static inline struct mlx5_geneve
+*mlx5_geneve_create(struct mlx5_core_dev *mdev) { return NULL; }
+static inline void
+mlx5_geneve_destroy(struct mlx5_geneve *geneve) {}
+static inline int
+mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt) { return 0; }
+static inline void
+mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve) {}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+
+#endif /* __MLX5_GENEVE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 23d53163ce15..b27f9537256c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -63,6 +63,7 @@
 #include "accel/tls.h"
 #include "lib/clock.h"
 #include "lib/vxlan.h"
+#include "lib/geneve.h"
 #include "lib/devcom.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
@@ -821,6 +822,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	mlx5_init_clock(dev);
 
 	dev->vxlan = mlx5_vxlan_create(dev);
+	dev->geneve = mlx5_geneve_create(dev);
 
 	err = mlx5_init_rl_table(dev);
 	if (err) {
@@ -865,6 +867,7 @@ err_mpfs_cleanup:
 err_rl_cleanup:
 	mlx5_cleanup_rl_table(dev);
 err_tables_cleanup:
+	mlx5_geneve_destroy(dev->geneve);
 	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_qp_table(dev);
@@ -887,6 +890,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
+	mlx5_geneve_destroy(dev->geneve);
 	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_clock(dev);
 	mlx5_cleanup_reserved_gids(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5431f7d97cb..3a810bf043fe 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -647,6 +647,7 @@ struct mlx5_clock {
 
 struct mlx5_fw_tracer;
 struct mlx5_vxlan;
+struct mlx5_geneve;
 
 struct mlx5_core_dev {
 	struct device *device;
@@ -681,6 +682,7 @@ struct mlx5_core_dev {
 	u32			issi;
 	struct mlx5e_resources  mlx5e_res;
 	struct mlx5_vxlan       *vxlan;
+	struct mlx5_geneve      *geneve;
 	struct {
 		struct mlx5_rsvd_gids	reserved_gids;
 		u32			roce_en;
-- 
cgit v1.2.3


From 87e5e6dab6c2a21fab2620f37786276d202e2ce0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 May 2019 16:02:22 -0600
Subject: uio: make import_iovec()/compat_import_iovec() return bytes on
 success

Currently these functions return < 0 on error, and 0 for success.
Change that so that we return < 0 on error, but number of bytes
for success.

Some callers already treat the return value that way, others need a
slight tweak.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c            |  9 +++++----
 fs/io_uring.c       | 16 ++++++++--------
 fs/splice.c         |  8 ++++----
 include/linux/uio.h |  4 ++--
 lib/iov_iter.c      | 15 ++++++++-------
 net/compat.c        |  3 ++-
 net/socket.c        |  3 ++-
 7 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..41824c710b36 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	return 0;
 }
 
-static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
-		bool vectored, bool compat, struct iov_iter *iter)
+static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
+		struct iovec **iovec, bool vectored, bool compat,
+		struct iov_iter *iter)
 {
 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
 	size_t len = iocb->aio_nbytes;
@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fbb486a320e..23e08c10f486 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	return 0;
 }
 
-static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
-			   const struct sqe_submit *s, struct iovec **iovec,
-			   struct iov_iter *iter)
+static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			       const struct sqe_submit *s, struct iovec **iovec,
+			       struct iov_iter *iter)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -1023,7 +1023,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		int ret = io_import_fixed(ctx, rw, sqe, iter);
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
@@ -1089,7 +1089,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1102,7 +1102,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
@@ -1136,7 +1136,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1149,7 +1149,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
diff --git a/fs/splice.c b/fs/splice.c
index 14cb602d9a2f..98412721f056 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 
 	error = import_iovec(type, uiov, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 
 	error = compat_import_iovec(type, iov32, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2d0131ad4604..a61ceb6575ab 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -279,13 +279,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 
 #ifdef CONFIG_COMPAT
 struct compat_iovec;
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
+ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f99c41d4eb54..f1e0569b4539 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter);
  * on-stack array was used or not (and regardless of whether this function
  * returns an error or not).
  *
- * Return: 0 on success or negative error code on error.
+ * Return: Negative error code on error, bytes imported on success
  */
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i)
 {
@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 EXPORT_SYMBOL(import_iovec);
 
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
-		 unsigned nr_segs, unsigned fast_segs,
-		 struct iovec **iov, struct iov_iter *i)
+ssize_t compat_import_iovec(int type,
+		const struct compat_iovec __user * uvector,
+		unsigned nr_segs, unsigned fast_segs,
+		struct iovec **iov, struct iov_iter *i)
 {
 	ssize_t n;
 	struct iovec *p;
@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 #endif
 
diff --git a/net/compat.c b/net/compat.c
index 3f9ce609397f..0f7ded26059e 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return compat_import_iovec(save_addr ? READ : WRITE,
+	err = compat_import_iovec(save_addr ? READ : WRITE,
 				   compat_ptr(msg.msg_iov), msg.msg_iovlen,
 				   UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 /* Bleech... */
diff --git a/net/socket.c b/net/socket.c
index 72372dc5dd70..bffec466b4f1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2208,9 +2208,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return import_iovec(save_addr ? READ : WRITE,
+	err = import_iovec(save_addr ? READ : WRITE,
 			    msg.msg_iov, msg.msg_iovlen,
 			    UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
-- 
cgit v1.2.3


From 1f52f6c0b0e846908e9c1082dab1b3f7088b82ac Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:35 -0700
Subject: bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY

Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by
__cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can
request cwr for TCP packets.

Current cgroup skb programs can only return 0 or 1 (0 to drop the
packet. This macro changes the behavior so the low order bit
indicates whether the packet should be dropped (0) or not (1)
and the next bit is used for congestion notification (cn).

Hence, new allowed return values of CGROUP EGRESS BPF programs are:
  0: drop packet
  1: keep packet
  2: drop packet and call cwr
  3: keep packet and call cwr

This macro then converts it to one of NET_XMIT values or -EPERM
that has the effect of dropping the packet with no cn.
  0: NET_XMIT_SUCCESS  skb should be transmitted (no cn)
  1: NET_XMIT_DROP     skb should be dropped and cwr called
  2: NET_XMIT_CN       skb should be transmitted and cwr called
  3: -EPERM            skb should be dropped (no cn)

Note that when more than one BPF program is called, the packet is
dropped if at least one of programs requests it be dropped, and
there is cn if at least one program returns cn.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ff3e00ff84d2..2cc58fc0f413 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -552,6 +552,56 @@ _out:							\
 		_ret;					\
 	 })
 
+/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
+ * so BPF programs can request cwr for TCP packets.
+ *
+ * Current cgroup skb programs can only return 0 or 1 (0 to drop the
+ * packet. This macro changes the behavior so the low order bit
+ * indicates whether the packet should be dropped (0) or not (1)
+ * and the next bit is a congestion notification bit. This could be
+ * used by TCP to call tcp_enter_cwr()
+ *
+ * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
+ *   0: drop packet
+ *   1: keep packet
+ *   2: drop packet and cn
+ *   3: keep packet and cn
+ *
+ * This macro then converts it to one of the NET_XMIT or an error
+ * code that is then interpreted as drop packet (and no cn):
+ *   0: NET_XMIT_SUCCESS  skb should be transmitted
+ *   1: NET_XMIT_DROP     skb should be dropped and cn
+ *   2: NET_XMIT_CN       skb should be transmitted and cn
+ *   3: -EPERM            skb should be dropped
+ */
+#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog_array_item *_item;	\
+		struct bpf_prog *_prog;			\
+		struct bpf_prog_array *_array;		\
+		u32 ret;				\
+		u32 _ret = 1;				\
+		u32 _cn = 0;				\
+		preempt_disable();			\
+		rcu_read_lock();			\
+		_array = rcu_dereference(array);	\
+		_item = &_array->items[0];		\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			ret = func(_prog, ctx);		\
+			_ret &= (ret & 1);		\
+			_cn |= (ret & 2);		\
+			_item++;			\
+		}					\
+		rcu_read_unlock();			\
+		preempt_enable();			\
+		if (_ret)				\
+			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
+		else					\
+			_ret = (_cn ? NET_XMIT_DROP : -EPERM);		\
+		_ret;					\
+	})
+
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
 
-- 
cgit v1.2.3


From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:36 -0700
Subject: bpf: cgroup inet skb programs can return 0 to 3

Allows cgroup inet skb programs to return values in the range [0, 3].
The second bit is used to deterine if congestion occurred and higher
level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr()

The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS
at load time if it uses the new return values (i.e. 2 or 3).

The expected_attach_type is currently not enforced for
BPF_PROG_TYPE_CGROUP_SKB.  e.g Meaning the current bpf_prog with
expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to
BPF_CGROUP_INET_INGRESS.  Blindly enforcing expected_attach_type will
break backward compatibility.

This patch adds a enforce_expected_attach_type bit to only
enforce the expected_attach_type when it uses the new
return value.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/syscall.c   | 12 ++++++++++++
 kernel/bpf/verifier.c  | 16 +++++++++++++---
 3 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ba8b65270e0d..43b45d6db36d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -526,7 +526,8 @@ struct bpf_prog {
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1; /* callchain buffer allocated? */
+				has_callchain_buf:1, /* callchain buffer allocated? */
+				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3d546b6f4646..1539774d78c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		switch (expected_attach_type) {
+		case BPF_CGROUP_INET_INGRESS:
+		case BPF_CGROUP_INET_EGRESS:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	default:
 		return 0;
 	}
@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		return prog->enforce_expected_attach_type &&
+			prog->expected_attach_type != attach_type ?
+			-EINVAL : 0;
 	default:
 		return 0;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2778417e6e0c..5c2cb5bd84ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static int check_return_code(struct bpf_verifier_env *env)
 {
+	struct tnum enforce_attach_type_range = tnum_unknown;
 	struct bpf_reg_state *reg;
 	struct tnum range = tnum_range(0, 1);
 
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SKB:
+		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+			range = tnum_range(0, 3);
+			enforce_attach_type_range = tnum_range(2, 3);
+		}
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
+		char tn_buf[48];
+
 		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
-			char tn_buf[48];
-
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 			verbose(env, "has value %s", tn_buf);
 		} else {
 			verbose(env, "has unknown scalar value");
 		}
-		verbose(env, " should have been 0 or 1\n");
+		tnum_strn(tn_buf, sizeof(tn_buf), range);
+		verbose(env, " should have been %s\n", tn_buf);
 		return -EINVAL;
 	}
+
+	if (!tnum_is_unknown(enforce_attach_type_range) &&
+	    tnum_in(enforce_attach_type_range, reg->var_off))
+		env->prog->enforce_expected_attach_type = 1;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3539b96e041c06e4317082816d90ec09160aeb11 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:57 -0700
Subject: bpf: group memory related fields in struct bpf_map_memory

Group "user" and "pages" fields of bpf_map into the bpf_map_memory
structure. Later it can be extended with "memcg" and other related
information.

The main reason for a such change (beside cosmetics) is to pass
bpf_map_memory structure to charging functions before the actual
allocation of bpf_map.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           | 10 +++++++---
 kernel/bpf/arraymap.c         |  2 +-
 kernel/bpf/cpumap.c           |  4 ++--
 kernel/bpf/devmap.c           |  4 ++--
 kernel/bpf/hashtab.c          |  4 ++--
 kernel/bpf/local_storage.c    |  2 +-
 kernel/bpf/lpm_trie.c         |  4 ++--
 kernel/bpf/queue_stack_maps.c |  2 +-
 kernel/bpf/reuseport_array.c  |  2 +-
 kernel/bpf/stackmap.c         |  4 ++--
 kernel/bpf/syscall.c          | 19 ++++++++++---------
 kernel/bpf/xskmap.c           |  4 ++--
 net/core/bpf_sk_storage.c     |  2 +-
 net/core/sock_map.c           |  4 ++--
 14 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2cc58fc0f413..2e7c1c40d949 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ struct bpf_map_ops {
 				     u64 imm, u32 *off);
 };
 
+struct bpf_map_memory {
+	u32 pages;
+	struct user_struct *user;
+};
+
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
@@ -86,7 +91,7 @@ struct bpf_map {
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
 	struct btf *btf;
-	u32 pages;
+	struct bpf_map_memory memory;
 	bool unpriv_array;
 	bool frozen; /* write-once */
 	/* 48 bytes hole */
@@ -94,8 +99,7 @@ struct bpf_map {
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
-	struct user_struct *user ____cacheline_aligned;
-	atomic_t refcnt;
+	atomic_t refcnt ____cacheline_aligned;
 	atomic_t usercnt;
 	struct work_struct work;
 	char name[BPF_OBJ_NAME_LEN];
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..8fda24e78193 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -138,7 +138,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.pages = cost;
+	array->map.memory.pages = cost;
 	array->elem_size = elem_size;
 
 	if (percpu && bpf_array_alloc_percpu(array)) {
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..035268add724 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_cmap;
-	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	ret = bpf_map_precharge_memlock(cmap->map.memory.pages);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1e525d70f833..f6c57efb1d0d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -111,10 +111,10 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_dtab;
 
-	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
+	err = bpf_map_precharge_memlock(dtab->map.memory.pages);
 	if (err)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0f2708fde5f7..15bf228d2e98 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -364,10 +364,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(htab->map.pages);
+	err = bpf_map_precharge_memlock(htab->map.memory.pages);
 	if (err)
 		goto free_htab;
 
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e48302ecb389..574325276650 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -303,7 +303,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	map->map.pages = pages;
+	map->map.memory.pages = pages;
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..8e423a582760 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -578,9 +578,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 		goto out_err;
 	}
 
-	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(trie->map.pages);
+	ret = bpf_map_precharge_memlock(trie->map.memory.pages);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 0b140d236889..8a510e71d486 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -89,7 +89,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	bpf_map_init_from_attr(&qs->map, attr);
 
-	qs->map.pages = cost;
+	qs->map.memory.pages = cost;
 	qs->size = size;
 
 	raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..819515242739 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -176,7 +176,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.pages = cost;
+	array->map.memory.pages = cost;
 
 	return &array->map;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..08d4efff73ac 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -131,9 +131,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	bpf_map_init_from_attr(&smap->map, attr);
 	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
-	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	err = bpf_map_precharge_memlock(smap->map.pages);
+	err = bpf_map_precharge_memlock(smap->map.memory.pages);
 	if (err)
 		goto free_smap;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1539774d78c7..8289a2ce14fc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -222,19 +222,20 @@ static int bpf_map_init_memlock(struct bpf_map *map)
 	struct user_struct *user = get_current_user();
 	int ret;
 
-	ret = bpf_charge_memlock(user, map->pages);
+	ret = bpf_charge_memlock(user, map->memory.pages);
 	if (ret) {
 		free_uid(user);
 		return ret;
 	}
-	map->user = user;
+	map->memory.user = user;
 	return ret;
 }
 
 static void bpf_map_release_memlock(struct bpf_map *map)
 {
-	struct user_struct *user = map->user;
-	bpf_uncharge_memlock(user, map->pages);
+	struct user_struct *user = map->memory.user;
+
+	bpf_uncharge_memlock(user, map->memory.pages);
 	free_uid(user);
 }
 
@@ -242,17 +243,17 @@ int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
 {
 	int ret;
 
-	ret = bpf_charge_memlock(map->user, pages);
+	ret = bpf_charge_memlock(map->memory.user, pages);
 	if (ret)
 		return ret;
-	map->pages += pages;
+	map->memory.pages += pages;
 	return ret;
 }
 
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
 {
-	bpf_uncharge_memlock(map->user, pages);
-	map->pages -= pages;
+	bpf_uncharge_memlock(map->memory.user, pages);
+	map->memory.pages -= pages;
 }
 
 static int bpf_map_alloc_id(struct bpf_map *map)
@@ -395,7 +396,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->pages * 1ULL << PAGE_SHIFT,
+		   map->memory.pages * 1ULL << PAGE_SHIFT,
 		   map->id,
 		   READ_ONCE(map->frozen));
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..f816ee1a0fa0 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -40,10 +40,10 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_m;
 
-	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_precharge_memlock(m->map.pages);
+	err = bpf_map_precharge_memlock(m->map.memory.pages);
 	if (err)
 		goto free_m;
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 9a8aaf8e235d..92581c3ff220 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -659,7 +659,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 	smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
 		BPF_SK_STORAGE_CACHE_SIZE;
-	smap->map.pages = pages;
+	smap->map.memory.pages = pages;
 
 	return &smap->map;
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..4eb5b6a1b29f 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 	}
 
-	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(stab->map.pages);
+	stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(stab->map.memory.pages);
 	if (err)
 		goto free_stab;
 
-- 
cgit v1.2.3


From b936ca643ade11f265fa10e5fb71c20d9c5243f1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:58 -0700
Subject: bpf: rework memlock-based memory accounting for maps

In order to unify the existing memlock charging code with the
memcg-based memory accounting, which will be added later, let's
rework the current scheme.

Currently the following design is used:
  1) .alloc() callback optionally checks if the allocation will likely
     succeed using bpf_map_precharge_memlock()
  2) .alloc() performs actual allocations
  3) .alloc() callback calculates map cost and sets map.memory.pages
  4) map_create() calls bpf_map_init_memlock() which sets map.memory.user
     and performs actual charging; in case of failure the map is
     destroyed
  <map is in use>
  1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which
     performs uncharge and releases the user
  2) .map_free() callback releases the memory

The scheme can be simplified and made more robust:
  1) .alloc() calculates map cost and calls bpf_map_charge_init()
  2) bpf_map_charge_init() sets map.memory.user and performs actual
    charge
  3) .alloc() performs actual allocations
  <map is in use>
  1) .map_free() callback releases the memory
  2) bpf_map_charge_finish() performs uncharge and releases the user

The new scheme also allows to reuse bpf_map_charge_init()/finish()
functions for memcg-based accounting. Because charges are performed
before actual allocations and uncharges after freeing the memory,
no bogus memory pressure can be created.

In cases when the map structure is not available (e.g. it's not
created yet, or is already destroyed), on-stack bpf_map_memory
structure is used. The charge can be transferred with the
bpf_map_charge_move() function.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  5 +++-
 kernel/bpf/arraymap.c         | 10 +++++--
 kernel/bpf/cpumap.c           |  8 +++--
 kernel/bpf/devmap.c           | 13 ++++----
 kernel/bpf/hashtab.c          | 11 +++----
 kernel/bpf/local_storage.c    |  9 ++++--
 kernel/bpf/lpm_trie.c         |  5 ++--
 kernel/bpf/queue_stack_maps.c |  9 ++++--
 kernel/bpf/reuseport_array.c  |  9 ++++--
 kernel/bpf/stackmap.c         | 30 +++++++++++--------
 kernel/bpf/syscall.c          | 69 +++++++++++++++++++++----------------------
 kernel/bpf/xskmap.c           |  9 +++---
 net/core/bpf_sk_storage.c     |  8 +++--
 net/core/sock_map.c           |  5 ++--
 14 files changed, 112 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2e7c1c40d949..3c8f24f402bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -650,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
-int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
+int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages);
+void bpf_map_charge_finish(struct bpf_map_memory *mem);
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+			 struct bpf_map_memory *src);
 void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8fda24e78193..3552da4407d9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -83,6 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	u32 elem_size, index_mask, max_entries;
 	bool unpriv = !capable(CAP_SYS_ADMIN);
 	u64 cost, array_size, mask64;
+	struct bpf_map_memory mem;
 	struct bpf_array *array;
 
 	elem_size = round_up(attr->value_size, 8);
@@ -125,23 +126,26 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(cost);
+	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	/* allocate all map elements and zero-initialize them */
 	array = bpf_map_area_alloc(array_size, numa_node);
-	if (!array)
+	if (!array) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 	array->index_mask = index_mask;
 	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.memory.pages = cost;
+	bpf_map_charge_move(&array->map.memory, &mem);
 	array->elem_size = elem_size;
 
 	if (percpu && bpf_array_alloc_percpu(array)) {
+		bpf_map_charge_finish(&array->map.memory);
 		bpf_map_area_free(array);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 035268add724..c633c8d68023 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_cmap;
-	cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_precharge_memlock(cmap->map.memory.pages);
+	ret = bpf_map_charge_init(&cmap->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
@@ -121,7 +121,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
 					    __alignof__(unsigned long));
 	if (!cmap->flush_needed)
-		goto free_cmap;
+		goto free_charge;
 
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -133,6 +133,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	return &cmap->map;
 free_percpu:
 	free_percpu(cmap->flush_needed);
+free_charge:
+	bpf_map_charge_finish(&cmap->map.memory);
 free_cmap:
 	kfree(cmap);
 	return ERR_PTR(err);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f6c57efb1d0d..371bd880ed58 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -111,10 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_dtab;
 
-	dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.memory.pages);
+	/* if map size is larger than memlock limit, reject it */
+	err = bpf_map_charge_init(&dtab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_dtab;
 
@@ -125,19 +124,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		goto free_charge;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_dtab;
+		goto free_charge;
 
 	spin_lock(&dev_map_lock);
 	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
 	return &dtab->map;
+free_charge:
+	bpf_map_charge_finish(&dtab->map.memory);
 free_dtab:
 	free_percpu(dtab->flush_needed);
 	kfree(dtab);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 15bf228d2e98..b0bdc7b040ad 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -364,10 +364,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(htab->map.memory.pages);
+	/* if map size is larger than memlock limit, reject it */
+	err = bpf_map_charge_init(&htab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_htab;
 
@@ -376,7 +375,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 					   sizeof(struct bucket),
 					   htab->map.numa_node);
 	if (!htab->buckets)
-		goto free_htab;
+		goto free_charge;
 
 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
 		htab->hashrnd = 0;
@@ -409,6 +408,8 @@ free_prealloc:
 	prealloc_destroy(htab);
 free_buckets:
 	bpf_map_area_free(htab->buckets);
+free_charge:
+	bpf_map_charge_finish(&htab->map.memory);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 574325276650..e49bfd4f4f6d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -272,6 +272,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_cgroup_storage_map *map;
+	struct bpf_map_memory mem;
 	u32 pages;
 	int ret;
 
@@ -294,16 +295,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 
 	pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >>
 		PAGE_SHIFT;
-	ret = bpf_map_precharge_memlock(pages);
+	ret = bpf_map_charge_init(&mem, pages);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
 			   __GFP_ZERO | GFP_USER, numa_node);
-	if (!map)
+	if (!map) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
-	map->map.memory.pages = pages;
+	bpf_map_charge_move(&map->map.memory, &mem);
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 8e423a582760..6345a8d2dcd0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -578,9 +578,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 		goto out_err;
 	}
 
-	trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	ret = bpf_map_precharge_memlock(trie->map.memory.pages);
+	ret = bpf_map_charge_init(&trie->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8a510e71d486..224cb0fd8f03 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -67,6 +67,7 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
 static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 {
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_map_memory mem = {0};
 	struct bpf_queue_stack *qs;
 	u64 size, queue_size, cost;
 
@@ -77,19 +78,21 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(cost);
+	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	qs = bpf_map_area_alloc(queue_size, numa_node);
-	if (!qs)
+	if (!qs) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	memset(qs, 0, sizeof(*qs));
 
 	bpf_map_init_from_attr(&qs->map, attr);
 
-	qs->map.memory.pages = cost;
+	bpf_map_charge_move(&qs->map.memory, &mem);
 	qs->size = size;
 
 	raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 819515242739..5c6e25b1b9b1 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,6 +151,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 {
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
+	struct bpf_map_memory mem;
 	u64 cost, array_size;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -165,18 +166,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	err = bpf_map_precharge_memlock(cost);
+	err = bpf_map_charge_init(&mem, cost);
 	if (err)
 		return ERR_PTR(err);
 
 	/* allocate all map elements and zero-initialize them */
 	array = bpf_map_area_alloc(array_size, numa_node);
-	if (!array)
+	if (!array) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.memory.pages = cost;
+	bpf_map_charge_move(&array->map.memory, &mem);
 
 	return &array->map;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 08d4efff73ac..8da24ca65d97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -89,6 +89,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
 	u32 value_size = attr->value_size;
 	struct bpf_stack_map *smap;
+	struct bpf_map_memory mem;
 	u64 cost, n_buckets;
 	int err;
 
@@ -116,40 +117,43 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	n_buckets = roundup_pow_of_two(attr->max_entries);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_map_charge_init(&mem,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	if (err)
+		return ERR_PTR(err);
+
 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
-	if (!smap)
+	if (!smap) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
-
-	err = -E2BIG;
-	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_smap;
+	}
 
 	bpf_map_init_from_attr(&smap->map, attr);
 	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
-	smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	err = bpf_map_precharge_memlock(smap->map.memory.pages);
-	if (err)
-		goto free_smap;
 
 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
 	if (err)
-		goto free_smap;
+		goto free_charge;
 
 	err = prealloc_elems_and_freelist(smap);
 	if (err)
 		goto put_buffers;
 
+	bpf_map_charge_move(&smap->map.memory, &mem);
+
 	return &smap->map;
 
 put_buffers:
 	put_callchain_buffers();
-free_smap:
+free_charge:
+	bpf_map_charge_finish(&mem);
 	bpf_map_area_free(smap);
 	return ERR_PTR(err);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8289a2ce14fc..4a5ebad99154 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -188,19 +188,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
 	map->numa_node = bpf_map_attr_numa_node(attr);
 }
 
-int bpf_map_precharge_memlock(u32 pages)
-{
-	struct user_struct *user = get_current_user();
-	unsigned long memlock_limit, cur;
-
-	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	cur = atomic_long_read(&user->locked_vm);
-	free_uid(user);
-	if (cur + pages > memlock_limit)
-		return -EPERM;
-	return 0;
-}
-
 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -214,29 +201,40 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 
 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 {
-	atomic_long_sub(pages, &user->locked_vm);
+	if (user)
+		atomic_long_sub(pages, &user->locked_vm);
 }
 
-static int bpf_map_init_memlock(struct bpf_map *map)
+int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages)
 {
 	struct user_struct *user = get_current_user();
 	int ret;
 
-	ret = bpf_charge_memlock(user, map->memory.pages);
+	ret = bpf_charge_memlock(user, pages);
 	if (ret) {
 		free_uid(user);
 		return ret;
 	}
-	map->memory.user = user;
-	return ret;
+
+	mem->pages = pages;
+	mem->user = user;
+
+	return 0;
 }
 
-static void bpf_map_release_memlock(struct bpf_map *map)
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
 {
-	struct user_struct *user = map->memory.user;
+	bpf_uncharge_memlock(mem->user, mem->pages);
+	free_uid(mem->user);
+}
 
-	bpf_uncharge_memlock(user, map->memory.pages);
-	free_uid(user);
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+			 struct bpf_map_memory *src)
+{
+	*dst = *src;
+
+	/* Make sure src will not be used for the redundant uncharging. */
+	memset(src, 0, sizeof(struct bpf_map_memory));
 }
 
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
@@ -304,11 +302,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
+	struct bpf_map_memory mem;
 
-	bpf_map_release_memlock(map);
+	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
+	bpf_map_charge_finish(&mem);
 }
 
 static void bpf_map_put_uref(struct bpf_map *map)
@@ -550,6 +550,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_map_memory mem;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -574,7 +575,7 @@ static int map_create(union bpf_attr *attr)
 
 	err = bpf_obj_name_cpy(map->name, attr->map_name);
 	if (err)
-		goto free_map_nouncharge;
+		goto free_map;
 
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
@@ -584,20 +585,20 @@ static int map_create(union bpf_attr *attr)
 
 		if (!attr->btf_value_type_id) {
 			err = -EINVAL;
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		btf = btf_get_by_fd(attr->btf_fd);
 		if (IS_ERR(btf)) {
 			err = PTR_ERR(btf);
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		err = map_check_btf(map, btf, attr->btf_key_type_id,
 				    attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		map->btf = btf;
@@ -609,15 +610,11 @@ static int map_create(union bpf_attr *attr)
 
 	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_nouncharge;
-
-	err = bpf_map_init_memlock(map);
-	if (err)
-		goto free_map_sec;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
-		goto free_map;
+		goto free_map_sec;
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -633,13 +630,13 @@ static int map_create(union bpf_attr *attr)
 
 	return err;
 
-free_map:
-	bpf_map_release_memlock(map);
 free_map_sec:
 	security_bpf_map_free(map);
-free_map_nouncharge:
+free_map:
 	btf_put(map->btf);
+	bpf_map_charge_move(&mem, &map->memory);
 	map->ops->map_free(map);
+	bpf_map_charge_finish(&mem);
 	return err;
 }
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index f816ee1a0fa0..a329dab7c7a4 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -40,10 +40,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_m;
 
-	m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_precharge_memlock(m->map.memory.pages);
+	err = bpf_map_charge_init(&m->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_m;
 
@@ -51,7 +50,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	m->flush_list = alloc_percpu(struct list_head);
 	if (!m->flush_list)
-		goto free_m;
+		goto free_charge;
 
 	for_each_possible_cpu(cpu)
 		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +64,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 free_percpu:
 	free_percpu(m->flush_list);
+free_charge:
+	bpf_map_charge_finish(&m->map.memory);
 free_m:
 	kfree(m);
 	return ERR_PTR(err);
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 92581c3ff220..621a0b07ff11 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -640,13 +640,16 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
 	pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(pages);
-	if (ret < 0)
+	ret = bpf_map_charge_init(&smap->map.memory, pages);
+	if (ret < 0) {
+		kfree(smap);
 		return ERR_PTR(ret);
+	}
 
 	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
 				 GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
+		bpf_map_charge_finish(&smap->map.memory);
 		kfree(smap);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -659,7 +662,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 	smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
 		BPF_SK_STORAGE_CACHE_SIZE;
-	smap->map.memory.pages = pages;
 
 	return &smap->map;
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 4eb5b6a1b29f..1028c922a149 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 	}
 
-	stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(stab->map.memory.pages);
+	err = bpf_map_charge_init(&stab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_stab;
 
@@ -60,6 +60,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (stab->sks)
 		return &stab->map;
 	err = -ENOMEM;
+	bpf_map_charge_finish(&stab->map.memory);
 free_stab:
 	kfree(stab);
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From c85d69135a9175c50a823d04d62d932312d037b3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:59 -0700
Subject: bpf: move memory size checks to bpf_map_charge_init()

Most bpf map types doing similar checks and bytes to pages
conversion during memory allocation and charging.

Let's unify these checks by moving them into bpf_map_charge_init().

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  2 +-
 kernel/bpf/arraymap.c         |  8 +-------
 kernel/bpf/cpumap.c           |  5 +----
 kernel/bpf/devmap.c           |  5 +----
 kernel/bpf/hashtab.c          |  7 +------
 kernel/bpf/local_storage.c    |  5 +----
 kernel/bpf/lpm_trie.c         |  7 +------
 kernel/bpf/queue_stack_maps.c |  4 ----
 kernel/bpf/reuseport_array.c  | 10 ++--------
 kernel/bpf/stackmap.c         |  8 +-------
 kernel/bpf/syscall.c          |  9 +++++++--
 kernel/bpf/xskmap.c           |  5 +----
 net/core/bpf_sk_storage.c     |  4 +---
 net/core/sock_map.c           |  8 +-------
 14 files changed, 20 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3c8f24f402bf..e5a309e6a400 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -652,7 +652,7 @@ void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages);
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
 void bpf_map_charge_finish(struct bpf_map_memory *mem);
 void bpf_map_charge_move(struct bpf_map_memory *dst,
 			 struct bpf_map_memory *src);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3552da4407d9..0349cbf23cdb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -117,14 +117,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* make sure there is no u32 overflow later in round_up() */
 	cost = array_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-ENOMEM);
-	if (percpu) {
+	if (percpu)
 		cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
-		if (cost >= U32_MAX - PAGE_SIZE)
-			return ERR_PTR(-ENOMEM);
-	}
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c633c8d68023..b31a71909307 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	/* make sure page count doesn't overflow */
 	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_cmap;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_charge_init(&cmap->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	ret = bpf_map_charge_init(&cmap->map.memory, cost);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 371bd880ed58..5ae7cce5ef16 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -108,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
 
 	/* if map size is larger than memlock limit, reject it */
-	err = bpf_map_charge_init(&dtab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&dtab->map.memory, cost);
 	if (err)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b0bdc7b040ad..d92e05d9979b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -360,13 +360,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	else
 	       cost += (u64) htab->elem_size * num_possible_cpus();
 
-	if (cost >= U32_MAX - PAGE_SIZE)
-		/* make sure page count doesn't overflow */
-		goto free_htab;
-
 	/* if map size is larger than memlock limit, reject it */
-	err = bpf_map_charge_init(&htab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&htab->map.memory, cost);
 	if (err)
 		goto free_htab;
 
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e49bfd4f4f6d..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -273,7 +273,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_map_memory mem;
-	u32 pages;
 	int ret;
 
 	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
@@ -293,9 +292,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 		/* max_entries is not used and enforced to be 0 */
 		return ERR_PTR(-EINVAL);
 
-	pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >>
-		PAGE_SHIFT;
-	ret = bpf_map_charge_init(&mem, pages);
+	ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
 	if (ret < 0)
 		return ERR_PTR(ret);
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 6345a8d2dcd0..09334f13a8a0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -573,13 +573,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	cost_per_node = sizeof(struct lpm_trie_node) +
 			attr->value_size + trie->data_size;
 	cost += (u64) attr->max_entries * cost_per_node;
-	if (cost >= U32_MAX - PAGE_SIZE) {
-		ret = -E2BIG;
-		goto out_err;
-	}
 
-	ret = bpf_map_charge_init(&trie->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	ret = bpf_map_charge_init(&trie->map.memory, cost);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 224cb0fd8f03..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -73,10 +73,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	size = (u64) attr->max_entries + 1;
 	cost = queue_size = sizeof(*qs) + size * attr->value_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
-
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 5c6e25b1b9b1..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,7 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
 	struct bpf_map_memory mem;
-	u64 cost, array_size;
+	u64 array_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -160,13 +160,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	array_size = sizeof(*array);
 	array_size += (u64)attr->max_entries * sizeof(struct sock *);
 
-	/* make sure there is no u32 overflow later in round_up() */
-	cost = array_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-ENOMEM);
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	err = bpf_map_charge_init(&mem, cost);
+	err = bpf_map_charge_init(&mem, array_size);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8da24ca65d97..3d86072d8e32 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -117,14 +117,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	n_buckets = roundup_pow_of_two(attr->max_entries);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
 	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
-
-	err = bpf_map_charge_init(&mem,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&mem, cost);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4a5ebad99154..4c53cbd3329d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -205,11 +205,16 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 		atomic_long_sub(pages, &user->locked_vm);
 }
 
-int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages)
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
 {
-	struct user_struct *user = get_current_user();
+	u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+	struct user_struct *user;
 	int ret;
 
+	if (size >= U32_MAX - PAGE_SIZE)
+		return -E2BIG;
+
+	user = get_current_user();
 	ret = bpf_charge_memlock(user, pages);
 	if (ret) {
 		free_uid(user);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index a329dab7c7a4..22066c28ba61 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -37,12 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
 	cost += sizeof(struct list_head) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_m;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_charge_init(&m->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&m->map.memory, cost);
 	if (err)
 		goto free_m;
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 621a0b07ff11..f40e3d35fd9c 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -626,7 +626,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	struct bpf_sk_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
-	u32 pages;
 	u64 cost;
 	int ret;
 
@@ -638,9 +637,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus()));
 	nbuckets = 1U << smap->bucket_log;
 	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-	pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_charge_init(&smap->map.memory, pages);
+	ret = bpf_map_charge_init(&smap->map.memory, cost);
 	if (ret < 0) {
 		kfree(smap);
 		return ERR_PTR(ret);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1028c922a149..52d4faeee18b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* Make sure page count doesn't overflow. */
 	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
-	if (cost >= U32_MAX - PAGE_SIZE) {
-		err = -EINVAL;
-		goto free_stab;
-	}
-
-	err = bpf_map_charge_init(&stab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&stab->map.memory, cost);
 	if (err)
 		goto free_stab;
 
-- 
cgit v1.2.3


From 5213d7efc8ec26ed8938dce75427eff9275a62d9 Mon Sep 17 00:00:00 2001
From: Ruslan Babayev <ruslan@babayev.com>
Date: Tue, 28 May 2019 16:02:32 -0700
Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle

This allows drivers to lookup i2c adapters on ACPI based systems similar to
of_get_i2c_adapter_by_node() with DT based systems.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 272800692088..964687534754 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data)
 	return ACPI_COMPANION(dev) == data;
 }
 
-static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 {
 	struct device *dev;
 
@@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 			      i2c_acpi_find_match_adapter);
 	return dev ? i2c_verify_adapter(dev) : NULL;
 }
+EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
 
 static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1308126fc384..e982b8913b73 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -14,6 +14,7 @@
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
+#include <linux/acpi.h>		/* for acpi_handle */
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
 {
 	return NULL;
 }
+static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From ef11db3310e272d3d8dbe8739e0770820dd20e52 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 31 May 2019 18:27:04 +0200
Subject: net: inetdevice: provide replacement iterators for in_ifaddr walk

The ifa_list is protected either by rcu or rtnl lock, but the
current iterators do not account for this.

This adds two iterators as replacement, a later patch in
the series will update them with the needed rcu/rtnl_dereference calls.

Its not done in this patch yet to avoid sparse warnings -- the fields
lack the proper __rcu annotation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 10 +++++++++-
 net/ipv4/devinet.c         | 31 ++++++++++++++++---------------
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 367dc2a0f84a..d5d05503a04b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -186,7 +186,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
 struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
-static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
+static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
 }
@@ -215,6 +215,14 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 
 #define endfor_ifa(in_dev) }
 
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)			\
+	for (ifa = (in_dev)->ifa_list; ifa;			\
+	     ifa = ifa->ifa_next)
+
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)			\
+	for (ifa = (in_dev)->ifa_list; ifa;			\
+	     ifa = ifa->ifa_next)
+
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->ip_ptr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 701c5d113a34..7803a4d2951c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -873,13 +873,12 @@ errout:
 static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
-	struct in_ifaddr *ifa1, **ifap;
+	struct in_ifaddr *ifa1;
 
 	if (!ifa->ifa_local)
 		return NULL;
 
-	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-	     ifap = &ifa1->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa) &&
 		    ifa1->ifa_local == ifa->ifa_local)
@@ -1208,7 +1207,7 @@ out:
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
 {
 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
-	struct in_ifaddr *ifa;
+	const struct in_ifaddr *ifa;
 	struct ifreq ifr;
 	int done = 0;
 
@@ -1218,7 +1217,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s
 	if (!in_dev)
 		goto out;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		if (!buf) {
 			done += size;
 			continue;
@@ -1321,10 +1320,11 @@ EXPORT_SYMBOL(inet_select_addr);
 static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
 			      __be32 local, int scope)
 {
-	int same = 0;
+	const struct in_ifaddr *ifa;
 	__be32 addr = 0;
+	int same = 0;
 
-	for_ifa(in_dev) {
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
 		if (!addr &&
 		    (local == ifa->ifa_local || !local) &&
 		    ifa->ifa_scope <= scope) {
@@ -1350,7 +1350,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
 				same = 0;
 			}
 		}
-	} endfor_ifa(in_dev);
+	}
 
 	return same ? addr : 0;
 }
@@ -1424,7 +1424,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
 	struct in_ifaddr *ifa;
 	int named = 0;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		char old[IFNAMSIZ], *dot;
 
 		memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1454,10 +1454,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
 					struct in_device *in_dev)
 
 {
-	struct in_ifaddr *ifa;
+	const struct in_ifaddr *ifa;
 
-	for (ifa = in_dev->ifa_list; ifa;
-	     ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		arp_send(ARPOP_REQUEST, ETH_P_ARP,
 			 ifa->ifa_local, dev,
 			 ifa->ifa_local, NULL,
@@ -1727,15 +1726,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
 	int ip_idx = 0;
 	int err;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
-		if (ip_idx < s_ip_idx)
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (ip_idx < s_ip_idx) {
+			ip_idx++;
 			continue;
-
+		}
 		err = inet_fill_ifaddr(skb, ifa, fillargs);
 		if (err < 0)
 			goto done;
 
 		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+		ip_idx++;
 	}
 	err = 0;
 
-- 
cgit v1.2.3


From 2638eb8b50cfc16240e0bb080b9afbf541a9b39d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 31 May 2019 18:27:09 +0200
Subject: net: ipv4: provide __rcu annotation for ifa_list

ifa_list is protected by rcu, yet code doesn't reflect this.

Add the __rcu annotations and fix up all places that are now reported by
sparse.

I've done this in the same commit to not add intermediate patches that
result in new warnings.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/i40iw/i40iw_utils.c       | 12 ++--
 drivers/infiniband/hw/nes/nes.c                 |  8 ++-
 drivers/infiniband/hw/usnic/usnic_ib_main.c     | 15 +++--
 drivers/net/ethernet/via/via-velocity.h         |  2 +-
 drivers/net/plip/plip.c                         |  4 +-
 drivers/net/vmxnet3/vmxnet3_drv.c               | 19 ++++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c      |  4 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c |  2 +-
 drivers/staging/isdn/hysdn/hysdn_net.c          |  6 +-
 include/linux/inetdevice.h                      | 21 ++----
 net/core/netpoll.c                              | 10 ++-
 net/core/pktgen.c                               |  8 ++-
 net/ipv4/devinet.c                              | 88 ++++++++++++++++---------
 net/mac80211/main.c                             |  4 +-
 net/netfilter/nf_nat_redirect.c                 | 12 ++--
 15 files changed, 134 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 337410f40860..016524683e17 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
 		rcu_read_lock();
 		in = __in_dev_get_rcu(upper_dev);
 
-		if (!in->ifa_list)
-			local_ipaddr = 0;
-		else
-			local_ipaddr = ntohl(in->ifa_list->ifa_address);
+		local_ipaddr = 0;
+		if (in) {
+			struct in_ifaddr *ifa;
+
+			ifa = rcu_dereference(in->ifa_list);
+			if (ifa)
+				local_ipaddr = ntohl(ifa->ifa_address);
+		}
 
 		rcu_read_unlock();
 	} else {
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index e00add6d78ec..29b324726ea6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block *notifier,
 
 						rcu_read_lock();
 						in = __in_dev_get_rcu(upper_dev);
-						nesvnic->local_ipaddr = in->ifa_list->ifa_address;
+						if (in) {
+							struct in_ifaddr *ifa;
+
+							ifa = rcu_dereference(in->ifa_list);
+							if (ifa)
+								nesvnic->local_ipaddr = ifa->ifa_address;
+						}
 						rcu_read_unlock();
 					} else {
 						nesvnic->local_ipaddr = ifa->ifa_address;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..34c1f9d6c915 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 	if (netif_carrier_ok(us_ibdev->netdev))
 		usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-	ind = in_dev_get(netdev);
-	if (ind->ifa_list)
-		usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-				     ind->ifa_list->ifa_address);
-	in_dev_put(ind);
+	rcu_read_lock();
+	ind = __in_dev_get_rcu(netdev);
+	if (ind) {
+		const struct in_ifaddr *ifa;
+
+		ifa = rcu_dereference(ind->ifa_list);
+		if (ifa)
+			usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+	}
+	rcu_read_unlock();
 
 	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
 				us_ibdev->ufdev->inaddr, &gid.raw[0]);
diff --git a/drivers/net/ethernet/via/via-velocity.h b/drivers/net/ethernet/via/via-velocity.h
index c0ecc6c7b5e0..cdfe7809e3c1 100644
--- a/drivers/net/ethernet/via/via-velocity.h
+++ b/drivers/net/ethernet/via/via-velocity.h
@@ -1509,7 +1509,7 @@ static inline int velocity_get_ip(struct velocity_info *vptr)
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(vptr->netdev);
 	if (in_dev != NULL) {
-		ifa = (struct in_ifaddr *) in_dev->ifa_list;
+		ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa != NULL) {
 			memcpy(vptr->ip_addr, &ifa->ifa_address, 4);
 			res = 0;
diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c
index feb92ecd1880..3e3ac2e496a1 100644
--- a/drivers/net/plip/plip.c
+++ b/drivers/net/plip/plip.c
@@ -1012,7 +1012,7 @@ plip_rewrite_address(const struct net_device *dev, struct ethhdr *eth)
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		/* Any address will do - we take the first */
-		const struct in_ifaddr *ifa = in_dev->ifa_list;
+		const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa) {
 			memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
 			memset(eth->h_dest, 0xfc, 2);
@@ -1107,7 +1107,7 @@ plip_open(struct net_device *dev)
 		/* Any address will do - we take the first. We already
 		   have the first two bytes filled with 0xfc, from
 		   plip_init_dev(). */
-		struct in_ifaddr *ifa=in_dev->ifa_list;
+		const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa != NULL) {
 			memcpy(dev->dev_addr+2, &ifa->ifa_local, 4);
 		}
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 89984fcab01e..1b2a18ea855c 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -3651,13 +3651,19 @@ vmxnet3_suspend(struct device *device)
 	}
 
 	if (adapter->wol & WAKE_ARP) {
-		in_dev = in_dev_get(netdev);
-		if (!in_dev)
+		rcu_read_lock();
+
+		in_dev = __in_dev_get_rcu(netdev);
+		if (!in_dev) {
+			rcu_read_unlock();
 			goto skip_arp;
+		}
 
-		ifa = (struct in_ifaddr *)in_dev->ifa_list;
-		if (!ifa)
+		ifa = rcu_dereference(in_dev->ifa_list);
+		if (!ifa) {
+			rcu_read_unlock();
 			goto skip_arp;
+		}
 
 		pmConf->filters[i].patternSize = ETH_HLEN + /* Ethernet header*/
 			sizeof(struct arphdr) +		/* ARP header */
@@ -3677,7 +3683,9 @@ vmxnet3_suspend(struct device *device)
 
 		/* The Unicast IPv4 address in 'tip' field. */
 		arpreq += 2 * ETH_ALEN + sizeof(u32);
-		*(u32 *)arpreq = ifa->ifa_address;
+		*(__be32 *)arpreq = ifa->ifa_address;
+
+		rcu_read_unlock();
 
 		/* The mask for the relevant bits. */
 		pmConf->filters[i].mask[0] = 0x00;
@@ -3686,7 +3694,6 @@ vmxnet3_suspend(struct device *device)
 		pmConf->filters[i].mask[3] = 0x00;
 		pmConf->filters[i].mask[4] = 0xC0; /* IPv4 TIP */
 		pmConf->filters[i].mask[5] = 0x03; /* IPv4 TIP */
-		in_dev_put(in_dev);
 
 		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
 		i++;
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 5477a014e1fb..37cf602d8adf 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -2194,13 +2194,13 @@ static int ath6kl_wow_suspend_vif(struct ath6kl_vif *vif,
 	if (!in_dev)
 		return 0;
 
-	ifa = in_dev->ifa_list;
+	ifa = rtnl_dereference(in_dev->ifa_list);
 	memset(&ips, 0, sizeof(ips));
 
 	/* Configure IP addr only if IP address count < MAX_IP_ADDRS */
 	while (index < MAX_IP_ADDRS && ifa) {
 		ips[index] = ifa->ifa_local;
-		ifa = ifa->ifa_next;
+		ifa = rtnl_dereference(ifa->ifa_next);
 		index++;
 	}
 
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index e11a4bb67172..5a7cdb981789 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -3268,7 +3268,7 @@ static void mwifiex_set_auto_arp_mef_entry(struct mwifiex_private *priv,
 			in_dev = __in_dev_get_rtnl(adapter->priv[i]->netdev);
 			if (!in_dev)
 				continue;
-			ifa = in_dev->ifa_list;
+			ifa = rtnl_dereference(in_dev->ifa_list);
 			if (!ifa || !ifa->ifa_local)
 				continue;
 			ips[i] = ifa->ifa_local;
diff --git a/drivers/staging/isdn/hysdn/hysdn_net.c b/drivers/staging/isdn/hysdn/hysdn_net.c
index 8e9c34f33d86..bea37ae30ebb 100644
--- a/drivers/staging/isdn/hysdn/hysdn_net.c
+++ b/drivers/staging/isdn/hysdn/hysdn_net.c
@@ -70,9 +70,13 @@ net_open(struct net_device *dev)
 		for (i = 0; i < ETH_ALEN; i++)
 			dev->dev_addr[i] = 0xfc;
 		if ((in_dev = dev->ip_ptr) != NULL) {
-			struct in_ifaddr *ifa = in_dev->ifa_list;
+			const struct in_ifaddr *ifa;
+
+			rcu_read_lock();
+			ifa = rcu_dereference(in_dev->ifa_list);
 			if (ifa != NULL)
 				memcpy(dev->dev_addr + (ETH_ALEN - sizeof(ifa->ifa_local)), &ifa->ifa_local, sizeof(ifa->ifa_local));
+			rcu_read_unlock();
 		}
 	} else
 		memcpy(dev->dev_addr, card->mac_addr, ETH_ALEN);
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index d5d05503a04b..3515ca64e638 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -26,7 +26,7 @@ struct in_device {
 	struct net_device	*dev;
 	refcount_t		refcnt;
 	int			dead;
-	struct in_ifaddr	*ifa_list;	/* IP ifaddr chain		*/
+	struct in_ifaddr	__rcu *ifa_list;/* IP ifaddr chain		*/
 
 	struct ip_mc_list __rcu	*mc_list;	/* IP multicast filter chain    */
 	struct ip_mc_list __rcu	* __rcu *mc_hash;
@@ -136,7 +136,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 struct in_ifaddr {
 	struct hlist_node	hash;
-	struct in_ifaddr	*ifa_next;
+	struct in_ifaddr	__rcu *ifa_next;
 	struct in_device	*ifa_dev;
 	struct rcu_head		rcu_head;
 	__be32			ifa_local;
@@ -206,22 +206,13 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 	return false;
 }
 
-#define for_primary_ifa(in_dev)	{ struct in_ifaddr *ifa; \
-  for (ifa = (in_dev)->ifa_list; ifa && !(ifa->ifa_flags&IFA_F_SECONDARY); ifa = ifa->ifa_next)
-
-#define for_ifa(in_dev)	{ struct in_ifaddr *ifa; \
-  for (ifa = (in_dev)->ifa_list; ifa; ifa = ifa->ifa_next)
-
-
-#define endfor_ifa(in_dev) }
-
 #define in_dev_for_each_ifa_rtnl(ifa, in_dev)			\
-	for (ifa = (in_dev)->ifa_list; ifa;			\
-	     ifa = ifa->ifa_next)
+	for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;	\
+	     ifa = rtnl_dereference(ifa->ifa_next))
 
 #define in_dev_for_each_ifa_rcu(ifa, in_dev)			\
-	for (ifa = (in_dev)->ifa_list; ifa;			\
-	     ifa = ifa->ifa_next)
+	for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;	\
+	     ifa = rcu_dereference(ifa->ifa_next))
 
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd8b1a460d64..2cf27da1baeb 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np)
 
 	if (!np->local_ip.ip) {
 		if (!np->ipv6) {
+			const struct in_ifaddr *ifa;
+
 			in_dev = __in_dev_get_rtnl(ndev);
+			if (!in_dev)
+				goto put_noaddr;
 
-			if (!in_dev || !in_dev->ifa_list) {
+			ifa = rtnl_dereference(in_dev->ifa_list);
+			if (!ifa) {
+put_noaddr:
 				np_err(np, "no IP address for %s, aborting\n",
 				       np->dev_name);
 				err = -EDESTADDRREQ;
 				goto put;
 			}
 
-			np->local_ip.ip = in_dev->ifa_list->ifa_local;
+			np->local_ip.ip = ifa->ifa_local;
 			np_info(np, "local IP %pI4\n", &np->local_ip.ip);
 		} else {
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 319ad5490fb3..4cd120dc30ad 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2125,9 +2125,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 			rcu_read_lock();
 			in_dev = __in_dev_get_rcu(pkt_dev->odev);
 			if (in_dev) {
-				if (in_dev->ifa_list) {
-					pkt_dev->saddr_min =
-					    in_dev->ifa_list->ifa_address;
+				const struct in_ifaddr *ifa;
+
+				ifa = rcu_dereference(in_dev->ifa_list);
+				if (ifa) {
+					pkt_dev->saddr_min = ifa->ifa_address;
 					pkt_dev->saddr_max = pkt_dev->saddr_min;
 				}
 			}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b45421b2b734..ebaea05b4033 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -194,7 +194,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
 static int devinet_sysctl_register(struct in_device *idev);
@@ -300,8 +301,8 @@ static void in_dev_rcu_put(struct rcu_head *head)
 
 static void inetdev_destroy(struct in_device *in_dev)
 {
-	struct in_ifaddr *ifa;
 	struct net_device *dev;
+	struct in_ifaddr *ifa;
 
 	ASSERT_RTNL();
 
@@ -311,7 +312,7 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 	ip_mc_destroy_dev(in_dev);
 
-	while ((ifa = in_dev->ifa_list) != NULL) {
+	while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
 		inet_free_ifa(ifa);
 	}
@@ -342,17 +343,20 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 	return 0;
 }
 
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-			 int destroy, struct nlmsghdr *nlh, u32 portid)
+static void __inet_del_ifa(struct in_device *in_dev,
+			   struct in_ifaddr __rcu **ifap,
+			   int destroy, struct nlmsghdr *nlh, u32 portid)
 {
 	struct in_ifaddr *promote = NULL;
-	struct in_ifaddr *ifa, *ifa1 = *ifap;
-	struct in_ifaddr *last_prim = in_dev->ifa_list;
+	struct in_ifaddr *ifa, *ifa1;
+	struct in_ifaddr *last_prim;
 	struct in_ifaddr *prev_prom = NULL;
 	int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
 
 	ASSERT_RTNL();
 
+	ifa1 = rtnl_dereference(*ifap);
+	last_prim = rtnl_dereference(in_dev->ifa_list);
 	if (in_dev->dead)
 		goto no_promotions;
 
@@ -361,9 +365,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	 **/
 
 	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
-		struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+		struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
 
-		while ((ifa = *ifap1) != NULL) {
+		while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
 			if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
 			    ifa1->ifa_scope <= ifa->ifa_scope)
 				last_prim = ifa;
@@ -396,7 +400,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	 * and later to add them back with new prefsrc. Do this
 	 * while all addresses are on the device list.
 	 */
-	for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+	for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa))
 			fib_del_ifaddr(ifa, ifa1);
@@ -422,19 +426,24 @@ no_promotions:
 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
-		struct in_ifaddr *next_sec = promote->ifa_next;
+		struct in_ifaddr *next_sec;
 
+		next_sec = rtnl_dereference(promote->ifa_next);
 		if (prev_prom) {
-			prev_prom->ifa_next = promote->ifa_next;
-			promote->ifa_next = last_prim->ifa_next;
-			last_prim->ifa_next = promote;
+			struct in_ifaddr *last_sec;
+
+			last_sec = rtnl_dereference(last_prim->ifa_next);
+			rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+			rcu_assign_pointer(promote->ifa_next, last_sec);
+			rcu_assign_pointer(last_prim->ifa_next, promote);
 		}
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
 		rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
 		blocking_notifier_call_chain(&inetaddr_chain,
 				NETDEV_UP, promote);
-		for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+		for (ifa = next_sec; ifa;
+		     ifa = rtnl_dereference(ifa->ifa_next)) {
 			if (ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa))
 					continue;
@@ -446,7 +455,8 @@ no_promotions:
 		inet_free_ifa(ifa1);
 }
 
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
 			 int destroy)
 {
 	__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
@@ -459,9 +469,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 			     u32 portid, struct netlink_ext_ack *extack)
 {
+	struct in_ifaddr __rcu **last_primary, **ifap;
 	struct in_device *in_dev = ifa->ifa_dev;
-	struct in_ifaddr *ifa1, **ifap, **last_primary;
 	struct in_validator_info ivi;
+	struct in_ifaddr *ifa1;
 	int ret;
 
 	ASSERT_RTNL();
@@ -474,8 +485,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	ifa->ifa_flags &= ~IFA_F_SECONDARY;
 	last_primary = &in_dev->ifa_list;
 
-	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-	     ifap = &ifa1->ifa_next) {
+	ifap = &in_dev->ifa_list;
+	ifa1 = rtnl_dereference(*ifap);
+
+	while (ifa1) {
 		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
 		    ifa->ifa_scope <= ifa1->ifa_scope)
 			last_primary = &ifa1->ifa_next;
@@ -491,6 +504,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
 		}
+
+		ifap = &ifa1->ifa_next;
+		ifa1 = rtnl_dereference(*ifap);
 	}
 
 	/* Allow any devices that wish to register ifaddr validtors to weigh
@@ -516,8 +532,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 		ifap = last_primary;
 	}
 
-	ifa->ifa_next = *ifap;
-	*ifap = ifa;
+	rcu_assign_pointer(ifa->ifa_next, *ifap);
+	rcu_assign_pointer(*ifap, ifa);
 
 	inet_hash_insert(dev_net(in_dev->dev), ifa);
 
@@ -617,10 +633,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
+	struct in_ifaddr __rcu **ifap;
 	struct nlattr *tb[IFA_MAX+1];
 	struct in_device *in_dev;
 	struct ifaddrmsg *ifm;
-	struct in_ifaddr *ifa, **ifap;
+	struct in_ifaddr *ifa;
+
 	int err = -EINVAL;
 
 	ASSERT_RTNL();
@@ -637,7 +655,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+	for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL;
 	     ifap = &ifa->ifa_next) {
 		if (tb[IFA_LOCAL] &&
 		    ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
@@ -725,15 +743,20 @@ static void check_lifetime(struct work_struct *work)
 
 			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
 			    age >= ifa->ifa_valid_lft) {
-				struct in_ifaddr **ifap;
-
-				for (ifap = &ifa->ifa_dev->ifa_list;
-				     *ifap != NULL; ifap = &(*ifap)->ifa_next) {
-					if (*ifap == ifa) {
+				struct in_ifaddr __rcu **ifap;
+				struct in_ifaddr *tmp;
+
+				ifap = &ifa->ifa_dev->ifa_list;
+				tmp = rtnl_dereference(*ifap);
+				while (tmp) {
+					tmp = rtnl_dereference(tmp->ifa_next);
+					if (rtnl_dereference(*ifap) == ifa) {
 						inet_del_ifa(ifa->ifa_dev,
 							     ifap, 1);
 						break;
 					}
+					ifap = &tmp->ifa_next;
+					tmp = rtnl_dereference(*ifap);
 				}
 			} else if (ifa->ifa_preferred_lft !=
 				   INFINITY_LIFE_TIME &&
@@ -977,8 +1000,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 {
 	struct sockaddr_in sin_orig;
 	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+	struct in_ifaddr __rcu **ifap = NULL;
 	struct in_device *in_dev;
-	struct in_ifaddr **ifap = NULL;
 	struct in_ifaddr *ifa = NULL;
 	struct net_device *dev;
 	char *colon;
@@ -1049,7 +1072,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 			/* note: we only do this for a limited set of ioctls
 			   and only if the original address family was AF_INET.
 			   This is checked above. */
-			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
 			     ifap = &ifa->ifa_next) {
 				if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
 				    sin_orig.sin_addr.s_addr ==
@@ -1062,7 +1087,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 		   4.3BSD-style and passed in junk so we fall back to
 		   comparing just the label */
 		if (!ifa) {
-			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
 			     ifap = &ifa->ifa_next)
 				if (!strcmp(ifr->ifr_name, ifa->ifa_label))
 					break;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2b608044ae23..1f11907dc528 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -354,11 +354,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	sdata_lock(sdata);
 
 	/* Copy the addresses to the bss_conf list */
-	ifa = idev->ifa_list;
+	ifa = rtnl_dereference(idev->ifa_list);
 	while (ifa) {
 		if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
 			bss_conf->arp_addr_list[c] = ifa->ifa_address;
-		ifa = ifa->ifa_next;
+		ifa = rtnl_dereference(ifa->ifa_next);
 		c++;
 	}
 
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 78a9e6454ff3..8598e80968e0 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -47,15 +47,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	if (hooknum == NF_INET_LOCAL_OUT) {
 		newdst = htonl(0x7F000001);
 	} else {
-		struct in_device *indev;
-		struct in_ifaddr *ifa;
+		const struct in_device *indev;
 
 		newdst = 0;
 
 		indev = __in_dev_get_rcu(skb->dev);
-		if (indev && indev->ifa_list) {
-			ifa = indev->ifa_list;
-			newdst = ifa->ifa_local;
+		if (indev) {
+			const struct in_ifaddr *ifa;
+
+			ifa = rcu_dereference(indev->ifa_list);
+			if (ifa)
+				newdst = ifa->ifa_local;
 		}
 
 		if (!newdst)
-- 
cgit v1.2.3


From 80488a6b1d3c3509b69d38d7c5ac7615889ea7e0 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 31 May 2019 17:15:34 +0300
Subject: software node: Add support for static node descriptors

Until now the software nodes could only be created
dynamically with fwnode_create_software_node() function.
This introduces struct software_node data structure, which
makes it possible to describe the software nodes also
statically.

The statically described software nodes can be registered
with a new function fwnode_register_software_node(). This
also adds a helper fwnode_register_software_nodes()
which makes it possible to register an array of struct
software_nodes, i.e. multiple nodes at the same time.

There is no difference between statically described and
dynamically allocated software nodes. Even the registration
does not differ, except that during node creation the device
properties are only copied if the node is created
dynamically. With statically described nodes, the property
entries in the descriptor (struct software_node) are
assigned directly to the new software node that is being
created without any copies.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/swnode.c    | 256 +++++++++++++++++++++++++++++++++++------------
 include/linux/property.h |  19 ++++
 2 files changed, 212 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 7b321bf8424c..ef1a48fec718 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -11,25 +11,25 @@
 #include <linux/property.h>
 #include <linux/slab.h>
 
-struct software_node {
+struct swnode {
 	int id;
 	struct kobject kobj;
 	struct fwnode_handle fwnode;
+	const struct software_node *node;
 
 	/* hierarchy */
 	struct ida child_ids;
 	struct list_head entry;
 	struct list_head children;
-	struct software_node *parent;
+	struct swnode *parent;
 
-	/* properties */
-	const struct property_entry *properties;
+	unsigned int allocated:1;
 };
 
 static DEFINE_IDA(swnode_root_ids);
 static struct kset *swnode_kset;
 
-#define kobj_to_swnode(_kobj_) container_of(_kobj_, struct software_node, kobj)
+#define kobj_to_swnode(_kobj_) container_of(_kobj_, struct swnode, kobj)
 
 static const struct fwnode_operations software_node_ops;
 
@@ -37,17 +37,56 @@ bool is_software_node(const struct fwnode_handle *fwnode)
 {
 	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &software_node_ops;
 }
+EXPORT_SYMBOL_GPL(is_software_node);
 
-#define to_software_node(__fwnode)					\
+#define to_swnode(__fwnode)						\
 	({								\
-		typeof(__fwnode) __to_software_node_fwnode = __fwnode;	\
+		typeof(__fwnode) __to_swnode_fwnode = __fwnode;		\
 									\
-		is_software_node(__to_software_node_fwnode) ?		\
-			container_of(__to_software_node_fwnode,		\
-				     struct software_node, fwnode) :	\
-			NULL;						\
+		is_software_node(__to_swnode_fwnode) ?			\
+			container_of(__to_swnode_fwnode,		\
+				     struct swnode, fwnode) : NULL;	\
 	})
 
+static struct swnode *
+software_node_to_swnode(const struct software_node *node)
+{
+	struct swnode *swnode;
+	struct kobject *k;
+
+	if (!node)
+		return NULL;
+
+	spin_lock(&swnode_kset->list_lock);
+
+	list_for_each_entry(k, &swnode_kset->list, entry) {
+		swnode = kobj_to_swnode(k);
+		if (swnode->node == node)
+			break;
+		swnode = NULL;
+	}
+
+	spin_unlock(&swnode_kset->list_lock);
+
+	return swnode;
+}
+
+const struct software_node *to_software_node(struct fwnode_handle *fwnode)
+{
+	struct swnode *swnode = to_swnode(fwnode);
+
+	return swnode ? swnode->node : NULL;
+}
+EXPORT_SYMBOL_GPL(to_software_node);
+
+struct fwnode_handle *software_node_fwnode(const struct software_node *node)
+{
+	struct swnode *swnode = software_node_to_swnode(node);
+
+	return swnode ? &swnode->fwnode : NULL;
+}
+EXPORT_SYMBOL_GPL(software_node_fwnode);
+
 /* -------------------------------------------------------------------------- */
 /* property_entry processing */
 
@@ -433,7 +472,7 @@ EXPORT_SYMBOL_GPL(property_entries_free);
 
 static struct fwnode_handle *software_node_get(struct fwnode_handle *fwnode)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
 	kobject_get(&swnode->kobj);
 
@@ -442,7 +481,7 @@ static struct fwnode_handle *software_node_get(struct fwnode_handle *fwnode)
 
 static void software_node_put(struct fwnode_handle *fwnode)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
 	kobject_put(&swnode->kobj);
 }
@@ -450,8 +489,9 @@ static void software_node_put(struct fwnode_handle *fwnode)
 static bool software_node_property_present(const struct fwnode_handle *fwnode,
 					   const char *propname)
 {
-	return !!property_entry_get(to_software_node(fwnode)->properties,
-				    propname);
+	struct swnode *swnode = to_swnode(fwnode);
+
+	return !!property_entry_get(swnode->node->properties, propname);
 }
 
 static int software_node_read_int_array(const struct fwnode_handle *fwnode,
@@ -459,9 +499,9 @@ static int software_node_read_int_array(const struct fwnode_handle *fwnode,
 					unsigned int elem_size, void *val,
 					size_t nval)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
-	return property_entry_read_int_array(swnode->properties, propname,
+	return property_entry_read_int_array(swnode->node->properties, propname,
 					     elem_size, val, nval);
 }
 
@@ -469,27 +509,26 @@ static int software_node_read_string_array(const struct fwnode_handle *fwnode,
 					   const char *propname,
 					   const char **val, size_t nval)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
-	return property_entry_read_string_array(swnode->properties, propname,
-						val, nval);
+	return property_entry_read_string_array(swnode->node->properties,
+						propname, val, nval);
 }
 
 static struct fwnode_handle *
 software_node_get_parent(const struct fwnode_handle *fwnode)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
-	return swnode ? (swnode->parent ? &swnode->parent->fwnode : NULL) :
-			NULL;
+	return swnode ? (swnode->parent ? &swnode->parent->fwnode : NULL) : NULL;
 }
 
 static struct fwnode_handle *
 software_node_get_next_child(const struct fwnode_handle *fwnode,
 			     struct fwnode_handle *child)
 {
-	struct software_node *p = to_software_node(fwnode);
-	struct software_node *c = to_software_node(child);
+	struct swnode *p = to_swnode(fwnode);
+	struct swnode *c = to_swnode(child);
 
 	if (!p || list_empty(&p->children) ||
 	    (c && list_is_last(&c->entry, &p->children)))
@@ -498,7 +537,7 @@ software_node_get_next_child(const struct fwnode_handle *fwnode,
 	if (c)
 		c = list_next_entry(c, entry);
 	else
-		c = list_first_entry(&p->children, struct software_node, entry);
+		c = list_first_entry(&p->children, struct swnode, entry);
 	return &c->fwnode;
 }
 
@@ -506,15 +545,15 @@ static struct fwnode_handle *
 software_node_get_named_child_node(const struct fwnode_handle *fwnode,
 				   const char *childname)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 	const struct property_entry *prop;
-	struct software_node *child;
+	struct swnode *child;
 
 	if (!swnode || list_empty(&swnode->children))
 		return NULL;
 
 	list_for_each_entry(child, &swnode->children, entry) {
-		prop = property_entry_get(child->properties, "name");
+		prop = property_entry_get(child->node->properties, "name");
 		if (!prop)
 			continue;
 		if (!strcmp(childname, prop->value.str)) {
@@ -539,7 +578,7 @@ static const struct fwnode_operations software_node_ops = {
 /* -------------------------------------------------------------------------- */
 
 static int
-software_node_register_properties(struct software_node *swnode,
+software_node_register_properties(struct software_node *node,
 				  const struct property_entry *properties)
 {
 	struct property_entry *props;
@@ -548,17 +587,20 @@ software_node_register_properties(struct software_node *swnode,
 	if (IS_ERR(props))
 		return PTR_ERR(props);
 
-	swnode->properties = props;
+	node->properties = props;
 
 	return 0;
 }
 
 static void software_node_release(struct kobject *kobj)
 {
-	struct software_node *swnode = kobj_to_swnode(kobj);
+	struct swnode *swnode = kobj_to_swnode(kobj);
 
+	if (swnode->allocated) {
+		property_entries_free(swnode->node->properties);
+		kfree(swnode->node);
+	}
 	ida_destroy(&swnode->child_ids);
-	property_entries_free(swnode->properties);
 	kfree(swnode);
 }
 
@@ -567,66 +609,154 @@ static struct kobj_type software_node_type = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 
-struct fwnode_handle *
-fwnode_create_software_node(const struct property_entry *properties,
-			    const struct fwnode_handle *parent)
+static struct fwnode_handle *
+swnode_register(const struct software_node *node, struct swnode *parent,
+		unsigned int allocated)
 {
-	struct software_node *p = NULL;
-	struct software_node *swnode;
+	struct swnode *swnode;
 	int ret;
 
-	if (parent) {
-		if (IS_ERR(parent))
-			return ERR_CAST(parent);
-		if (!is_software_node(parent))
-			return ERR_PTR(-EINVAL);
-		p = to_software_node(parent);
-	}
-
 	swnode = kzalloc(sizeof(*swnode), GFP_KERNEL);
-	if (!swnode)
-		return ERR_PTR(-ENOMEM);
+	if (!swnode) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
 
-	ret = ida_simple_get(p ? &p->child_ids : &swnode_root_ids, 0, 0,
-			     GFP_KERNEL);
+	ret = ida_simple_get(parent ? &parent->child_ids : &swnode_root_ids,
+			     0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		kfree(swnode);
-		return ERR_PTR(ret);
+		goto out_err;
 	}
 
 	swnode->id = ret;
+	swnode->node = node;
+	swnode->parent = parent;
+	swnode->allocated = allocated;
 	swnode->kobj.kset = swnode_kset;
 	swnode->fwnode.ops = &software_node_ops;
 
 	ida_init(&swnode->child_ids);
 	INIT_LIST_HEAD(&swnode->entry);
 	INIT_LIST_HEAD(&swnode->children);
-	swnode->parent = p;
 
-	ret = kobject_init_and_add(&swnode->kobj, &software_node_type,
-				   p ? &p->kobj : NULL, "node%d", swnode->id);
+	if (node->name)
+		ret = kobject_init_and_add(&swnode->kobj, &software_node_type,
+					   parent ? &parent->kobj : NULL,
+					   "%s", node->name);
+	else
+		ret = kobject_init_and_add(&swnode->kobj, &software_node_type,
+					   parent ? &parent->kobj : NULL,
+					   "node%d", swnode->id);
 	if (ret) {
 		kobject_put(&swnode->kobj);
 		return ERR_PTR(ret);
 	}
 
-	ret = software_node_register_properties(swnode, properties);
+	if (parent)
+		list_add_tail(&swnode->entry, &parent->children);
+
+	kobject_uevent(&swnode->kobj, KOBJ_ADD);
+	return &swnode->fwnode;
+
+out_err:
+	if (allocated)
+		property_entries_free(node->properties);
+	return ERR_PTR(ret);
+}
+
+/**
+ * software_node_register_nodes - Register an array of software nodes
+ * @nodes: Zero terminated array of software nodes to be registered
+ *
+ * Register multiple software nodes at once.
+ */
+int software_node_register_nodes(const struct software_node *nodes)
+{
+	int ret;
+	int i;
+
+	for (i = 0; nodes[i].name; i++) {
+		ret = software_node_register(&nodes[i]);
+		if (ret) {
+			software_node_unregister_nodes(nodes);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(software_node_register_nodes);
+
+/**
+ * software_node_unregister_nodes - Unregister an array of software nodes
+ * @nodes: Zero terminated array of software nodes to be unregistered
+ *
+ * Unregister multiple software nodes at once.
+ */
+void software_node_unregister_nodes(const struct software_node *nodes)
+{
+	struct swnode *swnode;
+	int i;
+
+	for (i = 0; nodes[i].name; i++) {
+		swnode = software_node_to_swnode(&nodes[i]);
+		if (swnode)
+			fwnode_remove_software_node(&swnode->fwnode);
+	}
+}
+EXPORT_SYMBOL_GPL(software_node_unregister_nodes);
+
+/**
+ * software_node_register - Register static software node
+ * @node: The software node to be registered
+ */
+int software_node_register(const struct software_node *node)
+{
+	struct swnode *parent = software_node_to_swnode(node->parent);
+
+	if (software_node_to_swnode(node))
+		return -EEXIST;
+
+	return PTR_ERR_OR_ZERO(swnode_register(node, parent, 0));
+}
+EXPORT_SYMBOL_GPL(software_node_register);
+
+struct fwnode_handle *
+fwnode_create_software_node(const struct property_entry *properties,
+			    const struct fwnode_handle *parent)
+{
+	struct software_node *node;
+	struct swnode *p = NULL;
+	int ret;
+
+	if (parent) {
+		if (IS_ERR(parent))
+			return ERR_CAST(parent);
+		if (!is_software_node(parent))
+			return ERR_PTR(-EINVAL);
+		p = to_swnode(parent);
+	}
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	ret = software_node_register_properties(node, properties);
 	if (ret) {
-		kobject_put(&swnode->kobj);
+		kfree(node);
 		return ERR_PTR(ret);
 	}
 
-	if (p)
-		list_add_tail(&swnode->entry, &p->children);
+	node->parent = p ? p->node : NULL;
 
-	kobject_uevent(&swnode->kobj, KOBJ_ADD);
-	return &swnode->fwnode;
+	return swnode_register(node, p, 1);
 }
 EXPORT_SYMBOL_GPL(fwnode_create_software_node);
 
 void fwnode_remove_software_node(struct fwnode_handle *fwnode)
 {
-	struct software_node *swnode = to_software_node(fwnode);
+	struct swnode *swnode = to_swnode(fwnode);
 
 	if (!swnode)
 		return;
@@ -645,7 +775,7 @@ EXPORT_SYMBOL_GPL(fwnode_remove_software_node);
 int software_node_notify(struct device *dev, unsigned long action)
 {
 	struct fwnode_handle *fwnode = dev_fwnode(dev);
-	struct software_node *swnode;
+	struct swnode *swnode;
 	int ret;
 
 	if (!fwnode)
@@ -656,7 +786,7 @@ int software_node_notify(struct device *dev, unsigned long action)
 	if (!is_software_node(fwnode))
 		return 0;
 
-	swnode = to_software_node(fwnode);
+	swnode = to_swnode(fwnode);
 
 	switch (action) {
 	case KOBJ_ADD:
diff --git a/include/linux/property.h b/include/linux/property.h
index a29369c89e6e..a3813ded52ea 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -332,7 +332,26 @@ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 /* -------------------------------------------------------------------------- */
 /* Software fwnode support - when HW description is incomplete or missing */
 
+/**
+ * struct software_node - Software node description
+ * @name: Name of the software node
+ * @parent: Parent of the software node
+ * @properties: Array of device properties
+ */
+struct software_node {
+	const char *name;
+	const struct software_node *parent;
+	const struct property_entry *properties;
+};
+
 bool is_software_node(const struct fwnode_handle *fwnode);
+const struct software_node *to_software_node(struct fwnode_handle *fwnode);
+struct fwnode_handle *software_node_fwnode(const struct software_node *node);
+
+int software_node_register_nodes(const struct software_node *nodes);
+void software_node_unregister_nodes(const struct software_node *nodes);
+
+int software_node_register(const struct software_node *node);
 
 int software_node_notify(struct device *dev, unsigned long action);
 
-- 
cgit v1.2.3


From b06184acf751fa52a3763e4fadfd2807e9703acd Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 31 May 2019 17:15:36 +0300
Subject: software node: Add software_node_get_reference_args()

This makes it possible to support drivers that use
fwnode_property_get_reference_args() function.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/swnode.c    | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 2d925fc2255f..e7b3aa3bd55a 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -560,6 +560,52 @@ software_node_get_named_child_node(const struct fwnode_handle *fwnode,
 	return NULL;
 }
 
+static int
+software_node_get_reference_args(const struct fwnode_handle *fwnode,
+				 const char *propname, const char *nargs_prop,
+				 unsigned int nargs, unsigned int index,
+				 struct fwnode_reference_args *args)
+{
+	struct swnode *swnode = to_swnode(fwnode);
+	const struct software_node_reference *ref;
+	const struct property_entry *prop;
+	struct fwnode_handle *refnode;
+	int i;
+
+	if (!swnode || !swnode->node->references)
+		return -ENOENT;
+
+	for (ref = swnode->node->references; ref->name; ref++)
+		if (!strcmp(ref->name, propname))
+			break;
+
+	if (!ref->name || index > (ref->nrefs - 1))
+		return -ENOENT;
+
+	refnode = software_node_fwnode(ref->refs[index].node);
+	if (!refnode)
+		return -ENOENT;
+
+	if (nargs_prop) {
+		prop = property_entry_get(swnode->node->properties, nargs_prop);
+		if (!prop)
+			return -EINVAL;
+
+		nargs = prop->value.u32_data;
+	}
+
+	if (nargs > NR_FWNODE_REFERENCE_ARGS)
+		return -EINVAL;
+
+	args->fwnode = software_node_get(refnode);
+	args->nargs = nargs;
+
+	for (i = 0; i < nargs; i++)
+		args->args[i] = ref->refs[index].args[i];
+
+	return 0;
+}
+
 static const struct fwnode_operations software_node_ops = {
 	.get = software_node_get,
 	.put = software_node_put,
@@ -569,6 +615,7 @@ static const struct fwnode_operations software_node_ops = {
 	.get_parent = software_node_get_parent,
 	.get_next_child_node = software_node_get_next_child,
 	.get_named_child_node = software_node_get_named_child_node,
+	.get_reference_args = software_node_get_reference_args
 };
 
 /* -------------------------------------------------------------------------- */
diff --git a/include/linux/property.h b/include/linux/property.h
index a3813ded52ea..abcde2f236a0 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -332,16 +332,44 @@ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 /* -------------------------------------------------------------------------- */
 /* Software fwnode support - when HW description is incomplete or missing */
 
+struct software_node;
+
+/**
+ * struct software_node_ref_args - Reference with additional arguments
+ * @node: Reference to a software node
+ * @nargs: Number of elements in @args array
+ * @args: Integer arguments
+ */
+struct software_node_ref_args {
+	const struct software_node *node;
+	unsigned int nargs;
+	u64 args[NR_FWNODE_REFERENCE_ARGS];
+};
+
+/**
+ * struct software_node_reference - Named software node reference property
+ * @name: Name of the property
+ * @nrefs: Number of elements in @refs array
+ * @refs: Array of references with optional arguments
+ */
+struct software_node_reference {
+	const char *name;
+	unsigned int nrefs;
+	const struct software_node_ref_args *refs;
+};
+
 /**
  * struct software_node - Software node description
  * @name: Name of the software node
  * @parent: Parent of the software node
  * @properties: Array of device properties
+ * @references: Array of software node reference properties
  */
 struct software_node {
 	const char *name;
 	const struct software_node *parent;
 	const struct property_entry *properties;
+	const struct software_node_reference *references;
 };
 
 bool is_software_node(const struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From dad9bb017865ae794b6cdfac40d60b1466a09195 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 31 May 2019 17:15:37 +0300
Subject: driver core: Add helper device_find_child_by_name()

It looks like the child device is often matched with a name.
This introduces a helper that does it automatically.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/core.c    | 28 ++++++++++++++++++++++++++++
 include/linux/device.h |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd7511e04e62..b4c64528f13c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2474,6 +2474,34 @@ struct device *device_find_child(struct device *parent, void *data,
 }
 EXPORT_SYMBOL_GPL(device_find_child);
 
+/**
+ * device_find_child_by_name - device iterator for locating a child device.
+ * @parent: parent struct device
+ * @name: name of the child device
+ *
+ * This is similar to the device_find_child() function above, but it
+ * returns a reference to a device that has the name @name.
+ *
+ * NOTE: you will need to drop the reference with put_device() after use.
+ */
+struct device *device_find_child_by_name(struct device *parent,
+					 const char *name)
+{
+	struct klist_iter i;
+	struct device *child;
+
+	if (!parent)
+		return NULL;
+
+	klist_iter_init(&parent->p->klist_children, &i);
+	while ((child = next_device(&i)))
+		if (!strcmp(dev_name(child), name) && get_device(child))
+			break;
+	klist_iter_exit(&i);
+	return child;
+}
+EXPORT_SYMBOL_GPL(device_find_child_by_name);
+
 int __init devices_init(void)
 {
 	devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..5489a759e1c5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1250,6 +1250,8 @@ extern int device_for_each_child_reverse(struct device *dev, void *data,
 		     int (*fn)(struct device *dev, void *data));
 extern struct device *device_find_child(struct device *dev, void *data,
 				int (*match)(struct device *dev, void *data));
+extern struct device *device_find_child_by_name(struct device *parent,
+						const char *name);
 extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
-- 
cgit v1.2.3


From 83b34afb6b79c69f5478a7249451cab858af97d6 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 31 May 2019 17:15:39 +0300
Subject: device property: Introduce fwnode_find_reference()

In most cases the references that the drivers look for don't
have any arguments. This introduces a wrapper function for
fwnode_property_get_reference_args() that looks for
references by using only the name and index.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 24 ++++++++++++++++++++++++
 include/linux/property.h |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 348b37e64944..81bd01ed4042 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -484,6 +484,30 @@ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args);
 
+/**
+ * fwnode_find_reference - Find named reference to a fwnode_handle
+ * @fwnode: Firmware node where to look for the reference
+ * @name: The name of the reference
+ * @index: Index of the reference
+ *
+ * @index can be used when the named reference holds a table of references.
+ *
+ * Returns pointer to the reference fwnode, or ERR_PTR. Caller is responsible to
+ * call fwnode_handle_put() on the returned fwnode pointer.
+ */
+struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
+					    const char *name,
+					    unsigned int index)
+{
+	struct fwnode_reference_args args;
+	int ret;
+
+	ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index,
+						 &args);
+	return ret ? ERR_PTR(ret) : args.fwnode;
+}
+EXPORT_SYMBOL_GPL(fwnode_find_reference);
+
 /**
  * device_remove_properties - Remove properties from a device object.
  * @dev: Device whose properties to remove.
diff --git a/include/linux/property.h b/include/linux/property.h
index abcde2f236a0..088d4db7e949 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -79,6 +79,10 @@ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
 				       unsigned int nargs, unsigned int index,
 				       struct fwnode_reference_args *args);
 
+struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
+					    const char *name,
+					    unsigned int index);
+
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 3370db35193b241ba5836a66df6ec1a559108389 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 31 May 2019 17:15:41 +0300
Subject: usb: typec: Registering real device entries for the muxes

Registering real device entries (struct device) for the mode
muxes as well as for the orientation switches.

The Type-C mux code was deliberately attempting to avoid
creation of separate device entries for the orientation
switch and the mode switch (alternate modes) because they
are not physical devices. They are functions of a single
physical multiplexer/demultiplexer switch device.

Unfortunately because of the dependency we still have on the
underlying mux device driver, we had to put in hacks like
the one in the commit 3e3b81965cbf ("usb: typec: mux: Take
care of driver module reference counting") to make sure the
driver does not disappear from underneath us. Even with
those hacks we were still left with a potential NUll pointer
dereference scenario, so just creating the device entries,
and letting the core take care of the dependencies. No more
hacks needed.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/platform/x86/intel_cht_int33fe.c |   4 +-
 drivers/usb/typec/bus.h                  |  15 ++
 drivers/usb/typec/class.c                |  17 ++-
 drivers/usb/typec/mux.c                  | 238 ++++++++++++++++++++++---------
 drivers/usb/typec/mux/pi3usb30532.c      |  46 +++---
 include/linux/usb/typec_mux.h            |  62 ++++----
 6 files changed, 259 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/intel_cht_int33fe.c b/drivers/platform/x86/intel_cht_int33fe.c
index 6fa3cced6f8e..657b8d61554c 100644
--- a/drivers/platform/x86/intel_cht_int33fe.c
+++ b/drivers/platform/x86/intel_cht_int33fe.c
@@ -173,10 +173,10 @@ static int cht_int33fe_probe(struct platform_device *pdev)
 	}
 
 	data->connections[0].endpoint[0] = "port0";
-	data->connections[0].endpoint[1] = "i2c-pi3usb30532";
+	data->connections[0].endpoint[1] = "i2c-pi3usb30532-switch";
 	data->connections[0].id = "orientation-switch";
 	data->connections[1].endpoint[0] = "port0";
-	data->connections[1].endpoint[1] = "i2c-pi3usb30532";
+	data->connections[1].endpoint[1] = "i2c-pi3usb30532-mux";
 	data->connections[1].id = "mode-switch";
 	data->connections[2].endpoint[0] = "i2c-fusb302";
 	data->connections[2].endpoint[1] = "intel_xhci_usb_sw-role-switch";
diff --git a/drivers/usb/typec/bus.h b/drivers/usb/typec/bus.h
index db40e61d8b72..0c9661c96473 100644
--- a/drivers/usb/typec/bus.h
+++ b/drivers/usb/typec/bus.h
@@ -35,4 +35,19 @@ extern const struct device_type typec_port_dev_type;
 #define is_typec_altmode(_dev_) (_dev_->type == &typec_altmode_dev_type)
 #define is_typec_port(_dev_) (_dev_->type == &typec_port_dev_type)
 
+extern struct class typec_mux_class;
+
+struct typec_switch {
+	struct device dev;
+	typec_switch_set_fn_t set;
+};
+
+struct typec_mux {
+	struct device dev;
+	typec_mux_set_fn_t set;
+};
+
+#define to_typec_switch(_dev_) container_of(_dev_, struct typec_switch, dev)
+#define to_typec_mux(_dev_) container_of(_dev_, struct typec_mux, dev)
+
 #endif /* __USB_TYPEC_ALTMODE_H__ */
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 2eb623841847..a18285a990a8 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -1646,13 +1646,25 @@ static int __init typec_init(void)
 	if (ret)
 		return ret;
 
+	ret = class_register(&typec_mux_class);
+	if (ret)
+		goto err_unregister_bus;
+
 	typec_class = class_create(THIS_MODULE, "typec");
 	if (IS_ERR(typec_class)) {
-		bus_unregister(&typec_bus);
-		return PTR_ERR(typec_class);
+		ret = PTR_ERR(typec_class);
+		goto err_unregister_mux_class;
 	}
 
 	return 0;
+
+err_unregister_mux_class:
+	class_unregister(&typec_mux_class);
+
+err_unregister_bus:
+	bus_unregister(&typec_bus);
+
+	return ret;
 }
 subsys_initcall(typec_init);
 
@@ -1661,6 +1673,7 @@ static void __exit typec_exit(void)
 	class_destroy(typec_class);
 	ida_destroy(&typec_index_ida);
 	bus_unregister(&typec_bus);
+	class_unregister(&typec_mux_class);
 }
 module_exit(typec_exit);
 
diff --git a/drivers/usb/typec/mux.c b/drivers/usb/typec/mux.c
index 2ce54f3fc79c..61b7bc58dd81 100644
--- a/drivers/usb/typec/mux.c
+++ b/drivers/usb/typec/mux.c
@@ -15,35 +15,47 @@
 #include <linux/slab.h>
 #include <linux/usb/typec_mux.h>
 
-static DEFINE_MUTEX(switch_lock);
-static DEFINE_MUTEX(mux_lock);
-static LIST_HEAD(switch_list);
-static LIST_HEAD(mux_list);
+#include "bus.h"
+
+static int name_match(struct device *dev, const void *name)
+{
+	return !strcmp((const char *)name, dev_name(dev));
+}
+
+static bool dev_name_ends_with(struct device *dev, const char *suffix)
+{
+	const char *name = dev_name(dev);
+	const int name_len = strlen(name);
+	const int suffix_len = strlen(suffix);
+
+	if (suffix_len > name_len)
+		return false;
+
+	return strcmp(name + (name_len - suffix_len), suffix) == 0;
+}
+
+static int switch_fwnode_match(struct device *dev, const void *fwnode)
+{
+	return dev_fwnode(dev) == fwnode && dev_name_ends_with(dev, "-switch");
+}
 
 static void *typec_switch_match(struct device_connection *con, int ep,
 				void *data)
 {
-	struct typec_switch *sw;
-
-	if (!con->fwnode) {
-		list_for_each_entry(sw, &switch_list, entry)
-			if (!strcmp(con->endpoint[ep], dev_name(sw->dev)))
-				return sw;
-		return ERR_PTR(-EPROBE_DEFER);
-	}
+	struct device *dev;
 
-	/*
-	 * With OF graph the mux node must have a boolean device property named
-	 * "orientation-switch".
-	 */
-	if (con->id && !fwnode_property_present(con->fwnode, con->id))
-		return NULL;
+	if (con->fwnode) {
+		if (con->id && !fwnode_property_present(con->fwnode, con->id))
+			return NULL;
 
-	list_for_each_entry(sw, &switch_list, entry)
-		if (dev_fwnode(sw->dev) == con->fwnode)
-			return sw;
+		dev = class_find_device(&typec_mux_class, NULL, con->fwnode,
+					switch_fwnode_match);
+	} else {
+		dev = class_find_device(&typec_mux_class, NULL,
+					con->endpoint[ep], name_match);
+	}
 
-	return con->id ? ERR_PTR(-EPROBE_DEFER) : NULL;
+	return dev ? to_typec_switch(dev) : ERR_PTR(-EPROBE_DEFER);
 }
 
 /**
@@ -59,14 +71,10 @@ struct typec_switch *typec_switch_get(struct device *dev)
 {
 	struct typec_switch *sw;
 
-	mutex_lock(&switch_lock);
 	sw = device_connection_find_match(dev, "orientation-switch", NULL,
 					  typec_switch_match);
-	if (!IS_ERR_OR_NULL(sw)) {
-		WARN_ON(!try_module_get(sw->dev->driver->owner));
-		get_device(sw->dev);
-	}
-	mutex_unlock(&switch_lock);
+	if (!IS_ERR_OR_NULL(sw))
+		WARN_ON(!try_module_get(sw->dev.parent->driver->owner));
 
 	return sw;
 }
@@ -81,28 +89,64 @@ EXPORT_SYMBOL_GPL(typec_switch_get);
 void typec_switch_put(struct typec_switch *sw)
 {
 	if (!IS_ERR_OR_NULL(sw)) {
-		module_put(sw->dev->driver->owner);
-		put_device(sw->dev);
+		module_put(sw->dev.parent->driver->owner);
+		put_device(&sw->dev);
 	}
 }
 EXPORT_SYMBOL_GPL(typec_switch_put);
 
+static void typec_switch_release(struct device *dev)
+{
+	kfree(to_typec_switch(dev));
+}
+
+static const struct device_type typec_switch_dev_type = {
+	.name = "orientation_switch",
+	.release = typec_switch_release,
+};
+
 /**
  * typec_switch_register - Register USB Type-C orientation switch
- * @sw: USB Type-C orientation switch
+ * @parent: Parent device
+ * @desc: Orientation switch description
  *
  * This function registers a switch that can be used for routing the correct
  * data pairs depending on the cable plug orientation from the USB Type-C
  * connector to the USB controllers. USB Type-C plugs can be inserted
  * right-side-up or upside-down.
  */
-int typec_switch_register(struct typec_switch *sw)
+struct typec_switch *
+typec_switch_register(struct device *parent,
+		      const struct typec_switch_desc *desc)
 {
-	mutex_lock(&switch_lock);
-	list_add_tail(&sw->entry, &switch_list);
-	mutex_unlock(&switch_lock);
+	struct typec_switch *sw;
+	int ret;
+
+	if (!desc || !desc->set)
+		return ERR_PTR(-EINVAL);
+
+	sw = kzalloc(sizeof(*sw), GFP_KERNEL);
+	if (!sw)
+		return ERR_PTR(-ENOMEM);
 
-	return 0;
+	sw->set = desc->set;
+
+	device_initialize(&sw->dev);
+	sw->dev.parent = parent;
+	sw->dev.fwnode = desc->fwnode;
+	sw->dev.class = &typec_mux_class;
+	sw->dev.type = &typec_switch_dev_type;
+	sw->dev.driver_data = desc->drvdata;
+	dev_set_name(&sw->dev, "%s-switch", dev_name(parent));
+
+	ret = device_add(&sw->dev);
+	if (ret) {
+		dev_err(parent, "failed to register switch (%d)\n", ret);
+		put_device(&sw->dev);
+		return ERR_PTR(ret);
+	}
+
+	return sw;
 }
 EXPORT_SYMBOL_GPL(typec_switch_register);
 
@@ -114,28 +158,44 @@ EXPORT_SYMBOL_GPL(typec_switch_register);
  */
 void typec_switch_unregister(struct typec_switch *sw)
 {
-	mutex_lock(&switch_lock);
-	list_del(&sw->entry);
-	mutex_unlock(&switch_lock);
+	if (!IS_ERR_OR_NULL(sw))
+		device_unregister(&sw->dev);
 }
 EXPORT_SYMBOL_GPL(typec_switch_unregister);
 
+void typec_switch_set_drvdata(struct typec_switch *sw, void *data)
+{
+	dev_set_drvdata(&sw->dev, data);
+}
+EXPORT_SYMBOL_GPL(typec_switch_set_drvdata);
+
+void *typec_switch_get_drvdata(struct typec_switch *sw)
+{
+	return dev_get_drvdata(&sw->dev);
+}
+EXPORT_SYMBOL_GPL(typec_switch_get_drvdata);
+
 /* ------------------------------------------------------------------------- */
 
+static int mux_fwnode_match(struct device *dev, const void *fwnode)
+{
+	return dev_fwnode(dev) == fwnode && dev_name_ends_with(dev, "-mux");
+}
+
 static void *typec_mux_match(struct device_connection *con, int ep, void *data)
 {
 	const struct typec_altmode_desc *desc = data;
-	struct typec_mux *mux;
-	int nval;
+	struct device *dev;
 	bool match;
+	int nval;
 	u16 *val;
 	int i;
 
 	if (!con->fwnode) {
-		list_for_each_entry(mux, &mux_list, entry)
-			if (!strcmp(con->endpoint[ep], dev_name(mux->dev)))
-				return mux;
-		return ERR_PTR(-EPROBE_DEFER);
+		dev = class_find_device(&typec_mux_class, NULL,
+					con->endpoint[ep], name_match);
+
+		return dev ? to_typec_switch(dev) : ERR_PTR(-EPROBE_DEFER);
 	}
 
 	/*
@@ -180,11 +240,10 @@ static void *typec_mux_match(struct device_connection *con, int ep, void *data)
 	return NULL;
 
 find_mux:
-	list_for_each_entry(mux, &mux_list, entry)
-		if (dev_fwnode(mux->dev) == con->fwnode)
-			return mux;
+	dev = class_find_device(&typec_mux_class, NULL, con->fwnode,
+				mux_fwnode_match);
 
-	return ERR_PTR(-EPROBE_DEFER);
+	return dev ? to_typec_switch(dev) : ERR_PTR(-EPROBE_DEFER);
 }
 
 /**
@@ -202,14 +261,10 @@ struct typec_mux *typec_mux_get(struct device *dev,
 {
 	struct typec_mux *mux;
 
-	mutex_lock(&mux_lock);
 	mux = device_connection_find_match(dev, "mode-switch", (void *)desc,
 					   typec_mux_match);
-	if (!IS_ERR_OR_NULL(mux)) {
-		WARN_ON(!try_module_get(mux->dev->driver->owner));
-		get_device(mux->dev);
-	}
-	mutex_unlock(&mux_lock);
+	if (!IS_ERR_OR_NULL(mux))
+		WARN_ON(!try_module_get(mux->dev.parent->driver->owner));
 
 	return mux;
 }
@@ -224,28 +279,63 @@ EXPORT_SYMBOL_GPL(typec_mux_get);
 void typec_mux_put(struct typec_mux *mux)
 {
 	if (!IS_ERR_OR_NULL(mux)) {
-		module_put(mux->dev->driver->owner);
-		put_device(mux->dev);
+		module_put(mux->dev.parent->driver->owner);
+		put_device(&mux->dev);
 	}
 }
 EXPORT_SYMBOL_GPL(typec_mux_put);
 
+static void typec_mux_release(struct device *dev)
+{
+	kfree(to_typec_mux(dev));
+}
+
+static const struct device_type typec_mux_dev_type = {
+	.name = "mode_switch",
+	.release = typec_mux_release,
+};
+
 /**
  * typec_mux_register - Register Multiplexer routing USB Type-C pins
- * @mux: USB Type-C Connector Multiplexer/DeMultiplexer
+ * @parent: Parent device
+ * @desc: Multiplexer description
  *
  * USB Type-C connectors can be used for alternate modes of operation besides
  * USB when Accessory/Alternate Modes are supported. With some of those modes,
  * the pins on the connector need to be reconfigured. This function registers
  * multiplexer switches routing the pins on the connector.
  */
-int typec_mux_register(struct typec_mux *mux)
+struct typec_mux *
+typec_mux_register(struct device *parent, const struct typec_mux_desc *desc)
 {
-	mutex_lock(&mux_lock);
-	list_add_tail(&mux->entry, &mux_list);
-	mutex_unlock(&mux_lock);
+	struct typec_mux *mux;
+	int ret;
+
+	if (!desc || !desc->set)
+		return ERR_PTR(-EINVAL);
+
+	mux = kzalloc(sizeof(*mux), GFP_KERNEL);
+	if (!mux)
+		return ERR_PTR(-ENOMEM);
+
+	mux->set = desc->set;
+
+	device_initialize(&mux->dev);
+	mux->dev.parent = parent;
+	mux->dev.fwnode = desc->fwnode;
+	mux->dev.class = &typec_mux_class;
+	mux->dev.type = &typec_mux_dev_type;
+	mux->dev.driver_data = desc->drvdata;
+	dev_set_name(&mux->dev, "%s-mux", dev_name(parent));
+
+	ret = device_add(&mux->dev);
+	if (ret) {
+		dev_err(parent, "failed to register mux (%d)\n", ret);
+		put_device(&mux->dev);
+		return ERR_PTR(ret);
+	}
 
-	return 0;
+	return mux;
 }
 EXPORT_SYMBOL_GPL(typec_mux_register);
 
@@ -257,8 +347,24 @@ EXPORT_SYMBOL_GPL(typec_mux_register);
  */
 void typec_mux_unregister(struct typec_mux *mux)
 {
-	mutex_lock(&mux_lock);
-	list_del(&mux->entry);
-	mutex_unlock(&mux_lock);
+	if (!IS_ERR_OR_NULL(mux))
+		device_unregister(&mux->dev);
 }
 EXPORT_SYMBOL_GPL(typec_mux_unregister);
+
+void typec_mux_set_drvdata(struct typec_mux *mux, void *data)
+{
+	dev_set_drvdata(&mux->dev, data);
+}
+EXPORT_SYMBOL_GPL(typec_mux_set_drvdata);
+
+void *typec_mux_get_drvdata(struct typec_mux *mux)
+{
+	return dev_get_drvdata(&mux->dev);
+}
+EXPORT_SYMBOL_GPL(typec_mux_get_drvdata);
+
+struct class typec_mux_class = {
+	.name = "typec_mux",
+	.owner = THIS_MODULE,
+};
diff --git a/drivers/usb/typec/mux/pi3usb30532.c b/drivers/usb/typec/mux/pi3usb30532.c
index 9294e85fd34b..5585b109095b 100644
--- a/drivers/usb/typec/mux/pi3usb30532.c
+++ b/drivers/usb/typec/mux/pi3usb30532.c
@@ -23,8 +23,8 @@
 struct pi3usb30532 {
 	struct i2c_client *client;
 	struct mutex lock; /* protects the cached conf register */
-	struct typec_switch sw;
-	struct typec_mux mux;
+	struct typec_switch *sw;
+	struct typec_mux *mux;
 	u8 conf;
 };
 
@@ -48,7 +48,7 @@ static int pi3usb30532_set_conf(struct pi3usb30532 *pi, u8 new_conf)
 static int pi3usb30532_sw_set(struct typec_switch *sw,
 			      enum typec_orientation orientation)
 {
-	struct pi3usb30532 *pi = container_of(sw, struct pi3usb30532, sw);
+	struct pi3usb30532 *pi = typec_switch_get_drvdata(sw);
 	u8 new_conf;
 	int ret;
 
@@ -75,7 +75,7 @@ static int pi3usb30532_sw_set(struct typec_switch *sw,
 
 static int pi3usb30532_mux_set(struct typec_mux *mux, int state)
 {
-	struct pi3usb30532 *pi = container_of(mux, struct pi3usb30532, mux);
+	struct pi3usb30532 *pi = typec_mux_get_drvdata(mux);
 	u8 new_conf;
 	int ret;
 
@@ -113,6 +113,8 @@ static int pi3usb30532_mux_set(struct typec_mux *mux, int state)
 static int pi3usb30532_probe(struct i2c_client *client)
 {
 	struct device *dev = &client->dev;
+	struct typec_switch_desc sw_desc;
+	struct typec_mux_desc mux_desc;
 	struct pi3usb30532 *pi;
 	int ret;
 
@@ -121,10 +123,6 @@ static int pi3usb30532_probe(struct i2c_client *client)
 		return -ENOMEM;
 
 	pi->client = client;
-	pi->sw.dev = dev;
-	pi->sw.set = pi3usb30532_sw_set;
-	pi->mux.dev = dev;
-	pi->mux.set = pi3usb30532_mux_set;
 	mutex_init(&pi->lock);
 
 	ret = i2c_smbus_read_byte_data(client, PI3USB30532_CONF);
@@ -134,17 +132,27 @@ static int pi3usb30532_probe(struct i2c_client *client)
 	}
 	pi->conf = ret;
 
-	ret = typec_switch_register(&pi->sw);
-	if (ret) {
-		dev_err(dev, "Error registering typec switch: %d\n", ret);
-		return ret;
+	sw_desc.drvdata = pi;
+	sw_desc.fwnode = dev->fwnode;
+	sw_desc.set = pi3usb30532_sw_set;
+
+	pi->sw = typec_switch_register(dev, &sw_desc);
+	if (IS_ERR(pi->sw)) {
+		dev_err(dev, "Error registering typec switch: %ld\n",
+			PTR_ERR(pi->sw));
+		return PTR_ERR(pi->sw);
 	}
 
-	ret = typec_mux_register(&pi->mux);
-	if (ret) {
-		typec_switch_unregister(&pi->sw);
-		dev_err(dev, "Error registering typec mux: %d\n", ret);
-		return ret;
+	mux_desc.drvdata = pi;
+	mux_desc.fwnode = dev->fwnode;
+	mux_desc.set = pi3usb30532_mux_set;
+
+	pi->mux = typec_mux_register(dev, &mux_desc);
+	if (IS_ERR(pi->mux)) {
+		typec_switch_unregister(pi->sw);
+		dev_err(dev, "Error registering typec mux: %ld\n",
+			PTR_ERR(pi->mux));
+		return PTR_ERR(pi->mux);
 	}
 
 	i2c_set_clientdata(client, pi);
@@ -155,8 +163,8 @@ static int pi3usb30532_remove(struct i2c_client *client)
 {
 	struct pi3usb30532 *pi = i2c_get_clientdata(client);
 
-	typec_mux_unregister(&pi->mux);
-	typec_switch_unregister(&pi->sw);
+	typec_mux_unregister(pi->mux);
+	typec_switch_unregister(pi->sw);
 	return 0;
 }
 
diff --git a/include/linux/usb/typec_mux.h b/include/linux/usb/typec_mux.h
index 43f40685e53c..873ace5b0cf8 100644
--- a/include/linux/usb/typec_mux.h
+++ b/include/linux/usb/typec_mux.h
@@ -3,54 +3,48 @@
 #ifndef __USB_TYPEC_MUX
 #define __USB_TYPEC_MUX
 
-#include <linux/list.h>
 #include <linux/usb/typec.h>
 
 struct device;
+struct typec_mux;
+struct typec_switch;
+struct fwnode_handle;
 
-/**
- * struct typec_switch - USB Type-C cable orientation switch
- * @dev: Switch device
- * @entry: List entry
- * @set: Callback to the driver for setting the orientation
- *
- * USB Type-C pin flipper switch routing the correct data pairs from the
- * connector to the USB controller depending on the orientation of the cable
- * plug.
- */
-struct typec_switch {
-	struct device *dev;
-	struct list_head entry;
-
-	int (*set)(struct typec_switch *sw, enum typec_orientation orientation);
-};
+typedef int (*typec_switch_set_fn_t)(struct typec_switch *sw,
+				     enum typec_orientation orientation);
 
-/**
- * struct typec_switch - USB Type-C connector pin mux
- * @dev: Mux device
- * @entry: List entry
- * @set: Callback to the driver for setting the state of the mux
- *
- * Pin Multiplexer/DeMultiplexer switch routing the USB Type-C connector pins to
- * different components depending on the requested mode of operation. Used with
- * Accessory/Alternate modes.
- */
-struct typec_mux {
-	struct device *dev;
-	struct list_head entry;
-
-	int (*set)(struct typec_mux *mux, int state);
+struct typec_switch_desc {
+	struct fwnode_handle *fwnode;
+	typec_switch_set_fn_t set;
+	void *drvdata;
 };
 
 struct typec_switch *typec_switch_get(struct device *dev);
 void typec_switch_put(struct typec_switch *sw);
-int typec_switch_register(struct typec_switch *sw);
+struct typec_switch *
+typec_switch_register(struct device *parent,
+		      const struct typec_switch_desc *desc);
 void typec_switch_unregister(struct typec_switch *sw);
 
+void typec_switch_set_drvdata(struct typec_switch *sw, void *data);
+void *typec_switch_get_drvdata(struct typec_switch *sw);
+
+typedef int (*typec_mux_set_fn_t)(struct typec_mux *mux, int state);
+
+struct typec_mux_desc {
+	struct fwnode_handle *fwnode;
+	typec_mux_set_fn_t set;
+	void *drvdata;
+};
+
 struct typec_mux *
 typec_mux_get(struct device *dev, const struct typec_altmode_desc *desc);
 void typec_mux_put(struct typec_mux *mux);
-int typec_mux_register(struct typec_mux *mux);
+struct typec_mux *
+typec_mux_register(struct device *parent, const struct typec_mux_desc *desc);
 void typec_mux_unregister(struct typec_mux *mux);
 
+void typec_mux_set_drvdata(struct typec_mux *mux, void *data);
+void *typec_mux_get_drvdata(struct typec_mux *mux);
+
 #endif /* __USB_TYPEC_MUX */
-- 
cgit v1.2.3


From 3bd3706251ee8ab67e69d9340ac2abdca217e733 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 23 Apr 2019 16:26:36 +0200
Subject: sched/core: Provide a pointer to the valid CPU mask

In commit:

  4b53a3412d66 ("sched/core: Remove the tsk_nr_cpus_allowed() wrapper")

the tsk_nr_cpus_allowed() wrapper was removed. There was not
much difference in !RT but in RT we used this to implement
migrate_disable(). Within a migrate_disable() section the CPU mask is
restricted to single CPU while the "normal" CPU mask remains untouched.

As an alternative implementation Ingo suggested to use:

	struct task_struct {
		const cpumask_t		*cpus_ptr;
		cpumask_t		cpus_mask;
        };
with
	t->cpus_ptr = &t->cpus_mask;

In -RT we then can switch the cpus_ptr to:

	t->cpus_ptr = &cpumask_of(task_cpu(p));

in a migration disabled region. The rules are simple:

 - Code that 'uses' ->cpus_allowed would use the pointer.
 - Code that 'modifies' ->cpus_allowed would use the direct mask.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190423142636.14347-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/mca.c                     |  2 +-
 arch/mips/include/asm/switch_to.h          |  4 +--
 arch/mips/kernel/mips-mt-fpaff.c           |  2 +-
 arch/mips/kernel/traps.c                   |  6 ++---
 arch/powerpc/platforms/cell/spufs/sched.c  |  2 +-
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c  |  2 +-
 drivers/infiniband/hw/hfi1/affinity.c      |  6 ++---
 drivers/infiniband/hw/hfi1/sdma.c          |  3 +--
 drivers/infiniband/hw/qib/qib_file_ops.c   |  7 +++---
 fs/proc/array.c                            |  4 +--
 include/linux/sched.h                      |  5 ++--
 init/init_task.c                           |  3 ++-
 kernel/cgroup/cpuset.c                     |  2 +-
 kernel/fork.c                              |  2 ++
 kernel/sched/core.c                        | 40 +++++++++++++++---------------
 kernel/sched/cpudeadline.c                 |  4 +--
 kernel/sched/cpupri.c                      |  4 +--
 kernel/sched/deadline.c                    |  6 ++---
 kernel/sched/fair.c                        | 34 ++++++++++++-------------
 kernel/sched/rt.c                          |  4 +--
 kernel/trace/trace_hwlat.c                 |  2 +-
 lib/smp_processor_id.c                     |  2 +-
 samples/trace_events/trace-events-sample.c |  2 +-
 23 files changed, 75 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6a52d761854b..79190d877fa7 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	ti->cpu = cpu;
 	p->stack = ti;
 	p->state = TASK_UNINTERRUPTIBLE;
-	cpumask_set_cpu(cpu, &p->cpus_allowed);
+	cpumask_set_cpu(cpu, &p->cpus_mask);
 	INIT_LIST_HEAD(&p->tasks);
 	p->parent = p->real_parent = p->group_leader = p;
 	INIT_LIST_HEAD(&p->children);
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index 0f813bb753c6..09cbe9042828 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
  * inline to try to keep the overhead down. If we have been forced to run on
  * a "CPU" with an FPU because of a previous high level of FP computation,
  * but did not actually use the FPU during the most recent time-slice (CU1
- * isn't set), we undo the restriction on cpus_allowed.
+ * isn't set), we undo the restriction on cpus_mask.
  *
  * We're not calling set_cpus_allowed() here, because we have no need to
  * force prompt migration - we're already switching the current CPU to a
@@ -57,7 +57,7 @@ do {									\
 	    test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) &&		\
 	    (!(KSTK_STATUS(prev) & ST0_CU1))) {				\
 		clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND);		\
-		prev->cpus_allowed = prev->thread.user_cpus_allowed;	\
+		prev->cpus_mask = prev->thread.user_cpus_allowed;	\
 	}								\
 	next->thread.emulated_fp = 0;					\
 } while(0)
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index a7c0f97e4b0d..1a08428eedcf 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	if (retval)
 		goto out_unlock;
 
-	cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
+	cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
 	cpumask_and(&mask, &allowed, cpu_active_mask);
 
 out_unlock:
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c52766a5b85f..ac7159263da0 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void)
 		 * restricted the allowed set to exclude any CPUs with FPUs,
 		 * we'll skip the procedure.
 		 */
-		if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
+		if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
 			cpumask_t tmask;
 
 			current->thread.user_cpus_allowed
-				= current->cpus_allowed;
-			cpumask_and(&tmask, &current->cpus_allowed,
+				= current->cpus_mask;
+			cpumask_and(&tmask, &current->cpus_mask,
 				    &mt_fpu_cpumask);
 			set_cpus_allowed_ptr(current, &tmask);
 			set_thread_flag(TIF_FPUBOUND);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e56b553de27b..f18d5067cd0f 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	 * runqueue. The context will be rescheduled on the proper node
 	 * if it is timesliced or preempted.
 	 */
-	cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
+	cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
 
 	/* Save the current cpu id for spu interrupt routing. */
 	ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 604c0e3bcc83..f68baccc69f0 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * may be scheduled elsewhere and invalidate entries in the
 	 * pseudo-locked region.
 	 */
-	if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
+	if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
 		mutex_unlock(&rdtgroup_mutex);
 		return -EINVAL;
 	}
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 4fe662c3bbc1..c142b23bb401 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node)
 	struct hfi1_affinity_node *entry;
 	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 	const struct cpumask *node_mask,
-		*proc_mask = &current->cpus_allowed;
+		*proc_mask = current->cpus_ptr;
 	struct hfi1_affinity_node_list *affinity = &node_affinity;
 	struct cpu_mask_set *set = &affinity->proc;
 
@@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node)
 	 * check whether process/context affinity has already
 	 * been set
 	 */
-	if (cpumask_weight(proc_mask) == 1) {
+	if (current->nr_cpus_allowed == 1) {
 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 			  current->pid, current->comm,
 			  cpumask_pr_args(proc_mask));
@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node)
 		cpu = cpumask_first(proc_mask);
 		cpumask_set_cpu(cpu, &set->used);
 		goto done;
-	} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 			  current->pid, current->comm,
 			  cpumask_pr_args(proc_mask));
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index b0110728f541..7e8139ee0cc1 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -855,14 +855,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
 {
 	struct sdma_rht_node *rht_node;
 	struct sdma_engine *sde = NULL;
-	const struct cpumask *current_mask = &current->cpus_allowed;
 	unsigned long cpu_id;
 
 	/*
 	 * To ensure that always the same sdma engine(s) will be
 	 * selected make sure the process is pinned to this CPU only.
 	 */
-	if (cpumask_weight(current_mask) != 1)
+	if (current->nr_cpus_allowed != 1)
 		goto out;
 
 	cpu_id = smp_processor_id();
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 78fa634de98a..27b6e664e59d 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
 {
 	struct qib_filedata *fd = fp->private_data;
-	const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+	const unsigned int weight = current->nr_cpus_allowed;
 	const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
 	int local_cpu;
 
@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
 		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
 	else {
 		int unit;
-		const unsigned int cpu = cpumask_first(&current->cpus_allowed);
-		const unsigned int weight =
-			cpumask_weight(&current->cpus_allowed);
+		const unsigned int cpu = cpumask_first(current->cpus_ptr);
+		const unsigned int weight = current->nr_cpus_allowed;
 
 		if (weight == 1 && !test_bit(cpu, qib_cpulist))
 			if (!find_hca(cpu, &unit) && unit >= 0)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2edbb657f859..84908556ea58 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(&task->cpus_allowed));
+		   cpumask_pr_args(task->cpus_ptr));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(&task->cpus_allowed));
+		   cpumask_pr_args(task->cpus_ptr));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..1b2590a8d038 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -651,7 +651,8 @@ struct task_struct {
 
 	unsigned int			policy;
 	int				nr_cpus_allowed;
-	cpumask_t			cpus_allowed;
+	const cpumask_t			*cpus_ptr;
+	cpumask_t			cpus_mask;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
@@ -1399,7 +1400,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
 #define PF_MEMSTALL		0x01000000	/* Stalled due to lack of memory */
 #define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
-#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
+#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
diff --git a/init/init_task.c b/init/init_task.c
index c70ef656d0f4..3c27c0efa316 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -72,7 +72,8 @@ struct task_struct init_task
 	.static_prio	= MAX_PRIO - 20,
 	.normal_prio	= MAX_PRIO - 20,
 	.policy		= SCHED_NORMAL,
-	.cpus_allowed	= CPU_MASK_ALL,
+	.cpus_ptr	= &init_task.cpus_mask,
+	.cpus_mask	= CPU_MASK_ALL,
 	.nr_cpus_allowed= NR_CPUS,
 	.mm		= NULL,
 	.active_mm	= &init_mm,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..fe90fa1899e6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
 	if (task_css_is_root(task, cpuset_cgrp_id))
 		return;
 
-	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+	set_cpus_allowed_ptr(task, current->cpus_ptr);
 	task->mems_allowed = current->mems_allowed;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..6be686283e55 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -894,6 +894,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_STACKPROTECTOR
 	tsk->stack_canary = get_random_canary();
 #endif
+	if (orig->cpus_ptr == &orig->cpus_mask)
+		tsk->cpus_ptr = &tsk->cpus_mask;
 
 	/*
 	 * One for us, one for whoever does the "release_task()" (usually
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..93ab85f0d076 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -930,7 +930,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
  */
 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 {
-	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
 	if (is_per_cpu_kthread(p))
@@ -1025,7 +1025,7 @@ static int migration_cpu_stop(void *data)
 	local_irq_disable();
 	/*
 	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * __migrate_task() such that we will not miss enforcing cpus_ptr
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
@@ -1056,7 +1056,7 @@ static int migration_cpu_stop(void *data)
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-	cpumask_copy(&p->cpus_allowed, new_mask);
+	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
@@ -1126,7 +1126,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
+	if (cpumask_equal(p->cpus_ptr, new_mask))
 		goto out;
 
 	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1286,10 +1286,10 @@ static int migrate_swap_stop(void *data)
 	if (task_cpu(arg->src_task) != arg->src_cpu)
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+	if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+	if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
 		goto unlock;
 
 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1331,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+	if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+	if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
 		goto out;
 
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1479,7 +1479,7 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 
 /*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
  *
  * A few notes on cpu_active vs cpu_online:
  *
@@ -1519,14 +1519,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_active(dest_cpu))
 				continue;
-			if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
 				return dest_cpu;
 		}
 	}
 
 	for (;;) {
 		/* Any allowed, online CPU? */
-		for_each_cpu(dest_cpu, &p->cpus_allowed) {
+		for_each_cpu(dest_cpu, p->cpus_ptr) {
 			if (!is_cpu_allowed(p, dest_cpu))
 				continue;
 
@@ -1570,7 +1570,7 @@ out:
 }
 
 /*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +1580,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
-		cpu = cpumask_any(&p->cpus_allowed);
+		cpu = cpumask_any(p->cpus_ptr);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
-	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
+	 * to rely on ttwu() to place the task on a valid ->cpus_ptr
 	 * CPU.
 	 *
 	 * Since this is common to all placement strategies, this lives here.
@@ -2395,7 +2395,7 @@ void wake_up_new_task(struct task_struct *p)
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
-	 *  - cpus_allowed can change in the fork path
+	 *  - cpus_ptr can change in the fork path
 	 *  - any previously selected CPU might disappear through hotplug
 	 *
 	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -4267,7 +4267,7 @@ change:
 			 * the entire root_domain to become SCHED_DEADLINE. We
 			 * will also fail if there's no bandwidth available.
 			 */
-			if (!cpumask_subset(span, &p->cpus_allowed) ||
+			if (!cpumask_subset(span, p->cpus_ptr) ||
 			    rq->rd->dl_bw.bw == 0) {
 				task_rq_unlock(rq, p, &rf);
 				return -EPERM;
@@ -4866,7 +4866,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 		goto out_unlock;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
@@ -5443,7 +5443,7 @@ int task_can_attach(struct task_struct *p,
 	 * allowed nodes is unnecessary.  Thus, cpusets are not
 	 * applicable for such threads.  This prevents checking for
 	 * success of set_cpus_allowed_ptr() on all attached tasks
-	 * before cpus_allowed may be changed.
+	 * before cpus_mask may be changed.
 	 */
 	if (p->flags & PF_NO_SETAFFINITY) {
 		ret = -EINVAL;
@@ -5470,7 +5470,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (curr_cpu == target_cpu)
 		return 0;
 
-	if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
 		return -EINVAL;
 
 	/* TODO: This is not properly updating schedstats */
@@ -5608,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 		put_prev_task(rq, next);
 
 		/*
-		 * Rules for changing task_struct::cpus_allowed are holding
+		 * Rules for changing task_struct::cpus_mask are holding
 		 * both pi_lock and rq->lock, such that holding either
 		 * stabilizes the mask.
 		 *
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..d57fb2f8ae67 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -124,14 +124,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
 		return 1;
 	} else {
 		int best_cpu = cpudl_maximum(cp);
 
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-		if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 			if (later_mask)
 				cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..f7d2c10b4c92 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -98,11 +98,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+			cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..c1ef30861068 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		 * If we cannot preempt any rq, fall back to pick any
 		 * online CPU:
 		 */
-		cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+		cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
 		if (cpu >= nr_cpu_ids) {
 			/*
 			 * Failed to find any suitable CPU.
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, &p->cpus_allowed))
+	    cpumask_test_cpu(cpu, p->cpus_ptr))
 		return 1;
 	return 0;
 }
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..8691a8fffe40 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1621,7 +1621,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * be incurred if the tasks were swapped.
 	 */
 	/* Skip this swap candidate if cannot move to the source cpu */
-	if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
 		goto unlock;
 
 	/*
@@ -1718,7 +1718,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
-		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
 			continue;
 
 		env->dst_cpu = cpu;
@@ -5831,7 +5831,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_span(group),
-					&p->cpus_allowed))
+					p->cpus_ptr))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -5963,7 +5963,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
 		if (available_idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
@@ -6003,7 +6003,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = cpu;
 
-	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
 		return prev_cpu;
 
 	/*
@@ -6120,7 +6120,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 	if (!test_idle_cores(target, false))
 		return -1;
 
-	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
 	for_each_cpu_wrap(core, cpus, target) {
 		bool idle = true;
@@ -6154,7 +6154,7 @@ static int select_idle_smt(struct task_struct *p, int target)
 		return -1;
 
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 			continue;
 		if (available_idle_cpu(cpu))
 			return cpu;
@@ -6217,7 +6217,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
 		if (!--nr)
 			return -1;
-		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 			continue;
 		if (available_idle_cpu(cpu))
 			break;
@@ -6254,7 +6254,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    available_idle_cpu(recent_used_cpu) &&
-	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
 		 * candidate for the next wake:
@@ -6600,7 +6600,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		int max_spare_cap_cpu = -1;
 
 		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
-			if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
 
 			/* Skip CPUs that will be overutilized. */
@@ -6689,7 +6689,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		}
 
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
-			      cpumask_test_cpu(cpu, &p->cpus_allowed);
+			      cpumask_test_cpu(cpu, p->cpus_ptr);
 	}
 
 	rcu_read_lock();
@@ -7445,14 +7445,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
-	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
 	 * 3) running (obviously), or
 	 * 4) are cache-hot on their current CPU.
 	 */
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
 
-	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
 		int cpu;
 
 		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7472,7 +7472,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's CPUs: */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
@@ -8099,7 +8099,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 
 /*
  * Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
  *
  * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
  * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8768,7 +8768,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assume all things are equal, which typically
-	 * isn't true due to cpus_allowed constraints and the like.
+	 * isn't true due to cpus_ptr constraints and the like.
 	 */
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
@@ -9210,7 +9210,7 @@ more_balance:
 			 * if the curr task on busiest CPU can't be
 			 * moved to this_cpu:
 			 */
-			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+			if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..63ad7c90822c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, &p->cpus_allowed))
+	    cpumask_test_cpu(cpu, p->cpus_ptr))
 		return 1;
 
 	return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
 	 * of this thread, than stop migrating for the duration
 	 * of the current test.
 	 */
-	if (!cpumask_equal(current_mask, &current->cpus_allowed))
+	if (!cpumask_equal(current_mask, current->cpus_ptr))
 		goto disable;
 
 	get_online_cpus();
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 157d9e31f6c2..60ba93fc42ce 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
-	if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
+	if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
 		goto out;
 
 	/*
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 1da597aa6141..1a72b7d95cdc 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -34,7 +34,7 @@ static void simple_thread_func(int cnt)
 
 	/* Silly tracepoints */
 	trace_foo_bar("hello", cnt, array, random_strings[len],
-		      &current->cpus_allowed);
+		      current->cpus_ptr);
 
 	trace_foo_with_template_simple("HELLO", cnt);
 
-- 
cgit v1.2.3


From 5e83eafbfd3b351537c0d74467fc43e8a88f4ae4 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 27 May 2019 07:21:10 +0100
Subject: sched/fair: Remove the rq->cpu_load[] update code

With LB_BIAS disabled, there is no need to update the rq->cpu_load[idx]
any more.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20190527062116.11512-2-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h |   8 --
 kernel/sched/core.c        |   1 -
 kernel/sched/fair.c        | 255 ---------------------------------------------
 kernel/sched/sched.h       |   6 --
 kernel/time/tick-sched.c   |   2 -
 5 files changed, 272 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index b36f4cf38111..1abe91ff6e4a 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -6,14 +6,6 @@
  * This is the interface between the scheduler and nohz/dynticks:
  */
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz_start(void);
-extern void cpu_load_update_nohz_stop(void);
-#else
-static inline void cpu_load_update_nohz_start(void) { }
-static inline void cpu_load_update_nohz_stop(void) { }
-#endif
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern int get_nohz_timer_target(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 93ab85f0d076..00b8966802a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3033,7 +3033,6 @@ void scheduler_tick(void)
 
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	cpu_load_update_active(rq);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08b1cb06f968..1aab323f1b4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5322,71 +5322,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 #ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-
-/*
- * The exact cpuload calculated at every tick would be:
- *
- *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- *   load_n   = (1 - 1/2^i)^n * load_0
- *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- *   load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT		7
-
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-	{   0,   0,  0,  0,  0,  0, 0, 0 },
-	{  64,  32,  8,  0,  0,  0, 0, 0 },
-	{  96,  72, 40, 12,  1,  0, 0, 0 },
-	{ 112,  98, 75, 43, 15,  1, 0, 0 },
-	{ 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-	int j = 0;
-
-	if (!missed_updates)
-		return load;
-
-	if (missed_updates >= degrade_zero_ticks[idx])
-		return 0;
-
-	if (idx == 1)
-		return load >> missed_updates;
-
-	while (missed_updates) {
-		if (missed_updates % 2)
-			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-		missed_updates >>= 1;
-		j++;
-	}
-	return load;
-}
 
 static struct {
 	cpumask_var_t idle_cpus_mask;
@@ -5398,201 +5333,12 @@ static struct {
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
-/**
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- *             = A * (A * load[i]_n-2 + B) + B
- *             = A * (A * (A * load[i]_n-3 + B) + B) + B
- *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
-			    unsigned long pending_updates)
-{
-	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-
-	/* Update our load: */
-	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
-		old_load = decay_load_missed(old_load, pending_updates - 1, i);
-		if (tickless_load) {
-			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
-			/*
-			 * old_load can never be a negative value because a
-			 * decayed tickless_load cannot be greater than the
-			 * original tickless_load.
-			 */
-			old_load += tickless_load;
-		}
-#endif
-		new_load = this_load;
-		/*
-		 * Round up the averaging division if load is increasing. This
-		 * prevents us from getting stuck on 9 if the load is 10, for
-		 * example.
-		 */
-		if (new_load > old_load)
-			new_load += scale - 1;
-
-		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-	}
-}
-
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(struct rq *rq)
 {
 	return cfs_rq_runnable_load_avg(&rq->cfs);
 }
 
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-static void cpu_load_update_nohz(struct rq *this_rq,
-				 unsigned long curr_jiffies,
-				 unsigned long load)
-{
-	unsigned long pending_updates;
-
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * In the regular NOHZ case, we were idle, this means load 0.
-		 * In the NOHZ_FULL case, we were non-idle, we should consider
-		 * its weighted load.
-		 */
-		cpu_load_update(this_rq, load, pending_updates);
-	}
-}
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
-	/*
-	 * bail if there's load or we're actually up-to-date.
-	 */
-	if (weighted_cpuload(this_rq))
-		return;
-
-	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
-	struct rq *this_rq = this_rq();
-
-	/*
-	 * This is all lockless but should be fine. If weighted_cpuload changes
-	 * concurrently we'll exit nohz. And cpu_load write can race with
-	 * cpu_load_update_idle() but both updater would be writing the same.
-	 */
-	this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
-	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	struct rq *this_rq = this_rq();
-	unsigned long load;
-	struct rq_flags rf;
-
-	if (curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	load = weighted_cpuload(this_rq);
-	rq_lock(this_rq, &rf);
-	update_rq_clock(this_rq);
-	cpu_load_update_nohz(this_rq, curr_jiffies, load);
-	rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
-					unsigned long curr_jiffies,
-					unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
-	/* See the mess around cpu_load_update_nohz(). */
-	this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
-	cpu_load_update(this_rq, load, 1);
-}
-
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
-	unsigned long load = weighted_cpuload(this_rq);
-
-	if (tick_nohz_tick_stopped())
-		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
-	else
-		cpu_load_update_periodic(this_rq, load);
-}
-
 /*
  * Return a low guess at the load of a migration-source CPU weighted
  * according to the scheduling class and "nice" value.
@@ -9876,7 +9622,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 
 			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
-			cpu_load_update_idle(rq);
 			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c308410675ed..3750b5e53792 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
-
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
-		cpu_load_update_nohz_start();
 		quiet_vmstat();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	cpu_load_update_nohz_stop();
 
 	/*
 	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
-- 
cgit v1.2.3


From 0e1fef63d92d61ed561e504c3a078a827a0f9bfe Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 27 May 2019 07:21:14 +0100
Subject: sched/core: Remove sd->*_idx

The sched domain per rq load index files also disappear from the
/proc/sys/kernel/sched_domain/cpuX/domainY directories.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@surriel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20190527062116.11512-6-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |  5 -----
 kernel/sched/debug.c           | 25 ++++++++++---------------
 kernel/sched/topology.c        | 10 ----------
 3 files changed, 10 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index cfc0a89a7159..53afbe07354a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -84,11 +84,6 @@ struct sched_domain {
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int busy_idx;
-	unsigned int idle_idx;
-	unsigned int newidle_idx;
-	unsigned int wake_idx;
-	unsigned int forkexec_idx;
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a0b0d6e21e5b..7ffde8ce82fd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -251,25 +251,20 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
+	struct ctl_table *table = sd_alloc_ctl_entry(9);
 
 	if (table == NULL)
 		return NULL;
 
-	set_table_entry(&table[0],  "min_interval",	   &sd->min_interval,	     sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[1],  "max_interval",	   &sd->max_interval,	     sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[2],  "busy_idx",		   &sd->busy_idx,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[3],  "idle_idx",		   &sd->idle_idx,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[4],  "newidle_idx",	   &sd->newidle_idx,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[5],  "wake_idx",		   &sd->wake_idx,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[6],  "forkexec_idx",	   &sd->forkexec_idx,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[7],  "busy_factor",	   &sd->busy_factor,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[8],  "imbalance_pct",	   &sd->imbalance_pct,	     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[9],  "cache_nice_tries",	   &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[10], "flags",		   &sd->flags,		     sizeof(int),  0644, proc_dointvec_minmax);
-	set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[12], "name",		   sd->name,		CORENAME_MAX_SIZE, 0444, proc_dostring);
-	/* &table[13] is terminator */
+	set_table_entry(&table[0], "min_interval",	  &sd->min_interval,	    sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], "max_interval",	  &sd->max_interval,	    sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], "busy_factor",	  &sd->busy_factor,	    sizeof(int),  0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], "imbalance_pct",	  &sd->imbalance_pct,	    sizeof(int),  0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], "cache_nice_tries",	  &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], "flags",		  &sd->flags,		    sizeof(int),  0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[7], "name",		  sd->name,	       CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[8] is terminator */
 
 	return table;
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..63184cf0d0d7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		.imbalance_pct		= 125,
 
 		.cache_nice_tries	= 0,
-		.busy_idx		= 0,
-		.idle_idx		= 0,
-		.newidle_idx		= 0,
-		.wake_idx		= 0,
-		.forkexec_idx		= 0,
 
 		.flags			= 1*SD_LOAD_BALANCE
 					| 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
 
 #ifdef CONFIG_NUMA
 	} else if (sd->flags & SD_NUMA) {
 		sd->cache_nice_tries = 2;
-		sd->busy_idx = 3;
-		sd->idle_idx = 2;
 
 		sd->flags &= ~SD_PREFER_SIBLING;
 		sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
 #endif
 	} else {
 		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-		sd->idle_idx = 1;
 	}
 
 	/*
-- 
cgit v1.2.3


From d16dbd1b8a29bb9f8aca2c2f3bd1a0d2b7621126 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:22 +0800
Subject: locking/lockdep: Update obsolete struct field description

The lock_chain struct definition has outdated comment, update it and add
struct member description.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-7-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 6e2377e6c1d6..851d44fa5457 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -203,11 +203,17 @@ struct lock_list {
 	struct lock_list		*parent;
 };
 
-/*
- * We record lock dependency chains, so that we can cache them:
+/**
+ * struct lock_chain - lock dependency chain record
+ *
+ * @irq_context: the same as irq_context in held_lock below
+ * @depth:       the number of held locks in this chain
+ * @base:        the index in chain_hlocks for this chain
+ * @entry:       the collided lock chains in lock_chain hash list
+ * @chain_key:   the hash key of this lock_chain
  */
 struct lock_chain {
-	/* see BUILD_BUG_ON()s in lookup_chain_cache() */
+	/* see BUILD_BUG_ON()s in add_chain_cache() */
 	unsigned int			irq_context :  2,
 					depth       :  6,
 					base	    : 24;
-- 
cgit v1.2.3


From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:23 +0800
Subject: locking/lockdep: Use lockdep_init_task for task initiation
 consistently

Despite that there is a lockdep_init_task() which does nothing, lockdep
initiates tasks by assigning lockdep fields and does so inconsistently. Fix
this by using lockdep_init_task().

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  |  7 ++++++-
 init/init_task.c         |  2 ++
 kernel/fork.c            |  3 ---
 kernel/locking/lockdep.c | 11 ++++++++---
 4 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 851d44fa5457..5d05b8149f19 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -287,6 +287,8 @@ extern void lockdep_free_key_range(void *start, unsigned long size);
 extern asmlinkage void lockdep_sys_exit(void);
 extern void lockdep_set_selftest_task(struct task_struct *task);
 
+extern void lockdep_init_task(struct task_struct *task);
+
 extern void lockdep_off(void);
 extern void lockdep_on(void);
 
@@ -411,6 +413,10 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
 #else /* !CONFIG_LOCKDEP */
 
+static inline void lockdep_init_task(struct task_struct *task)
+{
+}
+
 static inline void lockdep_off(void)
 {
 }
@@ -503,7 +509,6 @@ enum xhlock_context_t {
 	{ .name = (_name), .key = (void *)(_key), }
 
 static inline void lockdep_invariant_state(bool force) {}
-static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
 
 #ifdef CONFIG_LOCK_STAT
diff --git a/init/init_task.c b/init/init_task.c
index c70ef656d0f4..1b15cb90d64f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -166,6 +166,8 @@ struct task_struct init_task
 	.softirqs_enabled = 1,
 #endif
 #ifdef CONFIG_LOCKDEP
+	.lockdep_depth = 0, /* no locks held yet */
+	.curr_chain_key = 0,
 	.lockdep_recursion = 0,
 #endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..735d0b4a89e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1984,9 +1984,6 @@ static __latent_entropy struct task_struct *copy_process(
 	p->pagefault_disabled = 0;
 
 #ifdef CONFIG_LOCKDEP
-	p->lockdep_depth = 0; /* no locks held yet */
-	p->curr_chain_key = 0;
-	p->lockdep_recursion = 0;
 	lockdep_init_task(p);
 #endif
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bc1efc12a8c5..b7d9c28ecf3b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -359,6 +359,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
 	return k0 | (u64)k1 << 32;
 }
 
+void lockdep_init_task(struct task_struct *task)
+{
+	task->lockdep_depth = 0; /* no locks held yet */
+	task->curr_chain_key = 0;
+	task->lockdep_recursion = 0;
+}
+
 void lockdep_off(void)
 {
 	current->lockdep_recursion++;
@@ -4589,9 +4596,7 @@ void lockdep_reset(void)
 	int i;
 
 	raw_local_irq_save(flags);
-	current->curr_chain_key = 0;
-	current->lockdep_depth = 0;
-	current->lockdep_recursion = 0;
+	lockdep_init_task(current);
 	memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
 	nr_hardirq_chains = 0;
 	nr_softirq_chains = 0;
-- 
cgit v1.2.3


From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:24 +0800
Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start
 with

Chain keys are computed using Jenkins hash function, which needs an initial
hash to start with. Dedicate a macro to make this clear and configurable. A
later patch changes this initial chain key.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  |  1 +
 init/init_task.c         |  2 +-
 kernel/locking/lockdep.c | 18 +++++++++---------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5d05b8149f19..d4e69595dbd4 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -229,6 +229,7 @@ struct lock_chain {
  * bitfield and hitting the BUG in hlock_class().
  */
 #define MAX_LOCKDEP_KEYS		((1UL << MAX_LOCKDEP_KEYS_BITS) - 1)
+#define INITIAL_CHAIN_KEY		0
 
 struct held_lock {
 	/*
diff --git a/init/init_task.c b/init/init_task.c
index 1b15cb90d64f..afa6ad795355 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -167,7 +167,7 @@ struct task_struct init_task
 #endif
 #ifdef CONFIG_LOCKDEP
 	.lockdep_depth = 0, /* no locks held yet */
-	.curr_chain_key = 0,
+	.curr_chain_key = INITIAL_CHAIN_KEY,
 	.lockdep_recursion = 0,
 #endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b7d9c28ecf3b..9edf6f12b711 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -362,7 +362,7 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
 void lockdep_init_task(struct task_struct *task)
 {
 	task->lockdep_depth = 0; /* no locks held yet */
-	task->curr_chain_key = 0;
+	task->curr_chain_key = INITIAL_CHAIN_KEY;
 	task->lockdep_recursion = 0;
 }
 
@@ -857,7 +857,7 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
 static bool check_lock_chain_key(struct lock_chain *chain)
 {
 #ifdef CONFIG_PROVE_LOCKING
-	u64 chain_key = 0;
+	u64 chain_key = INITIAL_CHAIN_KEY;
 	int i;
 
 	for (i = chain->base; i < chain->base + chain->depth; i++)
@@ -2524,7 +2524,7 @@ static void
 print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
 {
 	struct held_lock *hlock;
-	u64 chain_key = 0;
+	u64 chain_key = INITIAL_CHAIN_KEY;
 	int depth = curr->lockdep_depth;
 	int i = get_first_held_lock(curr, hlock_next);
 
@@ -2544,7 +2544,7 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
 static void print_chain_keys_chain(struct lock_chain *chain)
 {
 	int i;
-	u64 chain_key = 0;
+	u64 chain_key = INITIAL_CHAIN_KEY;
 	int class_id;
 
 	printk("depth: %u\n", chain->depth);
@@ -2848,7 +2848,7 @@ static void check_chain_key(struct task_struct *curr)
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
 	unsigned int i;
-	u64 chain_key = 0;
+	u64 chain_key = INITIAL_CHAIN_KEY;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
@@ -2872,7 +2872,7 @@ static void check_chain_key(struct task_struct *curr)
 
 		if (prev_hlock && (prev_hlock->irq_context !=
 							hlock->irq_context))
-			chain_key = 0;
+			chain_key = INITIAL_CHAIN_KEY;
 		chain_key = iterate_chain_key(chain_key, hlock->class_idx);
 		prev_hlock = hlock;
 	}
@@ -3787,14 +3787,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		/*
 		 * How can we have a chain hash when we ain't got no keys?!
 		 */
-		if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+		if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
 			return 0;
 		chain_head = 1;
 	}
 
 	hlock->prev_chain_key = chain_key;
 	if (separate_irq_context(curr, hlock)) {
-		chain_key = 0;
+		chain_key = INITIAL_CHAIN_KEY;
 		chain_head = 1;
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
@@ -4636,7 +4636,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
 	return;
 
 recalc:
-	chain_key = 0;
+	chain_key = INITIAL_CHAIN_KEY;
 	for (i = chain->base; i < chain->base + chain->depth; i++)
 		chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
 	if (chain->depth && chain->chain_key == chain_key)
-- 
cgit v1.2.3


From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:25 +0800
Subject: locking/lockdep: Change the range of class_idx in held_lock struct

held_lock->class_idx is used to point to the class of the held lock. The
index is shifted by 1 to make index 0 mean no class, which results in class
index shifting back and forth but is not worth doing so.

The reason is: (1) there will be no "no-class" held_lock to begin with, and
(2) index 0 seems to be used for error checking, but if something wrong
indeed happened, the index can't be counted on to distinguish it as that
something won't set the class_idx to 0 on purpose to tell us it is wrong.

Therefore, change the index to start from 0. This saves a lot of
back-and-forth shifts and a class slot back to lock_classes.

Since index 0 is now used for lock class, we change the initial chain key to
-1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0.
Actually, the initial chain key can be any arbitrary value other than 0.

In addition, a bitmap is maintained to keep track of the used lock classes,
and we check the validity of the held lock against that bitmap.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  | 14 ++++++------
 kernel/locking/lockdep.c | 59 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 46 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d4e69595dbd4..30a0f81aa130 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -223,13 +223,8 @@ struct lock_chain {
 };
 
 #define MAX_LOCKDEP_KEYS_BITS		13
-/*
- * Subtract one because we offset hlock->class_idx by 1 in order
- * to make 0 mean no class. This avoids overflowing the class_idx
- * bitfield and hitting the BUG in hlock_class().
- */
-#define MAX_LOCKDEP_KEYS		((1UL << MAX_LOCKDEP_KEYS_BITS) - 1)
-#define INITIAL_CHAIN_KEY		0
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+#define INITIAL_CHAIN_KEY		-1
 
 struct held_lock {
 	/*
@@ -254,6 +249,11 @@ struct held_lock {
 	u64 				waittime_stamp;
 	u64				holdtime_stamp;
 #endif
+	/*
+	 * class_idx is zero-indexed; it points to the element in
+	 * lock_classes this held lock instance belongs to. class_idx is in
+	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+	 */
 	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9edf6f12b711..3eecae315885 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -151,17 +151,28 @@ unsigned long nr_lock_classes;
 static
 #endif
 struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
 
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
-	if (!hlock->class_idx) {
+	unsigned int class_idx = hlock->class_idx;
+
+	/* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+	barrier();
+
+	if (!test_bit(class_idx, lock_classes_in_use)) {
 		/*
 		 * Someone passed in garbage, we give up.
 		 */
 		DEBUG_LOCKS_WARN_ON(1);
 		return NULL;
 	}
-	return lock_classes + hlock->class_idx - 1;
+
+	/*
+	 * At this point, if the passed hlock->class_idx is still garbage,
+	 * we just have to live with it
+	 */
+	return lock_classes + class_idx;
 }
 
 #ifdef CONFIG_LOCK_STAT
@@ -590,19 +601,22 @@ static void print_lock(struct held_lock *hlock)
 	/*
 	 * We can be called locklessly through debug_show_all_locks() so be
 	 * extra careful, the hlock might have been released and cleared.
+	 *
+	 * If this indeed happens, lets pretend it does not hurt to continue
+	 * to print the lock unless the hlock class_idx does not point to a
+	 * registered class. The rationale here is: since we don't attempt
+	 * to distinguish whether we are in this situation, if it just
+	 * happened we can't count on class_idx to tell either.
 	 */
-	unsigned int class_idx = hlock->class_idx;
+	struct lock_class *lock = hlock_class(hlock);
 
-	/* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
-	barrier();
-
-	if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+	if (!lock) {
 		printk(KERN_CONT "<RELEASED>\n");
 		return;
 	}
 
 	printk(KERN_CONT "%p", hlock->instance);
-	print_lock_name(lock_classes + class_idx - 1);
+	print_lock_name(lock);
 	printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
 }
 
@@ -861,7 +875,7 @@ static bool check_lock_chain_key(struct lock_chain *chain)
 	int i;
 
 	for (i = chain->base; i < chain->base + chain->depth; i++)
-		chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+		chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
 	/*
 	 * The 'unsigned long long' casts avoid that a compiler warning
 	 * is reported when building tools/lib/lockdep.
@@ -1136,6 +1150,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 		return NULL;
 	}
 	nr_lock_classes++;
+	__set_bit(class - lock_classes, lock_classes_in_use);
 	debug_atomic_inc(nr_unused_locks);
 	class->key = key;
 	class->name = lock->name;
@@ -2550,7 +2565,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
 	printk("depth: %u\n", chain->depth);
 	for (i = 0; i < chain->depth; i++) {
 		class_id = chain_hlocks[chain->base + i];
-		chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+		chain_key = print_chain_key_iteration(class_id, chain_key);
 
 		print_lock_name(lock_classes + class_id);
 		printk("\n");
@@ -2601,7 +2616,7 @@ static int check_no_collision(struct task_struct *curr,
 	}
 
 	for (j = 0; j < chain->depth - 1; j++, i++) {
-		id = curr->held_locks[i].class_idx - 1;
+		id = curr->held_locks[i].class_idx;
 
 		if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
 			print_collision(curr, hlock, chain);
@@ -2684,7 +2699,7 @@ static inline int add_chain_cache(struct task_struct *curr,
 	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = nr_chain_hlocks;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
-			int lock_id = curr->held_locks[i].class_idx - 1;
+			int lock_id = curr->held_locks[i].class_idx;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
@@ -2864,10 +2879,12 @@ static void check_chain_key(struct task_struct *curr)
 				(unsigned long long)hlock->prev_chain_key);
 			return;
 		}
+
 		/*
-		 * Whoops ran out of static storage again?
+		 * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+		 * it registered lock class index?
 		 */
-		if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+		if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
 			return;
 
 		if (prev_hlock && (prev_hlock->irq_context !=
@@ -3715,7 +3732,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
 		return 0;
 
-	class_idx = class - lock_classes + 1;
+	class_idx = class - lock_classes;
 
 	if (depth) {
 		hlock = curr->held_locks + depth - 1;
@@ -3777,9 +3794,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * the hash, not class->key.
 	 */
 	/*
-	 * Whoops, we did it again.. ran straight out of our static allocation.
+	 * Whoops, we did it again.. class_idx is invalid.
 	 */
-	if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+	if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
 		return 0;
 
 	chain_key = curr->curr_chain_key;
@@ -3894,7 +3911,7 @@ static int match_held_lock(const struct held_lock *hlock,
 		if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
 			return 0;
 
-		if (hlock->class_idx == class - lock_classes + 1)
+		if (hlock->class_idx == class - lock_classes)
 			return 1;
 	}
 
@@ -3988,7 +4005,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 
 	lockdep_init_map(lock, name, key, 0);
 	class = register_lock_class(lock, subclass, 0);
-	hlock->class_idx = class - lock_classes + 1;
+	hlock->class_idx = class - lock_classes;
 
 	curr->lockdep_depth = i;
 	curr->curr_chain_key = hlock->prev_chain_key;
@@ -4638,7 +4655,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
 recalc:
 	chain_key = INITIAL_CHAIN_KEY;
 	for (i = chain->base; i < chain->base + chain->depth; i++)
-		chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+		chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
 	if (chain->depth && chain->chain_key == chain_key)
 		return;
 	/* Overwrite the chain key for concurrent RCU readers. */
@@ -4712,6 +4729,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
 		WRITE_ONCE(class->key, NULL);
 		WRITE_ONCE(class->name, NULL);
 		nr_lock_classes--;
+		__clear_bit(class - lock_classes, lock_classes_in_use);
 	} else {
 		WARN_ONCE(true, "%s() failed for class %s\n", __func__,
 			  class->name);
@@ -5057,6 +5075,7 @@ void __init lockdep_init(void)
 
 	printk(" memory used by lock dependency info: %zu kB\n",
 	       (sizeof(lock_classes) +
+		sizeof(lock_classes_in_use) +
 		sizeof(classhash_table) +
 		sizeof(list_entries) +
 		sizeof(list_entries_in_use) +
-- 
cgit v1.2.3


From aac1f7f95f115d5a5329be05b80022e72df7d080 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 12 May 2019 17:55:10 +0200
Subject: sysfs: Add sysfs_update_groups function

Adding sysfs_update_groups function to update
multiple groups.

  sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
  @kobj:      The kobject to update the group on
  @groups:    The attribute groups to update, NULL terminated

This function update a bunch of attribute groups.  If an error occurs when
updating a group, all previously updated groups will be removed together
with already existing (not updated) attributes.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190512155518.21468-2-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/sysfs/group.c      | 54 +++++++++++++++++++++++++++++++++++++--------------
 include/linux/sysfs.h |  8 ++++++++
 2 files changed, 47 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 57038604d4a8..d41c21fef138 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -175,6 +175,26 @@ int sysfs_create_group(struct kobject *kobj,
 }
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 
+static int internal_create_groups(struct kobject *kobj, int update,
+				  const struct attribute_group **groups)
+{
+	int error = 0;
+	int i;
+
+	if (!groups)
+		return 0;
+
+	for (i = 0; groups[i]; i++) {
+		error = internal_create_group(kobj, update, groups[i]);
+		if (error) {
+			while (--i >= 0)
+				sysfs_remove_group(kobj, groups[i]);
+			break;
+		}
+	}
+	return error;
+}
+
 /**
  * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
  * @kobj:	The kobject to create the group on
@@ -191,24 +211,28 @@ EXPORT_SYMBOL_GPL(sysfs_create_group);
 int sysfs_create_groups(struct kobject *kobj,
 			const struct attribute_group **groups)
 {
-	int error = 0;
-	int i;
-
-	if (!groups)
-		return 0;
-
-	for (i = 0; groups[i]; i++) {
-		error = sysfs_create_group(kobj, groups[i]);
-		if (error) {
-			while (--i >= 0)
-				sysfs_remove_group(kobj, groups[i]);
-			break;
-		}
-	}
-	return error;
+	return internal_create_groups(kobj, 0, groups);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_groups);
 
+/**
+ * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
+ * @kobj:	The kobject to update the group on
+ * @groups:	The attribute groups to update, NULL terminated
+ *
+ * This function update a bunch of attribute groups.  If an error occurs when
+ * updating a group, all previously updated groups will be removed together
+ * with already existing (not updated) attributes.
+ *
+ * Returns 0 on success or error code from sysfs_update_group on failure.
+ */
+int sysfs_update_groups(struct kobject *kobj,
+			const struct attribute_group **groups)
+{
+	return internal_create_groups(kobj, 1, groups);
+}
+EXPORT_SYMBOL_GPL(sysfs_update_groups);
+
 /**
  * sysfs_update_group - given a directory kobject, update an attribute group
  * @kobj:	The kobject to update the group on
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 786816cf4aa5..965236795750 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -268,6 +268,8 @@ int __must_check sysfs_create_group(struct kobject *kobj,
 				    const struct attribute_group *grp);
 int __must_check sysfs_create_groups(struct kobject *kobj,
 				     const struct attribute_group **groups);
+int __must_check sysfs_update_groups(struct kobject *kobj,
+				     const struct attribute_group **groups);
 int sysfs_update_group(struct kobject *kobj,
 		       const struct attribute_group *grp);
 void sysfs_remove_group(struct kobject *kobj,
@@ -433,6 +435,12 @@ static inline int sysfs_create_groups(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_update_groups(struct kobject *kobj,
+				      const struct attribute_group **groups)
+{
+	return 0;
+}
+
 static inline int sysfs_update_group(struct kobject *kobj,
 				const struct attribute_group *grp)
 {
-- 
cgit v1.2.3


From f3a3a8257e5a1a5e67cbb1afdbc4c1c6a26f1b22 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 12 May 2019 17:55:11 +0200
Subject: perf/core: Add attr_groups_update into struct pmu

Adding attr_update attribute group into pmu, to allow
having multiple attribute groups for same group name.

This will allow us to update "events" or "format"
directories with attributes that depend on various
HW conditions.

For example having group_format_extra group that updates
"format" directory only if pmu version is 2 and higher:

  static umode_t
  exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
  {
         return x86_pmu.version >= 2 ? attr->mode : 0;
  }

  static struct attribute_group group_format_extra = {
         .name       = "format",
         .is_visible = exra_is_visible,
  };

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190512155518.21468-3-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 kernel/events/core.c       | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0ab99c7b652d..3dc01cf98e16 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -255,6 +255,7 @@ struct pmu {
 	struct module			*module;
 	struct device			*dev;
 	const struct attribute_group	**attr_groups;
+	const struct attribute_group	**attr_update;
 	const char			*name;
 	int				type;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3005c80f621d..118ad1aef6af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9874,6 +9874,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto del_dev;
 
+	if (pmu->attr_update)
+		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+
+	if (ret)
+		goto del_dev;
+
 out:
 	return ret;
 
-- 
cgit v1.2.3


From 3724921396dd1a07c93e3516b8d7c9ff570d17a9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 22 May 2019 14:22:48 +0100
Subject: locking/atomic: Use s64 for atomic64_t on 64-bit

Now that all architectures use 64 consistently as the base type for the
atomic64 API, let's have the CONFIG_64BIT definition of atomic64_t use
s64 as the underlying type for atomic64_t, rather than long, matching
the generated headers.

On architectures where atomic64_read(v) is READ_ONCE(v->counter), this
patch will cause the return type of atomic64_read() to be s64.

As of this patch, the atomic64 API can be relied upon to consistently
return s64 where a value rather than boolean condition is returned. This
should make code more robust, and simpler, allowing for the removal of
casts previously required to ensure consistent types.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aou@eecs.berkeley.edu
Cc: arnd@arndb.de
Cc: bp@alien8.de
Cc: catalin.marinas@arm.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: ink@jurassic.park.msu.ru
Cc: jhogan@kernel.org
Cc: linux@armlinux.org.uk
Cc: mattst88@gmail.com
Cc: mpe@ellerman.id.au
Cc: palmer@sifive.com
Cc: paul.burton@mips.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: tony.luck@intel.com
Cc: vgupta@synopsys.com
Link: https://lkml.kernel.org/r/20190522132250.26499-17-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 231114ae38f4..05030f608be3 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -174,7 +174,7 @@ typedef struct {
 
 #ifdef CONFIG_64BIT
 typedef struct {
-	long counter;
+	s64 counter;
 } atomic64_t;
 #endif
 
-- 
cgit v1.2.3


From 2d146b924ec3c0873f06308d149684dc1105d9a3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Jan 2019 16:21:07 +0100
Subject: backing-dev: no need to check return value of debugfs_create
 functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

And as the return value does not matter at all, no need to save the
dentry in struct backing_dev_info, so delete it.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: linux-mm@kvack.org
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/backing-dev-defs.h |  1 -
 mm/backing-dev.c                 | 24 +++++-------------------
 2 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 07e02d6df5ad..6a1a8a314d85 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -203,7 +203,6 @@ struct backing_dev_info {
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
-	struct dentry *debug_stats;
 #endif
 };
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 909dae445ea7..e8e89158adec 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -103,39 +103,25 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 }
 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
 
-static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
-	if (!bdi_debug_root)
-		return -ENOMEM;
-
 	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
-	if (!bdi->debug_dir)
-		return -ENOMEM;
-
-	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
-					       bdi, &bdi_debug_stats_fops);
-	if (!bdi->debug_stats) {
-		debugfs_remove(bdi->debug_dir);
-		bdi->debug_dir = NULL;
-		return -ENOMEM;
-	}
 
-	return 0;
+	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
+			    &bdi_debug_stats_fops);
 }
 
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
-	debugfs_remove(bdi->debug_stats);
-	debugfs_remove(bdi->debug_dir);
+	debugfs_remove_recursive(bdi->debug_dir);
 }
 #else
 static inline void bdi_debug_init(void)
 {
 }
-static inline int bdi_debug_register(struct backing_dev_info *bdi,
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
 				      const char *name)
 {
-	return 0;
 }
 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
-- 
cgit v1.2.3


From b1d2dc009dece4cd7e629419b52266ba51960a6b Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicoleotsuka@gmail.com>
Date: Thu, 23 May 2019 21:06:32 -0700
Subject: dma-contiguous: add dma_{alloc,free}_contiguous() helpers

Both dma_alloc_from_contiguous() and dma_release_from_contiguous() are
very simply implemented, but requiring callers to pass certain
parameters like count and align, and taking a boolean parameter to check
__GFP_NOWARN in the allocation flags. So every function call duplicates
similar work:

	unsigned long order = get_order(size);
	size_t count = size >> PAGE_SHIFT;

	page = dma_alloc_from_contiguous(dev, count, order,
			gfp & __GFP_NOWARN);

	[...]

	dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);

Additionally, as CMA can be used only in the context which permits
sleeping, most of callers do a gfpflags_allow_blocking() check and a
corresponding fallback allocation of normal pages upon any false result:

	if (gfpflags_allow_blocking(flag))
		page = dma_alloc_from_contiguous();
	if (!page)
		page = alloc_pages();

	[...]

	if (!dma_release_from_contiguous(dev, page, count))
		__free_pages(page, get_order(size));

So this patch simplifies those function calls by abstracting these
operations into the two new functions: dma_{alloc,free}_contiguous.

As some callers of dma_{alloc,release}_from_contiguous() might be
complicated, this patch just implements these two new functions to
kernel/dma/direct.c only as an initial step.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicolin Chen <nicoleotsuka@gmail.com>
Tested-by: dann frazier <dann.frazier@canonical.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-contiguous.h | 13 ++++++++++++
 kernel/dma/contiguous.c        | 47 ++++++++++++++++++++++++++++++++++++++++++
 kernel/dma/direct.c            | 24 ++++-----------------
 3 files changed, 64 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 6665fa03c0d1..428f3b7b1c42 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -111,6 +111,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order, bool no_warn);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp);
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size);
 
 #else
 
@@ -153,6 +155,17 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return false;
 }
 
+static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size,
+		gfp_t gfp)
+{
+	return NULL;
+}
+
+static inline void dma_free_contiguous(struct device *dev, struct page *page,
+		size_t size)
+{
+}
+
 #endif
 
 #endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..637b120d647b 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,53 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return cma_release(dev_get_cma_area(dev), pages, count);
 }
 
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev:   Pointer to device for which the allocation is performed.
+ * @size:  Requested allocation size.
+ * @gfp:   Allocation flags.
+ *
+ * This function allocates contiguous memory buffer for specified device. It
+ * first tries to use device specific contiguous memory area if available or
+ * the default global one, then tries a fallback allocation of normal pages.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+	int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	size_t align = get_order(PAGE_ALIGN(size));
+	struct cma *cma = dev_get_cma_area(dev);
+	struct page *page = NULL;
+
+	/* CMA can be used only in the context which permits sleeping */
+	if (cma && gfpflags_allow_blocking(gfp)) {
+		align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+		page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
+	}
+
+	/* Fallback allocation of normal pages */
+	if (!page)
+		page = alloc_pages_node(node, gfp, align);
+	return page;
+}
+
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev:   Pointer to device for which the pages were allocated.
+ * @page:  Pointer to the allocated pages.
+ * @size:  Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+	if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
+		__free_pages(page, get_order(size));
+}
+
 /*
  * Support for reserved memory regions defined in device tree
  */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e9702a..0816c1e8b05a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	int page_order = get_order(size);
 	struct page *page = NULL;
 	u64 phys_mask;
 
@@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 			&phys_mask);
 again:
-	/* CMA can be used only in the context which permits sleeping */
-	if (gfpflags_allow_blocking(gfp)) {
-		page = dma_alloc_from_contiguous(dev, count, page_order,
-						 gfp & __GFP_NOWARN);
-		if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-			dma_release_from_contiguous(dev, page, count);
-			page = NULL;
-		}
-	}
-	if (!page)
-		page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
-
+	page = dma_alloc_contiguous(dev, size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-		__free_pages(page, page_order);
+		dma_free_contiguous(dev, page, size);
 		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -154,7 +141,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (PageHighMem(page)) {
 		/*
 		 * Depending on the cma= arguments and per-arch setup
-		 * dma_alloc_from_contiguous could return highmem pages.
+		 * dma_alloc_contiguous could return highmem pages.
 		 * Without remapping there is no way to return them here,
 		 * so log an error and fail.
 		 */
@@ -176,10 +163,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
 {
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	if (!dma_release_from_contiguous(dev, page, count))
-		__free_pages(page, get_order(size));
+	dma_free_contiguous(dev, page, size);
 }
 
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-- 
cgit v1.2.3


From dd3dcede9fa0a0b661ac1f24843f4a1b1317fdb6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicoleotsuka@gmail.com>
Date: Wed, 29 May 2019 17:54:25 -0700
Subject: dma-contiguous: fix !CONFIG_DMA_CMA version of dma_{alloc,
 free}_contiguous()

Commit fdaeec198ada ("dma-contiguous: add dma_{alloc,free}_contiguous()
helpers") adds a pair of new helper functions so as to abstract code in
the dma-direct (and other places in the future), however it breaks QEMU
boot feature using x86_64 defconfig.

That's because x86_64 defconfig has CONFIG_DMA_CMA=n so those two newly
introduced helper functions are empty in their !CONFIG_DMA_CMA version,
while previously the platform independent dma-direct code had fallback
alloc_pages_node() and __free_pages().

So this patch fixes it by adding alloc_pages_node() and __free_pages()
in the !CONFIG_DMA_CMA version of the two helper functions.

Tested with below QEMU command:
  qemu-system-x86_64 -m 512m \
      -drive file=images/x86_64/rootfs.ext4,format=raw,if=ide \
      -append 'console=ttyS0 root=/dev/sda' -nographic \
      -kernel arch/x86_64/boot/bzImage

with the rootfs from the below link:
  https://github.com/ClangBuiltLinux/continuous-integration/raw/master/images/x86_64/rootfs.ext4

Fixes: fdaeec198ada ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers")
Reported-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-contiguous.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 428f3b7b1c42..c05d4e661489 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -50,6 +50,7 @@
 #ifdef __KERNEL__
 
 #include <linux/device.h>
+#include <linux/mm.h>
 
 struct cma;
 struct page;
@@ -155,15 +156,20 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return false;
 }
 
+/* Use fallback alloc() and free() when CONFIG_DMA_CMA=n */
 static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size,
 		gfp_t gfp)
 {
-	return NULL;
+	int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	size_t align = get_order(PAGE_ALIGN(size));
+
+	return alloc_pages_node(node, gfp, align);
 }
 
 static inline void dma_free_contiguous(struct device *dev, struct page *page,
 		size_t size)
 {
+	__free_pages(page, get_order(size));
 }
 
 #endif
-- 
cgit v1.2.3


From da83a722959a82733c3ca60030cc364ca2318c5a Mon Sep 17 00:00:00 2001
From: Fredrik Noring <noring@nocrew.org>
Date: Wed, 29 May 2019 13:28:39 +0300
Subject: lib/genalloc: add gen_pool_dma_zalloc() for zeroed DMA allocations

gen_pool_dma_zalloc() is a zeroed memory variant of
gen_pool_dma_alloc().  Also document the return values of both, and
indicate NULL as a "%NULL" constant.

Signed-off-by: Fredrik Noring <noring@nocrew.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/genalloc.h |  1 +
 lib/genalloc.c           | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd0a452373e7..6c62eeca754f 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -121,6 +121,7 @@ extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
 		genpool_algo_t algo, void *data);
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
+void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
 extern void gen_pool_for_each_chunk(struct gen_pool *,
 	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 7e85d1e37a6e..5db43476a19b 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -337,12 +337,14 @@ EXPORT_SYMBOL(gen_pool_alloc_algo);
  * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
  * @pool: pool to allocate from
  * @size: number of bytes to allocate from the pool
- * @dma: dma-view physical address return value.  Use NULL if unneeded.
+ * @dma: dma-view physical address return value.  Use %NULL if unneeded.
  *
  * Allocate the requested number of bytes from the specified pool.
  * Uses the pool allocation function (with first-fit algorithm by default).
  * Can not be used in NMI handler on architectures without
  * NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated memory, or %NULL on failure
  */
 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 {
@@ -362,6 +364,31 @@ void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 }
 EXPORT_SYMBOL(gen_pool_dma_alloc);
 
+/**
+ * gen_pool_dma_zalloc - allocate special zeroed memory from the pool for
+ * DMA usage
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @dma: dma-view physical address return value.  Use %NULL if unneeded.
+ *
+ * Allocate the requested number of zeroed bytes from the specified pool.
+ * Uses the pool allocation function (with first-fit algorithm by default).
+ * Can not be used in NMI handler on architectures without
+ * NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated zeroed memory, or %NULL on failure
+ */
+void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
+{
+	void *vaddr = gen_pool_dma_alloc(pool, size, dma);
+
+	if (vaddr)
+		memset(vaddr, 0, size);
+
+	return vaddr;
+}
+EXPORT_SYMBOL(gen_pool_dma_zalloc);
+
 /**
  * gen_pool_free - free allocated special memory back to the pool
  * @pool: pool to free to
-- 
cgit v1.2.3


From b0310c2f09bbe8aebefb97ed67949a3a7092aca6 Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Wed, 29 May 2019 13:28:40 +0300
Subject: USB: use genalloc for USB HCs with local memory

For HCs that have local memory, replace the current DMA API usage with
a genalloc generic allocator to manage the mappings for these devices.
To help users, introduce a new HCD API, usb_hcd_setup_local_mem() that
will setup up the genalloc backing up the device local memory. It will
be used in subsequent patches.  This is in preparation for dropping
the existing "coherent" dma mem declaration APIs.  The current
implementation was relying on a short circuit in the DMA API that in
the end, was acting as an allocator for these type of devices.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Fredrik Noring <noring@nocrew.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/usb/Kconfig         |  1 +
 drivers/usb/core/buffer.c   |  9 +++++++++
 drivers/usb/core/hcd.c      | 36 ++++++++++++++++++++++++++++++++++++
 drivers/usb/host/ohci-hcd.c | 23 ++++++++++++++++++-----
 drivers/usb/host/ohci-mem.c | 35 +++++++++++++++++++++++++++++++----
 drivers/usb/host/ohci.h     |  2 ++
 include/linux/usb/hcd.h     |  5 +++++
 7 files changed, 102 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index e4b27413f528..389c57d8eba7 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -45,6 +45,7 @@ config USB_ARCH_HAS_HCD
 config USB
 	tristate "Support for Host-side USB"
 	depends on USB_ARCH_HAS_HCD
+	select GENERIC_ALLOCATOR
 	select USB_COMMON
 	select NLS  # for UTF-8 strings
 	---help---
diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index f641342cdec0..d2064ad7ad14 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmapool.h>
+#include <linux/genalloc.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
 
@@ -124,6 +125,9 @@ void *hcd_buffer_alloc(
 	if (size == 0)
 		return NULL;
 
+	if (hcd->localmem_pool)
+		return gen_pool_dma_alloc(hcd->localmem_pool, size, dma);
+
 	/* some USB hosts just use PIO */
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
 	    (!is_device_dma_capable(bus->sysdev) &&
@@ -152,6 +156,11 @@ void hcd_buffer_free(
 	if (!addr)
 		return;
 
+	if (hcd->localmem_pool) {
+		gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size);
+		return;
+	}
+
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
 	    (!is_device_dma_capable(bus->sysdev) &&
 	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 94d22551fc1b..29b96e5e8621 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -29,6 +29,8 @@
 #include <linux/workqueue.h>
 #include <linux/pm_runtime.h>
 #include <linux/types.h>
+#include <linux/genalloc.h>
+#include <linux/io.h>
 
 #include <linux/phy/phy.h>
 #include <linux/usb.h>
@@ -3039,6 +3041,40 @@ usb_hcd_platform_shutdown(struct platform_device *dev)
 }
 EXPORT_SYMBOL_GPL(usb_hcd_platform_shutdown);
 
+int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
+			    dma_addr_t dma, size_t size)
+{
+	int err;
+	void *local_mem;
+
+	hcd->localmem_pool = devm_gen_pool_create(hcd->self.sysdev, PAGE_SHIFT,
+						  dev_to_node(hcd->self.sysdev),
+						  dev_name(hcd->self.sysdev));
+	if (IS_ERR(hcd->localmem_pool))
+		return PTR_ERR(hcd->localmem_pool);
+
+	local_mem = devm_memremap(hcd->self.sysdev, phys_addr,
+				  size, MEMREMAP_WC);
+	if (!local_mem)
+		return -ENOMEM;
+
+	/*
+	 * Here we pass a dma_addr_t but the arg type is a phys_addr_t.
+	 * It's not backed by system memory and thus there's no kernel mapping
+	 * for it.
+	 */
+	err = gen_pool_add_virt(hcd->localmem_pool, (unsigned long)local_mem,
+				dma, size, dev_to_node(hcd->self.sysdev));
+	if (err < 0) {
+		dev_err(hcd->self.sysdev, "gen_pool_add_virt failed with %d\n",
+			err);
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_hcd_setup_local_mem);
+
 /*-------------------------------------------------------------------------*/
 
 #if IS_ENABLED(CONFIG_USB_MON)
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 210181fd98d2..b200b19b44fa 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -40,6 +40,7 @@
 #include <linux/dmapool.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
+#include <linux/genalloc.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -505,8 +506,15 @@ static int ohci_init (struct ohci_hcd *ohci)
 	timer_setup(&ohci->io_watchdog, io_watchdog_func, 0);
 	ohci->prev_frame_no = IO_WATCHDOG_OFF;
 
-	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
-			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
+	if (hcd->localmem_pool)
+		ohci->hcca = gen_pool_dma_alloc(hcd->localmem_pool,
+						sizeof(*ohci->hcca),
+						&ohci->hcca_dma);
+	else
+		ohci->hcca = dma_alloc_coherent(hcd->self.controller,
+						sizeof(*ohci->hcca),
+						&ohci->hcca_dma,
+						GFP_KERNEL);
 	if (!ohci->hcca)
 		return -ENOMEM;
 
@@ -990,9 +998,14 @@ static void ohci_stop (struct usb_hcd *hcd)
 	remove_debug_files (ohci);
 	ohci_mem_cleanup (ohci);
 	if (ohci->hcca) {
-		dma_free_coherent (hcd->self.controller,
-				sizeof *ohci->hcca,
-				ohci->hcca, ohci->hcca_dma);
+		if (hcd->localmem_pool)
+			gen_pool_free(hcd->localmem_pool,
+				      (unsigned long)ohci->hcca,
+				      sizeof(*ohci->hcca));
+		else
+			dma_free_coherent(hcd->self.controller,
+					  sizeof(*ohci->hcca),
+					  ohci->hcca, ohci->hcca_dma);
 		ohci->hcca = NULL;
 		ohci->hcca_dma = 0;
 	}
diff --git a/drivers/usb/host/ohci-mem.c b/drivers/usb/host/ohci-mem.c
index 3965ac0341eb..4afe27cc7e46 100644
--- a/drivers/usb/host/ohci-mem.c
+++ b/drivers/usb/host/ohci-mem.c
@@ -36,6 +36,13 @@ static void ohci_hcd_init (struct ohci_hcd *ohci)
 
 static int ohci_mem_init (struct ohci_hcd *ohci)
 {
+	/*
+	 * HCs with local memory allocate from localmem_pool so there's
+	 * no need to create the below dma pools.
+	 */
+	if (ohci_to_hcd(ohci)->localmem_pool)
+		return 0;
+
 	ohci->td_cache = dma_pool_create ("ohci_td",
 		ohci_to_hcd(ohci)->self.controller,
 		sizeof (struct td),
@@ -84,8 +91,12 @@ td_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 {
 	dma_addr_t	dma;
 	struct td	*td;
+	struct usb_hcd	*hcd = ohci_to_hcd(hc);
 
-	td = dma_pool_zalloc (hc->td_cache, mem_flags, &dma);
+	if (hcd->localmem_pool)
+		td = gen_pool_dma_zalloc(hcd->localmem_pool, sizeof(*td), &dma);
+	else
+		td = dma_pool_zalloc(hc->td_cache, mem_flags, &dma);
 	if (td) {
 		/* in case hc fetches it, make it look dead */
 		td->hwNextTD = cpu_to_hc32 (hc, dma);
@@ -99,6 +110,7 @@ static void
 td_free (struct ohci_hcd *hc, struct td *td)
 {
 	struct td	**prev = &hc->td_hash [TD_HASH_FUNC (td->td_dma)];
+	struct usb_hcd	*hcd = ohci_to_hcd(hc);
 
 	while (*prev && *prev != td)
 		prev = &(*prev)->td_hash;
@@ -106,7 +118,12 @@ td_free (struct ohci_hcd *hc, struct td *td)
 		*prev = td->td_hash;
 	else if ((td->hwINFO & cpu_to_hc32(hc, TD_DONE)) != 0)
 		ohci_dbg (hc, "no hash for td %p\n", td);
-	dma_pool_free (hc->td_cache, td, td->td_dma);
+
+	if (hcd->localmem_pool)
+		gen_pool_free(hcd->localmem_pool, (unsigned long)td,
+			      sizeof(*td));
+	else
+		dma_pool_free(hc->td_cache, td, td->td_dma);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -117,8 +134,12 @@ ed_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 {
 	dma_addr_t	dma;
 	struct ed	*ed;
+	struct usb_hcd	*hcd = ohci_to_hcd(hc);
 
-	ed = dma_pool_zalloc (hc->ed_cache, mem_flags, &dma);
+	if (hcd->localmem_pool)
+		ed = gen_pool_dma_zalloc(hcd->localmem_pool, sizeof(*ed), &dma);
+	else
+		ed = dma_pool_zalloc(hc->ed_cache, mem_flags, &dma);
 	if (ed) {
 		INIT_LIST_HEAD (&ed->td_list);
 		ed->dma = dma;
@@ -129,6 +150,12 @@ ed_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 static void
 ed_free (struct ohci_hcd *hc, struct ed *ed)
 {
-	dma_pool_free (hc->ed_cache, ed, ed->dma);
+	struct usb_hcd	*hcd = ohci_to_hcd(hc);
+
+	if (hcd->localmem_pool)
+		gen_pool_free(hcd->localmem_pool, (unsigned long)ed,
+			      sizeof(*ed));
+	else
+		dma_pool_free(hc->ed_cache, ed, ed->dma);
 }
 
diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index ef4813bfc5bf..b015b00774b2 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@@ -385,6 +385,8 @@ struct ohci_hcd {
 
 	/*
 	 * memory management for queue data structures
+	 *
+	 * @td_cache and @ed_cache are %NULL if &usb_hcd.localmem_pool is used.
 	 */
 	struct dma_pool		*td_cache;
 	struct dma_pool		*ed_cache;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index bb57b5af4700..127560a4bfa0 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -216,6 +216,9 @@ struct usb_hcd {
 #define	HC_IS_RUNNING(state) ((state) & __ACTIVE)
 #define	HC_IS_SUSPENDED(state) ((state) & __SUSPEND)
 
+	/* memory pool for HCs having local memory, or %NULL */
+	struct gen_pool         *localmem_pool;
+
 	/* more shared queuing code would be good; it should support
 	 * smarter scheduling, handle transaction translators, etc;
 	 * input size of periodic table to an interrupt scheduler.
@@ -461,6 +464,8 @@ extern int usb_add_hcd(struct usb_hcd *hcd,
 		unsigned int irqnum, unsigned long irqflags);
 extern void usb_remove_hcd(struct usb_hcd *hcd);
 extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1);
+int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
+			    dma_addr_t dma, size_t size);
 
 struct platform_device;
 extern void usb_hcd_platform_shutdown(struct platform_device *dev);
-- 
cgit v1.2.3


From 2d7a3dc3e24f43504b1f25eae8195e600f4cce8b Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Wed, 29 May 2019 13:28:43 +0300
Subject: USB: drop HCD_LOCAL_MEM flag

With the addition of the local memory allocator, the HCD_LOCAL_MEM
flag can be dropped and the checks against it replaced with a check
for the localmem_pool ptr being initialized.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Fredrik Noring <noring@nocrew.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/usb/core/buffer.c      |  8 +++-----
 drivers/usb/core/hcd.c         | 15 ++++++---------
 drivers/usb/host/ehci-hcd.c    |  2 +-
 drivers/usb/host/fotg210-hcd.c |  2 +-
 drivers/usb/host/ohci-hcd.c    |  2 +-
 drivers/usb/host/ohci-sm501.c  |  5 +++--
 drivers/usb/host/ohci-tmio.c   |  2 +-
 drivers/usb/host/uhci-hcd.c    |  2 +-
 include/linux/usb/hcd.h        |  1 -
 9 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index d2064ad7ad14..1359b78a624e 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -68,7 +68,7 @@ int hcd_buffer_create(struct usb_hcd *hcd)
 
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
 	    (!is_device_dma_capable(hcd->self.sysdev) &&
-	     !(hcd->driver->flags & HCD_LOCAL_MEM)))
+	     !hcd->localmem_pool))
 		return 0;
 
 	for (i = 0; i < HCD_BUFFER_POOLS; i++) {
@@ -130,8 +130,7 @@ void *hcd_buffer_alloc(
 
 	/* some USB hosts just use PIO */
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!is_device_dma_capable(bus->sysdev) &&
-	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
+	    !is_device_dma_capable(bus->sysdev)) {
 		*dma = ~(dma_addr_t) 0;
 		return kmalloc(size, mem_flags);
 	}
@@ -162,8 +161,7 @@ void hcd_buffer_free(
 	}
 
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!is_device_dma_capable(bus->sysdev) &&
-	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
+	    !is_device_dma_capable(bus->sysdev)) {
 		kfree(addr);
 		return;
 	}
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 29b96e5e8621..fe631d18c1ed 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1347,14 +1347,14 @@ EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);
  * using regular system memory - like pci devices doing bus mastering.
  *
  * To support host controllers with limited dma capabilities we provide dma
- * bounce buffers. This feature can be enabled using the HCD_LOCAL_MEM flag.
+ * bounce buffers. This feature can be enabled by initializing
+ * hcd->localmem_pool using usb_hcd_setup_local_mem().
  * For this to work properly the host controller code must first use the
  * function dma_declare_coherent_memory() to point out which memory area
  * that should be used for dma allocations.
  *
- * The HCD_LOCAL_MEM flag then tells the usb code to allocate all data for
- * dma using dma_alloc_coherent() which in turn allocates from the memory
- * area pointed out with dma_declare_coherent_memory().
+ * The initialized hcd->localmem_pool then tells the usb code to allocate all
+ * data for dma using the genalloc API.
  *
  * So, to summarize...
  *
@@ -1364,9 +1364,6 @@ EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);
  *   (a) "normal" kernel memory is no good, and
  *   (b) there's not enough to share
  *
- * - The only *portable* hook for such stuff in the
- *   DMA framework is dma_declare_coherent_memory()
- *
  * - So we use that, even though the primary requirement
  *   is that the memory be "local" (hence addressable
  *   by that device), not "coherent".
@@ -1533,7 +1530,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 						urb->setup_dma))
 				return -EAGAIN;
 			urb->transfer_flags |= URB_SETUP_MAP_SINGLE;
-		} else if (hcd->driver->flags & HCD_LOCAL_MEM) {
+		} else if (hcd->localmem_pool) {
 			ret = hcd_alloc_coherent(
 					urb->dev->bus, mem_flags,
 					&urb->setup_dma,
@@ -1603,7 +1600,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 				else
 					urb->transfer_flags |= URB_DMA_MAP_SINGLE;
 			}
-		} else if (hcd->driver->flags & HCD_LOCAL_MEM) {
+		} else if (hcd->localmem_pool) {
 			ret = hcd_alloc_coherent(
 					urb->dev->bus, mem_flags,
 					&urb->transfer_dma,
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index cdafa97f632d..9da7e22848c9 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -559,7 +559,7 @@ static int ehci_init(struct usb_hcd *hcd)
 	ehci->command = temp;
 
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 
 	/* Prepare for unlinking active QHs */
diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c
index 0da68df259c8..5d74ff61fa4c 100644
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@@ -4995,7 +4995,7 @@ static int hcd_fotg210_init(struct usb_hcd *hcd)
 	fotg210->command = temp;
 
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 	return 0;
 }
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index b200b19b44fa..5801858d867e 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -448,7 +448,7 @@ static int ohci_init (struct ohci_hcd *ohci)
 	struct usb_hcd *hcd = ohci_to_hcd(ohci);
 
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 
 	if (distrust_firmware)
diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index b710e100aec9..c158cda9e4b9 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@@ -49,7 +49,7 @@ static const struct hc_driver ohci_sm501_hc_driver = {
 	 * generic hardware linkage
 	 */
 	.irq =			ohci_irq,
-	.flags =		HCD_USB11 | HCD_MEMORY | HCD_LOCAL_MEM,
+	.flags =		HCD_USB11 | HCD_MEMORY,
 
 	/*
 	 * basic lifecycle operations
@@ -153,7 +153,8 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
 	 * fine. This is however not always the case - buffers may be allocated
 	 * using kmalloc() - so the usb core needs to be told that it must copy
 	 * data into our local memory if the buffers happen to be placed in
-	 * regular memory. The HCD_LOCAL_MEM flag does just that.
+	 * regular memory. A non-null hcd->localmem_pool initialized by the
+	 * the call to usb_hcd_setup_local_mem() below does just that.
 	 */
 
 	if (usb_hcd_setup_local_mem(hcd, mem->start,
diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index 3b84ce0c3f29..d5a293a707b6 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@@ -153,7 +153,7 @@ static const struct hc_driver ohci_tmio_hc_driver = {
 
 	/* generic hardware linkage */
 	.irq =			ohci_irq,
-	.flags =		HCD_USB11 | HCD_MEMORY | HCD_LOCAL_MEM,
+	.flags =		HCD_USB11 | HCD_MEMORY,
 
 	/* basic lifecycle operations */
 	.start =		ohci_tmio_start,
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 98deb5f64268..03bc59755123 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -581,7 +581,7 @@ static int uhci_start(struct usb_hcd *hcd)
 
 	hcd->uses_new_polling = 1;
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 
 	spin_lock_init(&uhci->lock);
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 127560a4bfa0..bab27ccc8ff5 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -256,7 +256,6 @@ struct hc_driver {
 
 	int	flags;
 #define	HCD_MEMORY	0x0001		/* HC regs use memory (else I/O) */
-#define	HCD_LOCAL_MEM	0x0002		/* HC needs local memory */
 #define	HCD_SHARED	0x0004		/* Two (or more) usb_hcds share HW */
 #define	HCD_USB11	0x0010		/* USB 1.1 */
 #define	HCD_USB2	0x0020		/* USB 2.0 */
-- 
cgit v1.2.3


From c30700db9eaabb35e0b123301df35a6846e6b6b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jun 2019 08:43:51 +0200
Subject: dma-direct: provide generic support for uncached kernel segments

A few architectures support uncached kernel segments.  In that case we get
an uncached mapping for a given physica address by using an offset in the
uncached segement.  Implement support for this scheme in the generic
dma-direct code instead of duplicating it in arch hooks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/Kconfig                    |  8 ++++++++
 include/linux/dma-noncoherent.h |  3 +++
 kernel/dma/direct.c             | 17 +++++++++++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index c47b328eada0..e8d19c3cb91f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -260,6 +260,14 @@ config ARCH_HAS_SET_MEMORY
 config ARCH_HAS_SET_DIRECT_MAP
 	bool
 
+#
+# Select if arch has an uncached kernel segment and provides the
+# uncached_kernel_address / cached_kernel_address symbols to use it
+#
+config ARCH_HAS_UNCACHED_SEGMENT
+	select ARCH_HAS_DMA_PREP_COHERENT
+	bool
+
 # Select if arch init_task must go in the __init_task_data section
 config ARCH_TASK_STRUCT_ON_STACK
        bool
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 9741767e400f..7e0126a04e02 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -80,4 +80,7 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
+void *uncached_kernel_address(void *addr);
+void *cached_kernel_address(void *addr);
+
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0816c1e8b05a..b67f0aa08aa3 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -158,6 +158,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	}
 	memset(ret, 0, size);
+
+	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) {
+		arch_dma_prep_coherent(page, size);
+		ret = uncached_kernel_address(ret);
+	}
+
 	return ret;
 }
 
@@ -173,13 +180,18 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 
 	if (force_dma_unencrypted())
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+
+	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT))
+		cpu_addr = cached_kernel_address(cpu_addr);
 	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-	if (!dev_is_dma_coherent(dev))
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !dev_is_dma_coherent(dev))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 }
@@ -187,7 +199,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
-	if (!dev_is_dma_coherent(dev))
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !dev_is_dma_coherent(dev))
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 	else
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
-- 
cgit v1.2.3


From c9c2c27d7ceca8c2856c5008f2002bddb384f518 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 16 Apr 2019 15:46:55 +0200
Subject: debugfs: make debugfs_create_u32_array() return void

The single user of debugfs_create_u32_array() does not care about the
return value of it, so make it return void as there is no need to do
anything with the return value.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/debugfs.txt |  2 +-
 fs/debugfs/file.c                     | 14 ++++----------
 include/linux/debugfs.h               | 12 +++++-------
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 4a0a9c3f4af6..9e27c843d00e 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -169,7 +169,7 @@ byte offsets over a base for the register block.
 
 If you want to dump an u32 array in debugfs, you can create file with:
 
-    struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+    void debugfs_create_u32_array(const char *name, umode_t mode,
 			struct dentry *parent,
 			u32 *array, u32 elements);
 
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddd708b09fa1..93e4ca6b2ad7 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -997,25 +997,19 @@ static const struct file_operations u32_array_fops = {
  * @array as data. If the @mode variable is so set it can be read from.
  * Writing is not supported. Seek within the file is also not supported.
  * Once array is created its size can not be changed.
- *
- * The function returns a pointer to dentry on success. If an error occurs,
- * %ERR_PTR(-ERROR) or NULL will be returned. If debugfs is not enabled in
- * the kernel, the value %ERR_PTR(-ENODEV) will be returned.
  */
-struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
-					    struct dentry *parent,
-					    u32 *array, u32 elements)
+void debugfs_create_u32_array(const char *name, umode_t mode,
+			      struct dentry *parent, u32 *array, u32 elements)
 {
 	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
 
 	if (data == NULL)
-		return NULL;
+		return;
 
 	data->array = array;
 	data->elements = elements;
 
-	return debugfs_create_file_unsafe(name, mode, parent, data,
-					&u32_array_fops);
+	debugfs_create_file_unsafe(name, mode, parent, data, &u32_array_fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 3b0ba54cc4d5..58424eb3b329 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -133,9 +133,8 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
 void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
 			  int nregs, void __iomem *base, char *prefix);
 
-struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
-					struct dentry *parent,
-					u32 *array, u32 elements);
+void debugfs_create_u32_array(const char *name, umode_t mode,
+			      struct dentry *parent, u32 *array, u32 elements);
 
 struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
 					   struct dentry *parent,
@@ -353,11 +352,10 @@ static inline bool debugfs_initialized(void)
 	return false;
 }
 
-static inline struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
-					struct dentry *parent,
-					u32 *array, u32 elements)
+static inline void debugfs_create_u32_array(const char *name, umode_t mode,
+					    struct dentry *parent, u32 *array,
+					    u32 elements)
 {
-	return ERR_PTR(-ENODEV);
 }
 
 static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
-- 
cgit v1.2.3


From a09db883e5d938b525a86a4630fc04f98ff1063d Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Tue, 4 Jun 2019 16:47:02 +0530
Subject: drm: Fix docbook warnings in hdr metadata helper structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following warnings:
./include/drm/drm_mode_config.h:841: warning: Incorrect use of
kernel-doc format:          * hdr_output_metadata_property: Connector
property containing hdr
./include/drm/drm_mode_config.h:918: warning: Function parameter or member 'hdr_output_metadata_property' not described in 'drm_mode_config'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_output_metadata' not described in 'drm_connector'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_sink_metadata' not described in 'drm_connector'

Also adds some property documentation for HDR Metadata Connector
Property in connector property create function.

v2: Fixed Sean Paul's review comments.

v3: Fixed Daniel Vetter's review comments, added the UAPI structure
definition section in kernel docs.

v4: Fixed Daniel Vetter's review comments.

v5: Added structure member references as per Daniel's suggestion.

Cc: Shashank Sharma <shashank.sharma@intel.com>
Cc: Ville Syrjä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: "Ville Syrjä" <ville.syrjala@linux.intel.com>
Cc: Hans Verkuil <hansverk@cisco.com>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Reviewed-by: Sean Paul <sean@poorly.run> (v1)
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
[danvet: Fix up markup: () for functions, & for structs. Style guide
also recommends to prepend struct for structures.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1559647022-7336-1-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_connector.c | 40 ++++++++++++++++++++++
 include/drm/drm_connector.h     |  1 +
 include/drm/drm_mode_config.h   |  4 +--
 include/linux/hdmi.h            | 12 +++++++
 include/uapi/drm/drm_mode.h     | 74 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 128 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index c9ac8b9e83ea..e17586aaa80f 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -956,6 +956,46 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] = {
  *	  is no longer protected and userspace should take appropriate action
  *	  (whatever that might be).
  *
+ * HDR_OUTPUT_METADATA:
+ *	Connector property to enable userspace to send HDR Metadata to
+ *	driver. This metadata is based on the composition and blending
+ *	policies decided by user, taking into account the hardware and
+ *	sink capabilities. The driver gets this metadata and creates a
+ *	Dynamic Range and Mastering Infoframe (DRM) in case of HDMI,
+ *	SDP packet (Non-audio INFOFRAME SDP v1.3) for DP. This is then
+ *	sent to sink. This notifies the sink of the upcoming frame's Color
+ *	Encoding and Luminance parameters.
+ *
+ *	Userspace first need to detect the HDR capabilities of sink by
+ *	reading and parsing the EDID. Details of HDR metadata for HDMI
+ *	are added in CTA 861.G spec. For DP , its defined in VESA DP
+ *	Standard v1.4. It needs to then get the metadata information
+ *	of the video/game/app content which are encoded in HDR (basically
+ *	using HDR transfer functions). With this information it needs to
+ *	decide on a blending policy and compose the relevant
+ *	layers/overlays into a common format. Once this blending is done,
+ *	userspace will be aware of the metadata of the composed frame to
+ *	be send to sink. It then uses this property to communicate this
+ *	metadata to driver which then make a Infoframe packet and sends
+ *	to sink based on the type of encoder connected.
+ *
+ *	Userspace will be responsible to do Tone mapping operation in case:
+ *		- Some layers are HDR and others are SDR
+ *		- HDR layers luminance is not same as sink
+ *	It will even need to do colorspace conversion and get all layers
+ *	to one common colorspace for blending. It can use either GL, Media
+ *	or display engine to get this done based on the capabilties of the
+ *	associated hardware.
+ *
+ *	Driver expects metadata to be put in &struct hdr_output_metadata
+ *	structure from userspace. This is received as blob and stored in
+ *	&drm_connector_state.hdr_output_metadata. It parses EDID and saves the
+ *	sink metadata in &struct hdr_sink_metadata, as
+ *	&drm_connector.hdr_sink_metadata.  Driver uses
+ *	drm_hdmi_infoframe_set_hdr_metadata() helper to set the HDR metadata,
+ *	hdmi_drm_infoframe_pack() to pack the infoframe as per spec, in case of
+ *	HDMI encoder.
+ *
  * max bpc:
  *	This range property is used by userspace to limit the bit depth. When
  *	used the driver would limit the bpc in accordance with the valid range
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 547656173c74..47e749b74e5f 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1244,6 +1244,7 @@ struct drm_connector {
 	 */
 	struct llist_node free_node;
 
+	/** @hdr_sink_metadata: HDR Metadata Information read from sink */
 	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 4f88cc972407..759d462d028b 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -837,8 +837,8 @@ struct drm_mode_config {
 	struct drm_property *writeback_out_fence_ptr_property;
 
 	/**
-	 * hdr_output_metadata_property: Connector property containing hdr
-	 * metatda. This will be provided by userspace compositors based
+	 * @hdr_output_metadata_property: Connector property containing hdr
+	 * metatada. This will be provided by userspace compositors based
 	 * on HDR content
 	 */
 	struct drm_property *hdr_output_metadata_property;
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index ee55ba589cdc..9918a6c910c5 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -367,8 +367,19 @@ struct hdr_static_metadata {
 	__u16 min_cll;
 };
 
+/**
+ * struct hdr_sink_metadata - HDR sink metadata
+ *
+ * Metadata Information read from Sink's EDID
+ */
 struct hdr_sink_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_static_metadata hdmi_type1;
 	};
@@ -398,6 +409,7 @@ union hdmi_vendor_any_infoframe {
  * @spd: spd infoframe
  * @vendor: union of all vendor infoframes
  * @audio: audio infoframe
+ * @drm: Dynamic Range and Mastering infoframe
  *
  * This is used by the generic pack function. This works since all infoframes
  * have the same header which also indicates which type of infoframe should be
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 19b5cf368cff..5ab331e5dc23 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -33,6 +33,15 @@
 extern "C" {
 #endif
 
+/**
+ * DOC: overview
+ *
+ * DRM exposes many UAPI and structure definition to have a consistent
+ * and standardized interface with user.
+ * Userspace can refer to these structure definitions and UAPI formats
+ * to communicate to driver
+ */
+
 #define DRM_CONNECTOR_NAME_LEN	32
 #define DRM_DISPLAY_MODE_LEN	32
 #define DRM_PROP_NAME_LEN	32
@@ -630,24 +639,87 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
-/* HDR Metadata Infoframe as per 861.G spec */
+/**
+ * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data.
+ *
+ * HDR Metadata Infoframe as per CTA 861.G spec. This is expected
+ * to match exactly with the spec.
+ *
+ * Userspace is expected to pass the metadata information as per
+ * the format described in this structure.
+ */
 struct hdr_metadata_infoframe {
+	/**
+	 * @eotf: Electro-Optical Transfer Function (EOTF)
+	 * used in the stream.
+	 */
 	__u8 eotf;
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u8 metadata_type;
+	/**
+	 * @display_primaries: Color Primaries of the Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @display_primaries.x: X cordinate of color primary.
+	 * @display_primaries.y: Y cordinate of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} display_primaries[3];
+	/**
+	 * @white_point: White Point of Colorspace Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @white_point.x: X cordinate of whitepoint of color primary.
+	 * @white_point.y: Y cordinate of whitepoint of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} white_point;
+	/**
+	 * @max_display_mastering_luminance: Max Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_display_mastering_luminance;
+	/**
+	 * @min_display_mastering_luminance: Min Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of
+	 * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF
+	 * represents 6.5535 cd/m2.
+	 */
 	__u16 min_display_mastering_luminance;
+	/**
+	 * @max_cll: Max Content Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_cll;
+	/**
+	 * @max_fall: Max Frame Average Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_fall;
 };
 
+/**
+ * struct hdr_output_metadata - HDR output metadata
+ *
+ * Metadata Information to be passed from userspace
+ */
 struct hdr_output_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_metadata_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_metadata_infoframe hdmi_metadata_type1;
 	};
-- 
cgit v1.2.3


From 9a83c84c3a491cbe7fc9dea3c43e26a8e67204d2 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Tue, 28 May 2019 10:16:53 +0800
Subject: drivers: base: cacheinfo: Add variable to record max cache line size

Add coherency_max_size variable to record the maximum cache line size
for different cache levels. If it is available, we will synchronize
it as cache line size, otherwise we will use CTR_EL0.CWG reporting
in cache_line_size() for arm64.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/base/cacheinfo.c  | 5 +++++
 include/linux/cacheinfo.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a7359535caf5..8827c60f51e2 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -213,6 +213,8 @@ int __weak cache_setup_acpi(unsigned int cpu)
 	return -ENOTSUPP;
 }
 
+unsigned int coherency_max_size;
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -251,6 +253,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
 				cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
 			}
 		}
+		/* record the maximum cache line size */
+		if (this_leaf->coherency_line_size > coherency_max_size)
+			coherency_max_size = this_leaf->coherency_line_size;
 	}
 
 	return 0;
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 70e19bc6cc9f..46b92cd61d0c 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -17,6 +17,8 @@ enum cache_type {
 	CACHE_TYPE_UNIFIED = BIT(2),
 };
 
+extern unsigned int coherency_max_size;
+
 /**
  * struct cacheinfo - represent a cache leaf node
  * @id: This cache's id. It is unique among caches with the same (type, level).
-- 
cgit v1.2.3


From f257d6dcda0187693407e0c2e5dab69bdab3223f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Apr 2019 22:18:17 -0700
Subject: KVM: Directly return result from kvm_arch_check_processor_compat()

Add a wrapper to invoke kvm_arch_check_processor_compat() so that the
boilerplate ugliness of checking virtualization support on all CPUs is
hidden from the arch specific code.  x86's implementation in particular
is quite heinous, as it unnecessarily propagates the out-param pattern
into kvm_x86_ops.

While the x86 specific issue could be resolved solely by changing
kvm_x86_ops, make the change for all architectures as returning a value
directly is prettier and technically more robust, e.g. s390 doesn't set
the out param, which could lead to subtle breakage in the (highly
unlikely) scenario where the out-param was not pre-initialized by the
caller.

Opportunistically annotate svm_check_processor_compat() with __init.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c             | 4 ++--
 arch/powerpc/kvm/powerpc.c       | 4 ++--
 arch/s390/include/asm/kvm_host.h | 1 -
 arch/s390/kvm/kvm-s390.c         | 5 +++++
 arch/x86/include/asm/kvm_host.h  | 2 +-
 arch/x86/kvm/svm.c               | 4 ++--
 arch/x86/kvm/vmx/vmx.c           | 8 ++++----
 arch/x86/kvm/x86.c               | 4 ++--
 include/linux/kvm_host.h         | 2 +-
 virt/kvm/arm/arm.c               | 4 ++--
 virt/kvm/kvm_main.c              | 9 ++++++---
 11 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 0369f26ab96d..2cfe839f0b3a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -123,9 +123,9 @@ int kvm_arch_hardware_setup(void)
 	return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-	*(int *)rtn = 0;
+	return 0;
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index aa3a678711be..628d3c791ad7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -425,9 +425,9 @@ int kvm_arch_hardware_setup(void)
 	return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-	*(int *)rtn = kvmppc_core_check_processor_compat();
+	return kvmppc_core_check_processor_compat();
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 2b00a3ebee08..da5825a3c16b 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -905,7 +905,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
 extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
 
 static inline void kvm_arch_hardware_disable(void) {}
-static inline void kvm_arch_check_processor_compat(void *rtn) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 28ebd647784c..7936af0a971f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -227,6 +227,11 @@ int kvm_arch_hardware_enable(void)
 	return 0;
 }
 
+int kvm_arch_check_processor_compat(void)
+{
+	return 0;
+}
+
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 			      unsigned long end);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 450d69a1e6fa..d5457c7bb243 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -999,7 +999,7 @@ struct kvm_x86_ops {
 	int (*disabled_by_bios)(void);             /* __init */
 	int (*hardware_enable)(void);
 	void (*hardware_disable)(void);
-	void (*check_processor_compatibility)(void *rtn);
+	int (*check_processor_compatibility)(void);/* __init */
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c56f40d430e5..302cb409d452 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5871,9 +5871,9 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xd9;
 }
 
-static void svm_check_processor_compat(void *rtn)
+static int __init svm_check_processor_compat(void)
 {
-	*(int *)rtn = 0;
+	return 0;
 }
 
 static bool svm_cpu_has_accelerated_tpr(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b35b3800a3c0..0861c71a4379 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6733,22 +6733,22 @@ static int vmx_vm_init(struct kvm *kvm)
 	return 0;
 }
 
-static void __init vmx_check_processor_compat(void *rtn)
+static int __init vmx_check_processor_compat(void)
 {
 	struct vmcs_config vmcs_conf;
 	struct vmx_capability vmx_cap;
 
-	*(int *)rtn = 0;
 	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
-		*(int *)rtn = -EIO;
+		return -EIO;
 	if (nested)
 		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
 					   enable_apicv);
 	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
 		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
 				smp_processor_id());
-		*(int *)rtn = -EIO;
+		return -EIO;
 	}
+	return 0;
 }
 
 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a713a74ca2e..5cb9ac9b61ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9082,9 +9082,9 @@ void kvm_arch_hardware_unsetup(void)
 	kvm_x86_ops->hardware_unsetup();
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-	kvm_x86_ops->check_processor_compatibility(rtn);
+	return kvm_x86_ops->check_processor_compatibility();
 }
 
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79fa4426509c..5e9fd7ad8018 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -870,7 +870,7 @@ int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
 int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
-void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_check_processor_compat(void);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 7eeebe5e9da2..d2389033e9d6 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -105,9 +105,9 @@ int kvm_arch_hardware_setup(void)
 	return 0;
 }
 
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
 {
-	*(int *)rtn = 0;
+	return 0;
 }
 
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ca54b09adf5b..b2579841263f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4224,6 +4224,11 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 	kvm_arch_vcpu_put(vcpu);
 }
 
+static void check_processor_compat(void *rtn)
+{
+	*(int *)rtn = kvm_arch_check_processor_compat();
+}
+
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
@@ -4255,9 +4260,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		goto out_free_0a;
 
 	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu,
-				kvm_arch_check_processor_compat,
-				&r, 1);
+		smp_call_function_single(cpu, check_processor_compat, &r, 1);
 		if (r < 0)
 			goto out_free_1;
 	}
-- 
cgit v1.2.3


From da29e4b466e6916a52e0e2f60054f855c324a9c2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 3 Jun 2019 15:16:58 -0700
Subject: net/tls: fully initialize the msg wrapper skb

If strparser gets cornered into starting a new message from
an sk_buff which already has frags, it will allocate a new
skb to become the "wrapper" around the fragments of the
message.

This new skb does not inherit any metadata fields.  In case
of TLS offload this may lead to unnecessarily re-encrypting
the message, as skb->decrypted is not set for the wrapper skb.

Try to be conservative and copy all fields of old skb
strparser's user may reasonably need.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 25 +++++++++++++++++++++++++
 net/strparser/strparser.c |  8 ++------
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ee5e63195c0..98ff5ac98caa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1063,6 +1063,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 				     int max_page_order,
 				     int *errcode,
 				     gfp_t gfp_mask);
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);
 
 /* Layout of fast clones : [skb1][skb2][fclone_ref] */
 struct sk_buff_fclones {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4a712a00243a..b50a5e3ac4e4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -913,6 +913,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 #undef C
 }
 
+/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+	struct sk_buff *n;
+
+	n = alloc_skb(0, GFP_ATOMIC);
+	if (!n)
+		return NULL;
+
+	n->len = first->len;
+	n->data_len = first->len;
+	n->truesize = first->truesize;
+
+	skb_shinfo(n)->frag_list = first;
+
+	__copy_skb_header(n, first);
+	n->destructor = NULL;
+
+	return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
 /**
  *	skb_morph	-	morph one skb into another
  *	@dst: the skb to receive the contents
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index e137698e8aef..3fe541b746b0 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -160,18 +160,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 					return 0;
 				}
 
-				skb = alloc_skb(0, GFP_ATOMIC);
+				skb = alloc_skb_for_msg(head);
 				if (!skb) {
 					STRP_STATS_INCR(strp->stats.mem_fail);
 					desc->error = -ENOMEM;
 					return 0;
 				}
-				skb->len = head->len;
-				skb->data_len = head->len;
-				skb->truesize = head->truesize;
-				*_strp_msg(skb) = *_strp_msg(head);
+
 				strp->skb_nextp = &head->next;
-				skb_shinfo(skb)->frag_list = head;
 				strp->skb_head = skb;
 				head = skb;
 			} else {
-- 
cgit v1.2.3


From 1210d1e6bad1e7ccccb19627b880a50d7c15dd51 Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Tue, 21 May 2019 13:20:45 -0600
Subject: platform/chrome: wilco_ec: Add telemetry char device interface

The Wilco Embedded Controller is able to send telemetry data
which is useful for enterprise applications. A daemon running on
the OS sends a command to the EC via a write() to a char device,
and can read the response with a read(). The write() request is
verified by the driver to ensure that it is performing only one
of the whitelisted commands, and that no extraneous data is
being transmitted to the EC. The response is passed directly
back to the reader with no modification.

The character device will appear as /dev/wilco_telemN, where N
is some small non-negative integer, starting with 0. Only one
process may have the file descriptor open at a time. The calling
userspace program needs to keep the device file descriptor open
between the calls to write() and read() in order to preserve the
response. Up to 32 bytes will be available for reading.

For testing purposes, try requesting the EC's firmware build
date, by sending the WILCO_EC_TELEM_GET_VERSION command with
argument index=3. i.e. write [0x38, 0x00, 0x03]
to the device node. An ASCII string of the build date is
returned.

Signed-off-by: Nick Crews <ncrews@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 drivers/platform/chrome/wilco_ec/Kconfig     |   7 +
 drivers/platform/chrome/wilco_ec/Makefile    |   2 +
 drivers/platform/chrome/wilco_ec/core.c      |  13 +
 drivers/platform/chrome/wilco_ec/debugfs.c   |   2 +-
 drivers/platform/chrome/wilco_ec/telemetry.c | 450 +++++++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h       |   2 +
 6 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 drivers/platform/chrome/wilco_ec/telemetry.c

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/wilco_ec/Kconfig b/drivers/platform/chrome/wilco_ec/Kconfig
index 2c8f6f15f28f..90336874af59 100644
--- a/drivers/platform/chrome/wilco_ec/Kconfig
+++ b/drivers/platform/chrome/wilco_ec/Kconfig
@@ -27,3 +27,10 @@ config WILCO_EC_EVENTS
 	  (such as power state changes) to userspace. The EC sends the events
 	  over ACPI, and a driver queues up the events to be read by a
 	  userspace daemon from /dev/wilco_event using read() and poll().
+
+config WILCO_EC_TELEMETRY
+	tristate "Enable querying telemetry data from EC"
+	depends on WILCO_EC
+	help
+	  If you say Y here, you get support to query EC telemetry data from
+	  /dev/wilco_telem0 using write() and then read().
diff --git a/drivers/platform/chrome/wilco_ec/Makefile b/drivers/platform/chrome/wilco_ec/Makefile
index 4d8a5f068f8b..bc817164596e 100644
--- a/drivers/platform/chrome/wilco_ec/Makefile
+++ b/drivers/platform/chrome/wilco_ec/Makefile
@@ -6,3 +6,5 @@ wilco_ec_debugfs-objs			:= debugfs.o
 obj-$(CONFIG_WILCO_EC_DEBUGFS)		+= wilco_ec_debugfs.o
 wilco_ec_events-objs			:= event.o
 obj-$(CONFIG_WILCO_EC_EVENTS)		+= wilco_ec_events.o
+wilco_ec_telem-objs			:= telemetry.o
+obj-$(CONFIG_WILCO_EC_TELEMETRY)	+= wilco_ec_telem.o
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index 45cf3a5ed062..3724bf4b77c6 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -93,8 +93,20 @@ static int wilco_ec_probe(struct platform_device *pdev)
 		goto unregister_rtc;
 	}
 
+	/* Register child device that will be found by the telemetry driver. */
+	ec->telem_pdev = platform_device_register_data(dev, "wilco_telem",
+						       PLATFORM_DEVID_AUTO,
+						       ec, sizeof(*ec));
+	if (IS_ERR(ec->telem_pdev)) {
+		dev_err(dev, "Failed to create telemetry platform device\n");
+		ret = PTR_ERR(ec->telem_pdev);
+		goto remove_sysfs;
+	}
+
 	return 0;
 
+remove_sysfs:
+	wilco_ec_remove_sysfs(ec);
 unregister_rtc:
 	platform_device_unregister(ec->rtc_pdev);
 unregister_debugfs:
@@ -109,6 +121,7 @@ static int wilco_ec_remove(struct platform_device *pdev)
 	struct wilco_ec_device *ec = platform_get_drvdata(pdev);
 
 	wilco_ec_remove_sysfs(ec);
+	platform_device_unregister(ec->telem_pdev);
 	platform_device_unregister(ec->rtc_pdev);
 	if (ec->debugfs_pdev)
 		platform_device_unregister(ec->debugfs_pdev);
diff --git a/drivers/platform/chrome/wilco_ec/debugfs.c b/drivers/platform/chrome/wilco_ec/debugfs.c
index 281ec595e8e0..8d65a1e2f1a3 100644
--- a/drivers/platform/chrome/wilco_ec/debugfs.c
+++ b/drivers/platform/chrome/wilco_ec/debugfs.c
@@ -16,7 +16,7 @@
 
 #define DRV_NAME "wilco-ec-debugfs"
 
-/* The 256 raw bytes will take up more space when represented as a hex string */
+/* The raw bytes will take up more space when represented as a hex string */
 #define FORMATTED_BUFFER_SIZE (EC_MAILBOX_DATA_SIZE * 4)
 
 struct wilco_ec_debugfs {
diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c
new file mode 100644
index 000000000000..94cdc166c840
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/telemetry.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Telemetry communication for Wilco EC
+ *
+ * Copyright 2019 Google LLC
+ *
+ * The Wilco Embedded Controller is able to send telemetry data
+ * which is useful for enterprise applications. A daemon running on
+ * the OS sends a command to the EC via a write() to a char device,
+ * and can read the response with a read(). The write() request is
+ * verified by the driver to ensure that it is performing only one
+ * of the whitelisted commands, and that no extraneous data is
+ * being transmitted to the EC. The response is passed directly
+ * back to the reader with no modification.
+ *
+ * The character device will appear as /dev/wilco_telemN, where N
+ * is some small non-negative integer, starting with 0. Only one
+ * process may have the file descriptor open at a time. The calling
+ * userspace program needs to keep the device file descriptor open
+ * between the calls to write() and read() in order to preserve the
+ * response. Up to 32 bytes will be available for reading.
+ *
+ * For testing purposes, try requesting the EC's firmware build
+ * date, by sending the WILCO_EC_TELEM_GET_VERSION command with
+ * argument index=3. i.e. write [0x38, 0x00, 0x03]
+ * to the device node. An ASCII string of the build date is
+ * returned.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#define TELEM_DEV_NAME		"wilco_telem"
+#define TELEM_CLASS_NAME	TELEM_DEV_NAME
+#define DRV_NAME		TELEM_DEV_NAME
+#define TELEM_DEV_NAME_FMT	(TELEM_DEV_NAME "%d")
+static struct class telem_class = {
+	.owner	= THIS_MODULE,
+	.name	= TELEM_CLASS_NAME,
+};
+
+/* Keep track of all the device numbers used. */
+#define TELEM_MAX_DEV 128
+static int telem_major;
+static DEFINE_IDA(telem_ida);
+
+/* EC telemetry command codes */
+#define WILCO_EC_TELEM_GET_LOG			0x99
+#define WILCO_EC_TELEM_GET_VERSION		0x38
+#define WILCO_EC_TELEM_GET_FAN_INFO		0x2E
+#define WILCO_EC_TELEM_GET_DIAG_INFO		0xFA
+#define WILCO_EC_TELEM_GET_TEMP_INFO		0x95
+#define WILCO_EC_TELEM_GET_TEMP_READ		0x2C
+#define WILCO_EC_TELEM_GET_BATT_EXT_INFO	0x07
+
+#define TELEM_ARGS_SIZE_MAX	30
+
+/**
+ * struct wilco_ec_telem_request - Telemetry command and arguments sent to EC.
+ * @command: One of WILCO_EC_TELEM_GET_* command codes.
+ * @reserved: Must be 0.
+ * @args: The first N bytes are one of telem_args_get_* structs, the rest is 0.
+ */
+struct wilco_ec_telem_request {
+	u8 command;
+	u8 reserved;
+	u8 args[TELEM_ARGS_SIZE_MAX];
+} __packed;
+
+/*
+ * The following telem_args_get_* structs are embedded within the |args| field
+ * of wilco_ec_telem_request.
+ */
+
+struct telem_args_get_log {
+	u8 log_type;
+	u8 log_index;
+} __packed;
+
+/*
+ * Get a piece of info about the EC firmware version:
+ * 0 = label
+ * 1 = svn_rev
+ * 2 = model_no
+ * 3 = build_date
+ * 4 = frio_version
+ */
+struct telem_args_get_version {
+	u8 index;
+} __packed;
+
+struct telem_args_get_fan_info {
+	u8 command;
+	u8 fan_number;
+	u8 arg;
+} __packed;
+
+struct telem_args_get_diag_info {
+	u8 type;
+	u8 sub_type;
+} __packed;
+
+struct telem_args_get_temp_info {
+	u8 command;
+	u8 index;
+	u8 field;
+	u8 zone;
+} __packed;
+
+struct telem_args_get_temp_read {
+	u8 sensor_index;
+} __packed;
+
+struct telem_args_get_batt_ext_info {
+	u8 var_args[5];
+} __packed;
+
+/**
+ * check_telem_request() - Ensure that a request from userspace is valid.
+ * @rq: Request buffer copied from userspace.
+ * @size: Number of bytes copied from userspace.
+ *
+ * Return: 0 if valid, -EINVAL if bad command or reserved byte is non-zero,
+ *         -EMSGSIZE if the request is too long.
+ *
+ * We do not want to allow userspace to send arbitrary telemetry commands to
+ * the EC. Therefore we check to ensure that
+ * 1. The request follows the format of struct wilco_ec_telem_request.
+ * 2. The supplied command code is one of the whitelisted commands.
+ * 3. The request only contains the necessary data for the header and arguments.
+ */
+static int check_telem_request(struct wilco_ec_telem_request *rq,
+			       size_t size)
+{
+	size_t max_size = offsetof(struct wilco_ec_telem_request, args);
+
+	if (rq->reserved)
+		return -EINVAL;
+
+	switch (rq->command) {
+	case WILCO_EC_TELEM_GET_LOG:
+		max_size += sizeof(struct telem_args_get_log);
+		break;
+	case WILCO_EC_TELEM_GET_VERSION:
+		max_size += sizeof(struct telem_args_get_version);
+		break;
+	case WILCO_EC_TELEM_GET_FAN_INFO:
+		max_size += sizeof(struct telem_args_get_fan_info);
+		break;
+	case WILCO_EC_TELEM_GET_DIAG_INFO:
+		max_size += sizeof(struct telem_args_get_diag_info);
+		break;
+	case WILCO_EC_TELEM_GET_TEMP_INFO:
+		max_size += sizeof(struct telem_args_get_temp_info);
+		break;
+	case WILCO_EC_TELEM_GET_TEMP_READ:
+		max_size += sizeof(struct telem_args_get_temp_read);
+		break;
+	case WILCO_EC_TELEM_GET_BATT_EXT_INFO:
+		max_size += sizeof(struct telem_args_get_batt_ext_info);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return (size <= max_size) ? 0 : -EMSGSIZE;
+}
+
+/**
+ * struct telem_device_data - Data for a Wilco EC device that queries telemetry.
+ * @cdev: Char dev that userspace reads and polls from.
+ * @dev: Device associated with the %cdev.
+ * @ec: Wilco EC that we will be communicating with using the mailbox interface.
+ * @available: Boolean of if the device can be opened.
+ */
+struct telem_device_data {
+	struct device dev;
+	struct cdev cdev;
+	struct wilco_ec_device *ec;
+	atomic_t available;
+};
+
+#define TELEM_RESPONSE_SIZE	EC_MAILBOX_DATA_SIZE
+
+/**
+ * struct telem_session_data - Data that exists between open() and release().
+ * @dev_data: Pointer to get back to the device data and EC.
+ * @request: Command and arguments sent to EC.
+ * @response: Response buffer of data from EC.
+ * @has_msg: Is there data available to read from a previous write?
+ */
+struct telem_session_data {
+	struct telem_device_data *dev_data;
+	struct wilco_ec_telem_request request;
+	u8 response[TELEM_RESPONSE_SIZE];
+	bool has_msg;
+};
+
+/**
+ * telem_open() - Callback for when the device node is opened.
+ * @inode: inode for this char device node.
+ * @filp: file for this char device node.
+ *
+ * We need to ensure that after writing a command to the device,
+ * the same userspace process reads the corresponding result.
+ * Therefore, we increment a refcount on opening the device, so that
+ * only one process can communicate with the EC at a time.
+ *
+ * Return: 0 on success, or negative error code on failure.
+ */
+static int telem_open(struct inode *inode, struct file *filp)
+{
+	struct telem_device_data *dev_data;
+	struct telem_session_data *sess_data;
+
+	/* Ensure device isn't already open */
+	dev_data = container_of(inode->i_cdev, struct telem_device_data, cdev);
+	if (atomic_cmpxchg(&dev_data->available, 1, 0) == 0)
+		return -EBUSY;
+
+	get_device(&dev_data->dev);
+
+	sess_data = kzalloc(sizeof(*sess_data), GFP_KERNEL);
+	if (!sess_data) {
+		atomic_set(&dev_data->available, 1);
+		return -ENOMEM;
+	}
+	sess_data->dev_data = dev_data;
+	sess_data->has_msg = false;
+
+	nonseekable_open(inode, filp);
+	filp->private_data = sess_data;
+
+	return 0;
+}
+
+static ssize_t telem_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	struct telem_session_data *sess_data = filp->private_data;
+	struct wilco_ec_message msg = {};
+	int ret;
+
+	if (count > sizeof(sess_data->request))
+		return -EMSGSIZE;
+	if (copy_from_user(&sess_data->request, buf, count))
+		return -EFAULT;
+	ret = check_telem_request(&sess_data->request, count);
+	if (ret < 0)
+		return ret;
+
+	memset(sess_data->response, 0, sizeof(sess_data->response));
+	msg.type = WILCO_EC_MSG_TELEMETRY;
+	msg.request_data = &sess_data->request;
+	msg.request_size = sizeof(sess_data->request);
+	msg.response_data = sess_data->response;
+	msg.response_size = sizeof(sess_data->response);
+
+	ret = wilco_ec_mailbox(sess_data->dev_data->ec, &msg);
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(sess_data->response))
+		return -EMSGSIZE;
+
+	sess_data->has_msg = true;
+
+	return count;
+}
+
+static ssize_t telem_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *pos)
+{
+	struct telem_session_data *sess_data = filp->private_data;
+
+	if (!sess_data->has_msg)
+		return -ENODATA;
+	if (count > sizeof(sess_data->response))
+		return -EINVAL;
+
+	if (copy_to_user(buf, sess_data->response, count))
+		return -EFAULT;
+
+	sess_data->has_msg = false;
+
+	return count;
+}
+
+static int telem_release(struct inode *inode, struct file *filp)
+{
+	struct telem_session_data *sess_data = filp->private_data;
+
+	atomic_set(&sess_data->dev_data->available, 1);
+	put_device(&sess_data->dev_data->dev);
+	kfree(sess_data);
+
+	return 0;
+}
+
+static const struct file_operations telem_fops = {
+	.open = telem_open,
+	.write = telem_write,
+	.read = telem_read,
+	.release = telem_release,
+	.llseek = no_llseek,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * telem_device_free() - Callback to free the telem_device_data structure.
+ * @d: The device embedded in our device data, which we have been ref counting.
+ *
+ * Once all open file descriptors are closed and the device has been removed,
+ * the refcount of the device will fall to 0 and this will be called.
+ */
+static void telem_device_free(struct device *d)
+{
+	struct telem_device_data *dev_data;
+
+	dev_data = container_of(d, struct telem_device_data, dev);
+	kfree(dev_data);
+}
+
+/**
+ * telem_device_probe() - Callback when creating a new device.
+ * @pdev: platform device that we will be receiving telems from.
+ *
+ * This finds a free minor number for the device, allocates and initializes
+ * some device data, and creates a new device and char dev node.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int telem_device_probe(struct platform_device *pdev)
+{
+	struct telem_device_data *dev_data;
+	int error, minor;
+
+	/* Get the next available device number */
+	minor = ida_alloc_max(&telem_ida, TELEM_MAX_DEV-1, GFP_KERNEL);
+	if (minor < 0) {
+		error = minor;
+		dev_err(&pdev->dev, "Failed to find minor number: %d", error);
+		return error;
+	}
+
+	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data) {
+		ida_simple_remove(&telem_ida, minor);
+		return -ENOMEM;
+	}
+
+	/* Initialize the device data */
+	dev_data->ec = dev_get_platdata(&pdev->dev);
+	atomic_set(&dev_data->available, 1);
+	platform_set_drvdata(pdev, dev_data);
+
+	/* Initialize the device */
+	dev_data->dev.devt = MKDEV(telem_major, minor);
+	dev_data->dev.class = &telem_class;
+	dev_data->dev.release = telem_device_free;
+	dev_set_name(&dev_data->dev, TELEM_DEV_NAME_FMT, minor);
+	device_initialize(&dev_data->dev);
+
+	/* Initialize the character device and add it to userspace */;
+	cdev_init(&dev_data->cdev, &telem_fops);
+	error = cdev_device_add(&dev_data->cdev, &dev_data->dev);
+	if (error) {
+		put_device(&dev_data->dev);
+		ida_simple_remove(&telem_ida, minor);
+		return error;
+	}
+
+	return 0;
+}
+
+static int telem_device_remove(struct platform_device *pdev)
+{
+	struct telem_device_data *dev_data = platform_get_drvdata(pdev);
+
+	cdev_device_del(&dev_data->cdev, &dev_data->dev);
+	put_device(&dev_data->dev);
+	ida_simple_remove(&telem_ida, MINOR(dev_data->dev.devt));
+
+	return 0;
+}
+
+static struct platform_driver telem_driver = {
+	.probe = telem_device_probe,
+	.remove = telem_device_remove,
+	.driver = {
+		.name = DRV_NAME,
+	},
+};
+
+static int __init telem_module_init(void)
+{
+	dev_t dev_num = 0;
+	int ret;
+
+	ret = class_register(&telem_class);
+	if (ret) {
+		pr_err(DRV_NAME ": Failed registering class: %d", ret);
+		return ret;
+	}
+
+	/* Request the kernel for device numbers, starting with minor=0 */
+	ret = alloc_chrdev_region(&dev_num, 0, TELEM_MAX_DEV, TELEM_DEV_NAME);
+	if (ret) {
+		pr_err(DRV_NAME ": Failed allocating dev numbers: %d", ret);
+		goto destroy_class;
+	}
+	telem_major = MAJOR(dev_num);
+
+	ret = platform_driver_register(&telem_driver);
+	if (ret < 0) {
+		pr_err(DRV_NAME ": Failed registering driver: %d\n", ret);
+		goto unregister_region;
+	}
+
+	return 0;
+
+unregister_region:
+	unregister_chrdev_region(MKDEV(telem_major, 0), TELEM_MAX_DEV);
+destroy_class:
+	class_unregister(&telem_class);
+	ida_destroy(&telem_ida);
+	return ret;
+}
+
+static void __exit telem_module_exit(void)
+{
+	platform_driver_unregister(&telem_driver);
+	unregister_chrdev_region(MKDEV(telem_major, 0), TELEM_MAX_DEV);
+	class_unregister(&telem_class);
+	ida_destroy(&telem_ida);
+}
+
+module_init(telem_module_init);
+module_exit(telem_module_exit);
+
+MODULE_AUTHOR("Nick Crews <ncrews@chromium.org>");
+MODULE_DESCRIPTION("Wilco EC telemetry driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index e3ce9ce49b11..ad03b586a095 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -29,6 +29,7 @@
  * @data_size: Size of the data buffer used for EC communication.
  * @debugfs_pdev: The child platform_device used by the debugfs sub-driver.
  * @rtc_pdev: The child platform_device used by the RTC sub-driver.
+ * @telem_pdev: The child platform_device used by the telemetry sub-driver.
  */
 struct wilco_ec_device {
 	struct device *dev;
@@ -40,6 +41,7 @@ struct wilco_ec_device {
 	size_t data_size;
 	struct platform_device *debugfs_pdev;
 	struct platform_device *rtc_pdev;
+	struct platform_device *telem_pdev;
 };
 
 /**
-- 
cgit v1.2.3


From 4998f1efd1904dd21697aeeead270e3eb97691dd Mon Sep 17 00:00:00 2001
From: Jim Lin <jilin@nvidia.com>
Date: Mon, 3 Jun 2019 18:53:43 +0800
Subject: usb: Add devaddr in struct usb_device

The Clear_TT_Buffer request sent to the hub includes the address of
the LS/FS child device in wValue field. usb_hub_clear_tt_buffer()
uses udev->devnum to set the address wValue. This won't work for
devices connected to xHC.

For other host controllers udev->devnum is the same as the address of
the usb device, chosen and set by usb core. With xHC the controller
hardware assigns the address, and won't be the same as devnum.

Here we add devaddr in "struct usb_device" for
usb_hub_clear_tt_buffer() to use.

Signed-off-by: Jim Lin <jilin@nvidia.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c  | 4 +++-
 drivers/usb/host/xhci.c | 2 ++
 include/linux/usb.h     | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 572e8c26a129..82cc3766cb23 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -873,7 +873,7 @@ int usb_hub_clear_tt_buffer(struct urb *urb)
 	/* info that CLEAR_TT_BUFFER needs */
 	clear->tt = tt->multi ? udev->ttport : 1;
 	clear->devinfo = usb_pipeendpoint (pipe);
-	clear->devinfo |= udev->devnum << 4;
+	clear->devinfo |= ((u16)udev->devaddr) << 4;
 	clear->devinfo |= usb_pipecontrol(pipe)
 			? (USB_ENDPOINT_XFER_CONTROL << 11)
 			: (USB_ENDPOINT_XFER_BULK << 11);
@@ -2125,6 +2125,8 @@ static void update_devnum(struct usb_device *udev, int devnum)
 	/* The address for a WUSB device is managed by wusbcore. */
 	if (!udev->wusb)
 		udev->devnum = devnum;
+	if (!udev->devaddr)
+		udev->devaddr = (u8)devnum;
 }
 
 static void hub_free_dev(struct usb_device *udev)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 20db378a6012..4f92643e3a4c 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -4125,6 +4125,8 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
 	/* Zero the input context control for later use */
 	ctrl_ctx->add_flags = 0;
 	ctrl_ctx->drop_flags = 0;
+	slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx);
+	udev->devaddr = (u8)(le32_to_cpu(slot_ctx->dev_state) & DEV_ADDR_MASK);
 
 	xhci_dbg_trace(xhci, trace_xhci_dbg_address,
 		       "Internal device address = %d",
diff --git a/include/linux/usb.h b/include/linux/usb.h
index ae82d9d1112b..83d35d993e8c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -578,6 +578,7 @@ struct usb3_lpm_parameters {
  * @bus_mA: Current available from the bus
  * @portnum: parent port number (origin 1)
  * @level: number of USB hub ancestors
+ * @devaddr: device address, XHCI: assigned by HW, others: same as devnum
  * @can_submit: URBs may be submitted
  * @persist_enabled:  USB_PERSIST enabled for this device
  * @have_langid: whether string_langid is valid
@@ -661,6 +662,7 @@ struct usb_device {
 	unsigned short bus_mA;
 	u8 portnum;
 	u8 level;
+	u8 devaddr;
 
 	unsigned can_submit:1;
 	unsigned persist_enabled:1;
-- 
cgit v1.2.3


From 32a6cfdfd168982cd7cd2898372da5eb49e56daf Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Wed, 5 Jun 2019 15:16:21 +0900
Subject: usb: renesas_usbhs: remove sudmac support

SUDMAC feature was supported in v3.10, but was never used by
any platform. So, this patch removes it.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/fifo.c  | 6 +-----
 include/linux/usb/renesas_usbhs.h | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c
index 452b456ac24e..e84d2ac2a30a 100644
--- a/drivers/usb/renesas_usbhs/fifo.c
+++ b/drivers/usb/renesas_usbhs/fifo.c
@@ -12,7 +12,6 @@
 #include "pipe.h"
 
 #define usbhsf_get_cfifo(p)	(&((p)->fifo_info.cfifo))
-#define usbhsf_is_cfifo(p, f)	(usbhsf_get_cfifo(p) == f)
 
 #define usbhsf_fifo_is_busy(f)	((f)->pipe) /* see usbhs_pipe_select_fifo */
 
@@ -325,10 +324,7 @@ static int usbhsf_fifo_select(struct usbhs_pipe *pipe,
 	}
 
 	/* "base" will be used below  */
-	if (usbhs_get_dparam(priv, has_sudmac) && !usbhsf_is_cfifo(priv, fifo))
-		usbhs_write(priv, fifo->sel, base);
-	else
-		usbhs_write(priv, fifo->sel, base | MBW_32);
+	usbhs_write(priv, fifo->sel, base | MBW_32);
 
 	/* check ISEL and CURPIPE value */
 	while (timeout--) {
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 3f53043fb56b..a2481f4da841 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -187,7 +187,6 @@ struct renesas_usbhs_driver_param {
 	 * option:
 	 */
 	u32 has_otg:1; /* for controlling PWEN/EXTLP */
-	u32 has_sudmac:1; /* for SUDMAC */
 	u32 has_usb_dmac:1; /* for USB-DMAC */
 	u32 runtime_pwctrl:1;
 	u32 has_cnen:1;
-- 
cgit v1.2.3


From e60e982375244026ca46feeba0fb5bb4d51b5a67 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Wed, 5 Jun 2019 15:16:22 +0900
Subject: usb: renesas_usbhs: remove controlling PWEN/EXTLP support

Controlling PWMEN/EXTLP (named as "has_otg") was supported in v3.2,
but the last user (kzm9g) was removed by the commit 30f8925a57d8ad49
("ARM: shmobile: Remove legacy board code for KZM-A9-GT"). So, this
patch remove it.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 4 ----
 include/linux/usb/renesas_usbhs.h  | 1 -
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index c7c9c5d75a56..a501ea609019 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -95,10 +95,6 @@ void usbhs_sys_host_ctrl(struct usbhs_priv *priv, int enable)
 {
 	u16 mask = DCFM | DRPD | DPRPU | HSE | USBE;
 	u16 val  = DCFM | DRPD | HSE | USBE;
-	int has_otg = usbhs_get_dparam(priv, has_otg);
-
-	if (has_otg)
-		usbhs_bset(priv, DVSTCTR, (EXTLP | PWEN), (EXTLP | PWEN));
 
 	/*
 	 * if enable
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index a2481f4da841..b2cba7c74444 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -186,7 +186,6 @@ struct renesas_usbhs_driver_param {
 	/*
 	 * option:
 	 */
-	u32 has_otg:1; /* for controlling PWEN/EXTLP */
 	u32 has_usb_dmac:1; /* for USB-DMAC */
 	u32 runtime_pwctrl:1;
 	u32 has_cnen:1;
-- 
cgit v1.2.3


From 0d9ce162cf46c99628cc5da9510b959c7976735b Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Thu, 3 Jan 2019 17:14:28 -0800
Subject: kvm: Convert kvm_lock to a mutex

It doesn't seem as if there is any particular need for kvm_lock to be a
spinlock, so convert the lock to a mutex so that sleepable functions (in
particular cond_resched()) can be called while holding it.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/locking.txt |  4 +---
 arch/s390/kvm/kvm-s390.c              |  4 ++--
 arch/x86/kvm/mmu.c                    |  4 ++--
 arch/x86/kvm/x86.c                    | 14 +++++++-------
 include/linux/kvm_host.h              |  2 +-
 virt/kvm/kvm_main.c                   | 30 +++++++++++++++---------------
 6 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 1bb8bcaf8497..635cd6eaf714 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows:
 
 On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
 
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock.
-
 Everything else is a leaf: no other lock is taken inside the critical
 sections.
 
@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above.
 ------------
 
 Name:		kvm_lock
-Type:		spinlock_t
+Type:		mutex
 Arch:		any
 Protects:	- vm_list
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7936af0a971f..0fef9192f6ac 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2423,13 +2423,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
 	if (!kvm->arch.sca)
 		goto out_err;
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	sca_offset += 16;
 	if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
 		sca_offset = 0;
 	kvm->arch.sca = (struct bsca_block *)
 			((char *) kvm->arch.sca + sca_offset);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	sprintf(debug_name, "kvm-%u", current->pid);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95ac393e2959..3384c539d150 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5956,7 +5956,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	int nr_to_scan = sc->nr_to_scan;
 	unsigned long freed = 0;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		int idx;
@@ -5998,7 +5998,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		break;
 	}
 
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	return freed;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 10feed6a01eb..6200d5a51f13 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6719,7 +6719,7 @@ static void kvm_hyperv_tsc_notifier(void)
 	struct kvm_vcpu *vcpu;
 	int cpu;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_make_mclock_inprogress_request(kvm);
 
@@ -6745,7 +6745,7 @@ static void kvm_hyperv_tsc_notifier(void)
 
 		spin_unlock(&ka->pvclock_gtod_sync_lock);
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 }
 #endif
 
@@ -6796,17 +6796,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 
 	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != cpu)
 				continue;
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-			if (vcpu->cpu != smp_processor_id())
+			if (vcpu->cpu != raw_smp_processor_id())
 				send_ipi = 1;
 		}
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	if (freq->old < freq->new && send_ipi) {
 		/*
@@ -6929,12 +6929,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
 	struct kvm_vcpu *vcpu;
 	int i;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 	atomic_set(&kvm_guest_has_master_clock, 0);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 }
 
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5e9fd7ad8018..abafddb9fe2c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -162,7 +162,7 @@ static inline bool is_error_page(struct page *page)
 
 extern struct kmem_cache *kvm_vcpu_cache;
 
-extern spinlock_t kvm_lock;
+extern struct mutex kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_io_range {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b2579841263f..9613987ef4c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_MUTEX(kvm_lock);
 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 
@@ -683,9 +683,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	if (r)
 		goto out_err;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	preempt_notifier_inc();
 
@@ -731,9 +731,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
 	kvm_destroy_vm_debugfs(kvm);
 	kvm_arch_sync_events(kvm);
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -4034,13 +4034,13 @@ static int vm_stat_get(void *_offset, u64 *val)
 	u64 tmp_val;
 
 	*val = 0;
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		stat_tmp.kvm = kvm;
 		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
 		*val += tmp_val;
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	return 0;
 }
 
@@ -4053,12 +4053,12 @@ static int vm_stat_clear(void *_offset, u64 val)
 	if (val)
 		return -EINVAL;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		stat_tmp.kvm = kvm;
 		vm_stat_clear_per_vm((void *)&stat_tmp, 0);
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	return 0;
 }
@@ -4073,13 +4073,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 	u64 tmp_val;
 
 	*val = 0;
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		stat_tmp.kvm = kvm;
 		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
 		*val += tmp_val;
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	return 0;
 }
 
@@ -4092,12 +4092,12 @@ static int vcpu_stat_clear(void *_offset, u64 val)
 	if (val)
 		return -EINVAL;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		stat_tmp.kvm = kvm;
 		vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	return 0;
 }
@@ -4118,7 +4118,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	if (!kvm_dev.this_device || !kvm)
 		return;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	if (type == KVM_EVENT_CREATE_VM) {
 		kvm_createvm_count++;
 		kvm_active_vms++;
@@ -4127,7 +4127,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	}
 	created = kvm_createvm_count;
 	active = kvm_active_vms;
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
 	if (!env)
-- 
cgit v1.2.3


From 1e390478cfb527e34c9ab89ba57212cb05c33c51 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 5 Jun 2019 10:46:05 +0200
Subject: gpu: host1x: Increase maximum DMA segment size

Recent versions of the DMA API debug code have started to warn about
violations of the maximum DMA segment size. This is because the segment
size defaults to 64 KiB, which can easily be exceeded in large buffer
allocations such as used in DRM/KMS for framebuffers.

Technically the Tegra SMMU and ARM SMMU don't have a maximum segment
size (they map individual pages irrespective of whether they are
contiguous or not), so the choice of 4 MiB is a bit arbitrary here. The
maximum segment size is a 32-bit unsigned integer, though, so we can't
set it to the correct maximum size, which would be the size of the
aperture.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 3 +++
 include/linux/host1x.h   | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 103fffc1904b..c9a637d9417e 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -425,6 +425,9 @@ static int host1x_device_add(struct host1x *host1x,
 
 	of_dma_configure(&device->dev, host1x->dev->of_node, true);
 
+	device->dev.dma_parms = &device->dma_parms;
+	dma_set_max_seg_size(&device->dev, SZ_4M);
+
 	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
 		kfree(device);
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 89110d896d72..aef6e2f73802 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -310,6 +310,8 @@ struct host1x_device {
 	struct list_head clients;
 
 	bool registered;
+
+	struct device_dma_parameters dma_parms;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From 2bd229df5e2ecbc13909f71dbd196fced1d533ca Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 4 Jun 2019 23:02:34 +0200
Subject: net: phy: remove state PHY_FORCING

In the early days of phylib we had a functionality that changed to the
next lower speed in fixed mode if no link was established after a
certain period of time. This functionality has been removed years ago,
and state PHY_FORCING isn't needed any longer. Instead we can go from
UP to RUNNING or NOLINK directly (same as in autoneg mode).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 26 ++------------------------
 include/linux/phy.h   | 11 -----------
 2 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 0084220d10dc..d9150765009e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -43,7 +43,6 @@ static const char *phy_state_to_str(enum phy_state st)
 	PHY_STATE_STR(UP)
 	PHY_STATE_STR(RUNNING)
 	PHY_STATE_STR(NOLINK)
-	PHY_STATE_STR(FORCING)
 	PHY_STATE_STR(HALTED)
 	}
 
@@ -577,15 +576,8 @@ int phy_start_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto out_unlock;
 
-	if (phy_is_started(phydev)) {
-		if (phydev->autoneg == AUTONEG_ENABLE) {
-			err = phy_check_link_status(phydev);
-		} else {
-			phydev->state = PHY_FORCING;
-			phydev->link_timeout = PHY_FORCE_TIMEOUT;
-		}
-	}
-
+	if (phy_is_started(phydev))
+		err = phy_check_link_status(phydev);
 out_unlock:
 	mutex_unlock(&phydev->lock);
 
@@ -951,20 +943,6 @@ void phy_state_machine(struct work_struct *work)
 	case PHY_RUNNING:
 		err = phy_check_link_status(phydev);
 		break;
-	case PHY_FORCING:
-		err = genphy_update_link(phydev);
-		if (err)
-			break;
-
-		if (phydev->link) {
-			phydev->state = PHY_RUNNING;
-			phy_link_up(phydev);
-		} else {
-			if (0 == phydev->link_timeout--)
-				needs_aneg = true;
-			phy_link_down(phydev, false);
-		}
-		break;
 	case PHY_HALTED:
 		if (phydev->link) {
 			phydev->link = 0;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index dc4b51060ebc..8caaf76685cd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -297,12 +297,6 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
  * - irq or timer will set RUNNING if link comes back
  * - phy_stop moves to HALTED
  *
- * FORCING: PHY is being configured with forced settings
- * - if link is up, move to RUNNING
- * - If link is down, we drop to the next highest setting, and
- *   retry (FORCING) after a timeout
- * - phy_stop moves to HALTED
- *
  * RUNNING: PHY is currently up, running, and possibly sending
  * and/or receiving packets
  * - irq or timer will set NOLINK if link goes down
@@ -319,7 +313,6 @@ enum phy_state {
 	PHY_UP,
 	PHY_RUNNING,
 	PHY_NOLINK,
-	PHY_FORCING,
 };
 
 /**
@@ -347,8 +340,6 @@ struct phy_c45_device_ids {
  * loopback_enabled: Set true if this phy has been loopbacked successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
- * link_timeout: The number of timer firings to wait before the
- * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
  * phy_timer: The timer for handling the state machine
  * attached_dev: The attached enet driver's device instance ptr
@@ -416,8 +407,6 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-	int link_timeout;
-
 #ifdef CONFIG_LED_TRIGGER_PHY
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
-- 
cgit v1.2.3


From 9c3cef54c50d93871eaa46c28a06de8bd03fab63 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Jun 2019 11:34:28 +0200
Subject: VMCI: Fixup atomic64_t abuse

The VMCI driver is abusing atomic64_t and atomic_t, there is no actual
atomic RmW operations around.

Rewrite the code to use a regular u64 with READ_ONCE() and
WRITE_ONCE() and a cast to 'unsigned long'. This fully preserves
whatever broken there was (it's not endian-safe for starters, and also
looks to be missing ordering).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vmw_vmci_defs.h | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index 0c06178e4985..8ee472118f54 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -438,8 +438,8 @@ enum {
 struct vmci_queue_header {
 	/* All fields are 64bit and aligned. */
 	struct vmci_handle handle;	/* Identifier. */
-	atomic64_t producer_tail;	/* Offset in this queue. */
-	atomic64_t consumer_head;	/* Offset in peer queue. */
+	u64 producer_tail;	/* Offset in this queue. */
+	u64 consumer_head;	/* Offset in peer queue. */
 };
 
 /*
@@ -740,13 +740,9 @@ static inline void *vmci_event_data_payload(struct vmci_event_data *ev_data)
  * prefix will be used, so correctness isn't an issue, but using a
  * 64bit operation still adds unnecessary overhead.
  */
-static inline u64 vmci_q_read_pointer(atomic64_t *var)
+static inline u64 vmci_q_read_pointer(u64 *var)
 {
-#if defined(CONFIG_X86_32)
-	return atomic_read((atomic_t *)var);
-#else
-	return atomic64_read(var);
-#endif
+	return READ_ONCE(*(unsigned long *)var);
 }
 
 /*
@@ -755,23 +751,17 @@ static inline u64 vmci_q_read_pointer(atomic64_t *var)
  * never exceeds a 32bit value in this case. On 32bit SMP, using a
  * locked cmpxchg8b adds unnecessary overhead.
  */
-static inline void vmci_q_set_pointer(atomic64_t *var,
-				      u64 new_val)
+static inline void vmci_q_set_pointer(u64 *var, u64 new_val)
 {
-#if defined(CONFIG_X86_32)
-	return atomic_set((atomic_t *)var, (u32)new_val);
-#else
-	return atomic64_set(var, new_val);
-#endif
+	/* XXX buggered on big-endian */
+	WRITE_ONCE(*(unsigned long *)var, (unsigned long)new_val);
 }
 
 /*
  * Helper to add a given offset to a head or tail pointer. Wraps the
  * value of the pointer around the max size of the queue.
  */
-static inline void vmci_qp_add_pointer(atomic64_t *var,
-				       size_t add,
-				       u64 size)
+static inline void vmci_qp_add_pointer(u64 *var, size_t add, u64 size)
 {
 	u64 new_val = vmci_q_read_pointer(var);
 
@@ -848,8 +838,8 @@ static inline void vmci_q_header_init(struct vmci_queue_header *q_header,
 				      const struct vmci_handle handle)
 {
 	q_header->handle = handle;
-	atomic64_set(&q_header->producer_tail, 0);
-	atomic64_set(&q_header->consumer_head, 0);
+	q_header->producer_tail = 0;
+	q_header->consumer_head = 0;
 }
 
 /*
-- 
cgit v1.2.3


From 9c523efb749de01f0ec172d1160bb6ef8d1620a4 Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Wed, 5 Jun 2019 07:56:31 -0500
Subject: mfd: ti-lmu: Add LM36274 support to the ti-lmu

Add the LM36274 register support to the ti-lmu MFD driver.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/mfd/Kconfig                 |  5 ++---
 drivers/mfd/ti-lmu.c                | 14 ++++++++++++++
 include/linux/mfd/ti-lmu-register.h | 23 +++++++++++++++++++++++
 include/linux/mfd/ti-lmu.h          |  4 ++++
 4 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8933485b28e7..a69aca3c2dab 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1335,9 +1335,8 @@ config MFD_TI_LMU
 	select REGMAP_I2C
 	help
 	  Say yes here to enable support for TI LMU chips.
-
-	  TI LMU MFD supports LM3532, LM3631, LM3632, LM3633, and LM3695.
-	  It consists of backlight, LED and regulator driver.
+	  TI LMU MFD supports LM3532, LM3631, LM3632, LM3633, LM3695 and
+	  LM36274.  It consists of backlight, LED and regulator driver.
 	  It provides consistent device controls for lighting functions.
 
 config MFD_OMAP_USB_HOST
diff --git a/drivers/mfd/ti-lmu.c b/drivers/mfd/ti-lmu.c
index 89b1c5b584af..691ab9dd6236 100644
--- a/drivers/mfd/ti-lmu.c
+++ b/drivers/mfd/ti-lmu.c
@@ -111,6 +111,17 @@ static const struct mfd_cell lm3695_devices[] = {
 	},
 };
 
+static const struct mfd_cell lm36274_devices[] = {
+	LM363X_REGULATOR(LM36274_BOOST),
+	LM363X_REGULATOR(LM36274_LDO_POS),
+	LM363X_REGULATOR(LM36274_LDO_NEG),
+	{
+		.name          = "lm36274-leds",
+		.id            = LM36274,
+		.of_compatible = "ti,lm36274-backlight",
+	},
+};
+
 #define TI_LMU_DATA(chip, max_reg)		\
 static const struct ti_lmu_data chip##_data =	\
 {						\
@@ -123,6 +134,7 @@ TI_LMU_DATA(lm3631, LM3631_MAX_REG);
 TI_LMU_DATA(lm3632, LM3632_MAX_REG);
 TI_LMU_DATA(lm3633, LM3633_MAX_REG);
 TI_LMU_DATA(lm3695, LM3695_MAX_REG);
+TI_LMU_DATA(lm36274, LM36274_MAX_REG);
 
 static int ti_lmu_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 {
@@ -191,6 +203,7 @@ static const struct of_device_id ti_lmu_of_match[] = {
 	{ .compatible = "ti,lm3632", .data = &lm3632_data },
 	{ .compatible = "ti,lm3633", .data = &lm3633_data },
 	{ .compatible = "ti,lm3695", .data = &lm3695_data },
+	{ .compatible = "ti,lm36274", .data = &lm36274_data },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, ti_lmu_of_match);
@@ -200,6 +213,7 @@ static const struct i2c_device_id ti_lmu_ids[] = {
 	{ "lm3632", LM3632 },
 	{ "lm3633", LM3633 },
 	{ "lm3695", LM3695 },
+	{ "lm36274", LM36274 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, ti_lmu_ids);
diff --git a/include/linux/mfd/ti-lmu-register.h b/include/linux/mfd/ti-lmu-register.h
index 76998b01764b..076d8dea38fd 100644
--- a/include/linux/mfd/ti-lmu-register.h
+++ b/include/linux/mfd/ti-lmu-register.h
@@ -189,4 +189,27 @@
 #define LM3695_REG_BRT_MSB			0x14
 
 #define LM3695_MAX_REG				0x14
+
+/* LM36274 */
+#define LM36274_REG_REV				0x01
+#define LM36274_REG_BL_CFG_1			0x02
+#define LM36274_REG_BL_CFG_2			0x03
+#define LM36274_REG_BRT_LSB			0x04
+#define LM36274_REG_BRT_MSB			0x05
+#define LM36274_REG_BL_EN			0x08
+
+#define LM36274_REG_BIAS_CONFIG_1		0x09
+#define LM36274_EXT_EN_MASK			BIT(0)
+#define LM36274_EN_VNEG_MASK			BIT(1)
+#define LM36274_EN_VPOS_MASK			BIT(2)
+
+#define LM36274_REG_BIAS_CONFIG_2		0x0a
+#define LM36274_REG_BIAS_CONFIG_3		0x0b
+#define LM36274_REG_VOUT_BOOST			0x0c
+#define LM36274_REG_VOUT_POS			0x0d
+#define LM36274_REG_VOUT_NEG			0x0e
+#define LM36274_VOUT_MASK			0x3F
+
+#define LM36274_MAX_REG				0x13
+
 #endif
diff --git a/include/linux/mfd/ti-lmu.h b/include/linux/mfd/ti-lmu.h
index 54e9d272e81c..0957598c7d41 100644
--- a/include/linux/mfd/ti-lmu.h
+++ b/include/linux/mfd/ti-lmu.h
@@ -26,6 +26,7 @@ enum ti_lmu_id {
 	LM3632,
 	LM3633,
 	LM3695,
+	LM36274,
 	LMU_MAX_ID,
 };
 
@@ -67,6 +68,9 @@ enum lm363x_regulator_id {
 	LM3632_BOOST,		/* Boost output */
 	LM3632_LDO_POS,		/* Positive display bias output */
 	LM3632_LDO_NEG,		/* Negative display bias output */
+	LM36274_BOOST,		/* Boost output */
+	LM36274_LDO_POS,	/* Positive display bias output */
+	LM36274_LDO_NEG,	/* Negative display bias output */
 };
 
 /**
-- 
cgit v1.2.3


From 2076e5c0451ca943ff8ecc6def7239c84c77e070 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:38 -0700
Subject: mm/hmm: update HMM documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the HMM documentation to reflect the latest API and make a few
minor wording changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst | 141 +++++++++++++++++++++++++----------------------
 include/linux/hmm.h      |   7 ++-
 2 files changed, 78 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..7b6eeda5a7c0 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
 this document).
 
 HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with
+allowing a device to transparently access program addresses coherently with
 the CPU meaning that any valid pointer on the CPU is also a valid pointer
 for the device. This is becoming mandatory to simplify the use of advanced
 heterogeneous computing where GPU, DSP, or FPGA are used to perform various
@@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third
 section gives an overview of the HMM design. The fourth section explains how
 CPU page-table mirroring works and the purpose of HMM in this context. The
 fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows lever-
-aging the device DMA engine.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
 
 .. contents:: :local:
 
@@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation:
 i.e., one in which any application memory region can be used by a device
 transparently.
 
-Split address space happens because device can only access memory allocated
-through device specific API. This implies that all memory objects in a program
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
 are not equal from the device point of view which complicates large programs
 that rely on a wide set of libraries.
 
-Concretely this means that code that wants to leverage devices like GPUs needs
-to copy object between generically allocated memory (malloc, mmap private, mmap
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
 share) and memory allocated through the device driver API (this still ends up
 with an mmap but of the device file).
 
 For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-complex data sets (list, tree, ...) are hard to get right. Duplicating a
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
 complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and program gets harder to debug because of the
+elements. This is error prone and programs get harder to debug because of the
 duplicate data set and addresses.
 
 Split address space also means that libraries cannot transparently use data
@@ -77,12 +77,12 @@ I/O bus, device memory characteristics
 
 I/O buses cripple shared address spaces due to a few limitations. Most I/O
 buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from CPU is even more
+coherency is often optional. Access to device memory from a CPU is even more
 limited. More often than not, it is not cache coherent.
 
 If we only consider the PCIE bus, then a device can access main memory (often
 through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from device on main memory. This is worse
+a limited set of atomic operations from the device on main memory. This is worse
 in the other direction: the CPU can only access a limited range of the device
 memory and cannot perform atomic operations on it. Thus device memory cannot
 be considered the same as regular memory from the kernel point of view.
@@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an
 order of magnitude higher latency than when the device accesses its own memory.
 
 Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
-way cache coherency between CPU and device and allow all atomic operations the
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
 architecture supports. Sadly, not all platforms are following this trend and
 some major architectures are left without hardware solutions to these problems.
 
 So for shared address space to make sense, not only must we allow devices to
 access any memory but we must also permit any memory to be migrated to device
-memory while device is using it (blocking CPU access while it happens).
+memory while the device is using it (blocking CPU access while it happens).
 
 
 Shared address space and migration
 ==================================
 
-HMM intends to provide two main features. First one is to share the address
+HMM intends to provide two main features. The first one is to share the address
 space by duplicating the CPU page table in the device page table so the same
 address points to the same physical memory for any valid main memory address in
 the process address space.
@@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the
 hardware specific details to the device driver.
 
 The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of the device memory. Those pages
+allows allocating a struct page for each page of device memory. Those pages
 are special because the CPU cannot map them. However, they allow migrating
 main memory to device memory using existing migration mechanisms and everything
-looks like a page is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm mech-
-anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
 memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate things is left to the device driver.
+of what and when to migrate is left to the device driver.
 
 Note that any CPU access to a device page triggers a page fault and a migration
 back to main memory. For example, when a page backing a given CPU address A is
@@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to
 address A triggers a page fault and initiates a migration back to main memory.
 
 With these two features, HMM not only allows a device to mirror process address
-space and keeping both CPU and device page table synchronized, but also lever-
-ages device memory by migrating the part of the data set that is actively being
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
 used by the device.
 
 
@@ -151,21 +151,28 @@ registration of an hmm_mirror struct::
 
  int hmm_mirror_register(struct hmm_mirror *mirror,
                          struct mm_struct *mm);
- int hmm_mirror_register_locked(struct hmm_mirror *mirror,
-                                struct mm_struct *mm);
 
-
-The locked variant is to be used when the driver is already holding mmap_sem
-of the mm in write mode. The mirror struct has a set of callbacks that are used
+The mirror struct has a set of callbacks that are used
 to propagate CPU page tables::
 
  struct hmm_mirror_ops {
+     /* release() - release hmm_mirror
+      *
+      * @mirror: pointer to struct hmm_mirror
+      *
+      * This is called when the mm_struct is being released.  The callback
+      * must ensure that all access to any pages obtained from this mirror
+      * is halted before the callback returns. All future access should
+      * fault.
+      */
+     void (*release)(struct hmm_mirror *mirror);
+
      /* sync_cpu_device_pagetables() - synchronize page tables
       *
       * @mirror: pointer to struct hmm_mirror
-      * @update_type: type of update that occurred to the CPU page table
-      * @start: virtual start address of the range to update
-      * @end: virtual end address of the range to update
+      * @update: update information (see struct mmu_notifier_range)
+      * Return: -EAGAIN if update.blockable false and callback need to
+      *         block, 0 otherwise.
       *
       * This callback ultimately originates from mmu_notifiers when the CPU
       * page table is updated. The device driver must update its page table
@@ -176,14 +183,12 @@ to propagate CPU page tables::
       * page tables are completely updated (TLBs flushed, etc); this is a
       * synchronous call.
       */
-      void (*update)(struct hmm_mirror *mirror,
-                     enum hmm_update action,
-                     unsigned long start,
-                     unsigned long end);
+     int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+                                       const struct hmm_update *update);
  };
 
 The device driver must perform the update action to the range (mark range
-read only, or fully unmap, ...). The device must be done with the update before
+read only, or fully unmap, etc.). The device must complete the update before
 the driver callback returns.
 
 When the device driver wants to populate a range of virtual addresses, it can
@@ -194,17 +199,18 @@ use either::
 
 The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
-The second one does trigger a page fault on missing or read-only entry if the
-write parameter is true. Page faults use the generic mm page fault code path
-just like a CPU page fault.
+The second one does trigger a page fault on missing or read-only entries if
+write access is requested (see below). Page faults use the generic mm page
+fault code path just like a CPU page fault.
 
 Both functions copy CPU page table entries into their pfns array argument. Each
 entry in that array corresponds to an address in the virtual range. HMM
 provides a set of flags to help the driver identify special CPU page table
 entries.
 
-Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is::
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
 
  int driver_populate_range(...)
  {
@@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is::
             hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
             goto again;
           }
-          hmm_mirror_unregister(&range);
+          hmm_range_unregister(&range);
           return ret;
       }
       take_lock(driver->update);
-      if (!range.valid) {
+      if (!hmm_range_valid(&range)) {
           release_lock(driver->update);
           up_read(&mm->mmap_sem);
           goto again;
@@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is::
 
       // Use pfns array content to update device page table
 
-      hmm_mirror_unregister(&range);
+      hmm_range_unregister(&range);
       release_lock(driver->update);
       up_read(&mm->mmap_sem);
       return 0;
  }
 
 The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before checking the range.valid
-field to avoid any race with a concurrent CPU page table update.
+sync_cpu_device_pagetables() callback. That lock must be held before calling
+hmm_range_valid() to avoid any race with a concurrent CPU page table update.
 
 HMM implements all this on top of the mmu_notifier API because we wanted a
 simpler API and also to be able to perform optimizations latter on like doing
@@ -279,46 +285,47 @@ concurrently).
 Leverage default_flags and pfn_flags_mask
 =========================================
 
-The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
-to set fault or snapshot policy for a whole range instead of having to set them
-for each entries in the range.
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance, if the device flags for range.flags are::
 
-For instance if the device flags for device entries are:
-    VALID (1 << 63)
-    WRITE (1 << 62)
+    range.flags[HMM_PFN_VALID] = (1 << 63);
+    range.flags[HMM_PFN_WRITE] = (1 << 62);
 
-Now let say that device driver wants to fault with at least read a range then
-it does set::
+and the device driver wants pages for a range with at least read permission,
+it sets::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = 0;
 
-and calls hmm_range_fault() as described above. This will fill fault all page
+and calls hmm_range_fault() as described above. This will fill fault all pages
 in the range with at least read permission.
 
-Now let say driver wants to do the same except for one page in the range for
-which its want to have write. Now driver set::
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = (1 << 62);
     range->pfns[index_of_write] = (1 << 62);
 
-With this HMM will fault in all page with at least read (ie valid) and for the
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
 address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission ie if the CPU pte does not have write permission set then HMM
+write permission i.e., if the CPU pte does not have write permission set then HMM
 will call handle_mm_fault().
 
-Note that HMM will populate the pfns array with write permission for any entry
-that have write permission within the CPU pte no matter what are the values set
+Note that HMM will populate the pfns array with write permission for any page
+that is mapped with CPU write permission no matter what values are set
 in default_flags or pfn_flags_mask.
 
 
 Represent and manage device memory from core kernel point of view
 =================================================================
 
-Several different designs were tried to support device memory. First one used
-a device specific data structure to keep information about migrated memory and
-HMM hooked itself in various places of mm code to handle any access to
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
 addresses that were backed by device memory. It turns out that this ended up
 replicating most of the fields of struct page and also needed many kernel code
 paths to be updated to understand this new kind of memory.
@@ -341,7 +348,7 @@ The hmm_devmem_ops is where most of the important things are::
 
  struct hmm_devmem_ops {
      void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
+     vm_fault_t (*fault)(struct hmm_devmem *devmem,
                   struct vm_area_struct *vma,
                   unsigned long addr,
                   struct page *page,
@@ -417,9 +424,9 @@ willing to pay to keep all the code simpler.
 Memory cgroup (memcg) and rss accounting
 ========================================
 
-For now device memory is accounted as any regular page in rss counters (either
+For now, device memory is accounted as any regular page in rss counters (either
 anonymous if device page is used for anonymous, file if device page is used for
-file backed page or shmem if device page is used for shared memory). This is a
+file backed page, or shmem if device page is used for shared memory). This is a
 deliberate choice to keep existing applications, that might start using device
 memory without knowing about it, running unimpacted.
 
@@ -439,6 +446,6 @@ get more experience in how device memory is used and its impact on memory
 resource control.
 
 
-Note that device memory can never be pinned by device driver nor through GUP
+Note that device memory can never be pinned by a device driver nor through GUP
 and thus such memory is always free upon process exit. Or when last reference
 is dropped in case of shared memory or file backed memory.
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..740bb00853f5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -418,9 +418,10 @@ struct hmm_mirror_ops {
 	 *
 	 * @mirror: pointer to struct hmm_mirror
 	 *
-	 * This is called when the mm_struct is being released.
-	 * The callback should make sure no references to the mirror occur
-	 * after the callback returns.
+	 * This is called when the mm_struct is being released.  The callback
+	 * must ensure that all access to any pages obtained from this mirror
+	 * is halted before the callback returns. All future access should
+	 * fault.
 	 */
 	void (*release)(struct hmm_mirror *mirror);
 
-- 
cgit v1.2.3


From 085ea25064a9169eba5f2ed6484c111ab0f3ee79 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:39 -0700
Subject: mm/hmm: clean up some coding style and comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no functional changes, just some coding style clean ups and
minor comment changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 71 +++++++++++++++++++++++++++--------------------------
 mm/hmm.c            | 62 ++++++++++++++++++++++++----------------------
 2 files changed, 68 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 740bb00853f5..7007123842ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -21,8 +21,8 @@
  *
  * HMM address space mirroring API:
  *
- * Use HMM address space mirroring if you want to mirror range of the CPU page
- * table of a process into a device page table. Here, "mirror" means "keep
+ * Use HMM address space mirroring if you want to mirror a range of the CPU
+ * page tables of a process into a device page table. Here, "mirror" means "keep
  * synchronized". Prerequisites: the device must provide the ability to write-
  * protect its page tables (at PAGE_SIZE granularity), and must be able to
  * recover from the resulting potential page faults.
@@ -105,10 +105,11 @@ struct hmm {
  * HMM_PFN_WRITE: CPU page table has write permission set
  * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
  *
- * The driver provide a flags array, if driver valid bit for an entry is bit
- * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
+ * The driver provides a flags array for mapping page protections to device
+ * PTE bits. If the driver valid bit for an entry is bit 3,
+ * i.e., (entry & (1 << 3)), then the driver must provide
  * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
- * Same logic apply to all flags. This is same idea as vm_page_prot in vma
+ * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
  * except that this is per device driver rather than per architecture.
  */
 enum hmm_pfn_flag_e {
@@ -129,13 +130,13 @@ enum hmm_pfn_flag_e {
  *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
  *      set and the pfn value is undefined.
  *
- * Driver provide entry value for none entry, error entry and special entry,
- * driver can alias (ie use same value for error and special for instance). It
- * should not alias none and error or special.
+ * Driver provides values for none entry, error entry, and special entry.
+ * Driver can alias (i.e., use same value) error and special, but
+ * it should not alias none with error or special.
  *
  * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
  * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
- * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
+ * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
  * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
  */
 enum hmm_pfn_value_e {
@@ -158,6 +159,7 @@ enum hmm_pfn_value_e {
  * @values: pfn value for some special case (none, special, error, ...)
  * @default_flags: default flags for the range (write, read, ... see hmm doc)
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
+ * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -180,7 +182,7 @@ struct hmm_range {
 /*
  * hmm_range_page_shift() - return the page shift for the range
  * @range: range being queried
- * Returns: page shift (page size = 1 << page shift) for the range
+ * Return: page shift (page size = 1 << page shift) for the range
  */
 static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 {
@@ -190,7 +192,7 @@ static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 /*
  * hmm_range_page_size() - return the page size for the range
  * @range: range being queried
- * Returns: page size for the range in bytes
+ * Return: page size for the range in bytes
  */
 static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 {
@@ -201,7 +203,7 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
  * @timeout: time out for wait in ms (ie abort wait after that period of time)
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
@@ -222,7 +224,7 @@ static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 /*
  * hmm_range_valid() - test if a range is valid or not
  * @range: range
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_valid(struct hmm_range *range)
 {
@@ -233,7 +235,7 @@ static inline bool hmm_range_valid(struct hmm_range *range)
  * hmm_device_entry_to_page() - return struct page pointed to by a device entry
  * @range: range use to decode device entry value
  * @entry: device entry value to get corresponding struct page from
- * Returns: struct page pointer if entry is a valid, NULL otherwise
+ * Return: struct page pointer if entry is a valid, NULL otherwise
  *
  * If the device entry is valid (ie valid flag set) then return the struct page
  * matching the entry value. Otherwise return NULL.
@@ -256,7 +258,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
  * hmm_device_entry_to_pfn() - return pfn value store in a device entry
  * @range: range use to decode device entry value
  * @entry: device entry to extract pfn from
- * Returns: pfn value if device entry is valid, -1UL otherwise
+ * Return: pfn value if device entry is valid, -1UL otherwise
  */
 static inline unsigned long
 hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
@@ -276,7 +278,7 @@ hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
  * hmm_device_entry_from_page() - create a valid device entry for a page
  * @range: range use to encode HMM pfn value
  * @page: page for which to create the device entry
- * Returns: valid device entry for the page
+ * Return: valid device entry for the page
  */
 static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
 						  struct page *page)
@@ -289,7 +291,7 @@ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
  * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
  * @range: range use to encode HMM pfn value
  * @pfn: pfn value for which to create the device entry
- * Returns: valid device entry for the pfn
+ * Return: valid device entry for the pfn
  */
 static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 						 unsigned long pfn)
@@ -394,7 +396,7 @@ enum hmm_update_event {
 };
 
 /*
- * struct hmm_update - HMM update informations for callback
+ * struct hmm_update - HMM update information for callback
  *
  * @start: virtual start address of the range to update
  * @end: virtual end address of the range to update
@@ -428,8 +430,8 @@ struct hmm_mirror_ops {
 	/* sync_cpu_device_pagetables() - synchronize page tables
 	 *
 	 * @mirror: pointer to struct hmm_mirror
-	 * @update: update informations (see struct hmm_update)
-	 * Returns: -EAGAIN if update.blockable false and callback need to
+	 * @update: update information (see struct hmm_update)
+	 * Return: -EAGAIN if update.blockable false and callback need to
 	 *          block, 0 otherwise.
 	 *
 	 * This callback ultimately originates from mmu_notifiers when the CPU
@@ -468,13 +470,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 /*
  * hmm_mirror_mm_is_alive() - test if mm is still alive
  * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Returns: false if the mm is dead, true otherwise
+ * Return: false if the mm is dead, true otherwise
  *
- * This is an optimization it will not accurately always return -EINVAL if the
- * mm is dead ie there can be false negative (process is being kill but HMM is
- * not yet inform of that). It is only intented to be use to optimize out case
- * where driver is about to do something time consuming and it would be better
- * to skip it if the mm is dead.
+ * This is an optimization, it will not always accurately return false if the
+ * mm is dead; i.e., there can be false negatives (process is being killed but
+ * HMM is not yet informed of that). It is only intended to be used to optimize
+ * out cases where the driver is about to do something time consuming and it
+ * would be better to skip it if the mm is dead.
  */
 static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 {
@@ -489,7 +491,6 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 	return true;
 }
 
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
@@ -562,7 +563,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	ret = hmm_range_fault(range, block);
 	if (ret <= 0) {
 		if (ret == -EBUSY || !ret) {
-			/* Same as above  drop mmap_sem to match old API. */
+			/* Same as above, drop mmap_sem to match old API. */
 			up_read(&range->vma->vm_mm->mmap_sem);
 			ret = -EBUSY;
 		} else if (ret == -EAGAIN)
@@ -629,7 +630,7 @@ struct hmm_devmem_ops {
 	 * @page: pointer to struct page backing virtual address (unreliable)
 	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
 	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+	 * Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
 	 *   on error
 	 *
 	 * The callback occurs whenever there is a CPU page fault or GUP on a
@@ -637,14 +638,14 @@ struct hmm_devmem_ops {
 	 * page back to regular memory (CPU accessible).
 	 *
 	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
+	 * fault() callback as an optimization. However if the device decides
+	 * to migrate more than one page it must always priotirize the faulting
 	 * address over the others.
 	 *
-	 * The struct page pointer is only given as an hint to allow quick
+	 * The struct page pointer is only given as a hint to allow quick
 	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
+	 * might have already freed that page and the virtual address might
+	 * no longer be backed by it. So it should not be modified by the
 	 * callback.
 	 *
 	 * Note that mmap semaphore is held in read mode at least when this
@@ -671,7 +672,7 @@ struct hmm_devmem_ops {
  * @ref: per CPU refcount
  * @page_fault: callback when CPU fault on an unaddressable device page
  *
- * This an helper structure for device drivers that do not wish to implement
+ * This is a helper structure for device drivers that do not wish to implement
  * the gory details related to hotplugging new memoy and allocating struct
  * pages.
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index c62ae414a3a2..4db5dcf110ba 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -153,9 +153,8 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
 	/* Wake-up everyone waiting on any range. */
 	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list) {
+	list_for_each_entry(range, &hmm->ranges, list)
 		range->valid = false;
-	}
 	wake_up_all(&hmm->wq);
 	mutex_unlock(&hmm->lock);
 
@@ -166,9 +165,10 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 		list_del_init(&mirror->list);
 		if (mirror->ops->release) {
 			/*
-			 * Drop mirrors_sem so callback can wait on any pending
-			 * work that might itself trigger mmu_notifier callback
-			 * and thus would deadlock with us.
+			 * Drop mirrors_sem so the release callback can wait
+			 * on any pending work that might itself trigger a
+			 * mmu_notifier callback and thus would deadlock with
+			 * us.
 			 */
 			up_write(&hmm->mirrors_sem);
 			mirror->ops->release(mirror);
@@ -223,11 +223,8 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		int ret;
 
 		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN) {
-			up_read(&hmm->mirrors_sem);
-			ret = -EAGAIN;
-			goto out;
-		}
+		if (!update.blockable && ret == -EAGAIN)
+			break;
 	}
 	up_read(&hmm->mirrors_sem);
 
@@ -271,6 +268,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  *
  * @mirror: new mirror struct to register
  * @mm: mm to register against
+ * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
  *
  * To start mirroring a process address space, the device driver must register
  * an HMM mirror struct.
@@ -298,7 +296,7 @@ EXPORT_SYMBOL(hmm_mirror_register);
 /*
  * hmm_mirror_unregister() - unregister a mirror
  *
- * @mirror: new mirror struct to register
+ * @mirror: mirror struct to unregister
  *
  * Stop mirroring a process address space, and cleanup.
  */
@@ -372,7 +370,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -911,6 +909,7 @@ int hmm_range_register(struct hmm_range *range,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
+	struct hmm *hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -924,28 +923,29 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	range->hmm = hmm_get_or_create(mm);
-	if (!range->hmm)
+	hmm = hmm_get_or_create(mm);
+	if (!hmm)
 		return -EFAULT;
 
 	/* Check if hmm_mm_destroy() was call. */
-	if (range->hmm->mm == NULL || range->hmm->dead) {
-		hmm_put(range->hmm);
+	if (hmm->mm == NULL || hmm->dead) {
+		hmm_put(hmm);
 		return -EFAULT;
 	}
 
-	/* Initialize range to track CPU page table update */
-	mutex_lock(&range->hmm->lock);
+	/* Initialize range to track CPU page table updates. */
+	mutex_lock(&hmm->lock);
 
-	list_add_rcu(&range->list, &range->hmm->ranges);
+	range->hmm = hmm;
+	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
 	 * If there are any concurrent notifiers we have to wait for them for
 	 * the range to be valid (see hmm_range_wait_until_valid()).
 	 */
-	if (!range->hmm->notifiers)
+	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	return 0;
 }
@@ -960,17 +960,19 @@ EXPORT_SYMBOL(hmm_range_register);
  */
 void hmm_range_unregister(struct hmm_range *range)
 {
+	struct hmm *hmm = range->hmm;
+
 	/* Sanity check this really should not happen. */
-	if (range->hmm == NULL || range->end <= range->start)
+	if (hmm == NULL || range->end <= range->start)
 		return;
 
-	mutex_lock(&range->hmm->lock);
+	mutex_lock(&hmm->lock);
 	list_del_rcu(&range->list);
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
-	hmm_put(range->hmm);
+	hmm_put(hmm);
 	range->hmm = NULL;
 }
 EXPORT_SYMBOL(hmm_range_unregister);
@@ -978,7 +980,7 @@ EXPORT_SYMBOL(hmm_range_unregister);
 /*
  * hmm_range_snapshot() - snapshot CPU page table for a range
  * @range: range
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  *          permission (for instance asking for write and range is read only),
  *          -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
  *          vma or it is illegal to access that range), number of valid pages
@@ -1061,7 +1063,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
  * hmm_range_fault() - try to fault some address in a virtual address range
  * @range: range being faulted
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of valid pages in range->pfns[] (from range start
+ * Return: number of valid pages in range->pfns[] (from range start
  *          address). This may be zero. If the return value is negative,
  *          then one of the following values may be returned:
  *
@@ -1179,7 +1181,7 @@ EXPORT_SYMBOL(hmm_range_fault);
  * @device: device against to dma map page to
  * @daddrs: dma address of mapped pages
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
  *          drop and you need to try again, some other error value otherwise
  *
  * Note same usage pattern as hmm_range_fault().
@@ -1267,7 +1269,7 @@ EXPORT_SYMBOL(hmm_range_dma_map);
  * @device: device against which dma map was done
  * @daddrs: dma address of mapped pages
  * @dirty: dirty page if it had the write flag set
- * Returns: number of page unmapped on success, -EINVAL otherwise
+ * Return: number of page unmapped on success, -EINVAL otherwise
  *
  * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
  * to the sync_cpu_device_pagetables() callback so that it is safe here to
@@ -1390,7 +1392,7 @@ static void hmm_devmem_free(struct page *page, void *data)
  * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  * @device: device struct to bind the resource too
  * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ * Return: pointer to new hmm_devmem struct ERR_PTR otherwise
  *
  * This function first finds an empty range of physical address big enough to
  * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
-- 
cgit v1.2.3


From 3e582c6e911ffe6c90b9f90324bdf85fc728d0c3 Mon Sep 17 00:00:00 2001
From: Leilk Liu <leilk.liu@mediatek.com>
Date: Wed, 5 Jun 2019 11:07:04 +0800
Subject: spi: mediatek: add SPI_LSB_FIRST support

this patch add SPI_LSB_FIRST feature support.

Signed-off-by: Leilk Liu <leilk.liu@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mt65xx.c                 | 15 ++++++---------
 include/linux/platform_data/spi-mt65xx.h |  2 --
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c
index 0cce6f0ba824..7f4dc1844789 100644
--- a/drivers/spi/spi-mt65xx.c
+++ b/drivers/spi/spi-mt65xx.c
@@ -131,8 +131,6 @@ static const struct mtk_spi_compatible mt8183_compat = {
  * supplies it.
  */
 static const struct mtk_chip_config mtk_default_chip_info = {
-	.rx_mlsb = 1,
-	.tx_mlsb = 1,
 	.cs_pol = 0,
 	.sample_sel = 0,
 };
@@ -203,14 +201,13 @@ static int mtk_spi_prepare_message(struct spi_master *master,
 		reg_val &= ~SPI_CMD_CPOL;
 
 	/* set the mlsbx and mlsbtx */
-	if (chip_config->tx_mlsb)
-		reg_val |= SPI_CMD_TXMSBF;
-	else
+	if (spi->mode & SPI_LSB_FIRST) {
 		reg_val &= ~SPI_CMD_TXMSBF;
-	if (chip_config->rx_mlsb)
-		reg_val |= SPI_CMD_RXMSBF;
-	else
 		reg_val &= ~SPI_CMD_RXMSBF;
+	} else {
+		reg_val |= SPI_CMD_TXMSBF;
+		reg_val |= SPI_CMD_RXMSBF;
+	}
 
 	/* set the tx/rx endian */
 #ifdef __LITTLE_ENDIAN
@@ -607,7 +604,7 @@ static int mtk_spi_probe(struct platform_device *pdev)
 
 	master->auto_runtime_pm = true;
 	master->dev.of_node = pdev->dev.of_node;
-	master->mode_bits = SPI_CPOL | SPI_CPHA;
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
 
 	master->set_cs = mtk_spi_set_cs;
 	master->prepare_message = mtk_spi_prepare_message;
diff --git a/include/linux/platform_data/spi-mt65xx.h b/include/linux/platform_data/spi-mt65xx.h
index ba4e4bb70262..8d5df58a13ef 100644
--- a/include/linux/platform_data/spi-mt65xx.h
+++ b/include/linux/platform_data/spi-mt65xx.h
@@ -14,8 +14,6 @@
 
 /* Board specific platform_data */
 struct mtk_chip_config {
-	u32 tx_mlsb;
-	u32 rx_mlsb;
 	u32 cs_pol;
 	u32 sample_sel;
 };
-- 
cgit v1.2.3


From 6445500b43129baac36c56d629cf1dd9e1104167 Mon Sep 17 00:00:00 2001
From: Vitor Soares <Vitor.Soares@synopsys.com>
Date: Thu, 6 Jun 2019 17:12:02 +0200
Subject: regmap: add i3c bus support

Add basic support for i3c bus.
This is a simple implementation that only give support
for SDR Read and Write commands.

Signed-off-by: Vitor Soares <vitor.soares@synopsys.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/Kconfig      |  6 +++-
 drivers/base/regmap/Makefile     |  1 +
 drivers/base/regmap/regmap-i3c.c | 60 ++++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h           | 20 ++++++++++++++
 4 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/regmap/regmap-i3c.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index 6ad5ef48b61e..c8bbf5322720 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -4,7 +4,7 @@
 # subsystems should select the appropriate symbols.
 
 config REGMAP
-	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ)
+	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_I3C)
 	select IRQ_DOMAIN if REGMAP_IRQ
 	bool
 
@@ -49,3 +49,7 @@ config REGMAP_SOUNDWIRE
 config REGMAP_SCCB
 	tristate
 	depends on I2C
+
+config REGMAP_I3C
+	tristate
+	depends on I3C
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index f5b4e8851d00..ff6c7d8ec1cd 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_REGMAP_IRQ) += regmap-irq.o
 obj-$(CONFIG_REGMAP_W1) += regmap-w1.o
 obj-$(CONFIG_REGMAP_SOUNDWIRE) += regmap-sdw.o
 obj-$(CONFIG_REGMAP_SCCB) += regmap-sccb.o
+obj-$(CONFIG_REGMAP_I3C) += regmap-i3c.o
diff --git a/drivers/base/regmap/regmap-i3c.c b/drivers/base/regmap/regmap-i3c.c
new file mode 100644
index 000000000000..1578fb506683
--- /dev/null
+++ b/drivers/base/regmap/regmap-i3c.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+
+#include <linux/regmap.h>
+#include <linux/i3c/device.h>
+#include <linux/i3c/master.h>
+#include <linux/module.h>
+
+static int regmap_i3c_write(void *context, const void *data, size_t count)
+{
+	struct device *dev = context;
+	struct i3c_device *i3c = dev_to_i3cdev(dev);
+	struct i3c_priv_xfer xfers[] = {
+		{
+			.rnw = false,
+			.len = count,
+			.data.out = data,
+		},
+	};
+
+	return i3c_device_do_priv_xfers(i3c, xfers, 1);
+}
+
+static int regmap_i3c_read(void *context,
+			   const void *reg, size_t reg_size,
+			   void *val, size_t val_size)
+{
+	struct device *dev = context;
+	struct i3c_device *i3c = dev_to_i3cdev(dev);
+	struct i3c_priv_xfer xfers[2];
+
+	xfers[0].rnw = false;
+	xfers[0].len = reg_size;
+	xfers[0].data.out = reg;
+
+	xfers[1].rnw = true;
+	xfers[1].len = val_size;
+	xfers[1].data.in = val;
+
+	return i3c_device_do_priv_xfers(i3c, xfers, 2);
+}
+
+static struct regmap_bus regmap_i3c = {
+	.write = regmap_i3c_write,
+	.read = regmap_i3c_read,
+};
+
+struct regmap *__devm_regmap_init_i3c(struct i3c_device *i3c,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name)
+{
+	return __devm_regmap_init(&i3c->dev, &regmap_i3c, &i3c->dev, config,
+				  lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__devm_regmap_init_i3c);
+
+MODULE_AUTHOR("Vitor Soares <vitor.soares@synopsys.com>");
+MODULE_DESCRIPTION("Regmap I3C Module");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index daeec7dbd65c..f65984d98b07 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -25,6 +25,7 @@ struct module;
 struct clk;
 struct device;
 struct i2c_client;
+struct i3c_device;
 struct irq_domain;
 struct slim_device;
 struct spi_device;
@@ -624,6 +625,10 @@ struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
+struct regmap *__devm_regmap_init_i3c(struct i3c_device *i3c,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
 /*
  * Wrapper for regmap_init macros to include a unique lockdep key and name
  * for each call. No-op if CONFIG_LOCKDEP is not set.
@@ -982,6 +987,21 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 #define devm_regmap_init_slimbus(slimbus, config)			\
 	__regmap_lockdep_wrapper(__devm_regmap_init_slimbus, #config,	\
 				slimbus, config)
+
+/**
+ * devm_regmap_init_i3c() - Initialise managed register map
+ *
+ * @i3c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_i3c(i3c, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_i3c, #config,	\
+				i3c, config)
+
 int regmap_mmio_attach_clk(struct regmap *map, struct clk *clk);
 void regmap_mmio_detach_clk(struct regmap *map);
 void regmap_exit(struct regmap *map);
-- 
cgit v1.2.3


From 6d7c3cde93c1d9ac0b37f78ec3f2ff052159a242 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 22 May 2019 16:52:52 -0300
Subject: mm/hmm: fix use after free with struct hmm in the mmu notifiers

mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
system will continue to reference hmm->mn until the srcu grace period
expires.

Resulting in use after free races like this:

         CPU0                                     CPU1
                                               __mmu_notifier_invalidate_range_start()
                                                 srcu_read_lock
                                                 hlist_for_each ()
                                                   // mn == hmm->mn
hmm_mirror_unregister()
  hmm_put()
    hmm_free()
      mmu_notifier_unregister_no_release()
         hlist_del_init_rcu(hmm-mn->list)
			                           mn->ops->invalidate_range_start(mn, range);
					             mm_get_hmm()
      mm->hmm = NULL;
      kfree(hmm)
                                                     mutex_lock(&hmm->lock);

Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
existing. Get the now-safe hmm struct through container_of and directly
check kref_get_unless_zero to lock it against free.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  1 +
 mm/hmm.c            | 23 +++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7007123842ba..cb01cf1fa3c0 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -93,6 +93,7 @@ struct hmm {
 	struct mmu_notifier	mmu_notifier;
 	struct rw_semaphore	mirrors_sem;
 	wait_queue_head_t	wq;
+	struct rcu_head		rcu;
 	long			notifiers;
 	bool			dead;
 };
diff --git a/mm/hmm.c b/mm/hmm.c
index 826816ab2377..f6956d78e3cb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,11 @@ error:
 	return NULL;
 }
 
+static void hmm_free_rcu(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct hmm, rcu));
+}
+
 static void hmm_free(struct kref *kref)
 {
 	struct hmm *hmm = container_of(kref, struct hmm, kref);
@@ -116,7 +121,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
-	kfree(hmm);
+	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
 static inline void hmm_put(struct hmm *hmm)
@@ -144,10 +149,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	struct hmm *hmm = mm_get_hmm(mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_range *range;
 
+	/* Bail out if hmm is in the process of being freed */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
+
 	/* Report this HMM as dying. */
 	hmm->dead = true;
 
@@ -185,13 +194,14 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
 	int ret = 0;
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return 0;
 
 	update.start = nrange->start;
 	update.end = nrange->end;
@@ -236,9 +246,10 @@ out:
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
 
 	mutex_lock(&hmm->lock);
 	hmm->notifiers--;
-- 
cgit v1.2.3


From 5923ea6c2ce626f0aa8a547d5b7e5fce705dd3dc Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 26 Apr 2019 14:40:18 +0200
Subject: gpio: pass lookup and descriptor flags to request_own

When a gpio_chip wants to request a descriptor from itself
using gpiochip_request_own_desc() it needs to be able to specify
fully how to use the descriptor, notably line inversion
semantics. The workaround in the gpiolib.c can be removed
and cases (such as SPI CS) where we need at times to request
a GPIO with line inversion semantics directly on a chip for
workarounds, can be fully supported with this call.

Fix up some users of the API that weren't really using the
last flag to set up the line as input or output properly
but instead just calling direction setting explicitly
after requesting the line.

Cc: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-omap1/ams-delta-fiq.c   |  4 +++-
 arch/arm/mach-omap1/board-ams-delta.c |  5 +++--
 drivers/gpio/gpio-mvebu.c             | 11 ++++-------
 drivers/gpio/gpiolib-acpi.c           |  6 +++---
 drivers/gpio/gpiolib.c                | 31 ++++++++++---------------------
 drivers/hid/hid-cp2112.c              |  7 +++++--
 drivers/memory/omap-gpmc.c            |  4 +++-
 include/linux/gpio/driver.h           |  4 +++-
 8 files changed, 34 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap1/ams-delta-fiq.c b/arch/arm/mach-omap1/ams-delta-fiq.c
index 51212133ce06..46ca2d9d38ef 100644
--- a/arch/arm/mach-omap1/ams-delta-fiq.c
+++ b/arch/arm/mach-omap1/ams-delta-fiq.c
@@ -14,6 +14,7 @@
  * the Free Software Foundation.
  */
 #include <linux/gpio/consumer.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -102,7 +103,8 @@ void __init ams_delta_init_fiq(struct gpio_chip *chip,
 	}
 
 	for (i = 0; i < ARRAY_SIZE(irq_data); i++) {
-		gpiod = gpiochip_request_own_desc(chip, i, pin_name[i], 0);
+		gpiod = gpiochip_request_own_desc(chip, i, pin_name[i],
+						  GPIO_ACTIVE_HIGH, GPIOD_IN);
 		if (IS_ERR(gpiod)) {
 			pr_err("%s: failed to get GPIO pin %d (%ld)\n",
 			       __func__, i, PTR_ERR(gpiod));
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index b6e814166ee0..e49542540fc6 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -13,6 +13,7 @@
  */
 #include <linux/gpio/driver.h>
 #include <linux/gpio/machine.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -609,12 +610,12 @@ static void __init modem_assign_irq(struct gpio_chip *chip)
 	struct gpio_desc *gpiod;
 
 	gpiod = gpiochip_request_own_desc(chip, AMS_DELTA_GPIO_PIN_MODEM_IRQ,
-					  "modem_irq", 0);
+					  "modem_irq", GPIO_ACTIVE_HIGH,
+					  GPIOD_IN);
 	if (IS_ERR(gpiod)) {
 		pr_err("%s: modem IRQ GPIO request failed (%ld)\n", __func__,
 		       PTR_ERR(gpiod));
 	} else {
-		gpiod_direction_input(gpiod);
 		ams_delta_modem_ports[0].irq = gpiod_to_irq(gpiod);
 	}
 }
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 059094ac44cb..869d47f89599 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -38,6 +38,7 @@
 #include <linux/err.h>
 #include <linux/gpio/driver.h>
 #include <linux/gpio/consumer.h>
+#include <linux/gpio/machine.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/irq.h>
@@ -618,18 +619,14 @@ static int mvebu_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
 		ret = -EBUSY;
 	} else {
 		desc = gpiochip_request_own_desc(&mvchip->chip,
-						 pwm->hwpwm, "mvebu-pwm", 0);
+						 pwm->hwpwm, "mvebu-pwm",
+						 GPIO_ACTIVE_HIGH,
+						 GPIOD_OUT_LOW);
 		if (IS_ERR(desc)) {
 			ret = PTR_ERR(desc);
 			goto out;
 		}
 
-		ret = gpiod_direction_output(desc, 0);
-		if (ret) {
-			gpiochip_free_own_desc(desc);
-			goto out;
-		}
-
 		mvpwm->gpiod = desc;
 	}
 out:
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index c9fc9e232aaf..39f2f9035c11 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -217,14 +217,13 @@ static acpi_status acpi_gpiochip_alloc_event(struct acpi_resource *ares,
 	if (!handler)
 		return AE_OK;
 
-	desc = gpiochip_request_own_desc(chip, pin, "ACPI:Event", 0);
+	desc = gpiochip_request_own_desc(chip, pin, "ACPI:Event",
+					 GPIO_ACTIVE_HIGH, GPIOD_IN);
 	if (IS_ERR(desc)) {
 		dev_err(chip->parent, "Failed to request GPIO\n");
 		return AE_ERROR;
 	}
 
-	gpiod_direction_input(desc);
-
 	ret = gpiochip_lock_as_irq(chip, pin);
 	if (ret) {
 		dev_err(chip->parent, "Failed to lock GPIO as interrupt\n");
@@ -951,6 +950,7 @@ acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address,
 			const char *label = "ACPI:OpRegion";
 
 			desc = gpiochip_request_own_desc(chip, pin, label,
+							 GPIO_ACTIVE_HIGH,
 							 flags);
 			if (IS_ERR(desc)) {
 				status = AE_ERROR;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index e013d417a936..4561cb39bdb4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2503,7 +2503,11 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested);
  * @chip: GPIO chip
  * @hwnum: hardware number of the GPIO for which to request the descriptor
  * @label: label for the GPIO
- * @flags: flags for this GPIO or 0 if default
+ * @lflags: lookup flags for this GPIO or 0 if default, this can be used to
+ * specify things like line inversion semantics with the machine flags
+ * such as GPIO_OUT_LOW
+ * @dflags: descriptor request flags for this GPIO or 0 if default, this
+ * can be used to specify consumer semantics such as open drain
  *
  * Function allows GPIO chip drivers to request and use their own GPIO
  * descriptors via gpiolib API. Difference to gpiod_request() is that this
@@ -2517,9 +2521,9 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested);
  */
 struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum,
 					    const char *label,
-					    enum gpiod_flags flags)
+					    enum gpio_lookup_flags lflags,
+					    enum gpiod_flags dflags)
 {
-	unsigned long lflags = GPIO_LOOKUP_FLAGS_DEFAULT;
 	struct gpio_desc *desc = gpiochip_get_desc(chip, hwnum);
 	int err;
 
@@ -2532,7 +2536,7 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum,
 	if (err < 0)
 		return ERR_PTR(err);
 
-	err = gpiod_configure_flags(desc, label, lflags, flags);
+	err = gpiod_configure_flags(desc, label, lflags, dflags);
 	if (err) {
 		chip_err(chip, "setup of own GPIO %s failed\n", label);
 		gpiod_free_commit(desc);
@@ -4420,15 +4424,8 @@ int gpiod_hog(struct gpio_desc *desc, const char *name,
 	chip = gpiod_to_chip(desc);
 	hwnum = gpio_chip_hwgpio(desc);
 
-	/*
-	 * FIXME: not very elegant that we call gpiod_configure_flags()
-	 * twice here (once inside gpiochip_request_own_desc() and
-	 * again here), but the gpiochip_request_own_desc() is external
-	 * and cannot really pass the lflags so this is the lesser evil
-	 * at the moment. Pass zero as dflags on this first call so we
-	 * don't screw anything up.
-	 */
-	local_desc = gpiochip_request_own_desc(chip, hwnum, name, 0);
+	local_desc = gpiochip_request_own_desc(chip, hwnum, name,
+					       lflags, dflags);
 	if (IS_ERR(local_desc)) {
 		status = PTR_ERR(local_desc);
 		pr_err("requesting hog GPIO %s (chip %s, offset %d) failed, %d\n",
@@ -4436,14 +4433,6 @@ int gpiod_hog(struct gpio_desc *desc, const char *name,
 		return status;
 	}
 
-	status = gpiod_configure_flags(desc, name, lflags, dflags);
-	if (status < 0) {
-		pr_err("setup of hog GPIO %s (chip %s, offset %d) failed, %d\n",
-		       name, chip->label, hwnum, status);
-		gpiochip_free_own_desc(desc);
-		return status;
-	}
-
 	/* Mark GPIO as hogged so it can be identified and removed later */
 	set_bit(FLAG_IS_HOGGED, &desc->flags);
 
diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 47f65857408d..f6fb97a14de6 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -24,7 +24,8 @@
  *   https://www.silabs.com/documents/public/application-notes/an495-cp2112-interface-specification.pdf
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio/driver.h>
 #include <linux/hid.h>
 #include <linux/hidraw.h>
@@ -1203,7 +1204,9 @@ static int __maybe_unused cp2112_allocate_irq(struct cp2112_device *dev,
 		return -EINVAL;
 
 	dev->desc[pin] = gpiochip_request_own_desc(&dev->gc, pin,
-						   "HID/I2C:Event", 0);
+						   "HID/I2C:Event",
+						   GPIO_ACTIVE_HIGH,
+						   GPIOD_IN);
 	if (IS_ERR(dev->desc[pin])) {
 		dev_err(dev->gc.parent, "Failed to request GPIO\n");
 		return PTR_ERR(dev->desc[pin]);
diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index f6297599433f..f4f98957dc86 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <linux/gpio/driver.h>
 #include <linux/gpio/consumer.h> /* GPIO descriptor enum */
+#include <linux/gpio/machine.h>
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
 #include <linux/platform_device.h>
@@ -2172,7 +2173,8 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
 
 		waitpin_desc = gpiochip_request_own_desc(&gpmc->gpio_chip,
 							 wait_pin, "WAITPIN",
-							 0);
+							 GPIO_ACTIVE_HIGH,
+							 GPIOD_IN);
 		if (IS_ERR(waitpin_desc)) {
 			dev_err(&pdev->dev, "invalid wait-pin: %d\n", wait_pin);
 			ret = PTR_ERR(waitpin_desc);
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a1d273c96016..937c40fb61f7 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -18,6 +18,7 @@ struct seq_file;
 struct gpio_device;
 struct module;
 enum gpiod_flags;
+enum gpio_lookup_flags;
 
 #ifdef CONFIG_GPIOLIB
 
@@ -614,7 +615,8 @@ gpiochip_remove_pin_ranges(struct gpio_chip *chip)
 
 struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum,
 					    const char *label,
-					    enum gpiod_flags flags);
+					    enum gpio_lookup_flags lflags,
+					    enum gpiod_flags dflags);
 void gpiochip_free_own_desc(struct gpio_desc *desc);
 
 void devprop_gpiochip_set_names(struct gpio_chip *chip,
-- 
cgit v1.2.3


From 2809b392a62ae307da058a52d451b2fc3ce4de7e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Wed, 5 Jun 2019 22:32:21 +0200
Subject: compiler.h: add CC_USING_PATCHABLE_FUNCTION_ENTRY

This can be used for architectures implementing dynamic
ftrace via -fpatchable-function-entry.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/compiler_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 19e58b9138a0..095d55c3834d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -112,6 +112,8 @@ struct ftrace_likely_data {
 
 #if defined(CC_USING_HOTPATCH)
 #define notrace			__attribute__((hotpatch(0, 0)))
+#elif defined(CC_USING_PATCHABLE_FUNCTION_ENTRY)
+#define notrace			__attribute__((patchable_function_entry(0, 0)))
 #else
 #define notrace			__attribute__((__no_instrument_function__))
 #endif
-- 
cgit v1.2.3


From cb1aaebea8d79860181559d7b5d482aea63db113 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Fri, 7 Jun 2019 15:54:32 -0300
Subject: docs: fix broken documentation links

Mostly due to x86 and acpi conversion, several documentation
links are still pointing to the old file. Fix them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Reviewed-by: Wolfram Sang <wsa@the-dreams.de>
Reviewed-by: Sven Van Asbroeck <TheSven73@gmail.com>
Reviewed-by: Bhupesh Sharma <bhsharma@redhat.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/acpi/dsd/leds.txt                          |  2 +-
 Documentation/admin-guide/kernel-parameters.rst          |  6 +++---
 Documentation/admin-guide/kernel-parameters.txt          | 16 ++++++++--------
 Documentation/admin-guide/ras.rst                        |  2 +-
 Documentation/devicetree/bindings/net/fsl-enetc.txt      |  7 +++----
 .../devicetree/bindings/pci/amlogic,meson-pcie.txt       |  2 +-
 .../bindings/regulator/qcom,rpmh-regulator.txt           |  2 +-
 Documentation/devicetree/booting-without-of.txt          |  2 +-
 Documentation/driver-api/gpio/board.rst                  |  2 +-
 Documentation/driver-api/gpio/consumer.rst               |  2 +-
 Documentation/firmware-guide/acpi/enumeration.rst        |  2 +-
 Documentation/firmware-guide/acpi/method-tracing.rst     |  2 +-
 Documentation/i2c/instantiating-devices                  |  2 +-
 Documentation/sysctl/kernel.txt                          |  4 ++--
 Documentation/translations/zh_CN/process/4.Coding.rst    |  2 +-
 Documentation/x86/x86_64/5level-paging.rst               |  2 +-
 Documentation/x86/x86_64/boot-options.rst                |  4 ++--
 Documentation/x86/x86_64/fake-numa-for-cpusets.rst       |  2 +-
 MAINTAINERS                                              |  4 ++--
 arch/arm/Kconfig                                         |  2 +-
 arch/arm64/kernel/kexec_image.c                          |  2 +-
 arch/x86/Kconfig                                         | 14 +++++++-------
 arch/x86/Kconfig.debug                                   |  2 +-
 arch/x86/boot/header.S                                   |  2 +-
 arch/x86/entry/entry_64.S                                |  2 +-
 arch/x86/include/asm/bootparam_utils.h                   |  2 +-
 arch/x86/include/asm/page_64_types.h                     |  2 +-
 arch/x86/include/asm/pgtable_64_types.h                  |  2 +-
 arch/x86/kernel/cpu/microcode/amd.c                      |  2 +-
 arch/x86/kernel/kexec-bzimage64.c                        |  2 +-
 arch/x86/kernel/pci-dma.c                                |  2 +-
 arch/x86/mm/tlb.c                                        |  2 +-
 arch/x86/platform/pvh/enlighten.c                        |  2 +-
 drivers/acpi/Kconfig                                     | 10 +++++-----
 drivers/net/ethernet/faraday/ftgmac100.c                 |  2 +-
 drivers/staging/fieldbus/Documentation/fieldbus_dev.txt  |  4 ++--
 drivers/vhost/vhost.c                                    |  2 +-
 include/acpi/acpi_drivers.h                              |  2 +-
 include/linux/fs_context.h                               |  2 +-
 include/linux/lsm_hooks.h                                |  2 +-
 mm/Kconfig                                               |  2 +-
 security/Kconfig                                         |  2 +-
 tools/include/linux/err.h                                |  2 +-
 tools/objtool/Documentation/stack-validation.txt         |  4 ++--
 44 files changed, 70 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt
index 81a63af42ed2..cc58b1a574c5 100644
--- a/Documentation/acpi/dsd/leds.txt
+++ b/Documentation/acpi/dsd/leds.txt
@@ -96,4 +96,4 @@ where
     <URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
     referenced 2019-02-21.
 
-[7] Documentation/acpi/dsd/data-node-reference.txt
+[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 0124980dca2d..8d3273e32eb1 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -167,7 +167,7 @@ parameter is applicable::
 	X86-32	X86-32, aka i386 architecture is enabled.
 	X86-64	X86-64 architecture is enabled.
 			More X86-64 boot options can be found in
-			Documentation/x86/x86_64/boot-options.txt .
+			Documentation/x86/x86_64/boot-options.rst.
 	X86	Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
 	X86_UV	SGI UV support is enabled.
 	XEN	Xen support is enabled
@@ -181,10 +181,10 @@ In addition, the following text indicates that the option::
 Parameters denoted with BOOT are actually interpreted by the boot
 loader, and have no meaning to the kernel directly.
 Do not modify the syntax of boot loader parameters without extreme
-need or coordination with <Documentation/x86/boot.txt>.
+need or coordination with <Documentation/x86/boot.rst>.
 
 There are also arch-specific kernel-parameters not documented here.
-See for example <Documentation/x86/x86_64/boot-options.txt>.
+See for example <Documentation/x86/x86_64/boot-options.rst>.
 
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
 a trailing = on the name of any parameter states that that parameter will
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 79d043b8850d..1abd7e145357 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -53,7 +53,7 @@
 			ACPI_DEBUG_PRINT statements, e.g.,
 			    ACPI_DEBUG_PRINT((ACPI_DB_INFO, ...
 			The debug_level mask defaults to "info".  See
-			Documentation/acpi/debug.txt for more information about
+			Documentation/firmware-guide/acpi/debug.rst for more information about
 			debug layers and levels.
 
 			Enable processor driver info messages:
@@ -963,7 +963,7 @@
 			for details.
 
 	nompx		[X86] Disables Intel Memory Protection Extensions.
-			See Documentation/x86/intel_mpx.txt for more
+			See Documentation/x86/intel_mpx.rst for more
 			information about the feature.
 
 	nopku		[X86] Disable Memory Protection Keys CPU feature found
@@ -1189,7 +1189,7 @@
 			that is to be dynamically loaded by Linux. If there are
 			multiple variables with the same name but with different
 			vendor GUIDs, all of them will be loaded. See
-			Documentation/acpi/ssdt-overlays.txt for details.
+			Documentation/admin-guide/acpi/ssdt-overlays.rst for details.
 
 
 	eisa_irq_edge=	[PARISC,HW]
@@ -2383,7 +2383,7 @@
 
 	mce		[X86-32] Machine Check Exception
 
-	mce=option	[X86-64] See Documentation/x86/x86_64/boot-options.txt
+	mce=option	[X86-64] See Documentation/x86/x86_64/boot-options.rst
 
 	md=		[HW] RAID subsystems devices and level
 			See Documentation/admin-guide/md.rst.
@@ -2439,7 +2439,7 @@
 			set according to the
 			CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
 			option.
-			See Documentation/memory-hotplug.txt.
+			See Documentation/admin-guide/mm/memory-hotplug.rst.
 
 	memmap=exactmap	[KNL,X86] Enable setting of an exact
 			E820 memory map, as specified by the user.
@@ -2528,7 +2528,7 @@
 			mem_encrypt=on:		Activate SME
 			mem_encrypt=off:	Do not activate SME
 
-			Refer to Documentation/x86/amd-memory-encryption.txt
+			Refer to Documentation/virtual/kvm/amd-memory-encryption.rst
 			for details on when memory encryption can be activated.
 
 	mem_sleep_default=	[SUSPEND] Default system suspend mode:
@@ -3529,7 +3529,7 @@
 			See Documentation/blockdev/paride.txt.
 
 	pirq=		[SMP,APIC] Manual mp-table setup
-			See Documentation/x86/i386/IO-APIC.txt.
+			See Documentation/x86/i386/IO-APIC.rst.
 
 	plip=		[PPT,NET] Parallel port network link
 			Format: { parport<nr> | timid | 0 }
@@ -5055,7 +5055,7 @@
 			Can be used multiple times for multiple devices.
 
 	vga=		[BOOT,X86-32] Select a particular video mode
-			See Documentation/x86/boot.txt and
+			See Documentation/x86/boot.rst and
 			Documentation/svga.txt.
 			Use vga=ask for menu.
 			This is actually a boot loader parameter; the value is
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index c7495e42e6f4..2b20f5f7380d 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_.
   mode).
 
 .. [#f3] For more details about the Machine Check Architecture (MCA),
-  please read Documentation/x86/x86_64/machinecheck at the Kernel tree.
+  please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree.
 
 EDAC - Error Detection And Correction
 *************************************
diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt
index c812e25ae90f..25fc687419db 100644
--- a/Documentation/devicetree/bindings/net/fsl-enetc.txt
+++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt
@@ -16,8 +16,8 @@ Required properties:
 In this case, the ENETC node should include a "mdio" sub-node
 that in turn should contain the "ethernet-phy" node describing the
 external phy.  Below properties are required, their bindings
-already defined in ethernet.txt or phy.txt, under
-Documentation/devicetree/bindings/net/*.
+already defined in Documentation/devicetree/bindings/net/ethernet.txt or
+Documentation/devicetree/bindings/net/phy.txt.
 
 Required:
 
@@ -51,8 +51,7 @@ Example:
 connection:
 
 In this case, the ENETC port node defines a fixed link connection,
-as specified by "fixed-link.txt", under
-Documentation/devicetree/bindings/net/*.
+as specified by Documentation/devicetree/bindings/net/fixed-link.txt.
 
 Required:
 
diff --git a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
index 12b18f82d441..efa2c8b9b85a 100644
--- a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
@@ -3,7 +3,7 @@ Amlogic Meson AXG DWC PCIE SoC controller
 Amlogic Meson PCIe host controller is based on the Synopsys DesignWare PCI core.
 It shares common functions with the PCIe DesignWare core driver and
 inherits common properties defined in
-Documentation/devicetree/bindings/pci/designware-pci.txt.
+Documentation/devicetree/bindings/pci/designware-pcie.txt.
 
 Additional properties are described here:
 
diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
index 7ef2dbe48e8a..14d2eee96b3d 100644
--- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
@@ -97,7 +97,7 @@ Second Level Nodes - Regulators
 		    sent for this regulator including those which are for a
 		    strictly lower power state.
 
-Other properties defined in Documentation/devicetree/bindings/regulator.txt
+Other properties defined in Documentation/devicetree/bindings/regulator/regulator.txt
 may also be used.  regulator-initial-mode and regulator-allowed-modes may be
 specified for VRM regulators using mode values from
 include/dt-bindings/regulator/qcom,rpmh-regulator.h.  regulator-allow-bypass
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index e86bd2f64117..60f8640f2b2f 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -277,7 +277,7 @@ it with special cases.
   the decompressor (the real mode entry point goes to the same  32bit
   entry point once it switched into protected mode). That entry point
   supports one calling convention which is documented in
-  Documentation/x86/boot.txt
+  Documentation/x86/boot.rst
   The physical pointer to the device-tree block (defined in chapter II)
   is passed via setup_data which requires at least boot protocol 2.09.
   The type filed is defined as
diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst
index b37f3f7b8926..ce91518bf9f4 100644
--- a/Documentation/driver-api/gpio/board.rst
+++ b/Documentation/driver-api/gpio/board.rst
@@ -101,7 +101,7 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1::
 	}
 
 For more information about the ACPI GPIO bindings see
-Documentation/acpi/gpio-properties.txt.
+Documentation/firmware-guide/acpi/gpio-properties.rst.
 
 Platform Data
 -------------
diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index 5e4d8aa68913..fdecb6d711db 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -437,7 +437,7 @@ case, it will be handled by the GPIO subsystem automatically.  However, if the
 _DSD is not present, the mappings between GpioIo()/GpioInt() resources and GPIO
 connection IDs need to be provided by device drivers.
 
-For details refer to Documentation/acpi/gpio-properties.txt
+For details refer to Documentation/firmware-guide/acpi/gpio-properties.rst
 
 
 Interacting With the Legacy GPIO Subsystem
diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst
index 850be9696931..1252617b520f 100644
--- a/Documentation/firmware-guide/acpi/enumeration.rst
+++ b/Documentation/firmware-guide/acpi/enumeration.rst
@@ -339,7 +339,7 @@ a code like this::
 There are also devm_* versions of these functions which release the
 descriptors once the device is released.
 
-See Documentation/acpi/gpio-properties.txt for more information about the
+See Documentation/firmware-guide/acpi/gpio-properties.rst for more information about the
 _DSD binding related to GPIOs.
 
 MFD devices
diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst
index d0b077b73f5f..0aa7e2c5d32a 100644
--- a/Documentation/firmware-guide/acpi/method-tracing.rst
+++ b/Documentation/firmware-guide/acpi/method-tracing.rst
@@ -68,7 +68,7 @@ c. Filter out the debug layer/level matched logs when the specified
 
 Where:
    0xXXXXXXXX/0xYYYYYYYY
-     Refer to Documentation/acpi/debug.txt for possible debug layer/level
+     Refer to Documentation/firmware-guide/acpi/debug.rst for possible debug layer/level
      masking values.
    \PPPP.AAAA.TTTT.HHHH
      Full path of a control method that can be found in the ACPI namespace.
diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index 0d85ac1935b7..5a3e2f331e8c 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI
 -------------------------------------------
 
 ACPI can also describe I2C devices. There is special documentation for this
-which is currently located at Documentation/acpi/enumeration.txt.
+which is currently located at Documentation/firmware-guide/acpi/enumeration.rst.
 
 
 Method 2: Instantiate the devices explicitly
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f0c86fbb3b48..92f7f34b021a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -155,7 +155,7 @@ is 0x15 and the full version number is 0x234, this file will contain
 the value 340 = 0x154.
 
 See the type_of_loader and ext_loader_type fields in
-Documentation/x86/boot.txt for additional information.
+Documentation/x86/boot.rst for additional information.
 
 ==============================================================
 
@@ -167,7 +167,7 @@ The complete bootloader version number.  In the example above, this
 file will contain the value 564 = 0x234.
 
 See the type_of_loader and ext_loader_ver fields in
-Documentation/x86/boot.txt for additional information.
+Documentation/x86/boot.rst for additional information.
 
 ==============================================================
 
diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst
index 5301e9d55255..8bb777941394 100644
--- a/Documentation/translations/zh_CN/process/4.Coding.rst
+++ b/Documentation/translations/zh_CN/process/4.Coding.rst
@@ -241,7 +241,7 @@ scripts/coccinelle目录下已经打包了相当多的内核“语义补丁”
 
 任何添加新用户空间界面的代码（包括新的sysfs或/proc文件）都应该包含该界面的
 文档，该文档使用户空间开发人员能够知道他们在使用什么。请参阅
-Documentation/abi/readme，了解如何格式化此文档以及需要提供哪些信息。
+Documentation/ABI/README，了解如何格式化此文档以及需要提供哪些信息。
 
 文件 :ref:`Documentation/admin-guide/kernel-parameters.rst <kernelparameters>`
 描述了内核的所有引导时间参数。任何添加新参数的补丁都应该向该文件添加适当的
diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/x86/x86_64/5level-paging.rst
index ab88a4514163..44856417e6a5 100644
--- a/Documentation/x86/x86_64/5level-paging.rst
+++ b/Documentation/x86/x86_64/5level-paging.rst
@@ -20,7 +20,7 @@ physical address space. This "ought to be enough for anybody" ©.
 QEMU 2.9 and later support 5-level paging.
 
 Virtual memory layout for 5-level paging is described in
-Documentation/x86/x86_64/mm.txt
+Documentation/x86/x86_64/mm.rst
 
 
 Enabling 5-level paging
diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst
index 2f69836b8445..6a4285a3c7a4 100644
--- a/Documentation/x86/x86_64/boot-options.rst
+++ b/Documentation/x86/x86_64/boot-options.rst
@@ -9,7 +9,7 @@ only the AMD64 specific ones are listed here.
 
 Machine check
 =============
-Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
+Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
 
    mce=off
 		Disable machine check
@@ -89,7 +89,7 @@ APICs
      Don't use the local APIC (alias for i386 compatibility)
 
    pirq=...
-	See Documentation/x86/i386/IO-APIC.txt
+	See Documentation/x86/i386/IO-APIC.rst
 
    noapictimer
 	Don't set up the APIC timer
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 74fbb78b3c67..04df57b9aa3f 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -18,7 +18,7 @@ For more information on the features of cpusets, see
 Documentation/cgroup-v1/cpusets.txt.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
-configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
+configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
 
 For the purposes of this introduction, we'll assume a very primitive NUMA
 emulation setup of "numa=fake=4*512,".  This will split our system memory into
diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..26e0369c1641 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3874,7 +3874,7 @@ F:	Documentation/devicetree/bindings/hwmon/cirrus,lochnagar.txt
 F:	Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.txt
 F:	Documentation/devicetree/bindings/regulator/cirrus,lochnagar.txt
 F:	Documentation/devicetree/bindings/sound/cirrus,lochnagar.txt
-F:	Documentation/hwmon/lochnagar
+F:	Documentation/hwmon/lochnagar.rst
 
 CISCO FCOE HBA DRIVER
 M:	Satish Kharat <satishkh@cisco.com>
@@ -11272,7 +11272,7 @@ NXP FXAS21002C DRIVER
 M:	Rui Miguel Silva <rmfrfs@gmail.com>
 L:	linux-iio@vger.kernel.org
 S:	Maintained
-F:	Documentation/devicetree/bindings/iio/gyroscope/fxas21002c.txt
+F:	Documentation/devicetree/bindings/iio/gyroscope/nxp,fxas21002c.txt
 F:	drivers/iio/gyro/fxas21002c_core.c
 F:	drivers/iio/gyro/fxas21002c.h
 F:	drivers/iio/gyro/fxas21002c_i2c.c
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8869742a85df..0f220264cc23 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1263,7 +1263,7 @@ config SMP
 	  uniprocessor machines. On a uniprocessor machine, the kernel
 	  will run faster if you say N here.
 
-	  See also <file:Documentation/x86/i386/IO-APIC.txt>,
+	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
 	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
 	  <http://tldp.org/HOWTO/SMP-HOWTO.html>.
 
diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c
index 07bf740bea91..31cc2f423aa8 100644
--- a/arch/arm64/kernel/kexec_image.c
+++ b/arch/arm64/kernel/kexec_image.c
@@ -53,7 +53,7 @@ static void *image_load(struct kimage *image,
 
 	/*
 	 * We require a kernel with an unambiguous Image header. Per
-	 * Documentation/booting.txt, this is the case when image_size
+	 * Documentation/arm64/booting.txt, this is the case when image_size
 	 * is non-zero (practically speaking, since v3.17).
 	 */
 	h = (struct arm64_image_header *)kernel;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d87d53fcd261..9f1f7b47621c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -395,7 +395,7 @@ config SMP
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 
-	  See also <file:Documentation/x86/i386/IO-APIC.txt>,
+	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
 	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 
@@ -1290,7 +1290,7 @@ config MICROCODE
 	  the Linux kernel.
 
 	  The preferred method to load microcode from a detached initrd is described
-	  in Documentation/x86/microcode.txt. For that you need to enable
+	  in Documentation/x86/microcode.rst. For that you need to enable
 	  CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
 	  initrd for microcode blobs.
 
@@ -1329,7 +1329,7 @@ config MICROCODE_OLD_INTERFACE
 	  It is inadequate because it runs too late to be able to properly
 	  load microcode on a machine and it needs special tools. Instead, you
 	  should've switched to the early loading method with the initrd or
-	  builtin microcode by now: Documentation/x86/microcode.txt
+	  builtin microcode by now: Documentation/x86/microcode.rst
 
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -1478,7 +1478,7 @@ config X86_5LEVEL
 	  A kernel with the option enabled can be booted on machines that
 	  support 4- or 5-level paging.
 
-	  See Documentation/x86/x86_64/5level-paging.txt for more
+	  See Documentation/x86/x86_64/5level-paging.rst for more
 	  information.
 
 	  Say N if unsure.
@@ -1626,7 +1626,7 @@ config ARCH_MEMORY_PROBE
 	depends on X86_64 && MEMORY_HOTPLUG
 	help
 	  This option enables a sysfs memory/probe interface for testing.
-	  See Documentation/memory-hotplug.txt for more information.
+	  See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
 	  If you are unsure how to answer this question, answer N.
 
 config ARCH_PROC_KCORE_TEXT
@@ -1783,7 +1783,7 @@ config MTRR
 	  You can safely say Y even if your machine doesn't have MTRRs, you'll
 	  just add about 9 KB to your kernel.
 
-	  See <file:Documentation/x86/mtrr.txt> for more information.
+	  See <file:Documentation/x86/mtrr.rst> for more information.
 
 config MTRR_SANITIZER
 	def_bool y
@@ -1895,7 +1895,7 @@ config X86_INTEL_MPX
 	  process and adds some branches to paths used during
 	  exec() and munmap().
 
-	  For details, see Documentation/x86/intel_mpx.txt
+	  For details, see Documentation/x86/intel_mpx.rst
 
 	  If unsure, say N.
 
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index f730680dc818..59f598543203 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -156,7 +156,7 @@ config IOMMU_DEBUG
 	  code. When you use it make sure you have a big enough
 	  IOMMU/AGP aperture.  Most of the options enabled by this can
 	  be set more finegrained using the iommu= command line
-	  options. See Documentation/x86/x86_64/boot-options.txt for more
+	  options. See Documentation/x86/x86_64/boot-options.rst for more
 	  details.
 
 config IOMMU_LEAK
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 850b8762e889..90d791ca1a95 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -313,7 +313,7 @@ start_sys_seg:	.word	SYSSEG		# obsolete and meaningless, but just
 
 type_of_loader:	.byte	0		# 0 means ancient bootloader, newer
 					# bootloaders know to change this.
-					# See Documentation/x86/boot.txt for
+					# See Documentation/x86/boot.rst for
 					# assigned ids
 
 # flags, unused bits must be zero (RFU) bit within loadflags
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 11aa3b2afa4d..33f9fc38d014 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -8,7 +8,7 @@
  *
  * entry.S contains the system-call and fault low-level handling routines.
  *
- * Some of this is documented in Documentation/x86/entry_64.txt
+ * Some of this is documented in Documentation/x86/entry_64.rst
  *
  * A note on terminology:
  * - iret frame:	Architecture defined interrupt frame from SS to RIP
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index f6f6ef436599..101eb944f13c 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -24,7 +24,7 @@ static void sanitize_boot_params(struct boot_params *boot_params)
 	 * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear
 	 * this field.  The purpose of this field is to guarantee
 	 * compliance with the x86 boot spec located in
-	 * Documentation/x86/boot.txt .  That spec says that the
+	 * Documentation/x86/boot.rst .  That spec says that the
 	 * *whole* structure should be cleared, after which only the
 	 * portion defined by struct setup_header (boot_params->hdr)
 	 * should be copied in.
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 793c14c372cb..288b065955b7 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -48,7 +48,7 @@
 
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
 
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */
 
 #define __PHYSICAL_MASK_SHIFT	52
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 88bca456da99..52e5f5f2240d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -103,7 +103,7 @@ extern unsigned int ptrs_per_p4d;
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
 /*
- * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ * See Documentation/x86/x86_64/mm.rst for a description of the memory map.
  *
  * Be very careful vs. KASLR when changing anything here. The KASLR address
  * range must not overlap with anything except the KASAN shadow area, which
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index e1f3ba19ba54..06d4e67f31ab 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -61,7 +61,7 @@ static u8 amd_ucode_patch[PATCH_MAX_SIZE];
 
 /*
  * Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/microcode.txt
+ * format. See Documentation/x86/microcode.rst
  */
 static const char
 ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 22f60dd26460..b07e7069b09e 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -416,7 +416,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	efi_map_offset = params_cmdline_sz;
 	efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
 
-	/* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+	/* Copy setup header onto bootparams. Documentation/x86/boot.rst */
 	setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
 
 	/* Is there a limit on setup header size? */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dcd272dbd0a9..f62b498b18fb 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -70,7 +70,7 @@ void __init pci_iommu_alloc(void)
 }
 
 /*
- * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
+ * See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel
  * parameter documentation.
  */
 static __init int iommu_setup(char *p)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7f61431c75fb..400c1ba033aa 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -711,7 +711,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 }
 
 /*
- * See Documentation/x86/tlb.txt for details.  We choose 33
+ * See Documentation/x86/tlb.rst for details.  We choose 33
  * because it is large enough to cover the vast majority (at
  * least 95%) of allocations, and is small enough that we are
  * confident it will not cause too much overhead.  Each single
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 1861a2ba0f2b..c0a502f7e3a7 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -86,7 +86,7 @@ static void __init init_pvh_bootparams(bool xen_guest)
 	}
 
 	/*
-	 * See Documentation/x86/boot.txt.
+	 * See Documentation/x86/boot.rst.
 	 *
 	 * Version 2.12 supports Xen entry point but we will use default x86/PC
 	 * environment (i.e. hardware_subarch 0).
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 283ee94224c6..2438f37f2ca1 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -333,7 +333,7 @@ config ACPI_CUSTOM_DSDT_FILE
 	depends on !STANDALONE
 	help
 	  This option supports a custom DSDT by linking it into the kernel.
-	  See Documentation/acpi/dsdt-override.txt
+	  See Documentation/admin-guide/acpi/dsdt-override.rst
 
 	  Enter the full path name to the file which includes the AmlCode
 	  or dsdt_aml_code declaration.
@@ -355,7 +355,7 @@ config ACPI_TABLE_UPGRADE
 	  This option provides functionality to upgrade arbitrary ACPI tables
 	  via initrd. No functional change if no ACPI tables are passed via
 	  initrd, therefore it's safe to say Y.
-	  See Documentation/acpi/initrd_table_override.txt for details
+	  See Documentation/admin-guide/acpi/initrd_table_override.rst for details
 
 config ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD
 	bool "Override ACPI tables from built-in initrd"
@@ -365,7 +365,7 @@ config ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD
 	  This option provides functionality to override arbitrary ACPI tables
 	  from built-in uncompressed initrd.
 
-	  See Documentation/acpi/initrd_table_override.txt for details
+	  See Documentation/admin-guide/acpi/initrd_table_override.rst for details
 
 config ACPI_DEBUG
 	bool "Debug Statements"
@@ -374,7 +374,7 @@ config ACPI_DEBUG
 	  output and increases the kernel size by around 50K.
 
 	  Use the acpi.debug_layer and acpi.debug_level kernel command-line
-	  parameters documented in Documentation/acpi/debug.txt and
+	  parameters documented in Documentation/firmware-guide/acpi/debug.rst and
 	  Documentation/admin-guide/kernel-parameters.rst to control the type and
 	  amount of debug output.
 
@@ -445,7 +445,7 @@ config ACPI_CUSTOM_METHOD
 	help
 	  This debug facility allows ACPI AML methods to be inserted and/or
 	  replaced without rebooting the system. For details refer to:
-	  Documentation/acpi/method-customizing.txt.
+	  Documentation/firmware-guide/acpi/method-customizing.rst.
 
 	  NOTE: This option is security sensitive, because it allows arbitrary
 	  kernel memory to be written to by root (uid=0) users, allowing them
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index b17b79e612a3..ac6280ad43a1 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1075,7 +1075,7 @@ static int ftgmac100_mii_probe(struct ftgmac100 *priv, phy_interface_t intf)
 	}
 
 	/* Indicate that we support PAUSE frames (see comment in
-	 * Documentation/networking/phy.txt)
+	 * Documentation/networking/phy.rst)
 	 */
 	phy_support_asym_pause(phydev);
 
diff --git a/drivers/staging/fieldbus/Documentation/fieldbus_dev.txt b/drivers/staging/fieldbus/Documentation/fieldbus_dev.txt
index 56af3f650fa3..89fb8e14676f 100644
--- a/drivers/staging/fieldbus/Documentation/fieldbus_dev.txt
+++ b/drivers/staging/fieldbus/Documentation/fieldbus_dev.txt
@@ -54,8 +54,8 @@ a limited few common behaviours and properties. This allows us to define
 a simple interface consisting of a character device and a set of sysfs files:
 
 See:
-Documentation/ABI/testing/sysfs-class-fieldbus-dev
-Documentation/ABI/testing/fieldbus-dev-cdev
+drivers/staging/fieldbus/Documentation/ABI/sysfs-class-fieldbus-dev
+drivers/staging/fieldbus/Documentation/ABI/fieldbus-dev-cdev
 
 Note that this simple interface does not provide a way to modify adapter
 configuration settings. It is therefore useful only for adapters that get their
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1e3ed41ae1f3..69938dbae2d0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1694,7 +1694,7 @@ EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
 
 /* TODO: This is really inefficient.  We need something like get_user()
  * (instruction directly accesses the data, with an exception table entry
- * returning -EFAULT). See Documentation/x86/exception-tables.txt.
+ * returning -EFAULT). See Documentation/x86/exception-tables.rst.
  */
 static int set_bit_to_user(int nr, void __user *addr)
 {
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index de1804aeaf69..98e3db7a89cd 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -25,7 +25,7 @@
 #define ACPI_MAX_STRING			80
 
 /*
- * Please update drivers/acpi/debug.c and Documentation/acpi/debug.txt
+ * Please update drivers/acpi/debug.c and Documentation/firmware-guide/acpi/debug.rst
  * if you add to this list.
  */
 #define ACPI_BUS_COMPONENT		0x00010000
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 1f966670c8dc..623eb58560b9 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -85,7 +85,7 @@ struct fs_parameter {
  * Superblock creation fills in ->root whereas reconfiguration begins with this
  * already set.
  *
- * See Documentation/filesystems/mounting.txt
+ * See Documentation/filesystems/mount_api.txt
  */
 struct fs_context {
 	const struct fs_context_operations *ops;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 47f58cfb6a19..df1318d85f7d 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -77,7 +77,7 @@
  *	state.  This is called immediately after commit_creds().
  *
  * Security hooks for mount using fs_context.
- *	[See also Documentation/filesystems/mounting.txt]
+ *	[See also Documentation/filesystems/mount_api.txt]
  *
  * @fs_context_dup:
  *	Allocate and attach a security structure to sc->security.  This pointer
diff --git a/mm/Kconfig b/mm/Kconfig
index ee8d1f311858..6e5fb81bde4b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -165,7 +165,7 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
 	  onlining policy (/sys/devices/system/memory/auto_online_blocks) which
 	  determines what happens to newly added memory regions. Policy setting
 	  can always be changed at runtime.
-	  See Documentation/memory-hotplug.txt for more information.
+	  See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
 
 	  Say Y here if you want all hot-plugged memory blocks to appear in
 	  'online' state by default.
diff --git a/security/Kconfig b/security/Kconfig
index aeac3676dd4d..6d75ed71970c 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -62,7 +62,7 @@ config PAGE_TABLE_ISOLATION
 	  ensuring that the majority of kernel addresses are not mapped
 	  into userspace.
 
-	  See Documentation/x86/pti.txt for more details.
+	  See Documentation/x86/pti.rst for more details.
 
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"
diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h
index 2f5a12b88a86..25f2bb3a991d 100644
--- a/tools/include/linux/err.h
+++ b/tools/include/linux/err.h
@@ -20,7 +20,7 @@
  * Userspace note:
  * The same principle works for userspace, because 'error' pointers
  * fall down to the unused hole far from user space, as described
- * in Documentation/x86/x86_64/mm.txt for x86_64 arch:
+ * in Documentation/x86/x86_64/mm.rst for x86_64 arch:
  *
  * 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension
  * ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 4dd11a554b9b..de094670050b 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -21,7 +21,7 @@ instructions).  Similarly, it knows how to follow switch statements, for
 which gcc sometimes uses jump tables.
 
 (Objtool also has an 'orc generate' subcommand which generates debuginfo
-for the ORC unwinder.  See Documentation/x86/orc-unwinder.txt in the
+for the ORC unwinder.  See Documentation/x86/orc-unwinder.rst in the
 kernel tree for more details.)
 
 
@@ -101,7 +101,7 @@ b) ORC (Oops Rewind Capability) unwind table generation
    band.  So it doesn't affect runtime performance and it can be
    reliable even when interrupts or exceptions are involved.
 
-   For more details, see Documentation/x86/orc-unwinder.txt.
+   For more details, see Documentation/x86/orc-unwinder.rst.
 
 c) Higher live patching compatibility rate
 
-- 
cgit v1.2.3


From 5c437fa29561f5809ef114ba3a5e80556cc43fb3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Fri, 7 Jun 2019 15:54:35 -0300
Subject: docs: fs: fix broken links to vfs.txt with was renamed to vfs.rst

A recent documentation conversion renamed this file but forgot
to update the links.

Fixes: af96c1e304f7 ("docs: filesystems: vfs: Convert vfs.txt to RST")
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/porting | 10 +++++-----
 include/linux/dcache.h            |  4 ++--
 include/linux/fs.h                |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3bd1148d8bb6..2813a19389fe 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -330,14 +330,14 @@ unreferenced dentries, and is now only called when the dentry refcount goes to
 [mandatory]
 
 	.d_compare() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
 look at examples of other filesystems) for guidance.
 
 ---
 [mandatory]
 
 	.d_hash() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
 look at examples of other filesystems) for guidance.
 
 ---
@@ -377,12 +377,12 @@ where possible.
 the filesystem provides it), which requires dropping out of rcu-walk mode. This
 may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
 returned if the filesystem cannot handle rcu-walk. See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
 
 	permission is an inode permission check that is called on many or all
 directory inodes on the way down a path walk (to check for exec permission). It
 must now be rcu-walk aware (mask & MAY_NOT_BLOCK).  See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
  
 --
 [mandatory]
@@ -625,7 +625,7 @@ in your dentry operations instead.
 --
 [mandatory]
 	->clone_file_range() and ->dedupe_file_range have been replaced with
-	->remap_file_range().  See Documentation/filesystems/vfs.txt for more
+	->remap_file_range().  See Documentation/filesystems/vfs.rst for more
 	information.
 --
 [recommended]
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f14e587c5d5d..5e0eadf7de55 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -153,7 +153,7 @@ struct dentry_operations {
  * Locking rules for dentry_operations callbacks are to be found in
  * Documentation/filesystems/Locking. Keep it updated!
  *
- * FUrther descriptions are found in Documentation/filesystems/vfs.txt.
+ * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
  * Keep it updated too!
  */
 
@@ -568,7 +568,7 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
  * If dentry is on a union/overlay, then return the underlying, real dentry.
  * Otherwise return the dentry itself.
  *
- * See also: Documentation/filesystems/vfs.txt
+ * See also: Documentation/filesystems/vfs.rst
  */
 static inline struct dentry *d_real(struct dentry *dentry,
 				    const struct inode *inode)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..c564cf3f48d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1769,7 +1769,7 @@ struct block_device_operations;
 /*
  * These flags control the behavior of the remap_file_range function pointer.
  * If it is called with len == 0 that means "remap to end of source file".
- * See Documentation/filesystems/vfs.txt for more details about this call.
+ * See Documentation/filesystems/vfs.rst for more details about this call.
  *
  * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
  * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
-- 
cgit v1.2.3


From d461933638ae9fa49ad22f60a40de5b3ed414912 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:29 +0300
Subject: net: dsa: tag_8021q: Create helper function for removing VLAN header

This removes the existing implementation from tag_sja1105, which was
partially incorrect (it was not changing the MAC header offset, thereby
leaving it to point 4 bytes earlier than it should have).

This overwrites the VLAN tag by moving the Ethernet source and
destination MACs 4 bytes to the right. Then skb->data (assumed to be
pointing immediately after the EtherType) is temporarily pushed to the
beginning of the new Ethernet header, the new Ethernet header offset and
length are recorded, then skb->data is moved back to where it was.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 16 ++++++-------
 net/dsa/tag_8021q.c       | 57 ++++++++++++++++++++++++++++++-----------------
 net/dsa/tag_sja1105.c     | 19 ++++++++--------
 3 files changed, 53 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 3911e0586478..0aa803c451a3 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -20,9 +20,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci);
-
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
@@ -31,6 +28,8 @@ int dsa_8021q_rx_switch_id(u16 vid);
 
 int dsa_8021q_rx_source_port(u16 vid);
 
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb);
+
 #else
 
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
@@ -45,12 +44,6 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 	return NULL;
 }
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci)
-{
-	return NULL;
-}
-
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port)
 {
 	return 0;
@@ -71,6 +64,11 @@ int dsa_8021q_rx_source_port(u16 vid)
 	return 0;
 }
 
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
+{
+	return NULL;
+}
+
 #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */
 
 #endif /* _NET_DSA_8021Q_H */
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 65a35e976d7b..6ebbd799c4eb 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci)
+/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
+ * tag, after tpid and before tci. This is because so far, ETH_HLEN
+ * (DMAC, SMAC, EtherType) bytes were pulled.
+ * There are 2 bytes of VLAN tag left in skb->data, and upper
+ * layers expect the 'real' EtherType to be consumed as well.
+ * Coincidentally, a VLAN header is also of the same size as
+ * the number of bytes that need to be pulled.
+ *
+ * skb_mac_header                                      skb->data
+ * |                                                       |
+ * v                                                       v
+ * |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * |    Destination MAC    |      Source MAC       |  TPID |  TCI  | EType |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * ^                                               |               |
+ * |<--VLAN_HLEN-->to                              <---VLAN_HLEN--->
+ * from            |
+ *       >>>>>>>   v
+ *       >>>>>>>   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *       >>>>>>>   +-----------------------+-----------------------+-------+
+ *       >>>>>>>   |    Destination MAC    |      Source MAC       | EType |
+ *                 +-----------------------+-----------------------+-------+
+ *                 ^                                                       ^
+ * (now part of    |                                                       |
+ *  skb->head)     skb_mac_header                                  skb->data
+ */
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
 {
-	struct vlan_ethhdr *tag;
-
-	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
-		return NULL;
+	u8 *from = skb_mac_header(skb);
+	u8 *dest = from + VLAN_HLEN;
 
-	tag = vlan_eth_hdr(skb);
-	*tpid = ntohs(tag->h_vlan_proto);
-	*tci = ntohs(tag->h_vlan_TCI);
-
-	/* skb->data points in the middle of the VLAN tag,
-	 * after tpid and before tci. This is because so far,
-	 * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled.
-	 * There are 2 bytes of VLAN tag left in skb->data, and upper
-	 * layers expect the 'real' EtherType to be consumed as well.
-	 * Coincidentally, a VLAN header is also of the same size as
-	 * the number of bytes that need to be pulled.
-	 */
-	skb_pull_rcsum(skb, VLAN_HLEN);
+	memmove(dest, from, ETH_HLEN - VLAN_HLEN);
+	skb_pull(skb, VLAN_HLEN);
+	skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb_pull_rcsum(skb, ETH_HLEN);
 
 	return skb;
 }
-EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
+EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
 
 static const struct dsa_device_ops dsa_8021q_netdev_ops = {
 	.name		= "8021q",
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index d43737e6c3fb..77eeea004e92 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -66,17 +66,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
 {
-	struct ethhdr *hdr = eth_hdr(skb);
-	u64 source_port, switch_id;
-	struct sk_buff *nskb;
+	int source_port, switch_id;
+	struct vlan_ethhdr *hdr;
 	u16 tpid, vid, tci;
 	bool is_tagged;
 
-	nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci);
-	is_tagged = (nskb && tpid == ETH_P_SJA1105);
-
-	skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
-	vid = tci & VLAN_VID_MASK;
+	hdr = vlan_eth_hdr(skb);
+	tpid = ntohs(hdr->h_vlan_proto);
+	is_tagged = (tpid == ETH_P_SJA1105);
 
 	skb->offload_fwd_mark = 1;
 
@@ -92,8 +89,11 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 		hdr->h_dest[4] = 0;
 	} else {
 		/* Normal traffic path. */
+		tci = ntohs(hdr->h_vlan_TCI);
+		vid = tci & VLAN_VID_MASK;
 		source_port = dsa_8021q_rx_source_port(vid);
 		switch_id = dsa_8021q_rx_switch_id(vid);
+		skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 	}
 
 	skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
@@ -106,8 +106,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	 * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
 	 */
 	if (is_tagged)
-		memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN,
-			ETH_HLEN - VLAN_HLEN);
+		skb = dsa_8021q_remove_header(skb);
 
 	return skb;
 }
-- 
cgit v1.2.3


From 47ed985e97f513b7746270e8c5d1f3a3f959b2da Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:35 +0300
Subject: net: dsa: sja1105: Add logic for TX timestamping

On TX, timestamping is performed synchronously from the
port_deferred_xmit worker thread.
In management routes, the switch is requested to take egress timestamps
(again partial), which are reconstructed and appended to a clone of the
skb that was just sent.  The cloning is done by DSA and we retrieve the
pointer from the structure that DSA keeps in skb->cb.
Then these clones are enqueued to the socket's error queue for
application-level processing.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  10 ++++
 drivers/net/dsa/sja1105/sja1105_main.c |  55 ++++++++++++++++-
 drivers/net/dsa/sja1105/sja1105_ptp.c  | 106 +++++++++++++++++++++++++++++++++
 drivers/net/dsa/sja1105/sja1105_ptp.h  |  17 ++++++
 drivers/net/dsa/sja1105/sja1105_spi.c  |  14 +++++
 include/linux/dsa/sja1105.h            |   1 +
 6 files changed, 201 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 3c6296203c21..5a4f83a3417b 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -33,6 +33,7 @@ struct sja1105_regs {
 	u64 ptpclk;
 	u64 ptpclkrate;
 	u64 ptptsclk;
+	u64 ptpegr_ts[SJA1105_NUM_PORTS];
 	u64 pad_mii_tx[SJA1105_NUM_PORTS];
 	u64 cgu_idiv[SJA1105_NUM_PORTS];
 	u64 rgmii_pad_mii_tx[SJA1105_NUM_PORTS];
@@ -56,6 +57,15 @@ struct sja1105_info {
 	 * switch core and device_id)
 	 */
 	u64 part_no;
+	/* E/T and P/Q/R/S have partial timestamps of different sizes.
+	 * They must be reconstructed on both families anyway to get the full
+	 * 64-bit values back.
+	 */
+	int ptp_ts_bits;
+	/* Also SPI commands are of different sizes to retrieve
+	 * the egress timestamps.
+	 */
+	int ptpegr_ts_bytes;
 	const struct sja1105_dynamic_table_ops *dyn_ops;
 	const struct sja1105_table_ops *static_ops;
 	const struct sja1105_regs *regs;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index f897fdb12930..121ceccd8107 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1565,7 +1565,7 @@ static int sja1105_setup(struct dsa_switch *ds)
 }
 
 static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
-			     struct sk_buff *skb)
+			     struct sk_buff *skb, bool takets)
 {
 	struct sja1105_mgmt_entry mgmt_route = {0};
 	struct sja1105_private *priv = ds->priv;
@@ -1578,6 +1578,8 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 	mgmt_route.macaddr = ether_addr_to_u64(hdr->h_dest);
 	mgmt_route.destports = BIT(port);
 	mgmt_route.enfport = 1;
+	mgmt_route.tsreg = 0;
+	mgmt_route.takets = takets;
 
 	rc = sja1105_dynamic_config_write(priv, BLK_IDX_MGMT_ROUTE,
 					  slot, &mgmt_route, true);
@@ -1629,7 +1631,11 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
 {
 	struct sja1105_private *priv = ds->priv;
 	struct sja1105_port *sp = &priv->ports[port];
+	struct skb_shared_hwtstamps shwt = {0};
 	int slot = sp->mgmt_slot;
+	struct sk_buff *clone;
+	u64 now, ts;
+	int rc;
 
 	/* The tragic fact about the switch having 4x2 slots for installing
 	 * management routes is that all of them except one are actually
@@ -1647,8 +1653,36 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
 	 */
 	mutex_lock(&priv->mgmt_lock);
 
-	sja1105_mgmt_xmit(ds, port, slot, skb);
+	/* The clone, if there, was made by dsa_skb_tx_timestamp */
+	clone = DSA_SKB_CB(skb)->clone;
+
+	sja1105_mgmt_xmit(ds, port, slot, skb, !!clone);
+
+	if (!clone)
+		goto out;
+
+	skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS;
+
+	mutex_lock(&priv->ptp_lock);
+
+	now = priv->tstamp_cc.read(&priv->tstamp_cc);
+
+	rc = sja1105_ptpegr_ts_poll(priv, slot, &ts);
+	if (rc < 0) {
+		dev_err(ds->dev, "xmit: timed out polling for tstamp\n");
+		kfree_skb(clone);
+		goto out_unlock_ptp;
+	}
+
+	ts = sja1105_tstamp_reconstruct(priv, now, ts);
+	ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
 
+	shwt.hwtstamp = ns_to_ktime(ts);
+	skb_complete_tx_timestamp(clone, &shwt);
+
+out_unlock_ptp:
+	mutex_unlock(&priv->ptp_lock);
+out:
 	mutex_unlock(&priv->mgmt_lock);
 	return NETDEV_TX_OK;
 }
@@ -1677,6 +1711,22 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds,
 	return sja1105_static_config_reload(priv);
 }
 
+/* Called from dsa_skb_tx_timestamp. This callback is just to make DSA clone
+ * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit
+ * callback, where we will timestamp it synchronously.
+ */
+bool sja1105_port_txtstamp(struct dsa_switch *ds, int port,
+			   struct sk_buff *skb, unsigned int type)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_port *sp = &priv->ports[port];
+
+	if (!sp->hwts_tx_en)
+		return false;
+
+	return true;
+}
+
 static const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
@@ -1701,6 +1751,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_mdb_add		= sja1105_mdb_add,
 	.port_mdb_del		= sja1105_mdb_del,
 	.port_deferred_xmit	= sja1105_port_deferred_xmit,
+	.port_txtstamp		= sja1105_port_txtstamp,
 };
 
 static int sja1105_check_device_id(struct sja1105_private *priv)
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 47313a6ec932..01ecc8fb1b30 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -113,6 +113,112 @@ int sja1105pqrs_ptp_cmd(const void *ctx, const void *data)
 }
 EXPORT_SYMBOL_GPL(sja1105pqrs_ptp_cmd);
 
+/* The switch returns partial timestamps (24 bits for SJA1105 E/T, which wrap
+ * around in 0.135 seconds, and 32 bits for P/Q/R/S, wrapping around in 34.35
+ * seconds).
+ *
+ * This receives the RX or TX MAC timestamps, provided by hardware as
+ * the lower bits of the cycle counter, sampled at the time the timestamp was
+ * collected.
+ *
+ * To reconstruct into a full 64-bit-wide timestamp, the cycle counter is
+ * read and the high-order bits are filled in.
+ *
+ * Must be called within one wraparound period of the partial timestamp since
+ * it was generated by the MAC.
+ */
+u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
+			       u64 ts_partial)
+{
+	u64 partial_tstamp_mask = CYCLECOUNTER_MASK(priv->info->ptp_ts_bits);
+	u64 ts_reconstructed;
+
+	ts_reconstructed = (now & ~partial_tstamp_mask) | ts_partial;
+
+	/* Check lower bits of current cycle counter against the timestamp.
+	 * If the current cycle counter is lower than the partial timestamp,
+	 * then wraparound surely occurred and must be accounted for.
+	 */
+	if ((now & partial_tstamp_mask) <= ts_partial)
+		ts_reconstructed -= (partial_tstamp_mask + 1);
+
+	return ts_reconstructed;
+}
+EXPORT_SYMBOL_GPL(sja1105_tstamp_reconstruct);
+
+/* Reads the SPI interface for an egress timestamp generated by the switch
+ * for frames sent using management routes.
+ *
+ * SJA1105 E/T layout of the 4-byte SPI payload:
+ *
+ * 31    23    15    7     0
+ * |     |     |     |     |
+ * +-----+-----+-----+     ^
+ *          ^              |
+ *          |              |
+ *  24-bit timestamp   Update bit
+ *
+ *
+ * SJA1105 P/Q/R/S layout of the 8-byte SPI payload:
+ *
+ * 31    23    15    7     0     63    55    47    39    32
+ * |     |     |     |     |     |     |     |     |     |
+ *                         ^     +-----+-----+-----+-----+
+ *                         |                 ^
+ *                         |                 |
+ *                    Update bit    32-bit timestamp
+ *
+ * Notice that the update bit is in the same place.
+ * To have common code for E/T and P/Q/R/S for reading the timestamp,
+ * we need to juggle with the offset and the bit indices.
+ */
+int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
+{
+	const struct sja1105_regs *regs = priv->info->regs;
+	int tstamp_bit_start, tstamp_bit_end;
+	int timeout = 10;
+	u8 packed_buf[8];
+	u64 update;
+	int rc;
+
+	do {
+		rc = sja1105_spi_send_packed_buf(priv, SPI_READ,
+						 regs->ptpegr_ts[port],
+						 packed_buf,
+						 priv->info->ptpegr_ts_bytes);
+		if (rc < 0)
+			return rc;
+
+		sja1105_unpack(packed_buf, &update, 0, 0,
+			       priv->info->ptpegr_ts_bytes);
+		if (update)
+			break;
+
+		usleep_range(10, 50);
+	} while (--timeout);
+
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	/* Point the end bit to the second 32-bit word on P/Q/R/S,
+	 * no-op on E/T.
+	 */
+	tstamp_bit_end = (priv->info->ptpegr_ts_bytes - 4) * 8;
+	/* Shift the 24-bit timestamp on E/T to be collected from 31:8.
+	 * No-op on P/Q/R/S.
+	 */
+	tstamp_bit_end += 32 - priv->info->ptp_ts_bits;
+	tstamp_bit_start = tstamp_bit_end + priv->info->ptp_ts_bits - 1;
+
+	*ts = 0;
+
+	sja1105_unpack(packed_buf, ts, tstamp_bit_start, tstamp_bit_end,
+		       priv->info->ptpegr_ts_bytes);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sja1105_ptpegr_ts_poll);
+
 int sja1105_ptp_reset(struct sja1105_private *priv)
 {
 	struct dsa_switch *ds = priv->ds;
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index 137ffbb0a233..af456b0a4d27 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -10,6 +10,8 @@ int sja1105_ptp_clock_register(struct sja1105_private *priv);
 
 void sja1105_ptp_clock_unregister(struct sja1105_private *priv);
 
+int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts);
+
 int sja1105et_ptp_cmd(const void *ctx, const void *data);
 
 int sja1105pqrs_ptp_cmd(const void *ctx, const void *data);
@@ -17,6 +19,9 @@ int sja1105pqrs_ptp_cmd(const void *ctx, const void *data);
 int sja1105_get_ts_info(struct dsa_switch *ds, int port,
 			struct ethtool_ts_info *ts);
 
+u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
+			       u64 ts_partial);
+
 int sja1105_ptp_reset(struct sja1105_private *priv);
 
 #else
@@ -31,6 +36,18 @@ static inline void sja1105_ptp_clock_unregister(struct sja1105_private *priv)
 	return;
 }
 
+static inline int
+sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
+{
+	return 0;
+}
+
+static inline u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv,
+					     u64 now, u64 ts_partial)
+{
+	return 0;
+}
+
 static inline int sja1105_ptp_reset(struct sja1105_private *priv)
 {
 	return 0;
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index a0d08e6c22ff..d729a0f0b28e 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -514,6 +514,7 @@ static struct sja1105_regs sja1105et_regs = {
 	.rgmii_tx_clk = {0x100016, 0x10001D, 0x100024, 0x10002B, 0x100032},
 	.rmii_ref_clk = {0x100015, 0x10001C, 0x100023, 0x10002A, 0x100031},
 	.rmii_ext_tx_clk = {0x100018, 0x10001F, 0x100026, 0x10002D, 0x100034},
+	.ptpegr_ts = {0xC0, 0xC2, 0xC4, 0xC6, 0xC8},
 	.ptp_control = 0x17,
 	.ptpclk = 0x18, /* Spans 0x18 to 0x19 */
 	.ptpclkrate = 0x1A,
@@ -544,6 +545,7 @@ static struct sja1105_regs sja1105pqrs_regs = {
 	.rmii_ref_clk = {0x100015, 0x10001B, 0x100021, 0x100027, 0x10002D},
 	.rmii_ext_tx_clk = {0x100017, 0x10001D, 0x100023, 0x100029, 0x10002F},
 	.qlevel = {0x604, 0x614, 0x624, 0x634, 0x644},
+	.ptpegr_ts = {0xC0, 0xC4, 0xC8, 0xCC, 0xD0},
 	.ptp_control = 0x18,
 	.ptpclk = 0x19,
 	.ptpclkrate = 0x1B,
@@ -555,6 +557,8 @@ struct sja1105_info sja1105e_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105e_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.ptp_ts_bits		= 24,
+	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
 	.fdb_add_cmd		= sja1105et_fdb_add,
 	.fdb_del_cmd		= sja1105et_fdb_del,
@@ -567,6 +571,8 @@ struct sja1105_info sja1105t_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105t_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.ptp_ts_bits		= 24,
+	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
 	.fdb_add_cmd		= sja1105et_fdb_add,
 	.fdb_del_cmd		= sja1105et_fdb_del,
@@ -579,6 +585,8 @@ struct sja1105_info sja1105p_info = {
 	.part_no		= SJA1105P_PART_NO,
 	.static_ops		= sja1105p_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -591,6 +599,8 @@ struct sja1105_info sja1105q_info = {
 	.part_no		= SJA1105Q_PART_NO,
 	.static_ops		= sja1105q_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -603,6 +613,8 @@ struct sja1105_info sja1105r_info = {
 	.part_no		= SJA1105R_PART_NO,
 	.static_ops		= sja1105r_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -616,6 +628,8 @@ struct sja1105_info sja1105s_info = {
 	.static_ops		= sja1105s_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.regs			= &sja1105pqrs_regs,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index e46e18c47d41..5a956f335022 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -22,6 +22,7 @@
 
 struct sja1105_port {
 	struct dsa_port *dp;
+	bool hwts_tx_en;
 	int mgmt_slot;
 };
 
-- 
cgit v1.2.3


From d3f9b90bf19fad05889e4bead7dc1b336da56118 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:36 +0300
Subject: net: dsa: sja1105: Build a minimal understanding of meta frames

Meta frames are sent on the CPU port by the switch if RX timestamping is
enabled. They contain a partial timestamp of the previous frame.

They are Ethernet frames with the Ethernet header constructed out of:

- SJA1105_META_DMAC
- SJA1105_META_SMAC
- ETH_P_SJA1105_META

The Ethernet payload will be decoded in a follow-up patch.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 11 +++++++++++
 net/dsa/tag_sja1105.c       | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 5a956f335022..cc4a909d1007 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -12,6 +12,7 @@
 #include <net/dsa.h>
 
 #define ETH_P_SJA1105				ETH_P_DSA_8021Q
+#define ETH_P_SJA1105_META			0x0008
 
 /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */
 #define SJA1105_LINKLOCAL_FILTER_A		0x0180C2000000ull
@@ -20,6 +21,16 @@
 #define SJA1105_LINKLOCAL_FILTER_B		0x011B19000000ull
 #define SJA1105_LINKLOCAL_FILTER_B_MASK		0xFFFFFF000000ull
 
+/* Source and Destination MAC of follow-up meta frames.
+ * Whereas the choice of SMAC only affects the unique identification of the
+ * switch as sender of meta frames, the DMAC must be an address that is present
+ * in the DSA master port's multicast MAC filter.
+ * 01-80-C2-00-00-0E is a good choice for this, as all profiles of IEEE 1588
+ * over L2 use this address for some purpose already.
+ */
+#define SJA1105_META_SMAC			0x222222222222ull
+#define SJA1105_META_DMAC			0x0180C200000Eull
+
 struct sja1105_port {
 	struct dsa_port *dp;
 	bool hwts_tx_en;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index cd8e0bfb5e75..0beb52518d56 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -22,6 +22,21 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
 	return false;
 }
 
+static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+	const struct ethhdr *hdr = eth_hdr(skb);
+	u64 smac = ether_addr_to_u64(hdr->h_source);
+	u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+	if (smac != SJA1105_META_SMAC)
+		return false;
+	if (dmac != SJA1105_META_DMAC)
+		return false;
+	if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+		return false;
+	return true;
+}
+
 /* This is the first time the tagger sees the frame on RX.
  * Figure out if we can decode it, and if we can, annotate skb->cb with how we
  * plan to do that, so we don't need to check again in the rcv function.
-- 
cgit v1.2.3


From 844d7edc6a34ae3a8236f1306e4f2615c8db1eac Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:40 +0300
Subject: net: dsa: sja1105: Add a global sja1105_tagger_data structure

This will be used to keep state for RX timestamping. It is global
because the switch serializes timestampable and meta frames when
trapping them towards the CPU port (lower port indices have higher
priority) and therefore having one state machine per port would create
unnecessary complications.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  1 +
 drivers/net/dsa/sja1105/sja1105_main.c |  5 +++++
 include/linux/dsa/sja1105.h            | 15 +++++++++++++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 5a4f83a3417b..0fc6fe9ada87 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -103,6 +103,7 @@ struct sja1105_private {
 	 * the switch doesn't confuse them with one another.
 	 */
 	struct mutex mgmt_lock;
+	struct sja1105_tagger_data tagger_data;
 };
 
 #include "sja1105_dynamic_config.h"
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index d129997174bb..3c11142f1c67 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1828,6 +1828,7 @@ static int sja1105_check_device_id(struct sja1105_private *priv)
 
 static int sja1105_probe(struct spi_device *spi)
 {
+	struct sja1105_tagger_data *tagger_data;
 	struct device *dev = &spi->dev;
 	struct sja1105_private *priv;
 	struct dsa_switch *ds;
@@ -1882,12 +1883,16 @@ static int sja1105_probe(struct spi_device *spi)
 	ds->priv = priv;
 	priv->ds = ds;
 
+	tagger_data = &priv->tagger_data;
+	skb_queue_head_init(&tagger_data->skb_rxtstamp_queue);
+
 	/* Connections between dsa_port and sja1105_port */
 	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
 		struct sja1105_port *sp = &priv->ports[i];
 
 		ds->ports[i].priv = sp;
 		sp->dp = &ds->ports[i];
+		sp->data = tagger_data;
 	}
 	mutex_init(&priv->mgmt_lock);
 
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index cc4a909d1007..2c4fce4eaf0d 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -31,7 +31,22 @@
 #define SJA1105_META_SMAC			0x222222222222ull
 #define SJA1105_META_DMAC			0x0180C200000Eull
 
+/* Global tagger data: each struct sja1105_port has a reference to
+ * the structure defined in struct sja1105_private.
+ */
+struct sja1105_tagger_data {
+	struct sk_buff_head skb_rxtstamp_queue;
+	struct work_struct rxtstamp_work;
+	struct sk_buff *stampable_skb;
+	/* Protects concurrent access to the meta state machine
+	 * from taggers running on multiple ports on SMP systems
+	 */
+	spinlock_t meta_lock;
+	bool hwts_rx_en;
+};
+
 struct sja1105_port {
+	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
 	bool hwts_tx_en;
 	int mgmt_slot;
-- 
cgit v1.2.3


From f3097be21bf17ae8785eea009cbc424f16611d9a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:42 +0300
Subject: net: dsa: sja1105: Add a state machine for RX timestamping

Meta frame reception relies on the hardware keeping its promise that it
will send no other traffic towards the CPU port between a link-local
frame and a meta frame.  Otherwise there is no other way to associate
the meta frame with the link-local frame it's holding a timestamp of.
The receive function is made stateful, and buffers a timestampable frame
until its meta frame arrives, then merges the two, drops the meta and
releases the link-local frame up the stack.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  62 +++++++++++++++++
 include/linux/dsa/sja1105.h            |   7 ++
 net/dsa/tag_sja1105.c                  | 121 ++++++++++++++++++++++++++++++++-
 3 files changed, 189 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 2b804eeca390..8963b21b3061 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1600,6 +1600,14 @@ static int sja1105_setup(struct dsa_switch *ds)
 	return sja1105_setup_8021q_tagging(ds, true);
 }
 
+static void sja1105_teardown(struct dsa_switch *ds)
+{
+	struct sja1105_private *priv = ds->priv;
+
+	cancel_work_sync(&priv->tagger_data.rxtstamp_work);
+	skb_queue_purge(&priv->tagger_data.skb_rxtstamp_queue);
+}
+
 static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 			     struct sk_buff *skb, bool takets)
 {
@@ -1747,6 +1755,57 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds,
 	return sja1105_static_config_reload(priv);
 }
 
+#define to_tagger(d) \
+	container_of((d), struct sja1105_tagger_data, rxtstamp_work)
+#define to_sja1105(d) \
+	container_of((d), struct sja1105_private, tagger_data)
+
+static void sja1105_rxtstamp_work(struct work_struct *work)
+{
+	struct sja1105_tagger_data *data = to_tagger(work);
+	struct sja1105_private *priv = to_sja1105(data);
+	struct sk_buff *skb;
+	u64 now;
+
+	mutex_lock(&priv->ptp_lock);
+
+	now = priv->tstamp_cc.read(&priv->tstamp_cc);
+
+	while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) {
+		struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb);
+		u64 ts;
+
+		*shwt = (struct skb_shared_hwtstamps) {0};
+
+		ts = SJA1105_SKB_CB(skb)->meta_tstamp;
+		ts = sja1105_tstamp_reconstruct(priv, now, ts);
+		ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
+
+		shwt->hwtstamp = ns_to_ktime(ts);
+		netif_rx_ni(skb);
+	}
+
+	mutex_unlock(&priv->ptp_lock);
+}
+
+/* Called from dsa_skb_defer_rx_timestamp */
+bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
+			   struct sk_buff *skb, unsigned int type)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_tagger_data *data = &priv->tagger_data;
+
+	if (!data->hwts_rx_en)
+		return false;
+
+	/* We need to read the full PTP clock to reconstruct the Rx
+	 * timestamp. For that we need a sleepable context.
+	 */
+	skb_queue_tail(&data->skb_rxtstamp_queue, skb);
+	schedule_work(&data->rxtstamp_work);
+	return true;
+}
+
 /* Called from dsa_skb_tx_timestamp. This callback is just to make DSA clone
  * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit
  * callback, where we will timestamp it synchronously.
@@ -1766,6 +1825,7 @@ bool sja1105_port_txtstamp(struct dsa_switch *ds, int port,
 static const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
+	.teardown		= sja1105_teardown,
 	.set_ageing_time	= sja1105_set_ageing_time,
 	.phylink_validate	= sja1105_phylink_validate,
 	.phylink_mac_config	= sja1105_mac_config,
@@ -1787,6 +1847,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_mdb_add		= sja1105_mdb_add,
 	.port_mdb_del		= sja1105_mdb_del,
 	.port_deferred_xmit	= sja1105_port_deferred_xmit,
+	.port_rxtstamp		= sja1105_port_rxtstamp,
 	.port_txtstamp		= sja1105_port_txtstamp,
 };
 
@@ -1885,6 +1946,7 @@ static int sja1105_probe(struct spi_device *spi)
 
 	tagger_data = &priv->tagger_data;
 	skb_queue_head_init(&tagger_data->skb_rxtstamp_queue);
+	INIT_WORK(&tagger_data->rxtstamp_work, sja1105_rxtstamp_work);
 
 	/* Connections between dsa_port and sja1105_port */
 	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 2c4fce4eaf0d..79435cfc20eb 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -45,6 +45,13 @@ struct sja1105_tagger_data {
 	bool hwts_rx_en;
 };
 
+struct sja1105_skb_cb {
+	u32 meta_tstamp;
+};
+
+#define SJA1105_SKB_CB(skb) \
+	((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb))
+
 struct sja1105_port {
 	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5b51e96130c7..1d96c9d4a8e9 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -110,6 +110,124 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 }
 
+static void sja1105_transfer_meta(struct sk_buff *skb,
+				  const struct sja1105_meta *meta)
+{
+	struct ethhdr *hdr = eth_hdr(skb);
+
+	hdr->h_dest[3] = meta->dmac_byte_3;
+	hdr->h_dest[4] = meta->dmac_byte_4;
+	SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+				struct sja1105_meta *meta,
+				bool is_link_local,
+				bool is_meta)
+{
+	struct sja1105_port *sp;
+	struct dsa_port *dp;
+
+	dp = dsa_slave_to_port(skb->dev);
+	sp = dp->priv;
+
+	/* Step 1: A timestampable frame was received.
+	 * Buffer it until we get its meta frame.
+	 */
+	if (is_link_local && sp->data->hwts_rx_en) {
+		spin_lock(&sp->data->meta_lock);
+		/* Was this a link-local frame instead of the meta
+		 * that we were expecting?
+		 */
+		if (sp->data->stampable_skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Expected meta frame, is %12llx "
+					    "in the DSA master multicast filter?\n",
+					    SJA1105_META_DMAC);
+		}
+
+		/* Hold a reference to avoid dsa_switch_rcv
+		 * from freeing the skb.
+		 */
+		sp->data->stampable_skb = skb_get(skb);
+		spin_unlock(&sp->data->meta_lock);
+
+		/* Tell DSA we got nothing */
+		return NULL;
+
+	/* Step 2: The meta frame arrived.
+	 * Time to take the stampable skb out of the closet, annotate it
+	 * with the partial timestamp, and pretend that we received it
+	 * just now (basically masquerade the buffered frame as the meta
+	 * frame, which serves no further purpose).
+	 */
+	} else if (is_meta) {
+		struct sk_buff *stampable_skb;
+
+		spin_lock(&sp->data->meta_lock);
+
+		stampable_skb = sp->data->stampable_skb;
+		sp->data->stampable_skb = NULL;
+
+		/* Was this a meta frame instead of the link-local
+		 * that we were expecting?
+		 */
+		if (!stampable_skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Unexpected meta frame\n");
+			spin_unlock(&sp->data->meta_lock);
+			return NULL;
+		}
+
+		if (stampable_skb->dev != skb->dev) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Meta frame on wrong port\n");
+			spin_unlock(&sp->data->meta_lock);
+			return NULL;
+		}
+
+		/* Free the meta frame and give DSA the buffered stampable_skb
+		 * for further processing up the network stack.
+		 */
+		kfree_skb(skb);
+
+		skb = skb_copy(stampable_skb, GFP_ATOMIC);
+		if (!skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Failed to copy stampable skb\n");
+			return NULL;
+		}
+		sja1105_transfer_meta(skb, meta);
+		/* The cached copy will be freed now */
+		skb_unref(stampable_skb);
+
+		spin_unlock(&sp->data->meta_lock);
+	}
+
+	return skb;
+}
+
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
@@ -167,7 +285,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	if (is_tagged)
 		skb = dsa_8021q_remove_header(skb);
 
-	return skb;
+	return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+					      is_meta);
 }
 
 static struct dsa_device_ops sja1105_netdev_ops = {
-- 
cgit v1.2.3


From 7f192e3cd316ba58c88dfa26796cf77789dd9872 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Sat, 25 May 2019 11:36:41 +0200
Subject: fork: add clone3

This adds the clone3 system call.

As mentioned several times already (cf. [7], [8]) here's the promised
patchset for clone3().

We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last
free flag from clone().

Independent of the CLONE_PIDFD patchset a time namespace has been discussed
at Linux Plumber Conference last year and has been sent out and reviewed
(cf. [5]). It is expected that it will go upstream in the not too distant
future. However, it relies on the addition of the CLONE_NEWTIME flag to
clone(). The only other good candidate - CLONE_DETACHED - is currently not
recyclable as we have identified at least two large or widely used
codebases that currently pass this flag (cf. [2], [3], and [4]). Given that
CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively
blocked. clone3() has the advantage that it will unblock this patchset
again. In general, clone3() is extensible and allows for the implementation
of new features.

The idea is to keep clone3() very simple and close to the original clone(),
specifically, to keep on supporting old clone()-based workloads.
We know there have been various creative proposals how a new process
creation syscall or even api is supposed to look like. Some people even
going so far as to argue that the traditional fork()+exec() split should be
abandoned in favor of an in-kernel version of spawn(). Independent of
whether or not we personally think spawn() is a good idea this patchset has
and does not want to have anything to do with this.
One stance we take is that there's no real good alternative to
clone()+exec() and we need and want to support this model going forward;
independent of spawn().
The following requirements guided clone3():
- bump the number of available flags
- move arguments that are currently passed as separate arguments
  in clone() into a dedicated struct clone_args
  - choose a struct layout that is easy to handle on 32 and on 64 bit
  - choose a struct layout that is extensible
  - give new flags that currently need to abuse another flag's dedicated
    return argument in clone() their own dedicated return argument
    (e.g. CLONE_PIDFD)
  - use a separate kernel internal struct kernel_clone_args that is
    properly typed according to current kernel conventions in fork.c and is
    different from  the uapi struct clone_args
- port _do_fork() to use kernel_clone_args so that all process creation
  syscalls such as fork(), vfork(), clone(), and clone3() behave identical
  (Arnd suggested, that we can probably also port do_fork() itself in a
   separate patchset.)
- ease of transition for userspace from clone() to clone3()
  This very much means that we do *not* remove functionality that userspace
  currently relies on as the latter is a good way of creating a syscall
  that won't be adopted.
- do not try to be clever or complex: keep clone3() as dumb as possible

In accordance with Linus suggestions (cf. [11]), clone3() has the following
signature:

/* uapi */
struct clone_args {
        __aligned_u64 flags;
        __aligned_u64 pidfd;
        __aligned_u64 child_tid;
        __aligned_u64 parent_tid;
        __aligned_u64 exit_signal;
        __aligned_u64 stack;
        __aligned_u64 stack_size;
        __aligned_u64 tls;
};

/* kernel internal */
struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        int exit_signal;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
};

long sys_clone3(struct clone_args __user *uargs, size_t size)

clone3() cleanly supports all of the supported flags from clone() and thus
all legacy workloads.
The advantage of sticking close to the old clone() is the low cost for
userspace to switch to this new api. Quite a lot of userspace apis (e.g.
pthreads) are based on the clone() syscall. With the new clone3() syscall
supporting all of the old workloads and opening up the ability to add new
features should make switching to it for userspace more appealing. In
essence, glibc can just write a simple wrapper to switch from clone() to
clone3().

There has been some interest in this patchset already. We have received a
patch from the CRIU corner for clone3() that would set the PID/TID of a
restored process without /proc/sys/kernel/ns_last_pid to eliminate a race.

/* User visible differences to legacy clone() */
- CLONE_DETACHED will cause EINVAL with clone3()
- CSIGNAL is deprecated
  It is superseeded by a dedicated "exit_signal" argument in struct
  clone_args freeing up space for additional flags.
  This is based on a suggestion from Andrei and Linus (cf. [9] and [10])

/* References */
[1]: b3e5838252665ee4cfa76b82bdf1198dca81e5be
[2]: https://dxr.mozilla.org/mozilla-central/source/security/sandbox/linux/SandboxFilter.cpp#343
[3]: https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_create.c#n233
[4]: https://sources.debian.org/src/blcr/0.8.5-2.3/cr_module/cr_dump_self.c/?hl=740#L740
[5]: https://lore.kernel.org/lkml/20190425161416.26600-1-dima@arista.com/
[6]: https://lore.kernel.org/lkml/20190425161416.26600-2-dima@arista.com/
[7]: https://lore.kernel.org/lkml/CAHrFyr5HxpGXA2YrKza-oB-GGwJCqwPfyhD-Y5wbktWZdt0sGQ@mail.gmail.com/
[8]: https://lore.kernel.org/lkml/20190524102756.qjsjxukuq2f4t6bo@brauner.io/
[9]: https://lore.kernel.org/lkml/20190529222414.GA6492@gmail.com/
[10]: https://lore.kernel.org/lkml/CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com/
[11]: https://lore.kernel.org/lkml/CAHk-=wieuV4hGwznPsX-8E0G2FKhx3NjZ9X3dTKh5zKd+iqOBw@mail.gmail.com/

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Serge Hallyn <serge@hallyn.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Adrian Reber <adrian@lisas.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
 arch/x86/ia32/sys_ia32.c   |  12 ++-
 include/linux/sched/task.h |  17 +++-
 include/linux/syscalls.h   |   4 +
 include/uapi/linux/sched.h |  16 ++++
 kernel/fork.c              | 201 ++++++++++++++++++++++++++++++++++-----------
 5 files changed, 199 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..64a6c952091e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		       unsigned long, newsp, int __user *, parent_tidptr,
 		       unsigned long, tls_val, int __user *, child_tidptr)
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
-			tls_val);
+	struct kernel_clone_args args = {
+		.flags		= (clone_flags & ~CSIGNAL),
+		.child_tid	= child_tidptr,
+		.parent_tid	= parent_tidptr,
+		.exit_signal	= (clone_flags & CSIGNAL),
+		.stack		= newsp,
+		.tls		= tls_val,
+	};
+
+	return _do_fork(&args);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1227f2c38a4..109a0df5af39 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,11 +8,26 @@
  */
 
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 
 struct task_struct;
 struct rusage;
 union thread_union;
 
+/* All the bits taken by the old clone syscall. */
+#define CLONE_LEGACY_FLAGS 0xffffffffULL
+
+struct kernel_clone_args {
+	u64 flags;
+	int __user *pidfd;
+	int __user *child_tid;
+	int __user *parent_tid;
+	int exit_signal;
+	unsigned long stack;
+	unsigned long stack_size;
+	unsigned long tls;
+};
+
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
@@ -73,7 +88,7 @@ extern void do_group_exit(int);
 extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..60a81f374ca3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -70,6 +70,7 @@ struct sigaltstack;
 struct rseq;
 union bpf_attr;
 struct io_uring_params;
+struct clone_args;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -852,6 +853,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
 	       int __user *, unsigned long);
 #endif
 #endif
+
+asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size);
+
 asmlinkage long sys_execve(const char __user *filename,
 		const char __user *const __user *argv,
 		const char __user *const __user *envp);
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index ed4ee170bee2..f5331dbdcaa2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -2,6 +2,8 @@
 #ifndef _UAPI_LINUX_SCHED_H
 #define _UAPI_LINUX_SCHED_H
 
+#include <linux/types.h>
+
 /*
  * cloning flags:
  */
@@ -31,6 +33,20 @@
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 
+/*
+ * Arguments for the clone3 syscall
+ */
+struct clone_args {
+	__aligned_u64 flags;
+	__aligned_u64 pidfd;
+	__aligned_u64 child_tid;
+	__aligned_u64 parent_tid;
+	__aligned_u64 exit_signal;
+	__aligned_u64 stack;
+	__aligned_u64 stack_size;
+	__aligned_u64 tls;
+};
+
 /*
  * Scheduling policies
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..08ff131f26b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1760,19 +1760,15 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
  * flags). The actual kick-off is left to the caller.
  */
 static __latent_entropy struct task_struct *copy_process(
-					unsigned long clone_flags,
-					unsigned long stack_start,
-					unsigned long stack_size,
-					int __user *parent_tidptr,
-					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
-					unsigned long tls,
-					int node)
+					int node,
+					struct kernel_clone_args *args)
 {
 	int pidfd = -1, retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
+	u64 clone_flags = args->flags;
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -1821,27 +1817,12 @@ static __latent_entropy struct task_struct *copy_process(
 	}
 
 	if (clone_flags & CLONE_PIDFD) {
-		int reserved;
-
 		/*
-		 * - CLONE_PARENT_SETTID is useless for pidfds and also
-		 *   parent_tidptr is used to return pidfds.
 		 * - CLONE_DETACHED is blocked so that we can potentially
 		 *   reuse it later for CLONE_PIDFD.
 		 * - CLONE_THREAD is blocked until someone really needs it.
 		 */
-		if (clone_flags &
-		    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
-			return ERR_PTR(-EINVAL);
-
-		/*
-		 * Verify that parent_tidptr is sane so we can potentially
-		 * reuse it later.
-		 */
-		if (get_user(reserved, parent_tidptr))
-			return ERR_PTR(-EFAULT);
-
-		if (reserved != 0)
+		if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -1874,11 +1855,11 @@ static __latent_entropy struct task_struct *copy_process(
 	 * p->set_child_tid which is (ab)used as a kthread's data pointer for
 	 * kernel threads (PF_KTHREAD).
 	 */
-	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 	/*
 	 * Clear TID on mm_release()?
 	 */
-	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
 
 	ftrace_graph_init_task(p);
 
@@ -2037,7 +2018,8 @@ static __latent_entropy struct task_struct *copy_process(
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+	retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+				 args->tls);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
@@ -2062,7 +2044,7 @@ static __latent_entropy struct task_struct *copy_process(
 			goto bad_fork_free_pid;
 
 		pidfd = retval;
-		retval = put_user(pidfd, parent_tidptr);
+		retval = put_user(pidfd, args->pidfd);
 		if (retval)
 			goto bad_fork_put_pidfd;
 	}
@@ -2105,7 +2087,7 @@ static __latent_entropy struct task_struct *copy_process(
 		if (clone_flags & CLONE_PARENT)
 			p->exit_signal = current->group_leader->exit_signal;
 		else
-			p->exit_signal = (clone_flags & CSIGNAL);
+			p->exit_signal = args->exit_signal;
 		p->group_leader = p;
 		p->tgid = p->pid;
 	}
@@ -2313,8 +2295,11 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
-			    cpu_to_node(cpu));
+	struct kernel_clone_args args = {
+		.flags = CLONE_VM,
+	};
+
+	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task);
 		init_idle(task, cpu);
@@ -2334,13 +2319,9 @@ struct mm_struct *copy_init_mm(void)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long _do_fork(unsigned long clone_flags,
-	      unsigned long stack_start,
-	      unsigned long stack_size,
-	      int __user *parent_tidptr,
-	      int __user *child_tidptr,
-	      unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
 {
+	u64 clone_flags = args->flags;
 	struct completion vfork;
 	struct pid *pid;
 	struct task_struct *p;
@@ -2356,7 +2337,7 @@ long _do_fork(unsigned long clone_flags,
 	if (!(clone_flags & CLONE_UNTRACED)) {
 		if (clone_flags & CLONE_VFORK)
 			trace = PTRACE_EVENT_VFORK;
-		else if ((clone_flags & CSIGNAL) != SIGCHLD)
+		else if (args->exit_signal != SIGCHLD)
 			trace = PTRACE_EVENT_CLONE;
 		else
 			trace = PTRACE_EVENT_FORK;
@@ -2365,8 +2346,7 @@ long _do_fork(unsigned long clone_flags,
 			trace = 0;
 	}
 
-	p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
-			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
 	add_latent_entropy();
 
 	if (IS_ERR(p))
@@ -2382,7 +2362,7 @@ long _do_fork(unsigned long clone_flags,
 	nr = pid_vnr(pid);
 
 	if (clone_flags & CLONE_PARENT_SETTID)
-		put_user(nr, parent_tidptr);
+		put_user(nr, args->parent_tid);
 
 	if (clone_flags & CLONE_VFORK) {
 		p->vfork_done = &vfork;
@@ -2414,8 +2394,16 @@ long do_fork(unsigned long clone_flags,
 	      int __user *parent_tidptr,
 	      int __user *child_tidptr)
 {
-	return _do_fork(clone_flags, stack_start, stack_size,
-			parent_tidptr, child_tidptr, 0);
+	struct kernel_clone_args args = {
+		.flags		= (clone_flags & ~CSIGNAL),
+		.child_tid	= child_tidptr,
+		.parent_tid	= parent_tidptr,
+		.exit_signal	= (clone_flags & CSIGNAL),
+		.stack		= stack_start,
+		.stack_size	= stack_size,
+	};
+
+	return _do_fork(&args);
 }
 #endif
 
@@ -2424,15 +2412,25 @@ long do_fork(unsigned long clone_flags,
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.flags		= ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (flags & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+	};
+
+	return _do_fork(&args);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.exit_signal = SIGCHLD,
+	};
+
+	return _do_fork(&args);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -2443,8 +2441,12 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.flags		= CLONE_VFORK | CLONE_VM,
+		.exit_signal	= SIGCHLD,
+	};
+
+	return _do_fork(&args);
 }
 #endif
 
@@ -2472,7 +2474,110 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 unsigned long, tls)
 #endif
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+	struct kernel_clone_args args = {
+		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
+		.child_tid	= child_tidptr,
+		.parent_tid	= parent_tidptr,
+		.exit_signal	= (clone_flags & CSIGNAL),
+		.stack		= newsp,
+		.tls		= tls,
+	};
+
+	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+		return -EINVAL;
+
+	return _do_fork(&args);
+}
+
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+					      struct clone_args __user *uargs,
+					      size_t size)
+{
+	struct clone_args args;
+
+	if (unlikely(size > PAGE_SIZE))
+		return -E2BIG;
+
+	if (unlikely(size < sizeof(struct clone_args)))
+		return -EINVAL;
+
+	if (unlikely(!access_ok(uargs, size)))
+		return -EFAULT;
+
+	if (size > sizeof(struct clone_args)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uargs + sizeof(struct clone_args);
+		end = (void __user *)uargs + size;
+
+		for (; addr < end; addr++) {
+			if (get_user(val, addr))
+				return -EFAULT;
+			if (val)
+				return -E2BIG;
+		}
+
+		size = sizeof(struct clone_args);
+	}
+
+	if (copy_from_user(&args, uargs, size))
+		return -EFAULT;
+
+	*kargs = (struct kernel_clone_args){
+		.flags		= args.flags,
+		.pidfd		= u64_to_user_ptr(args.pidfd),
+		.child_tid	= u64_to_user_ptr(args.child_tid),
+		.parent_tid	= u64_to_user_ptr(args.parent_tid),
+		.exit_signal	= args.exit_signal,
+		.stack		= args.stack,
+		.stack_size	= args.stack_size,
+		.tls		= args.tls,
+	};
+
+	return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+	/*
+	 * All lower bits of the flag word are taken.
+	 * Verify that no other unknown flags are passed along.
+	 */
+	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+		return false;
+
+	/*
+	 * - make the CLONE_DETACHED bit reuseable for clone3
+	 * - make the CSIGNAL bits reuseable for clone3
+	 */
+	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+		return false;
+
+	if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+	    kargs->exit_signal)
+		return false;
+
+	return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+	int err;
+
+	struct kernel_clone_args kargs;
+
+	err = copy_clone_args_from_user(&kargs, uargs, size);
+	if (err)
+		return err;
+
+	if (!clone3_args_valid(&kargs))
+		return -EINVAL;
+
+	return _do_fork(&kargs);
 }
 #endif
 
-- 
cgit v1.2.3


From f652e66fcca07e59f207bcca27c5566193feabd5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 9 Jun 2019 23:43:13 +0900
Subject: pinctrl: add include guard to pinctrl-state.h

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl-state.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl-state.h b/include/linux/pinctrl/pinctrl-state.h
index a0e785815a64..635d97e9285e 100644
--- a/include/linux/pinctrl/pinctrl-state.h
+++ b/include/linux/pinctrl/pinctrl-state.h
@@ -3,6 +3,9 @@
  * Standard pin control state definitions
  */
 
+#ifndef __LINUX_PINCTRL_PINCTRL_STATE_H
+#define __LINUX_PINCTRL_PINCTRL_STATE_H
+
 /**
  * @PINCTRL_STATE_DEFAULT: the state the pinctrl handle shall be put
  *	into as default, usually this means the pins are up and ready to
@@ -31,3 +34,5 @@
 #define PINCTRL_STATE_INIT "init"
 #define PINCTRL_STATE_IDLE "idle"
 #define PINCTRL_STATE_SLEEP "sleep"
+
+#endif /* __LINUX_PINCTRL_PINCTRL_STATE_H */
-- 
cgit v1.2.3


From f16acc9d9b3761ae5e45219c9302f99e20919829 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 5 Jun 2019 08:04:47 -0700
Subject: vfs: introduce generic_copy_file_range()

Right now if vfs_copy_file_range() does not use any offload
mechanism, it falls back to calling do_splice_direct(). This fails
to do basic sanity checks on the files being copied. Before we
start adding this necessarily functionality to the fallback path,
separate it out into generic_copy_file_range().

generic_copy_file_range() has the same prototype as
->copy_file_range() so that filesystems can use it in their custom
->copy_file_range() method if they so choose.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/read_write.c    | 35 ++++++++++++++++++++++++++++++++---
 include/linux/fs.h |  3 +++
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index c543d965e288..676b02fae589 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1565,6 +1565,36 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
 }
 #endif
 
+/**
+ * generic_copy_file_range - copy data between two files
+ * @file_in:	file structure to read from
+ * @pos_in:	file offset to read from
+ * @file_out:	file structure to write data to
+ * @pos_out:	file offset to write data to
+ * @len:	amount of data to copy
+ * @flags:	copy flags
+ *
+ * This is a generic filesystem helper to copy data from one file to another.
+ * It has no constraints on the source or destination file owners - the files
+ * can belong to different superblocks and different filesystem types. Short
+ * copies are allowed.
+ *
+ * This should be called from the @file_out filesystem, as per the
+ * ->copy_file_range() method.
+ *
+ * Returns the number of bytes copied or a negative error indicating the
+ * failure.
+ */
+
+ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out,
+				size_t len, unsigned int flags)
+{
+	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+}
+EXPORT_SYMBOL(generic_copy_file_range);
+
 /*
  * copy_file_range() differs from regular file read and write in that it
  * specifically allows return partial success.  When it does so is up to
@@ -1632,9 +1662,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			goto done;
 	}
 
-	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-
+	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+				      flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..ea17858310ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1889,6 +1889,9 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
+extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
+				       struct file *file_out, loff_t pos_out,
+				       size_t len, unsigned int flags);
 extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 					 struct file *file_out, loff_t pos_out,
 					 loff_t *count,
-- 
cgit v1.2.3


From a31713517dac0862a3f0ec9006df9160ce022b0c Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 5 Jun 2019 08:04:48 -0700
Subject: vfs: introduce generic_file_rw_checks()

Factor out helper with some checks on in/out file that are
common to clone_file_range and copy_file_range.

Suggested-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/read_write.c    | 38 +++++++++++---------------------------
 include/linux/fs.h |  1 +
 mm/filemap.c       | 24 ++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index b63dcb4e4fe9..f1900bdb3127 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1617,17 +1617,18 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			    struct file *file_out, loff_t pos_out,
 			    size_t len, unsigned int flags)
 {
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
 	ssize_t ret;
 
 	if (flags != 0)
 		return -EINVAL;
 
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
+	/* this could be relaxed once a method supports cross-fs copies */
+	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+		return -EXDEV;
+
+	ret = generic_file_rw_checks(file_in, file_out);
+	if (unlikely(ret))
+		return ret;
 
 	ret = rw_verify_area(READ, file_in, &pos_in, len);
 	if (unlikely(ret))
@@ -1637,15 +1638,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (unlikely(ret))
 		return ret;
 
-	if (!(file_in->f_mode & FMODE_READ) ||
-	    !(file_out->f_mode & FMODE_WRITE) ||
-	    (file_out->f_flags & O_APPEND))
-		return -EBADF;
-
-	/* this could be relaxed once a method supports cross-fs copies */
-	if (inode_in->i_sb != inode_out->i_sb)
-		return -EXDEV;
-
 	if (len == 0)
 		return 0;
 
@@ -2013,29 +2005,21 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
 			   struct file *file_out, loff_t pos_out,
 			   loff_t len, unsigned int remap_flags)
 {
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
 	loff_t ret;
 
 	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
 
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
-
 	/*
 	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
 	 * the same mount. Practically, they only need to be on the same file
 	 * system.
 	 */
-	if (inode_in->i_sb != inode_out->i_sb)
+	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
 		return -EXDEV;
 
-	if (!(file_in->f_mode & FMODE_READ) ||
-	    !(file_out->f_mode & FMODE_WRITE) ||
-	    (file_out->f_flags & O_APPEND))
-		return -EBADF;
+	ret = generic_file_rw_checks(file_in, file_out);
+	if (ret < 0)
+		return ret;
 
 	if (!file_in->f_op->remap_file_range)
 		return -EOPNOTSUPP;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ea17858310ff..89b9b73eb581 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3049,6 +3049,7 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
+extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index df2006ba0cfa..a38619a4a6af 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3041,6 +3041,30 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in,
 	return 0;
 }
 
+
+/*
+ * Performs common checks before doing a file copy/clone
+ * from @file_in to @file_out.
+ */
+int generic_file_rw_checks(struct file *file_in, struct file *file_out)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+
+	/* Don't copy dirs, pipes, sockets... */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND))
+		return -EBADF;
+
+	return 0;
+}
+
 int pagecache_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
-- 
cgit v1.2.3


From 96e6e8f4a68df2d94800311163faa67124df24e5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 5 Jun 2019 08:04:49 -0700
Subject: vfs: add missing checks to copy_file_range

Like the clone and dedupe interfaces we've recently fixed, the
copy_file_range() implementation is missing basic sanity, limits and
boundary condition tests on the parameters that are passed to it
from userspace. Create a new "generic_copy_file_checks()" function
modelled on the generic_remap_checks() function to provide this
missing functionality.

[Amir] Shorten copy length instead of checking pos_in limits
because input file size already abides by the limits.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/read_write.c    |  3 ++-
 include/linux/fs.h |  3 +++
 mm/filemap.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index f1900bdb3127..b0fb1176b628 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1626,7 +1626,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
 		return -EXDEV;
 
-	ret = generic_file_rw_checks(file_in, file_out);
+	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
+				       flags);
 	if (unlikely(ret))
 		return ret;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 89b9b73eb581..e4d382c4342a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3050,6 +3050,9 @@ extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
+extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t *count, unsigned int flags);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 44361928bbb0..aac71aef4c61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3056,6 +3056,59 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 	return 0;
 }
 
+/*
+ * Performs necessary checks before doing a file copy
+ *
+ * Can adjust amount of bytes to copy via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the copy should be allowed.
+ */
+int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+			     struct file *file_out, loff_t pos_out,
+			     size_t *req_count, unsigned int flags)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	uint64_t count = *req_count;
+	loff_t size_in;
+	int ret;
+
+	ret = generic_file_rw_checks(file_in, file_out);
+	if (ret)
+		return ret;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode_out))
+		return -EPERM;
+
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		return -ETXTBSY;
+
+	/* Ensure offsets don't wrap. */
+	if (pos_in + count < pos_in || pos_out + count < pos_out)
+		return -EOVERFLOW;
+
+	/* Shorten the copy to EOF */
+	size_in = i_size_read(inode_in);
+	if (pos_in >= size_in)
+		count = 0;
+	else
+		count = min(count, size_in - (uint64_t)pos_in);
+
+	ret = generic_write_check_limits(file_out, pos_out, &count);
+	if (ret)
+		return ret;
+
+	/* Don't allow overlapped copying within the same file. */
+	if (inode_in == inode_out &&
+	    pos_out + count > pos_in &&
+	    pos_out < pos_in + count)
+		return -EINVAL;
+
+	*req_count = count;
+	return 0;
+}
+
 int pagecache_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
-- 
cgit v1.2.3


From e38f7f53c35213b1cbce70eee5de7ced17f40d4a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 5 Jun 2019 08:04:49 -0700
Subject: vfs: introduce file_modified() helper

The combination of file_remove_privs() and file_update_mtime() is
quite common in filesystem ->write_iter() methods.

Modelled after the helper file_accessed(), introduce file_modified()
and use it from generic_remap_file_range_prep().

Note that the order of calling file_remove_privs() before
file_update_mtime() in the helper was matched to the more common order by
filesystems and not the current order in generic_remap_file_range_prep().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/inode.c         | 20 ++++++++++++++++++++
 fs/read_write.c    | 21 +++------------------
 include/linux/fs.h |  2 ++
 3 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index df6542ec3b88..4348cfb14562 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1899,6 +1899,26 @@ int file_update_time(struct file *file)
 }
 EXPORT_SYMBOL(file_update_time);
 
+/* Caller must hold the file's inode lock */
+int file_modified(struct file *file)
+{
+	int err;
+
+	/*
+	 * Clear the security bits if the process is not being run by root.
+	 * This keeps people from modifying setuid and setgid binaries.
+	 */
+	err = file_remove_privs(file);
+	if (err)
+		return err;
+
+	if (unlikely(file->f_mode & FMODE_NOCMTIME))
+		return 0;
+
+	return file_update_time(file);
+}
+EXPORT_SYMBOL(file_modified);
+
 int inode_needs_sync(struct inode *inode)
 {
 	if (IS_SYNC(inode))
diff --git a/fs/read_write.c b/fs/read_write.c
index b0fb1176b628..cec7e7b1f693 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1980,25 +1980,10 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 		return ret;
 
 	/* If can't alter the file contents, we're done. */
-	if (!(remap_flags & REMAP_FILE_DEDUP)) {
-		/* Update the timestamps, since we can alter file contents. */
-		if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-			ret = file_update_time(file_out);
-			if (ret)
-				return ret;
-		}
+	if (!(remap_flags & REMAP_FILE_DEDUP))
+		ret = file_modified(file_out);
 
-		/*
-		 * Clear the security bits if the process is not being run by
-		 * root.  This keeps people from modifying setuid and setgid
-		 * binaries.
-		 */
-		ret = file_remove_privs(file_out);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e4d382c4342a..79ffa2958bd8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2177,6 +2177,8 @@ static inline void file_accessed(struct file *file)
 		touch_atime(&file->f_path);
 }
 
+extern int file_modified(struct file *file);
+
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
 int sync_inode_metadata(struct inode *inode, int wait);
 
-- 
cgit v1.2.3


From e63d79d1ffcd2201a2dbff1d7a1184b8f3ec74cf Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:22 +0200
Subject: dmaengine: Add Synopsys eDMA IP core driver

Add Synopsys PCIe Endpoint eDMA IP core driver to kernel.

This IP is generally distributed with Synopsys PCIe Endpoint IP (depends
of the use and licensing agreement).

This core driver, initializes and configures the eDMA IP using vma-helpers
functions and dma-engine subsystem.

This driver can be compile as built-in or external module in kernel.

To enable this driver just select DW_EDMA option in kernel configuration,
however it requires and selects automatically DMA_ENGINE and
DMA_VIRTUAL_CHANNELS option too.

In order to transfer data from point A to B as fast as possible this IP
requires a dedicated memory space containing linked list of elements.

All elements of this linked list are continuous and each one describes a
data transfer (source and destination addresses, length and a control
variable).

For the sake of simplicity, lets assume a memory space for channel write
0 which allows about 42 elements.

+---------+
| Desc #0 |-+
+---------+ |
            V
       +----------+
       | Chunk #0 |-+
       |  CB = 1  | |  +----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #0 |->| ... |->| Burst #41 |->| llp |
            |          +----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #1 |-+
       |  CB = 0  | |  +-----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #42 |->| ... |->| Burst #83 |->| llp |
            |          +-----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #2 |-+
       |  CB = 1  | |  +-----------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #84 |->| ... |->| Burst #125 |->| llp |
            |          +-----------+  +-----+  +------------+  +-----+
            V
       +----------+
       | Chunk #3 |-+
       |  CB = 0  | |  +------------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #126 |->| ... |->| Burst #129 |->| llp |
                       +------------+  +-----+  +------------+  +-----+

Legend:
 - Linked list, also know as Chunk
 - Linked list element*, also know as Burst *CB*, also know as Change Bit,
it's a control bit (and typically is toggled) that allows to easily
identify and differentiate between the current linked list and the
previous or the next one.
 - LLP, is a special element that indicates the end of the linked list
element stream also informs that the next CB should be toggle

On every last Burst of the Chunk (Burst #41, Burst #83, Burst #125 or
even Burst #129) is set some flags on their control variable (RIE and
LIE bits) that will trigger the send of "done" interruption.

On the interruptions callback, is decided whether to recycle the linked
list memory space by writing a new set of Bursts elements (if still
exists Chunks to transfer) or is considered completed (if there is no
Chunks available to transfer).

On scatter-gather transfer mode, the client will submit a scatter-gather
list of n (on this case 130) elements, that will be divide in multiple
Chunks, each Chunk will have (on this case 42) a limited number of
Bursts and after transferring all Bursts, an interrupt will be
triggered, which will allow to recycle the all linked list dedicated
memory again with the new information relative to the next Chunk and
respective Burst associated and repeat the whole cycle again.

On cyclic transfer mode, the client will submit a buffer pointer, length
of it and number of repetitions, in this case each burst will correspond
directly to each repetition.

Each Burst can describes a data transfer from point A(source) to point
B(destination) with a length that can be from 1 byte up to 4 GB. Since
dedicated the memory space where the linked list will reside is limited,
the whole n burst elements will be organized in several Chunks, that
will be used later to recycle the dedicated memory space to initiate a
new sequence of data transfers.

The whole transfer is considered has completed when it was transferred
all bursts.

Currently this IP has a set well-known register map, which includes
support for legacy and unroll modes. Legacy mode is version of this
register map that has multiplexer register that allows to switch
registers between all write and read channels and the unroll modes
repeats all write and read channels registers with an offset between
them. This register map is called v0.

The IP team is creating a new register map more suitable to the latest
PCIe features, that very likely will change the map register, which this
version will be called v1. As soon as this new version is released by
the IP team the support for this version in be included on this driver.

According to the logic, patches 1, 2 and 3 should be squashed into 1
unique patch, but for the sake of simplicity of review, it was divided
in this 3 patches files.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/Kconfig                |   2 +
 drivers/dma/Makefile               |   1 +
 drivers/dma/dw-edma/Kconfig        |   9 +
 drivers/dma/dw-edma/Makefile       |   4 +
 drivers/dma/dw-edma/dw-edma-core.c | 936 +++++++++++++++++++++++++++++++++++++
 drivers/dma/dw-edma/dw-edma-core.h | 165 +++++++
 include/linux/dma/edma.h           |  47 ++
 7 files changed, 1164 insertions(+)
 create mode 100644 drivers/dma/dw-edma/Kconfig
 create mode 100644 drivers/dma/dw-edma/Makefile
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.c
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.h
 create mode 100644 include/linux/dma/edma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index eaf78f4e07ce..76859aa2688c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -665,6 +665,8 @@ source "drivers/dma/qcom/Kconfig"
 
 source "drivers/dma/dw/Kconfig"
 
+source "drivers/dma/dw-edma/Kconfig"
+
 source "drivers/dma/hsu/Kconfig"
 
 source "drivers/dma/sh/Kconfig"
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6126e1c3a875..5bddf6f8790f 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_DMA_SUN4I) += sun4i-dma.o
 obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o
 obj-$(CONFIG_DW_AXI_DMAC) += dw-axi-dmac/
 obj-$(CONFIG_DW_DMAC_CORE) += dw/
+obj-$(CONFIG_DW_EDMA) += dw-edma/
 obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o
 obj-$(CONFIG_FSL_DMA) += fsldma.o
 obj-$(CONFIG_FSL_EDMA) += fsl-edma.o fsl-edma-common.o
diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig
new file mode 100644
index 000000000000..3016bed63589
--- /dev/null
+++ b/drivers/dma/dw-edma/Kconfig
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config DW_EDMA
+	tristate "Synopsys DesignWare eDMA controller driver"
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support the Synopsys DesignWare eDMA controller, normally
+	  implemented on endpoints SoCs.
diff --git a/drivers/dma/dw-edma/Makefile b/drivers/dma/dw-edma/Makefile
new file mode 100644
index 000000000000..322401089891
--- /dev/null
+++ b/drivers/dma/dw-edma/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DW_EDMA)		+= dw-edma.o
+dw-edma-objs			:= dw-edma-core.o
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
new file mode 100644
index 000000000000..c9d032f49dc3
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -0,0 +1,936 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pm_runtime.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/dma/edma.h>
+#include <linux/pci.h>
+
+#include "dw-edma-core.h"
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+static inline
+struct device *dchan2dev(struct dma_chan *dchan)
+{
+	return &dchan->dev->device;
+}
+
+static inline
+struct device *chan2dev(struct dw_edma_chan *chan)
+{
+	return &chan->vc.chan.dev->device;
+}
+
+static inline
+struct dw_edma_desc *vd2dw_edma_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct dw_edma_desc, vd);
+}
+
+static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *burst;
+
+	burst = kzalloc(sizeof(*burst), GFP_NOWAIT);
+	if (unlikely(!burst))
+		return NULL;
+
+	INIT_LIST_HEAD(&burst->list);
+	if (chunk->burst) {
+		/* Create and add new element into the linked list */
+		chunk->bursts_alloc++;
+		list_add_tail(&burst->list, &chunk->burst->list);
+	} else {
+		/* List head */
+		chunk->bursts_alloc = 0;
+		chunk->burst = burst;
+	}
+
+	return burst;
+}
+
+static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chan *chan = desc->chan;
+	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma_chunk *chunk;
+
+	chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
+	if (unlikely(!chunk))
+		return NULL;
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->chan = chan;
+	/* Toggling change bit (CB) in each chunk, this is a mechanism to
+	 * inform the eDMA HW block that this is a new linked list ready
+	 * to be consumed.
+	 *  - Odd chunks originate CB equal to 0
+	 *  - Even chunks originate CB equal to 1
+	 */
+	chunk->cb = !(desc->chunks_alloc % 2);
+	chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off;
+	chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off;
+
+	if (desc->chunk) {
+		/* Create and add new element into the linked list */
+		desc->chunks_alloc++;
+		list_add_tail(&chunk->list, &desc->chunk->list);
+		if (!dw_edma_alloc_burst(chunk)) {
+			kfree(chunk);
+			return NULL;
+		}
+	} else {
+		/* List head */
+		chunk->burst = NULL;
+		desc->chunks_alloc = 0;
+		desc->chunk = chunk;
+	}
+
+	return chunk;
+}
+
+static struct dw_edma_desc *dw_edma_alloc_desc(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
+	if (unlikely(!desc))
+		return NULL;
+
+	desc->chan = chan;
+	if (!dw_edma_alloc_chunk(desc)) {
+		kfree(desc);
+		return NULL;
+	}
+
+	return desc;
+}
+
+static void dw_edma_free_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *child, *_next;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &chunk->burst->list, list) {
+		list_del(&child->list);
+		kfree(child);
+		chunk->bursts_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	chunk->burst = NULL;
+}
+
+static void dw_edma_free_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chunk *child, *_next;
+
+	if (!desc->chunk)
+		return;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &desc->chunk->list, list) {
+		dw_edma_free_burst(child);
+		list_del(&child->list);
+		kfree(child);
+		desc->chunks_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	desc->chunk = NULL;
+}
+
+static void dw_edma_free_desc(struct dw_edma_desc *desc)
+{
+	dw_edma_free_chunk(desc);
+	kfree(desc);
+}
+
+static void vchan_free_desc(struct virt_dma_desc *vdesc)
+{
+	dw_edma_free_desc(vd2dw_edma_desc(vdesc));
+}
+
+static void dw_edma_start_transfer(struct dw_edma_chan *chan)
+{
+	struct dw_edma_chunk *child;
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+
+	vd = vchan_next_desc(&chan->vc);
+	if (!vd)
+		return;
+
+	desc = vd2dw_edma_desc(vd);
+	if (!desc)
+		return;
+
+	child = list_first_entry_or_null(&desc->chunk->list,
+					 struct dw_edma_chunk, list);
+	if (!child)
+		return;
+
+	dw_edma_v0_core_start(child, !desc->xfer_sz);
+	desc->xfer_sz += child->ll_region.sz;
+	dw_edma_free_burst(child);
+	list_del(&child->list);
+	kfree(child);
+	desc->chunks_alloc--;
+}
+
+static int dw_edma_device_config(struct dma_chan *dchan,
+				 struct dma_slave_config *config)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	memcpy(&chan->config, config, sizeof(*config));
+	chan->configured = true;
+
+	return 0;
+}
+
+static int dw_edma_device_pause(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured)
+		err = -EPERM;
+	else if (chan->status != EDMA_ST_BUSY)
+		err = -EPERM;
+	else if (chan->request != EDMA_REQ_NONE)
+		err = -EPERM;
+	else
+		chan->request = EDMA_REQ_PAUSE;
+
+	return err;
+}
+
+static int dw_edma_device_resume(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured) {
+		err = -EPERM;
+	} else if (chan->status != EDMA_ST_PAUSE) {
+		err = -EPERM;
+	} else if (chan->request != EDMA_REQ_NONE) {
+		err = -EPERM;
+	} else {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+
+	return err;
+}
+
+static int dw_edma_device_terminate_all(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+	LIST_HEAD(head);
+
+	if (!chan->configured) {
+		/* Do nothing */
+	} else if (chan->status == EDMA_ST_PAUSE) {
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->status == EDMA_ST_IDLE) {
+		chan->configured = false;
+	} else if (dw_edma_v0_core_ch_status(chan) == DMA_COMPLETE) {
+		/*
+		 * The channel is in a false BUSY state, probably didn't
+		 * receive or lost an interrupt
+		 */
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->request > EDMA_REQ_PAUSE) {
+		err = -EPERM;
+	} else {
+		chan->request = EDMA_REQ_STOP;
+	}
+
+	return err;
+}
+
+static void dw_edma_device_issue_pending(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (chan->configured && chan->request == EDMA_REQ_NONE &&
+	    chan->status == EDMA_ST_IDLE && vchan_issue_pending(&chan->vc)) {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static enum dma_status
+dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
+			 struct dma_tx_state *txstate)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	enum dma_status ret;
+	u32 residue = 0;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	if (ret == DMA_IN_PROGRESS && chan->status == EDMA_ST_PAUSE)
+		ret = DMA_PAUSED;
+
+	if (!txstate)
+		goto ret_residue;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_find_desc(&chan->vc, cookie);
+	if (vd) {
+		desc = vd2dw_edma_desc(vd);
+		if (desc)
+			residue = desc->alloc_sz - desc->xfer_sz;
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ret_residue:
+	dma_set_residue(txstate, residue);
+
+	return ret;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_transfer(struct dw_edma_transfer *xfer)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(xfer->dchan);
+	enum dma_transfer_direction direction = xfer->direction;
+	phys_addr_t src_addr, dst_addr;
+	struct scatterlist *sg = NULL;
+	struct dw_edma_chunk *chunk;
+	struct dw_edma_burst *burst;
+	struct dw_edma_desc *desc;
+	u32 cnt;
+	int i;
+
+	if ((direction == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE) ||
+	    (direction == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ))
+		return NULL;
+
+	if (xfer->cyclic) {
+		if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt)
+			return NULL;
+	} else {
+		if (xfer->xfer.sg.len < 1)
+			return NULL;
+	}
+
+	if (!chan->configured)
+		return NULL;
+
+	desc = dw_edma_alloc_desc(chan);
+	if (unlikely(!desc))
+		goto err_alloc;
+
+	chunk = dw_edma_alloc_chunk(desc);
+	if (unlikely(!chunk))
+		goto err_alloc;
+
+	src_addr = chan->config.src_addr;
+	dst_addr = chan->config.dst_addr;
+
+	if (xfer->cyclic) {
+		cnt = xfer->xfer.cyclic.cnt;
+	} else {
+		cnt = xfer->xfer.sg.len;
+		sg = xfer->xfer.sg.sgl;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		if (!xfer->cyclic && !sg)
+			break;
+
+		if (chunk->bursts_alloc == chan->ll_max) {
+			chunk = dw_edma_alloc_chunk(desc);
+			if (unlikely(!chunk))
+				goto err_alloc;
+		}
+
+		burst = dw_edma_alloc_burst(chunk);
+		if (unlikely(!burst))
+			goto err_alloc;
+
+		if (xfer->cyclic)
+			burst->sz = xfer->xfer.cyclic.len;
+		else
+			burst->sz = sg_dma_len(sg);
+
+		chunk->ll_region.sz += burst->sz;
+		desc->alloc_sz += burst->sz;
+
+		if (direction == DMA_DEV_TO_MEM) {
+			burst->sar = src_addr;
+			if (xfer->cyclic) {
+				burst->dar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->dar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				src_addr += sg_dma_len(sg);
+			}
+		} else {
+			burst->dar = dst_addr;
+			if (xfer->cyclic) {
+				burst->sar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->sar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				dst_addr += sg_dma_len(sg);
+			}
+		}
+
+		if (!xfer->cyclic)
+			sg = sg_next(sg);
+	}
+
+	return vchan_tx_prep(&chan->vc, &desc->vd, xfer->flags);
+
+err_alloc:
+	if (desc)
+		dw_edma_free_desc(desc);
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
+			     unsigned int len,
+			     enum dma_transfer_direction direction,
+			     unsigned long flags, void *context)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.sg.sgl = sgl;
+	xfer.xfer.sg.len = len;
+	xfer.flags = flags;
+	xfer.cyclic = false;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr,
+			       size_t len, size_t count,
+			       enum dma_transfer_direction direction,
+			       unsigned long flags)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.cyclic.paddr = paddr;
+	xfer.xfer.cyclic.len = len;
+	xfer.xfer.cyclic.cnt = count;
+	xfer.flags = flags;
+	xfer.cyclic = true;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_done_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		switch (chan->request) {
+		case EDMA_REQ_NONE:
+			desc = vd2dw_edma_desc(vd);
+			if (desc->chunks_alloc) {
+				chan->status = EDMA_ST_BUSY;
+				dw_edma_start_transfer(chan);
+			} else {
+				list_del(&vd->node);
+				vchan_cookie_complete(vd);
+				chan->status = EDMA_ST_IDLE;
+			}
+			break;
+
+		case EDMA_REQ_STOP:
+			list_del(&vd->node);
+			vchan_cookie_complete(vd);
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_IDLE;
+			break;
+
+		case EDMA_REQ_PAUSE:
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_PAUSE;
+			break;
+
+		default:
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static void dw_edma_abort_interrupt(struct dw_edma_chan *chan)
+{
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_abort_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		list_del(&vd->node);
+		vchan_cookie_complete(vd);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	chan->request = EDMA_REQ_NONE;
+	chan->status = EDMA_ST_IDLE;
+}
+
+static irqreturn_t dw_edma_interrupt(int irq, void *data, bool write)
+{
+	struct dw_edma_irq *dw_irq = data;
+	struct dw_edma *dw = dw_irq->dw;
+	unsigned long total, pos, val;
+	unsigned long off;
+	u32 mask;
+
+	if (write) {
+		total = dw->wr_ch_cnt;
+		off = 0;
+		mask = dw_irq->wr_mask;
+	} else {
+		total = dw->rd_ch_cnt;
+		off = dw->wr_ch_cnt;
+		mask = dw_irq->rd_mask;
+	}
+
+	val = dw_edma_v0_core_status_done_int(dw, write ?
+							  EDMA_DIR_WRITE :
+							  EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_done_interrupt(chan);
+	}
+
+	val = dw_edma_v0_core_status_abort_int(dw, write ?
+							   EDMA_DIR_WRITE :
+							   EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_abort_interrupt(chan);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, true);
+}
+
+static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, false);
+}
+
+static irqreturn_t dw_edma_interrupt_common(int irq, void *data)
+{
+	dw_edma_interrupt(irq, data, true);
+	dw_edma_interrupt(irq, data, false);
+
+	return IRQ_HANDLED;
+}
+
+static int dw_edma_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	if (chan->status != EDMA_ST_IDLE)
+		return -EBUSY;
+
+	pm_runtime_get(chan->chip->dev);
+
+	return 0;
+}
+
+static void dw_edma_free_chan_resources(struct dma_chan *dchan)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(5000);
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int ret;
+
+	while (time_before(jiffies, timeout)) {
+		ret = dw_edma_device_terminate_all(dchan);
+		if (!ret)
+			break;
+
+		if (time_after_eq(jiffies, timeout))
+			return;
+
+		cpu_relax();
+	};
+
+	pm_runtime_put(chan->chip->dev);
+}
+
+static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
+				 u32 wr_alloc, u32 rd_alloc)
+{
+	struct dw_edma_region *dt_region;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	struct dw_edma_chan *chan;
+	size_t ll_chunk, dt_chunk;
+	struct dw_edma_irq *irq;
+	struct dma_device *dma;
+	u32 i, j, cnt, ch_cnt;
+	u32 alloc, off_alloc;
+	int err = 0;
+	u32 pos;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+	ll_chunk = dw->ll_region.sz;
+	dt_chunk = dw->dt_region.sz;
+
+	/* Calculate linked list chunk for each channel */
+	ll_chunk /= roundup_pow_of_two(ch_cnt);
+
+	/* Calculate linked list chunk for each channel */
+	dt_chunk /= roundup_pow_of_two(ch_cnt);
+
+	if (write) {
+		i = 0;
+		cnt = dw->wr_ch_cnt;
+		dma = &dw->wr_edma;
+		alloc = wr_alloc;
+		off_alloc = 0;
+	} else {
+		i = dw->wr_ch_cnt;
+		cnt = dw->rd_ch_cnt;
+		dma = &dw->rd_edma;
+		alloc = rd_alloc;
+		off_alloc = wr_alloc;
+	}
+
+	INIT_LIST_HEAD(&dma->channels);
+	for (j = 0; (alloc || dw->nr_irqs == 1) && j < cnt; j++, i++) {
+		chan = &dw->chan[i];
+
+		dt_region = devm_kzalloc(dev, sizeof(*dt_region), GFP_KERNEL);
+		if (!dt_region)
+			return -ENOMEM;
+
+		chan->vc.chan.private = dt_region;
+
+		chan->chip = chip;
+		chan->id = j;
+		chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ;
+		chan->configured = false;
+		chan->request = EDMA_REQ_NONE;
+		chan->status = EDMA_ST_IDLE;
+
+		chan->ll_off = (ll_chunk * i);
+		chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1;
+
+		chan->dt_off = (dt_chunk * i);
+
+		dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n",
+			 write ? "write" : "read", j,
+			 chan->ll_off, chan->ll_max);
+
+		if (dw->nr_irqs == 1)
+			pos = 0;
+		else
+			pos = off_alloc + (j % alloc);
+
+		irq = &dw->irq[pos];
+
+		if (write)
+			irq->wr_mask |= BIT(j);
+		else
+			irq->rd_mask |= BIT(j);
+
+		irq->dw = dw;
+		memcpy(&chan->msi, &irq->msi, sizeof(chan->msi));
+
+		dev_vdbg(dev, "MSI:\t\tChannel %s[%u] addr=0x%.8x%.8x, data=0x%.8x\n",
+			 write ? "write" : "read", j,
+			 chan->msi.address_hi, chan->msi.address_lo,
+			 chan->msi.data);
+
+		chan->vc.desc_free = vchan_free_desc;
+		vchan_init(&chan->vc, dma);
+
+		dt_region->paddr = dw->dt_region.paddr + chan->dt_off;
+		dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off;
+		dt_region->sz = dt_chunk;
+
+		dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n",
+			 write ? "write" : "read", j, chan->dt_off);
+
+		dw_edma_v0_core_device_config(chan);
+	}
+
+	/* Set DMA channel capabilities */
+	dma_cap_zero(dma->cap_mask);
+	dma_cap_set(DMA_SLAVE, dma->cap_mask);
+	dma_cap_set(DMA_CYCLIC, dma->cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+	dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV);
+	dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	dma->chancnt = cnt;
+
+	/* Set DMA channel callbacks */
+	dma->dev = chip->dev;
+	dma->device_alloc_chan_resources = dw_edma_alloc_chan_resources;
+	dma->device_free_chan_resources = dw_edma_free_chan_resources;
+	dma->device_config = dw_edma_device_config;
+	dma->device_pause = dw_edma_device_pause;
+	dma->device_resume = dw_edma_device_resume;
+	dma->device_terminate_all = dw_edma_device_terminate_all;
+	dma->device_issue_pending = dw_edma_device_issue_pending;
+	dma->device_tx_status = dw_edma_device_tx_status;
+	dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
+	dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
+
+	dma_set_max_seg_size(dma->dev, U32_MAX);
+
+	/* Register DMA device */
+	err = dma_async_device_register(dma);
+
+	return err;
+}
+
+static inline void dw_edma_dec_irq_alloc(int *nr_irqs, u32 *alloc, u16 cnt)
+{
+	if (*nr_irqs && *alloc < cnt) {
+		(*alloc)++;
+		(*nr_irqs)--;
+	}
+}
+
+static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt)
+{
+	while (*mask * alloc < cnt)
+		(*mask)++;
+}
+
+static int dw_edma_irq_request(struct dw_edma_chip *chip,
+			       u32 *wr_alloc, u32 *rd_alloc)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_mask = 1;
+	u32 rd_mask = 1;
+	int i, err = 0;
+	u32 ch_cnt;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+
+	if (dw->nr_irqs < 1)
+		return -EINVAL;
+
+	if (dw->nr_irqs == 1) {
+		/* Common IRQ shared among all channels */
+		err = request_irq(pci_irq_vector(to_pci_dev(dev), 0),
+				  dw_edma_interrupt_common,
+				  IRQF_SHARED, dw->name, &dw->irq[0]);
+		if (err) {
+			dw->nr_irqs = 0;
+			return err;
+		}
+
+		get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), 0),
+				   &dw->irq[0].msi);
+	} else {
+		/* Distribute IRQs equally among all channels */
+		int tmp = dw->nr_irqs;
+
+		while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) {
+			dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt);
+			dw_edma_dec_irq_alloc(&tmp, rd_alloc, dw->rd_ch_cnt);
+		}
+
+		dw_edma_add_irq_mask(&wr_mask, *wr_alloc, dw->wr_ch_cnt);
+		dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt);
+
+		for (i = 0; i < (*wr_alloc + *rd_alloc); i++) {
+			err = request_irq(pci_irq_vector(to_pci_dev(dev), i),
+					  i < *wr_alloc ?
+						dw_edma_interrupt_write :
+						dw_edma_interrupt_read,
+					  IRQF_SHARED, dw->name,
+					  &dw->irq[i]);
+			if (err) {
+				dw->nr_irqs = i;
+				return err;
+			}
+
+			get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), i),
+					   &dw->irq[i].msi);
+		}
+
+		dw->nr_irqs = i;
+	}
+
+	return err;
+}
+
+int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_alloc = 0;
+	u32 rd_alloc = 0;
+	int i, err;
+
+	raw_spin_lock_init(&dw->lock);
+
+	/* Find out how many write channels are supported by hardware */
+	dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE);
+	if (!dw->wr_ch_cnt)
+		return -EINVAL;
+
+	/* Find out how many read channels are supported by hardware */
+	dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ);
+	if (!dw->rd_ch_cnt)
+		return -EINVAL;
+
+	dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n",
+		 dw->wr_ch_cnt, dw->rd_ch_cnt);
+
+	/* Allocate channels */
+	dw->chan = devm_kcalloc(dev, dw->wr_ch_cnt + dw->rd_ch_cnt,
+				sizeof(*dw->chan), GFP_KERNEL);
+	if (!dw->chan)
+		return -ENOMEM;
+
+	snprintf(dw->name, sizeof(dw->name), "dw-edma-core:%d", chip->id);
+
+	/* Disable eDMA, only to establish the ideal initial conditions */
+	dw_edma_v0_core_off(dw);
+
+	/* Request IRQs */
+	err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc);
+	if (err)
+		return err;
+
+	/* Setup write channels */
+	err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Setup read channels */
+	err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Power management */
+	pm_runtime_enable(dev);
+
+	/* Turn debugfs on */
+	dw_edma_v0_core_debugfs_on(chip);
+
+	return 0;
+
+err_irq_free:
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	dw->nr_irqs = 0;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dw_edma_probe);
+
+int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	struct dw_edma_chan *chan, *_chan;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	int i;
+
+	/* Disable eDMA */
+	dw_edma_v0_core_off(dw);
+
+	/* Free irqs */
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	/* Power management */
+	pm_runtime_disable(dev);
+
+	list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	/* Deregister eDMA device */
+	dma_async_device_unregister(&dw->wr_edma);
+	dma_async_device_unregister(&dw->rd_edma);
+
+	/* Turn debugfs off */
+	dw_edma_v0_core_debugfs_off();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dw_edma_remove);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Synopsys DesignWare eDMA controller core driver");
+MODULE_AUTHOR("Gustavo Pimentel <gustavo.pimentel@synopsys.com>");
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
new file mode 100644
index 000000000000..b6cc90cbc9dc
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_CORE_H
+#define _DW_EDMA_CORE_H
+
+#include <linux/msi.h>
+#include <linux/dma/edma.h>
+
+#include "../virt-dma.h"
+
+#define EDMA_LL_SZ					24
+
+enum dw_edma_dir {
+	EDMA_DIR_WRITE = 0,
+	EDMA_DIR_READ
+};
+
+enum dw_edma_mode {
+	EDMA_MODE_LEGACY = 0,
+	EDMA_MODE_UNROLL
+};
+
+enum dw_edma_request {
+	EDMA_REQ_NONE = 0,
+	EDMA_REQ_STOP,
+	EDMA_REQ_PAUSE
+};
+
+enum dw_edma_status {
+	EDMA_ST_IDLE = 0,
+	EDMA_ST_PAUSE,
+	EDMA_ST_BUSY
+};
+
+struct dw_edma_chan;
+struct dw_edma_chunk;
+
+struct dw_edma_burst {
+	struct list_head		list;
+	u64				sar;
+	u64				dar;
+	u32				sz;
+};
+
+struct dw_edma_region {
+	phys_addr_t			paddr;
+	dma_addr_t			vaddr;
+	size_t				sz;
+};
+
+struct dw_edma_chunk {
+	struct list_head		list;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_burst		*burst;
+
+	u32				bursts_alloc;
+
+	u8				cb;
+	struct dw_edma_region		ll_region;	/* Linked list */
+};
+
+struct dw_edma_desc {
+	struct virt_dma_desc		vd;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_chunk		*chunk;
+
+	u32				chunks_alloc;
+
+	u32				alloc_sz;
+	u32				xfer_sz;
+};
+
+struct dw_edma_chan {
+	struct virt_dma_chan		vc;
+	struct dw_edma_chip		*chip;
+	int				id;
+	enum dw_edma_dir		dir;
+
+	off_t				ll_off;
+	u32				ll_max;
+
+	off_t				dt_off;
+
+	struct msi_msg			msi;
+
+	enum dw_edma_request		request;
+	enum dw_edma_status		status;
+	u8				configured;
+
+	struct dma_slave_config		config;
+};
+
+struct dw_edma_irq {
+	struct msi_msg                  msi;
+	u32				wr_mask;
+	u32				rd_mask;
+	struct dw_edma			*dw;
+};
+
+struct dw_edma {
+	char				name[20];
+
+	struct dma_device		wr_edma;
+	u16				wr_ch_cnt;
+
+	struct dma_device		rd_edma;
+	u16				rd_ch_cnt;
+
+	struct dw_edma_region		rg_region;	/* Registers */
+	struct dw_edma_region		ll_region;	/* Linked list */
+	struct dw_edma_region		dt_region;	/* Data */
+
+	struct dw_edma_irq		*irq;
+	int				nr_irqs;
+
+	u32				version;
+	enum dw_edma_mode		mode;
+
+	struct dw_edma_chan		*chan;
+	const struct dw_edma_core_ops	*ops;
+
+	raw_spinlock_t			lock;		/* Only for legacy */
+};
+
+struct dw_edma_sg {
+	struct scatterlist		*sgl;
+	unsigned int			len;
+};
+
+struct dw_edma_cyclic {
+	dma_addr_t			paddr;
+	size_t				len;
+	size_t				cnt;
+};
+
+struct dw_edma_transfer {
+	struct dma_chan			*dchan;
+	union dw_edma_xfer {
+		struct dw_edma_sg	sg;
+		struct dw_edma_cyclic	cyclic;
+	} xfer;
+	enum dma_transfer_direction	direction;
+	unsigned long			flags;
+	bool				cyclic;
+};
+
+static inline
+struct dw_edma_chan *vc2dw_edma_chan(struct virt_dma_chan *vc)
+{
+	return container_of(vc, struct dw_edma_chan, vc);
+}
+
+static inline
+struct dw_edma_chan *dchan2dw_edma_chan(struct dma_chan *dchan)
+{
+	return vc2dw_edma_chan(to_virt_chan(dchan));
+}
+
+#endif /* _DW_EDMA_CORE_H */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
new file mode 100644
index 000000000000..cab6e18773da
--- /dev/null
+++ b/include/linux/dma/edma.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_H
+#define _DW_EDMA_H
+
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+
+struct dw_edma;
+
+/**
+ * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
+ * @dev:		 struct device of the eDMA controller
+ * @id:			 instance ID
+ * @irq:		 irq line
+ * @dw:			 struct dw_edma that is filed by dw_edma_probe()
+ */
+struct dw_edma_chip {
+	struct device		*dev;
+	int			id;
+	int			irq;
+	struct dw_edma		*dw;
+};
+
+/* Export to the platform drivers */
+#if IS_ENABLED(CONFIG_DW_EDMA)
+int dw_edma_probe(struct dw_edma_chip *chip);
+int dw_edma_remove(struct dw_edma_chip *chip);
+#else
+static inline int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	return -ENODEV;
+}
+
+static inline int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	return 0;
+}
+#endif /* CONFIG_DW_EDMA */
+
+#endif /* _DW_EDMA_H */
-- 
cgit v1.2.3


From 1f418f46503d72594bbe6407d97fd2ae1ce15ee6 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:25 +0200
Subject: PCI: Add Synopsys endpoint EDDA Device ID

Create and add Synopsys Endpoint EDDA Device ID to PCI ID list, since
this ID is now being use on two different drivers (pci_endpoint_test.ko
and dw-edma-pcie.ko).

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/misc/pci_endpoint_test.c | 2 +-
 include/linux/pci_ids.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 7b015f2a1c6f..1f531c1b4f74 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -804,7 +804,7 @@ static const struct pci_device_id pci_endpoint_test_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, 0xedda) },
+	{ PCI_DEVICE_DATA(SYNOPSYS, EDDA, NULL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_AM654),
 	  .driver_data = (kernel_ulong_t)&am654_data
 	},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..4aad69fc4d6b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2366,6 +2366,7 @@
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3		0xabcd
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3_AXI	0xabce
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB31	0xabcf
+#define PCI_DEVICE_ID_SYNOPSYS_EDDA	0xedda
 
 #define PCI_VENDOR_ID_USR		0x16ec
 
-- 
cgit v1.2.3


From 2769bd79a9154b933cc774ee773dd78b04d2be60 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:32 -0700
Subject: mfd: cros_ec: Update license term

Update to SPDX-License-Identifier, GPL-2.0

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index dcec96f01879..48292d449921 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1,25 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Host communication command constants for ChromeOS EC
  *
  * Copyright (C) 2012 Google, Inc
  *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * The ChromeOS EC multi function device is used to mux all the requests
- * to the EC device for its multiple features: keyboard controller,
- * battery charging and regulator control, firmware update.
- *
- * NOTE: This file is copied verbatim from the ChromeOS EC Open Source
- * project in an attempt to make future updates easy to make.
+ * NOTE: This file is auto-generated from ChromeOS EC Open Source code from
+ * https://chromium.googlesource.com/chromiumos/platform/ec/+/master/include/ec_commands.h
  */
 
+/* Host communication command constants for Chrome EC */
+
 #ifndef __CROS_EC_COMMANDS_H
 #define __CROS_EC_COMMANDS_H
 
-- 
cgit v1.2.3


From c9f69d8b170c2044013bf0248300d25a068f3fb5 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:33 -0700
Subject: mfd: cros_ec: Zero BUILD_ macro

Defined out build macro used when compiling embedded controller
firmware.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 48292d449921..7b8fac4d0c89 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -13,6 +13,11 @@
 #ifndef __CROS_EC_COMMANDS_H
 #define __CROS_EC_COMMANDS_H
 
+
+
+
+#define BUILD_ASSERT(_cond)
+
 /*
  * Current version of this protocol
  *
-- 
cgit v1.2.3


From df95a3bdf8f1fe110a50873f8f24eb2675b73d35 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:34 -0700
Subject: mfd: cros_ec: set comments properly

Fix comments syntax and spelling errors.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 65 ++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 7b8fac4d0c89..a7a7060f44f7 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -42,13 +42,16 @@
 /* Protocol version 2 */
 #define EC_LPC_ADDR_HOST_ARGS    0x800  /* And 0x801, 0x802, 0x803 */
 #define EC_LPC_ADDR_HOST_PARAM   0x804  /* For version 2 params; size is
-					 * EC_PROTO2_MAX_PARAM_SIZE */
+					 * EC_PROTO2_MAX_PARAM_SIZE
+					 */
 /* Protocol version 3 */
 #define EC_LPC_ADDR_HOST_PACKET  0x800  /* Offset of version 3 packet */
 #define EC_LPC_HOST_PACKET_SIZE  0x100  /* Max size of version 3 packet */
 
-/* The actual block is 0x800-0x8ff, but some BIOSes think it's 0x880-0x8ff
- * and they tell the kernel that so we have to think of it as two parts. */
+/*
+ * The actual block is 0x800-0x8ff, but some BIOSes think it's 0x880-0x8ff
+ * and they tell the kernel that so we have to think of it as two parts.
+ */
 #define EC_HOST_CMD_REGION0    0x800
 #define EC_HOST_CMD_REGION1    0x880
 #define EC_HOST_CMD_REGION_SIZE 0x80
@@ -324,7 +327,7 @@ struct ec_lpc_host_args {
  * If EC gets a command and this flag is not set, this is an old-style command.
  * Command version is 0 and params from host are at EC_LPC_ADDR_OLD_PARAM with
  * unknown length.  EC must respond with an old-style response (that is,
- * withouth setting EC_HOST_ARGS_FLAG_TO_HOST).
+ * without setting EC_HOST_ARGS_FLAG_TO_HOST).
  */
 #define EC_HOST_ARGS_FLAG_FROM_HOST 0x01
 /*
@@ -511,7 +514,7 @@ struct ec_host_response {
  * Notes on commands:
  *
  * Each command is an 16-bit command value.  Commands which take params or
- * return response data specify structs for that data.  If no struct is
+ * return response data specify structures for that data.  If no structure is
  * specified, the command does not input or output data, respectively.
  * Parameter/response length is implicit in the structs.  Some underlying
  * communication protocols (I2C, SPI) may add length or checksum headers, but
@@ -684,7 +687,7 @@ struct ec_response_get_cmd_versions {
 } __packed;
 
 /*
- * Check EC communcations status (busy). This is needed on i2c/spi but not
+ * Check EC communications status (busy). This is needed on i2c/spi but not
  * on lpc since it has its own out-of-band busy indicator.
  *
  * lpc must read the status from the command register. Attempting this on
@@ -721,7 +724,7 @@ struct ec_response_test_protocol {
 	uint8_t buf[32];
 } __packed;
 
-/* Get prococol information */
+/* Get protocol information */
 #define EC_CMD_GET_PROTOCOL_INFO	0x0b
 
 /* Flags for ec_response_get_protocol_info.flags */
@@ -767,7 +770,7 @@ struct ec_response_get_set_value {
 	uint32_t value;
 } __packed;
 
-/* More than one command can use these structs to get/set paramters. */
+/* More than one command can use these structs to get/set parameters. */
 #define EC_CMD_GSV_PAUSE_IN_S5	0x0c
 
 /*****************************************************************************/
@@ -917,8 +920,10 @@ struct ec_response_flash_info {
 	uint32_t protect_block_size;
 } __packed;
 
-/* Flags for version 1+ flash info command */
-/* EC flash erases bits to 0 instead of 1 */
+/*
+ * Flags for version 1+ flash info command
+ * EC flash erases bits to 0 instead of 1.
+ */
 #define EC_FLASH_INFO_ERASE_TO_0 (1 << 0)
 
 /**
@@ -941,7 +946,8 @@ struct ec_response_flash_info {
  * fields following.
  *
  * gcc anonymous structs don't seem to get along with the __packed directive;
- * if they did we'd define the version 0 struct as a sub-struct of this one.
+ * if they did we'd define the version 0 structure as a sub-structure of this
+ * one.
  */
 struct ec_response_flash_info_1 {
 	/* Version 0 fields; see above for description */
@@ -1036,7 +1042,7 @@ struct ec_params_flash_erase {
  * re-requesting the desired flags, or by a hard reset if that fails.
  */
 #define EC_FLASH_PROTECT_ERROR_INCONSISTENT (1 << 5)
-/* Entile flash code protected when the EC boots */
+/* Entire flash code protected when the EC boots */
 #define EC_FLASH_PROTECT_ALL_AT_BOOT        (1 << 6)
 
 /**
@@ -1629,7 +1635,7 @@ struct ec_response_motion_sensor_data {
 	union {
 		int16_t             data[3];
 		struct {
-			uint16_t    rsvd;
+			uint16_t    reserved;
 			uint32_t    timestamp;
 		} __packed;
 		struct {
@@ -1828,7 +1834,7 @@ struct ec_response_rtc {
 #define EC_CMD_RTC_SET_VALUE 0x46
 #define EC_CMD_RTC_SET_ALARM 0x47
 
-/* Pass as param to SET_ALARM to clear the current alarm */
+/* Pass as time param to SET_ALARM to clear the current alarm */
 #define EC_RTC_ALARM_CLEAR 0
 
 /*****************************************************************************/
@@ -1914,7 +1920,8 @@ enum ec_temp_thresholds {
 	EC_TEMP_THRESH_COUNT
 };
 
-/* Thermal configuration for one temperature sensor. Temps are in degrees K.
+/*
+ * Thermal configuration for one temperature sensor. Temps are in degrees K.
  * Zero values will be silently ignored by the thermal task.
  */
 struct ec_thermal_config {
@@ -1929,8 +1936,10 @@ struct ec_params_thermal_get_threshold_v1 {
 } __packed;
 /* This returns a struct ec_thermal_config */
 
-/* Version 1 - set config for one sensor.
- * Use read-modify-write for best results! */
+/*
+ * Version 1 - set config for one sensor.
+ * Use read-modify-write for best results!
+ */
 struct ec_params_thermal_set_threshold_v1 {
 	uint32_t sensor_num;
 	struct ec_thermal_config cfg;
@@ -2079,7 +2088,12 @@ enum mkbp_config_valid {
 	EC_MKBP_VALID_FIFO_MAX_DEPTH		= 1 << 7,
 };
 
-/* Configuration for our key scanning algorithm */
+/*
+ * Configuration for our key scanning algorithm.
+ *
+ * Note that this is used as a sub-structure of
+ * ec_{params/response}_mkbp_get_config.
+ */
 struct ec_mkbp_config {
 	uint32_t valid_mask;		/* valid fields */
 	uint8_t flags;		/* some flags (enum mkbp_config_flags) */
@@ -2362,6 +2376,7 @@ struct ec_params_gpio_set {
 struct ec_params_gpio_get {
 	char name[32];
 } __packed;
+
 struct ec_response_gpio_get {
 	uint8_t val;
 } __packed;
@@ -2402,8 +2417,10 @@ enum gpio_get_subcmd {
 /* I2C commands. Only available when flash write protect is unlocked. */
 
 /*
- * TODO(crosbug.com/p/23570): These commands are deprecated, and will be
- * removed soon.  Use EC_CMD_I2C_XFER instead.
+ * CAUTION: These commands are deprecated, and are not supported anymore in EC
+ * builds >= 8398.0.0 (see crosbug.com/p/23570).
+ *
+ * Use EC_CMD_I2C_PASSTHRU instead.
  */
 
 /* Read I2C bus */
@@ -2415,6 +2432,7 @@ struct ec_params_i2c_read {
 	uint8_t port;
 	uint8_t offset;
 } __packed;
+
 struct ec_response_i2c_read {
 	uint16_t data;
 } __packed;
@@ -2450,7 +2468,6 @@ struct ec_params_charge_control {
 } __packed;
 
 /*****************************************************************************/
-/* Console commands. Only available when flash write protect is unlocked. */
 
 /* Snapshot console output buffer for use by EC_CMD_CONSOLE_READ. */
 #define EC_CMD_CONSOLE_SNAPSHOT 0x97
@@ -2904,9 +2921,7 @@ enum ec_i2s_config {
 };
 
 struct ec_param_codec_i2s {
-	/*
-	 * enum ec_codec_i2s_subcmd
-	 */
+	/* enum ec_codec_i2s_subcmd */
 	uint8_t cmd;
 	union {
 		/*
@@ -2981,7 +2996,7 @@ struct ec_response_codec_gain {
 enum ec_reboot_cmd {
 	EC_REBOOT_CANCEL = 0,        /* Cancel a pending reboot */
 	EC_REBOOT_JUMP_RO = 1,       /* Jump to RO without rebooting */
-	EC_REBOOT_JUMP_RW = 2,       /* Jump to RW without rebooting */
+	EC_REBOOT_JUMP_RW = 2,       /* Jump to active RW without rebooting */
 	/* (command 3 was jump to RW-B) */
 	EC_REBOOT_COLD = 4,          /* Cold-reboot */
 	EC_REBOOT_DISABLE_JUMP = 5,  /* Disable jump until next reboot */
-- 
cgit v1.2.3


From 6f72c3f9bbdf08fde3b328c8bafbe8d667590b4e Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:35 -0700
Subject: mfd: cros_ec: add ec_align macros

To reduce code and improve performance of the embedded controller
firmware, pragma __aligned(2) or __aligned(4) are used when alignment to
16 or 32 bit boundary is expected.

Define all ec_align to packed when compiling kernel.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 442 +++++++++++++++++++----------------
 1 file changed, 238 insertions(+), 204 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index a7a7060f44f7..c12ae9742e20 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -188,12 +188,46 @@
 #ifndef __ACPI__
 
 /*
- * Define __packed if someone hasn't beat us to it.  Linux kernel style
- * checking prefers __packed over __attribute__((packed)).
+ * Attributes for EC request and response packets.  Just defining __packed
+ * results in inefficient assembly code on ARM, if the structure is actually
+ * 32-bit aligned, as it should be for all buffers.
+ *
+ * Be very careful when adding these to existing structures.  They will round
+ * up the structure size to the specified boundary.
+ *
+ * Also be very careful to make that if a structure is included in some other
+ * parent structure that the alignment will still be true given the packing of
+ * the parent structure.  This is particularly important if the sub-structure
+ * will be passed as a pointer to another function, since that function will
+ * not know about the misaligment caused by the parent structure's packing.
+ *
+ * Also be very careful using __packed - particularly when nesting non-packed
+ * structures inside packed ones.  In fact, DO NOT use __packed directly;
+ * always use one of these attributes.
+ *
+ * Once everything is annotated properly, the following search strings should
+ * not return ANY matches in this file other than right here:
+ *
+ * "__packed" - generates inefficient code; all sub-structs must also be packed
+ *
+ * "struct [^_]" - all structs should be annotated, except for structs that are
+ * members of other structs/unions (and their original declarations should be
+ * annotated).
  */
-#ifndef __packed
-#define __packed __attribute__((packed))
-#endif
+
+/*
+ * Packed structures make no assumption about alignment, so they do inefficient
+ * byte-wise reads.
+ */
+#define __ec_align1 __packed
+#define __ec_align2 __packed
+#define __ec_align4 __packed
+#define __ec_align_size1 __packed
+#define __ec_align_offset1 __packed
+#define __ec_align_offset2 __packed
+#define __ec_todo_packed __packed
+#define __ec_todo_unpacked
+
 
 /* LPC command status byte masks */
 /* EC has written a byte in the data register and host hasn't read it yet */
@@ -317,7 +351,7 @@ struct ec_lpc_host_args {
 	uint8_t command_version;
 	uint8_t data_size;
 	uint8_t checksum;
-} __packed;
+} __ec_align4;
 
 /* Flags for ec_lpc_host_args.flags */
 /*
@@ -488,7 +522,7 @@ struct ec_host_request {
 	uint8_t command_version;
 	uint8_t reserved;
 	uint16_t data_len;
-} __packed;
+} __ec_align4;
 
 #define EC_HOST_RESPONSE_VERSION 3
 
@@ -507,7 +541,7 @@ struct ec_host_response {
 	uint16_t result;
 	uint16_t data_len;
 	uint16_t reserved;
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /*
@@ -536,7 +570,7 @@ struct ec_host_response {
  */
 struct ec_response_proto_version {
 	uint32_t version;
-} __packed;
+} __ec_align4;
 
 /*
  * Hello.  This is a simple command to test the EC is responsive to
@@ -550,7 +584,7 @@ struct ec_response_proto_version {
  */
 struct ec_params_hello {
 	uint32_t in_data;
-} __packed;
+} __ec_align4;
 
 /**
  * struct ec_response_hello - Response to the hello command.
@@ -558,7 +592,7 @@ struct ec_params_hello {
  */
 struct ec_response_hello {
 	uint32_t out_data;
-} __packed;
+} __ec_align4;
 
 /* Get version number */
 #define EC_CMD_GET_VERSION 0x02
@@ -581,7 +615,7 @@ struct ec_response_get_version {
 	char version_string_rw[32];
 	char reserved[32];
 	uint32_t current_image;
-} __packed;
+} __ec_align4;
 
 /* Read test */
 #define EC_CMD_READ_TEST 0x03
@@ -594,7 +628,7 @@ struct ec_response_get_version {
 struct ec_params_read_test {
 	uint32_t offset;
 	uint32_t size;
-} __packed;
+} __ec_align4;
 
 /**
  * struct ec_response_read_test - Response to the read test command.
@@ -602,7 +636,7 @@ struct ec_params_read_test {
  */
 struct ec_response_read_test {
 	uint32_t data[32];
-} __packed;
+} __ec_align4;
 
 /*
  * Get build information
@@ -624,7 +658,7 @@ struct ec_response_get_chip_info {
 	char vendor[32];
 	char name[32];
 	char revision[32];
-} __packed;
+} __ec_align4;
 
 /* Get board HW version */
 #define EC_CMD_GET_BOARD_VERSION 0x06
@@ -635,7 +669,7 @@ struct ec_response_get_chip_info {
  */
 struct ec_response_board_version {
 	uint16_t board_version;
-} __packed;
+} __ec_align2;
 
 /*
  * Read memory-mapped data.
@@ -655,7 +689,7 @@ struct ec_response_board_version {
 struct ec_params_read_memmap {
 	uint8_t offset;
 	uint8_t size;
-} __packed;
+} __ec_align1;
 
 /* Read versions supported for a command */
 #define EC_CMD_GET_CMD_VERSIONS 0x08
@@ -666,7 +700,7 @@ struct ec_params_read_memmap {
  */
 struct ec_params_get_cmd_versions {
 	uint8_t cmd;
-} __packed;
+} __ec_align1;
 
 /**
  * struct ec_params_get_cmd_versions_v1 - Parameters for the get command
@@ -675,7 +709,7 @@ struct ec_params_get_cmd_versions {
  */
 struct ec_params_get_cmd_versions_v1 {
 	uint16_t cmd;
-} __packed;
+} __ec_align2;
 
 /**
  * struct ec_response_get_cmd_version - Response to the get command versions.
@@ -684,7 +718,7 @@ struct ec_params_get_cmd_versions_v1 {
  */
 struct ec_response_get_cmd_versions {
 	uint32_t version_mask;
-} __packed;
+} __ec_align4;
 
 /*
  * Check EC communications status (busy). This is needed on i2c/spi but not
@@ -707,7 +741,7 @@ enum ec_comms_status {
  */
 struct ec_response_get_comms_status {
 	uint32_t flags;		/* Mask of enum ec_comms_status */
-} __packed;
+} __ec_align4;
 
 /* Fake a variety of responses, purely for testing purposes. */
 #define EC_CMD_TEST_PROTOCOL		0x0a
@@ -717,12 +751,12 @@ struct ec_params_test_protocol {
 	uint32_t ec_result;
 	uint32_t ret_len;
 	uint8_t buf[32];
-} __packed;
+} __ec_align4;
 
 /* Here it comes... */
 struct ec_response_test_protocol {
 	uint8_t buf[32];
-} __packed;
+} __ec_align4;
 
 /* Get protocol information */
 #define EC_CMD_GET_PROTOCOL_INFO	0x0b
@@ -745,7 +779,7 @@ struct ec_response_get_protocol_info {
 	uint16_t max_request_packet_size;
 	uint16_t max_response_packet_size;
 	uint32_t flags;
-} __packed;
+} __ec_align4;
 
 
 /*****************************************************************************/
@@ -763,12 +797,12 @@ struct ec_response_get_protocol_info {
 struct ec_params_get_set_value {
 	uint32_t flags;
 	uint32_t value;
-} __packed;
+} __ec_align4;
 
 struct ec_response_get_set_value {
 	uint32_t flags;
 	uint32_t value;
-} __packed;
+} __ec_align4;
 
 /* More than one command can use these structs to get/set parameters. */
 #define EC_CMD_GSV_PAUSE_IN_S5	0x0c
@@ -893,7 +927,7 @@ enum ec_feature_code {
 
 struct ec_response_get_features {
 	uint32_t flags[2];
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* Flash commands */
@@ -918,7 +952,7 @@ struct ec_response_flash_info {
 	uint32_t write_block_size;
 	uint32_t erase_block_size;
 	uint32_t protect_block_size;
-} __packed;
+} __ec_align4;
 
 /*
  * Flags for version 1+ flash info command
@@ -959,7 +993,7 @@ struct ec_response_flash_info_1 {
 	/* Version 1 adds these fields: */
 	uint32_t write_ideal_size;
 	uint32_t flags;
-} __packed;
+} __ec_align4;
 
 /*
  * Read flash
@@ -976,7 +1010,7 @@ struct ec_response_flash_info_1 {
 struct ec_params_flash_read {
 	uint32_t offset;
 	uint32_t size;
-} __packed;
+} __ec_align4;
 
 /* Write flash */
 #define EC_CMD_FLASH_WRITE 0x12
@@ -994,7 +1028,7 @@ struct ec_params_flash_write {
 	uint32_t offset;
 	uint32_t size;
 	/* Followed by data to write */
-} __packed;
+} __ec_align4;
 
 /* Erase flash */
 #define EC_CMD_FLASH_ERASE 0x13
@@ -1007,7 +1041,7 @@ struct ec_params_flash_write {
 struct ec_params_flash_erase {
 	uint32_t offset;
 	uint32_t size;
-} __packed;
+} __ec_align4;
 
 /*
  * Get/set flash protection.
@@ -1053,7 +1087,7 @@ struct ec_params_flash_erase {
 struct ec_params_flash_protect {
 	uint32_t mask;
 	uint32_t flags;
-} __packed;
+} __ec_align4;
 
 /**
  * struct ec_response_flash_protect - Response to the flash protect command.
@@ -1068,7 +1102,7 @@ struct ec_response_flash_protect {
 	uint32_t flags;
 	uint32_t valid_flags;
 	uint32_t writable_flags;
-} __packed;
+} __ec_align4;
 
 /*
  * Note: commands 0x14 - 0x19 version 0 were old commands to get/set flash
@@ -1100,12 +1134,12 @@ enum ec_flash_region {
  */
 struct ec_params_flash_region_info {
 	uint32_t region;
-} __packed;
+} __ec_align4;
 
 struct ec_response_flash_region_info {
 	uint32_t offset;
 	uint32_t size;
-} __packed;
+} __ec_align4;
 
 /* Read/write VbNvContext */
 #define EC_CMD_VBNV_CONTEXT 0x17
@@ -1120,11 +1154,11 @@ enum ec_vbnvcontext_op {
 struct ec_params_vbnvcontext {
 	uint32_t op;
 	uint8_t block[EC_VBNV_BLOCK_SIZE];
-} __packed;
+} __ec_align4;
 
 struct ec_response_vbnvcontext {
 	uint8_t block[EC_VBNV_BLOCK_SIZE];
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* PWM commands */
@@ -1134,14 +1168,14 @@ struct ec_response_vbnvcontext {
 
 struct ec_response_pwm_get_fan_rpm {
 	uint32_t rpm;
-} __packed;
+} __ec_align4;
 
 /* Set target fan RPM */
-#define EC_CMD_PWM_SET_FAN_TARGET_RPM 0x21
+#define EC_CMD_PWM_SET_FAN_TARGET_RPM 0x0021
 
 struct ec_params_pwm_set_fan_target_rpm {
 	uint32_t rpm;
-} __packed;
+} __ec_align_size1;
 
 /* Get keyboard backlight */
 #define EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT 0x22
@@ -1149,21 +1183,21 @@ struct ec_params_pwm_set_fan_target_rpm {
 struct ec_response_pwm_get_keyboard_backlight {
 	uint8_t percent;
 	uint8_t enabled;
-} __packed;
+} __ec_align1;
 
 /* Set keyboard backlight */
 #define EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT 0x23
 
 struct ec_params_pwm_set_keyboard_backlight {
 	uint8_t percent;
-} __packed;
+} __ec_align1;
 
 /* Set target fan PWM duty cycle */
 #define EC_CMD_PWM_SET_FAN_DUTY 0x24
 
 struct ec_params_pwm_set_fan_duty {
 	uint32_t percent;
-} __packed;
+} __ec_align4;
 
 #define EC_CMD_PWM_SET_DUTY 0x25
 /* 16 bit duty cycle, 0xffff = 100% */
@@ -1183,18 +1217,18 @@ struct ec_params_pwm_set_duty {
 	uint16_t duty;     /* Duty cycle, EC_PWM_MAX_DUTY = 100% */
 	uint8_t pwm_type;  /* ec_pwm_type */
 	uint8_t index;     /* Type-specific index, or 0 if unique */
-} __packed;
+} __ec_align4;
 
 #define EC_CMD_PWM_GET_DUTY 0x26
 
 struct ec_params_pwm_get_duty {
 	uint8_t pwm_type;  /* ec_pwm_type */
 	uint8_t index;     /* Type-specific index, or 0 if unique */
-} __packed;
+} __ec_align1;
 
 struct ec_response_pwm_get_duty {
 	uint16_t duty;     /* Duty cycle, EC_PWM_MAX_DUTY = 100% */
-} __packed;
+} __ec_align2;
 
 /*****************************************************************************/
 /*
@@ -1207,7 +1241,7 @@ struct ec_response_pwm_get_duty {
 
 struct rgb_s {
 	uint8_t r, g, b;
-};
+} __ec_todo_unpacked;
 
 #define LB_BATTERY_LEVELS 4
 
@@ -1247,7 +1281,7 @@ struct lightbar_params_v0 {
 
 	/* Color palette */
 	struct rgb_s color[8];			/* 0-3 are Google colors */
-} __packed;
+} __ec_todo_packed;
 
 struct lightbar_params_v1 {
 	/* Timing */
@@ -1290,14 +1324,14 @@ struct lightbar_params_v1 {
 
 	/* Color palette */
 	struct rgb_s color[8];			/* 0-3 are Google colors */
-} __packed;
+} __ec_todo_packed;
 
 /* Lightbar program */
 #define EC_LB_PROG_LEN 192
 struct lightbar_program {
 	uint8_t size;
 	uint8_t data[EC_LB_PROG_LEN];
-};
+} __ec_todo_unpacked;
 
 struct ec_params_lightbar {
 	uint8_t cmd;		      /* Command (see enum lightbar_command) */
@@ -1307,23 +1341,23 @@ struct ec_params_lightbar {
 		} dump, off, on, init, get_seq, get_params_v0, get_params_v1,
 			version, get_brightness, get_demo, suspend, resume;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t num;
 		} set_brightness, seq, demo;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t ctrl, reg, value;
 		} reg;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t led, red, green, blue;
 		} set_rgb;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t led;
 		} get_rgb;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t enable;
 		} manual_suspend_ctrl;
 
@@ -1331,31 +1365,31 @@ struct ec_params_lightbar {
 		struct lightbar_params_v1 set_params_v1;
 		struct lightbar_program set_program;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_response_lightbar {
 	union {
-		struct {
-			struct {
+		struct __ec_todo_unpacked {
+			struct __ec_todo_unpacked {
 				uint8_t reg;
 				uint8_t ic0;
 				uint8_t ic1;
 			} vals[23];
 		} dump;
 
-		struct  {
+		struct __ec_todo_unpacked {
 			uint8_t num;
 		} get_seq, get_brightness, get_demo;
 
 		struct lightbar_params_v0 get_params_v0;
 		struct lightbar_params_v1 get_params_v1;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint32_t num;
 			uint32_t flags;
 		} version;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t red, green, blue;
 		} get_rgb;
 
@@ -1365,7 +1399,7 @@ struct ec_response_lightbar {
 			demo, set_params_v0, set_params_v1,
 			set_program, manual_suspend_ctrl, suspend, resume;
 	};
-} __packed;
+} __ec_todo_packed;
 
 /* Lightbar commands */
 enum lightbar_command {
@@ -1432,7 +1466,7 @@ struct ec_params_led_control {
 	uint8_t flags;      /* Control flags */
 
 	uint8_t brightness[EC_LED_COLOR_COUNT];
-} __packed;
+} __ec_align1;
 
 struct ec_response_led_control {
 	/*
@@ -1443,7 +1477,7 @@ struct ec_response_led_control {
 	 * Other values means the LED is control by PWM.
 	 */
 	uint8_t brightness_range[EC_LED_COLOR_COUNT];
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* Verified boot commands */
@@ -1464,7 +1498,7 @@ struct ec_params_vboot_hash {
 	uint32_t offset;         /* Offset in flash to hash */
 	uint32_t size;           /* Number of bytes to hash */
 	uint8_t nonce_data[64];  /* Nonce data; ignored if nonce_size=0 */
-} __packed;
+} __ec_align4;
 
 struct ec_response_vboot_hash {
 	uint8_t status;          /* enum ec_vboot_hash_status */
@@ -1474,7 +1508,7 @@ struct ec_response_vboot_hash {
 	uint32_t offset;         /* Offset in flash which was hashed */
 	uint32_t size;           /* Number of bytes hashed */
 	uint8_t hash_digest[64]; /* Hash digest data */
-} __packed;
+} __ec_align4;
 
 enum ec_vboot_hash_cmd {
 	EC_VBOOT_HASH_GET = 0,       /* Get current hash status */
@@ -1634,23 +1668,23 @@ struct ec_response_motion_sensor_data {
 	/* Each sensor is up to 3-axis. */
 	union {
 		int16_t             data[3];
-		struct {
+		struct __ec_todo_packed {
 			uint16_t    reserved;
 			uint32_t    timestamp;
-		} __packed;
-		struct {
+		};
+		struct __ec_todo_unpacked {
 			uint8_t     activity; /* motionsensor_activity */
 			uint8_t     state;
 			int16_t     add_info[2];
 		};
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_params_motion_sense {
 	uint8_t cmd;
 	union {
 		/* Used for MOTIONSENSE_CMD_DUMP. */
-		struct {
+		struct __ec_todo_unpacked {
 			/* no args */
 		} dump;
 
@@ -1658,13 +1692,13 @@ struct ec_params_motion_sense {
 		 * Used for MOTIONSENSE_CMD_EC_RATE and
 		 * MOTIONSENSE_CMD_KB_WAKE_ANGLE.
 		 */
-		struct {
+		struct __ec_todo_unpacked {
 			/* Data to set or EC_MOTION_SENSE_NO_VALUE to read. */
 			int16_t data;
 		} ec_rate, kb_wake_angle;
 
 		/* Used for MOTIONSENSE_CMD_SENSOR_OFFSET */
-		struct {
+		struct __ec_todo_packed {
 			uint8_t sensor_num;
 
 			/*
@@ -1690,10 +1724,10 @@ struct ec_params_motion_sense {
 			 * Compass:       1/16 uT
 			 */
 			int16_t offset[3];
-		} __packed sensor_offset;
+		} sensor_offset;
 
 		/* Used for MOTIONSENSE_CMD_INFO. */
-		struct {
+		struct __ec_todo_packed {
 			uint8_t sensor_num;
 		} info;
 
@@ -1714,12 +1748,12 @@ struct ec_params_motion_sense {
 			int32_t data;
 		} sensor_odr, sensor_range;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_response_motion_sense {
 	union {
 		/* Used for MOTIONSENSE_CMD_DUMP. */
-		struct {
+		struct __ec_todo_unpacked {
 			/* Flags representing the motion sensor module. */
 			uint8_t module_flags;
 
@@ -1734,7 +1768,7 @@ struct ec_response_motion_sense {
 		} dump;
 
 		/* Used for MOTIONSENSE_CMD_INFO. */
-		struct {
+		struct __ec_todo_unpacked {
 			/* Should be element of enum motionsensor_type. */
 			uint8_t type;
 
@@ -1753,18 +1787,18 @@ struct ec_response_motion_sense {
 		 * MOTIONSENSE_CMD_SENSOR_RANGE, and
 		 * MOTIONSENSE_CMD_KB_WAKE_ANGLE.
 		 */
-		struct {
+		struct __ec_todo_unpacked {
 			/* Current value of the parameter queried. */
 			int32_t ret;
 		} ec_rate, sensor_odr, sensor_range, kb_wake_angle;
 
 		/* Used for MOTIONSENSE_CMD_SENSOR_OFFSET */
-		struct {
+		struct __ec_todo_unpacked  {
 			int16_t temp;
 			int16_t offset[3];
 		} sensor_offset, perform_calib;
 	};
-} __packed;
+} __ec_todo_packed;
 
 /*****************************************************************************/
 /* USB charging control commands */
@@ -1775,7 +1809,7 @@ struct ec_response_motion_sense {
 struct ec_params_usb_charge_set_mode {
 	uint8_t usb_port_id;
 	uint8_t mode;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* Persistent storage for host */
@@ -1791,7 +1825,7 @@ struct ec_response_pstore_info {
 	uint32_t pstore_size;
 	/* Access size; read/write offset and size must be a multiple of this */
 	uint32_t access_size;
-} __packed;
+} __ec_align4;
 
 /*
  * Read persistent storage
@@ -1803,7 +1837,7 @@ struct ec_response_pstore_info {
 struct ec_params_pstore_read {
 	uint32_t offset;   /* Byte offset to read */
 	uint32_t size;     /* Size to read in bytes */
-} __packed;
+} __ec_align4;
 
 /* Write persistent storage */
 #define EC_CMD_PSTORE_WRITE 0x42
@@ -1812,7 +1846,7 @@ struct ec_params_pstore_write {
 	uint32_t offset;   /* Byte offset to write */
 	uint32_t size;     /* Size to write in bytes */
 	uint8_t data[EC_PSTORE_SIZE_MAX];
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* Real-time clock */
@@ -1820,11 +1854,11 @@ struct ec_params_pstore_write {
 /* RTC params and response structures */
 struct ec_params_rtc {
 	uint32_t time;
-} __packed;
+} __ec_align4;
 
 struct ec_response_rtc {
 	uint32_t time;
-} __packed;
+} __ec_align4;
 
 /* These use ec_response_rtc */
 #define EC_CMD_RTC_GET_VALUE 0x44
@@ -1855,29 +1889,29 @@ enum ec_port80_subcmd {
 struct ec_params_port80_read {
 	uint16_t subcmd;
 	union {
-		struct {
+		struct __ec_todo_unpacked {
 			uint32_t offset;
 			uint32_t num_entries;
 		} read_buffer;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_response_port80_read {
 	union {
-		struct {
+		struct __ec_todo_unpacked {
 			uint32_t writes;
 			uint32_t history_size;
 			uint32_t last_boot;
 		} get_info;
-		struct {
+		struct __ec_todo_unpacked {
 			uint16_t codes[EC_PORT80_SIZE_MAX];
 		} data;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_response_port80_last_boot {
 	uint16_t code;
-} __packed;
+} __ec_align2;
 
 /*****************************************************************************/
 /* Thermal engine commands. Note that there are two implementations. We'll
@@ -1898,17 +1932,17 @@ struct ec_params_thermal_set_threshold {
 	uint8_t sensor_type;
 	uint8_t threshold_id;
 	uint16_t value;
-} __packed;
+} __ec_align2;
 
 /* Version 0 - get */
 struct ec_params_thermal_get_threshold {
 	uint8_t sensor_type;
 	uint8_t threshold_id;
-} __packed;
+} __ec_align1;
 
 struct ec_response_thermal_get_threshold {
 	uint16_t value;
-} __packed;
+} __ec_align2;
 
 
 /* The version 1 structs are visible. */
@@ -1928,12 +1962,12 @@ struct ec_thermal_config {
 	uint32_t temp_host[EC_TEMP_THRESH_COUNT]; /* levels of hotness */
 	uint32_t temp_fan_off;		/* no active cooling needed */
 	uint32_t temp_fan_max;		/* max active cooling needed */
-} __packed;
+} __ec_align4;
 
 /* Version 1 - get config for one sensor. */
 struct ec_params_thermal_get_threshold_v1 {
 	uint32_t sensor_num;
-} __packed;
+} __ec_align4;
 /* This returns a struct ec_thermal_config */
 
 /*
@@ -1943,7 +1977,7 @@ struct ec_params_thermal_get_threshold_v1 {
 struct ec_params_thermal_set_threshold_v1 {
 	uint32_t sensor_num;
 	struct ec_thermal_config cfg;
-} __packed;
+} __ec_align4;
 /* This returns no data */
 
 /****************************************************************************/
@@ -1956,14 +1990,14 @@ struct ec_params_thermal_set_threshold_v1 {
 
 struct ec_params_tmp006_get_calibration {
 	uint8_t index;
-} __packed;
+} __ec_align1;
 
 struct ec_response_tmp006_get_calibration {
 	float s0;
 	float b0;
 	float b1;
 	float b2;
-} __packed;
+} __ec_align4;
 
 /* Set TMP006 calibration data */
 #define EC_CMD_TMP006_SET_CALIBRATION 0x54
@@ -1975,19 +2009,19 @@ struct ec_params_tmp006_set_calibration {
 	float b0;
 	float b1;
 	float b2;
-} __packed;
+} __ec_align4;
 
 /* Read raw TMP006 data */
 #define EC_CMD_TMP006_GET_RAW 0x55
 
 struct ec_params_tmp006_get_raw {
 	uint8_t index;
-} __packed;
+} __ec_align1;
 
 struct ec_response_tmp006_get_raw {
 	int32_t t;  /* In 1/100 K */
 	int32_t v;  /* In nV */
-};
+} __ec_align4;
 
 /*****************************************************************************/
 /* MKBP - Matrix KeyBoard Protocol */
@@ -2014,12 +2048,12 @@ struct ec_response_mkbp_info {
 	uint32_t cols;
 	/* Formerly "switches", which was 0. */
 	uint8_t reserved;
-} __packed;
+} __ec_align_size1;
 
 struct ec_params_mkbp_info {
 	uint8_t info_type;
 	uint8_t event_type;
-} __packed;
+} __ec_align1;
 
 enum ec_mkbp_info_type {
 	/*
@@ -2067,7 +2101,7 @@ struct ec_params_mkbp_simulate_key {
 	uint8_t col;
 	uint8_t row;
 	uint8_t pressed;
-} __packed;
+} __ec_align1;
 
 /* Configure keyboard scanning */
 #define EC_CMD_MKBP_SET_CONFIG 0x64
@@ -2113,15 +2147,15 @@ struct ec_mkbp_config {
 	uint16_t debounce_up_us;	/* time for debounce on key up */
 	/* maximum depth to allow for fifo (0 = no keyscan output) */
 	uint8_t fifo_max_depth;
-} __packed;
+} __ec_align_size1;
 
 struct ec_params_mkbp_set_config {
 	struct ec_mkbp_config config;
-} __packed;
+} __ec_align_size1;
 
 struct ec_response_mkbp_get_config {
 	struct ec_mkbp_config config;
-} __packed;
+} __ec_align_size1;
 
 /* Run the key scan emulation */
 #define EC_CMD_KEYSCAN_SEQ_CTRL 0x66
@@ -2144,18 +2178,18 @@ enum ec_collect_flags {
 
 struct ec_collect_item {
 	uint8_t flags;		/* some flags (enum ec_collect_flags) */
-};
+} __ec_align1;
 
 struct ec_params_keyscan_seq_ctrl {
 	uint8_t cmd;	/* Command to send (enum ec_keyscan_seq_cmd) */
 	union {
-		struct {
+		struct __ec_align1 {
 			uint8_t active;		/* still active */
 			uint8_t num_items;	/* number of items */
 			/* Current item being presented */
 			uint8_t cur_item;
 		} status;
-		struct {
+		struct __ec_todo_unpacked {
 			/*
 			 * Absolute time for this scan, measured from the
 			 * start of the sequence.
@@ -2163,22 +2197,22 @@ struct ec_params_keyscan_seq_ctrl {
 			uint32_t time_us;
 			uint8_t scan[0];	/* keyscan data */
 		} add;
-		struct {
+		struct __ec_align1 {
 			uint8_t start_item;	/* First item to return */
 			uint8_t num_items;	/* Number of items to return */
 		} collect;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_result_keyscan_seq_ctrl {
 	union {
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t num_items;	/* Number of items */
 			/* Data for each item */
 			struct ec_collect_item item[0];
 		} collect;
 	};
-} __packed;
+} __ec_todo_packed;
 
 /*
  * Command for retrieving the next pending MKBP event from the EC device
@@ -2216,8 +2250,8 @@ enum ec_mkbp_event {
 	EC_MKBP_EVENT_COUNT,
 };
 
-union ec_response_get_next_data {
-	uint8_t   key_matrix[13];
+union __ec_align_offset1 ec_response_get_next_data {
+	uint8_t key_matrix[13];
 
 	/* Unaligned */
 	uint32_t  host_event;
@@ -2225,9 +2259,9 @@ union ec_response_get_next_data {
 	uint32_t   buttons;
 	uint32_t   switches;
 	uint32_t   sysrq;
-} __packed;
+};
 
-union ec_response_get_next_data_v1 {
+union __ec_align_offset1 ec_response_get_next_data_v1 {
 	uint8_t key_matrix[16];
 	uint32_t host_event;
 	uint32_t buttons;
@@ -2235,19 +2269,19 @@ union ec_response_get_next_data_v1 {
 	uint32_t sysrq;
 	uint32_t cec_events;
 	uint8_t cec_message[16];
-} __packed;
+};
 
 struct ec_response_get_next_event {
 	uint8_t event_type;
 	/* Followed by event data if any */
 	union ec_response_get_next_data data;
-} __packed;
+} __ec_align1;
 
 struct ec_response_get_next_event_v1 {
 	uint8_t event_type;
 	/* Followed by event data if any */
 	union ec_response_get_next_data_v1 data;
-} __packed;
+} __ec_align1;
 
 /* Bit indices for buttons and switches.*/
 /* Buttons */
@@ -2268,12 +2302,12 @@ struct ec_response_get_next_event_v1 {
 
 struct ec_params_temp_sensor_get_info {
 	uint8_t id;
-} __packed;
+} __ec_align1;
 
 struct ec_response_temp_sensor_get_info {
 	char sensor_name[32];
 	uint8_t sensor_type;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 
@@ -2292,11 +2326,11 @@ struct ec_response_temp_sensor_get_info {
  */
 struct ec_params_host_event_mask {
 	uint32_t mask;
-} __packed;
+} __ec_align4;
 
 struct ec_response_host_event_mask {
 	uint32_t mask;
-} __packed;
+} __ec_align4;
 
 /* These all use ec_response_host_event_mask */
 #define EC_CMD_HOST_EVENT_GET_B         0x87
@@ -2319,7 +2353,7 @@ struct ec_response_host_event_mask {
 
 struct ec_params_switch_enable_backlight {
 	uint8_t enabled;
-} __packed;
+} __ec_align1;
 
 /* Enable/disable WLAN/Bluetooth */
 #define EC_CMD_SWITCH_ENABLE_WIRELESS 0x91
@@ -2328,7 +2362,7 @@ struct ec_params_switch_enable_backlight {
 /* Version 0 params; no response */
 struct ec_params_switch_enable_wireless_v0 {
 	uint8_t enabled;
-} __packed;
+} __ec_align1;
 
 /* Version 1 params */
 struct ec_params_switch_enable_wireless_v1 {
@@ -2347,7 +2381,7 @@ struct ec_params_switch_enable_wireless_v1 {
 
 	/* Which flags to copy from suspend_flags */
 	uint8_t suspend_mask;
-} __packed;
+} __ec_align1;
 
 /* Version 1 response */
 struct ec_response_switch_enable_wireless_v1 {
@@ -2356,7 +2390,7 @@ struct ec_response_switch_enable_wireless_v1 {
 
 	/* Flags to leave enabled in S3 */
 	uint8_t suspend_flags;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* GPIO commands. Only available on EC if write protect has been disabled. */
@@ -2367,7 +2401,7 @@ struct ec_response_switch_enable_wireless_v1 {
 struct ec_params_gpio_set {
 	char name[32];
 	uint8_t val;
-} __packed;
+} __ec_align1;
 
 /* Get GPIO value */
 #define EC_CMD_GPIO_GET 0x93
@@ -2375,37 +2409,37 @@ struct ec_params_gpio_set {
 /* Version 0 of input params and response */
 struct ec_params_gpio_get {
 	char name[32];
-} __packed;
+} __ec_align1;
 
 struct ec_response_gpio_get {
 	uint8_t val;
-} __packed;
+} __ec_align1;
 
 /* Version 1 of input params and response */
 struct ec_params_gpio_get_v1 {
 	uint8_t subcmd;
 	union {
-		struct {
+		struct __ec_align1 {
 			char name[32];
 		} get_value_by_name;
-		struct {
+		struct __ec_align1 {
 			uint8_t index;
 		} get_info;
 	};
-} __packed;
+} __ec_align1;
 
 struct ec_response_gpio_get_v1 {
 	union {
-		struct {
+		struct __ec_align1 {
 			uint8_t val;
 		} get_value_by_name, get_count;
-		struct {
+		struct __ec_todo_unpacked {
 			uint8_t val;
 			char name[32];
 			uint32_t flags;
 		} get_info;
 	};
-} __packed;
+} __ec_todo_packed;
 
 enum gpio_get_subcmd {
 	EC_GPIO_GET_BY_NAME = 0,
@@ -2431,11 +2465,11 @@ struct ec_params_i2c_read {
 	uint8_t read_size; /* Either 8 or 16. */
 	uint8_t port;
 	uint8_t offset;
-} __packed;
+} __ec_align_size1;
 
 struct ec_response_i2c_read {
 	uint16_t data;
-} __packed;
+} __ec_align2;
 
 /* Write I2C bus */
 #define EC_CMD_I2C_WRITE 0x95
@@ -2446,7 +2480,7 @@ struct ec_params_i2c_write {
 	uint8_t write_size; /* Either 8 or 16. */
 	uint8_t port;
 	uint8_t offset;
-} __packed;
+} __ec_align_size1;
 
 /*****************************************************************************/
 /* Charge state commands. Only available when flash write protect unlocked. */
@@ -2465,7 +2499,7 @@ enum ec_charge_control_mode {
 
 struct ec_params_charge_control {
 	uint32_t mode;  /* enum charge_control_mode */
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 
@@ -2493,7 +2527,7 @@ enum ec_console_read_subcmd {
 
 struct ec_params_console_read_v1 {
 	uint8_t subcmd; /* enum ec_console_read_subcmd */
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 
@@ -2511,7 +2545,7 @@ struct ec_params_console_read_v1 {
 
 struct ec_params_battery_cutoff {
 	uint8_t flags;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* USB port mux control. */
@@ -2523,7 +2557,7 @@ struct ec_params_battery_cutoff {
 
 struct ec_params_usb_mux {
 	uint8_t mux;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* LDOs / FETs control. */
@@ -2541,7 +2575,7 @@ enum ec_ldo_state {
 struct ec_params_ldo_set {
 	uint8_t index;
 	uint8_t state;
-} __packed;
+} __ec_align1;
 
 /*
  * Get LDO state.
@@ -2550,11 +2584,11 @@ struct ec_params_ldo_set {
 
 struct ec_params_ldo_get {
 	uint8_t index;
-} __packed;
+} __ec_align1;
 
 struct ec_response_ldo_get {
 	uint8_t state;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* Power info. */
@@ -2570,7 +2604,7 @@ struct ec_response_power_info {
 	uint16_t voltage_system;
 	uint16_t current_system;
 	uint16_t usb_current_limit;
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* I2C passthru command */
@@ -2592,20 +2626,20 @@ struct ec_response_power_info {
 struct ec_params_i2c_passthru_msg {
 	uint16_t addr_flags;	/* I2C slave address (7 or 10 bits) and flags */
 	uint16_t len;		/* Number of bytes to read or write */
-} __packed;
+} __ec_align2;
 
 struct ec_params_i2c_passthru {
 	uint8_t port;		/* I2C port number */
 	uint8_t num_msgs;	/* Number of messages */
 	struct ec_params_i2c_passthru_msg msg[];
 	/* Data to write for all messages is concatenated here */
-} __packed;
+} __ec_align2;
 
 struct ec_response_i2c_passthru {
 	uint8_t i2c_status;	/* Status flags (EC_I2C_STATUS_...) */
 	uint8_t num_msgs;	/* Number of messages processed */
 	uint8_t data[];		/* Data read by messages concatenated here */
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* Power button hang detect */
@@ -2660,7 +2694,7 @@ struct ec_params_hang_detect {
 
 	/* Timeout in msec before generating warm reboot, if enabled */
 	uint16_t warm_reboot_timeout_msec;
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* Commands for battery charging */
@@ -2706,20 +2740,20 @@ struct ec_params_charge_state {
 			/* no args */
 		} get_state;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint32_t param;		/* enum charge_state_param */
 		} get_param;
 
-		struct {
+		struct __ec_todo_unpacked {
 			uint32_t param;		/* param to set */
 			uint32_t value;		/* value to set */
 		} set_param;
 	};
-} __packed;
+} __ec_todo_packed;
 
 struct ec_response_charge_state {
 	union {
-		struct {
+		struct __ec_align4 {
 			int ac;
 			int chg_voltage;
 			int chg_current;
@@ -2727,14 +2761,14 @@ struct ec_response_charge_state {
 			int batt_state_of_charge;
 		} get_state;
 
-		struct {
+		struct __ec_align4 {
 			uint32_t value;
 		} get_param;
 		struct {
 			/* no return values */
 		} set_param;
 	};
-} __packed;
+} __ec_align4;
 
 
 /*
@@ -2744,7 +2778,7 @@ struct ec_response_charge_state {
 
 struct ec_params_current_limit {
 	uint32_t limit; /* in mA */
-} __packed;
+} __ec_align4;
 
 /*
  * Set maximum external voltage / current.
@@ -2755,7 +2789,7 @@ struct ec_params_current_limit {
 struct ec_params_external_power_limit_v1 {
 	uint16_t current_lim; /* in mA, or EC_POWER_LIMIT_NONE to clear limit */
 	uint16_t voltage_lim; /* in mV, or EC_POWER_LIMIT_NONE to clear limit */
-} __packed;
+} __ec_align2;
 
 #define EC_POWER_LIMIT_NONE 0xffff
 
@@ -2771,7 +2805,7 @@ enum host_sleep_event {
 
 struct ec_params_host_sleep_event {
 	uint8_t sleep_event;
-} __packed;
+} __ec_align1;
 
 /*
  * Use a default timeout value (CONFIG_SLEEP_TIMEOUT_MS) for detecting sleep
@@ -2802,7 +2836,7 @@ struct ec_params_host_sleep_event_v1 {
 
 		/* No parameters for non-suspend messages. */
 	};
-} __packed;
+} __ec_align2;
 
 /* A timeout occurred when this bit is set */
 #define EC_HOST_RESUME_SLEEP_TIMEOUT 0x80000000
@@ -2828,7 +2862,7 @@ struct ec_response_host_sleep_event_v1 {
 
 		/* No response fields for non-resume messages. */
 	};
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* Smart battery pass-through */
@@ -2845,25 +2879,25 @@ struct ec_response_host_sleep_event_v1 {
 
 struct ec_params_sb_rd {
 	uint8_t reg;
-} __packed;
+} __ec_align1;
 
 struct ec_response_sb_rd_word {
 	uint16_t value;
-} __packed;
+} __ec_align2;
 
 struct ec_params_sb_wr_word {
 	uint8_t reg;
 	uint16_t value;
-} __packed;
+} __ec_align1;
 
 struct ec_response_sb_rd_block {
 	uint8_t data[32];
-} __packed;
+} __ec_align1;
 
 struct ec_params_sb_wr_block {
 	uint8_t reg;
 	uint16_t data[32];
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* Battery vendor parameters
@@ -2885,11 +2919,11 @@ struct ec_params_battery_vendor_param {
 	uint32_t param;
 	uint32_t value;
 	uint8_t mode;
-} __packed;
+} __ec_align_size1;
 
 struct ec_response_battery_vendor_param {
 	uint32_t value;
-} __packed;
+} __ec_align4;
 
 /*****************************************************************************/
 /* Commands for I2S recording on audio codec. */
@@ -2920,7 +2954,7 @@ enum ec_i2s_config {
 	EC_DAI_FMT_PCM_TDM = 5,
 };
 
-struct ec_param_codec_i2s {
+struct __ec_todo_packed ec_param_codec_i2s {
 	/* enum ec_codec_i2s_subcmd */
 	uint8_t cmd;
 	union {
@@ -2934,10 +2968,10 @@ struct ec_param_codec_i2s {
 		 * EC_CODEC_SET_GAIN
 		 * Value should be 0~43 for both channels.
 		 */
-		struct ec_param_codec_i2s_set_gain {
+		struct __ec_align1 ec_param_codec_i2s_set_gain {
 			uint8_t left;
 			uint8_t right;
-		} __packed gain;
+		} gain;
 
 		/*
 		 * EC_CODEC_I2S_ENABLE
@@ -2955,7 +2989,7 @@ struct ec_param_codec_i2s {
 		 * EC_CODEC_I2S_SET_TDM_CONFIG
 		 * Value should be one of ec_i2s_config.
 		 */
-		struct ec_param_codec_i2s_tdm {
+		struct __ec_todo_unpacked ec_param_codec_i2s_tdm {
 			/*
 			 * 0 to 496
 			 */
@@ -2966,14 +3000,14 @@ struct ec_param_codec_i2s {
 			int16_t ch1_delay;
 			uint8_t adjacent_to_ch0;
 			uint8_t adjacent_to_ch1;
-		} __packed tdm_param;
+		} tdm_param;
 
 		/*
 		 * EC_CODEC_I2S_SET_BCLK
 		 */
 		uint32_t bclk;
 	};
-} __packed;
+};
 
 /*
  * For subcommand EC_CODEC_GET_GAIN.
@@ -2981,7 +3015,7 @@ struct ec_param_codec_i2s {
 struct ec_response_codec_gain {
 	uint8_t left;
 	uint8_t right;
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /* System commands */
@@ -3010,7 +3044,7 @@ enum ec_reboot_cmd {
 struct ec_params_reboot_ec {
 	uint8_t cmd;           /* enum ec_reboot_cmd */
 	uint8_t flags;         /* See EC_REBOOT_FLAG_* */
-} __packed;
+} __ec_align1;
 
 /*
  * Get information on last EC panic.
@@ -3153,7 +3187,7 @@ struct ec_params_reboot_ec {
  */
 struct ec_params_cec_write {
 	uint8_t msg[EC_MAX_CEC_MSG_LEN];
-} __packed;
+} __ec_align1;
 
 /* Set various CEC parameters */
 #define EC_CMD_CEC_SET 0x00BA
@@ -3169,7 +3203,7 @@ struct ec_params_cec_write {
 struct ec_params_cec_set {
 	uint8_t cmd; /* enum cec_command */
 	uint8_t val;
-} __packed;
+} __ec_align1;
 
 /* Read various CEC parameters */
 #define EC_CMD_CEC_GET 0x00BB
@@ -3180,7 +3214,7 @@ struct ec_params_cec_set {
  */
 struct ec_params_cec_get {
 	uint8_t cmd; /* enum cec_command */
-} __packed;
+} __ec_align1;
 
 /**
  * struct ec_response_cec_get - CEC parameters get response
@@ -3191,7 +3225,7 @@ struct ec_params_cec_get {
  */
 struct ec_response_cec_get {
 	uint8_t val;
-} __packed;
+} __ec_align1;
 
 /* CEC parameters command */
 enum ec_cec_command {
@@ -3264,13 +3298,13 @@ enum mkbp_cec_event {
 /* Status of EC being sent to PD */
 struct ec_params_pd_status {
 	int8_t batt_soc; /* battery state of charge */
-} __packed;
+} __ec_align1;
 
 /* Status of PD being sent back to EC */
 struct ec_response_pd_status {
 	int8_t status;        /* PD MCU status */
 	uint32_t curr_lim_ma; /* input current limit */
-} __packed;
+} __ec_align_size1;
 
 /* Set USB type-C port role and muxes */
 #define EC_CMD_USB_PD_CONTROL 0x101
@@ -3305,7 +3339,7 @@ struct ec_params_usb_pd_control {
 	uint8_t role;
 	uint8_t mux;
 	uint8_t swap;
-} __packed;
+} __ec_align1;
 
 #define PD_CTRL_RESP_ENABLED_COMMS      (1 << 0) /* Communication enabled */
 #define PD_CTRL_RESP_ENABLED_CONNECTED  (1 << 1) /* Device connected */
@@ -3324,7 +3358,7 @@ struct ec_response_usb_pd_control_v1 {
 	uint8_t role;
 	uint8_t polarity;
 	char state[32];
-} __packed;
+} __ec_align1;
 
 #define EC_CMD_USB_PD_PORTS 0x102
 
@@ -3333,14 +3367,14 @@ struct ec_response_usb_pd_control_v1 {
 
 struct ec_response_usb_pd_ports {
 	uint8_t num_ports;
-} __packed;
+} __ec_align1;
 
 #define EC_CMD_USB_PD_POWER_INFO 0x103
 
 #define PD_POWER_CHARGING_PORT 0xff
 struct ec_params_usb_pd_power_info {
 	uint8_t port;
-} __packed;
+} __ec_align1;
 
 enum usb_chg_type {
 	USB_CHG_TYPE_NONE,
@@ -3366,7 +3400,7 @@ struct usb_chg_measures {
 	uint16_t voltage_now;
 	uint16_t current_max;
 	uint16_t current_lim;
-} __packed;
+} __ec_align2;
 
 struct ec_response_usb_pd_power_info {
 	uint8_t role;
@@ -3375,11 +3409,11 @@ struct ec_response_usb_pd_power_info {
 	uint8_t reserved1;
 	struct usb_chg_measures meas;
 	uint32_t max_power;
-} __packed;
+} __ec_align4;
 
 struct ec_params_usb_pd_info_request {
 	uint8_t port;
-} __packed;
+} __ec_align1;
 
 /*
  * This command will return the number of USB PD charge port + the number
@@ -3389,7 +3423,7 @@ struct ec_params_usb_pd_info_request {
 #define EC_CMD_CHARGE_PORT_COUNT 0x0105
 struct ec_response_charge_port_count {
 	uint8_t port_count;
-} __packed;
+} __ec_align1;
 
 /* Read USB-PD Device discovery info */
 #define EC_CMD_USB_PD_DISCOVERY 0x0113
@@ -3397,7 +3431,7 @@ struct ec_params_usb_pd_discovery_entry {
 	uint16_t vid;  /* USB-IF VID */
 	uint16_t pid;  /* USB-IF PID */
 	uint8_t ptype; /* product type (hub,periph,cable,ama) */
-} __packed;
+} __ec_align_size1;
 
 /* Override default charge behavior */
 #define EC_CMD_PD_CHARGE_PORT_OVERRIDE 0x0114
@@ -3411,7 +3445,7 @@ enum usb_pd_override_ports {
 
 struct ec_params_charge_port_override {
 	int16_t override_port; /* Override port# */
-} __packed;
+} __ec_align2;
 
 /* Read (and delete) one entry of PD event log */
 #define EC_CMD_PD_GET_LOG_ENTRY 0x0115
@@ -3422,7 +3456,7 @@ struct ec_response_pd_log {
 	uint8_t size_port;  /* [7:5] port number [4:0] payload size in bytes */
 	uint16_t data;      /* type-defined data payload */
 	uint8_t payload[0]; /* optional additional data payload: 0..16 bytes */
-} __packed;
+} __ec_align4;
 
 /* The timestamp is the microsecond counter shifted to get about a ms. */
 #define PD_LOG_TIMESTAMP_SHIFT 10 /* 1 LSB = 1024us */
@@ -3488,14 +3522,14 @@ struct mcdp_version {
 	uint8_t major;
 	uint8_t minor;
 	uint16_t build;
-} __packed;
+} __ec_align4;
 
 struct mcdp_info {
 	uint8_t family[2];
 	uint8_t chipid[2];
 	struct mcdp_version irom;
 	struct mcdp_version fw;
-} __packed;
+} __ec_align4;
 
 /* struct mcdp_info field decoding */
 #define MCDP_CHIPID(chipid) ((chipid[0] << 8) | chipid[1])
@@ -3506,7 +3540,7 @@ struct mcdp_info {
 
 struct ec_params_usb_pd_mux_info {
 	uint8_t port; /* USB-C port number */
-} __packed;
+} __ec_align1;
 
 /* Flags representing mux state */
 #define USB_PD_MUX_USB_ENABLED       (1 << 0)
@@ -3516,7 +3550,7 @@ struct ec_params_usb_pd_mux_info {
 
 struct ec_response_usb_pd_mux_info {
 	uint8_t flags; /* USB_PD_MUX_*-encoded USB mux state */
-} __packed;
+} __ec_align1;
 
 /*****************************************************************************/
 /*
-- 
cgit v1.2.3


From ff8343328bb91f9f1e211f4ab65ba5eb5bc6d3dc Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:36 -0700
Subject: mfd: cros_ec: Define commands as 4-digit UPPER CASE hex values

This change is required for compilation of embedded controller firmware
to work properly (See CONFIG_HOSTCMD_SECTION_SORTED).

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 235 ++++++++++++++++++++---------------
 1 file changed, 136 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index c12ae9742e20..8ad77d8a9141 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -553,6 +553,9 @@ struct ec_host_response {
  * Parameter/response length is implicit in the structs.  Some underlying
  * communication protocols (I2C, SPI) may add length or checksum headers, but
  * those are implementation-dependent and not defined here.
+ *
+ * All commands MUST be #defined to be 4-digit UPPER CASE hex values
+ * (e.g., 0x00AB, not 0xab) for CONFIG_HOSTCMD_SECTION_SORTED to work.
  */
 
 /*****************************************************************************/
@@ -562,7 +565,7 @@ struct ec_host_response {
  * Get protocol version, used to deal with non-backward compatible protocol
  * changes.
  */
-#define EC_CMD_PROTO_VERSION 0x00
+#define EC_CMD_PROTO_VERSION 0x0000
 
 /**
  * struct ec_response_proto_version - Response to the proto version command.
@@ -576,7 +579,7 @@ struct ec_response_proto_version {
  * Hello.  This is a simple command to test the EC is responsive to
  * commands.
  */
-#define EC_CMD_HELLO 0x01
+#define EC_CMD_HELLO 0x0001
 
 /**
  * struct ec_params_hello - Parameters to the hello command.
@@ -595,7 +598,7 @@ struct ec_response_hello {
 } __ec_align4;
 
 /* Get version number */
-#define EC_CMD_GET_VERSION 0x02
+#define EC_CMD_GET_VERSION 0x0002
 
 enum ec_current_image {
 	EC_IMAGE_UNKNOWN = 0,
@@ -618,7 +621,7 @@ struct ec_response_get_version {
 } __ec_align4;
 
 /* Read test */
-#define EC_CMD_READ_TEST 0x03
+#define EC_CMD_READ_TEST 0x0003
 
 /**
  * struct ec_params_read_test - Parameters for the read test command.
@@ -643,10 +646,10 @@ struct ec_response_read_test {
  *
  * Response is null-terminated string.
  */
-#define EC_CMD_GET_BUILD_INFO 0x04
+#define EC_CMD_GET_BUILD_INFO 0x0004
 
 /* Get chip info */
-#define EC_CMD_GET_CHIP_INFO 0x05
+#define EC_CMD_GET_CHIP_INFO 0x0005
 
 /**
  * struct ec_response_get_chip_info - Response to the get chip info command.
@@ -661,7 +664,7 @@ struct ec_response_get_chip_info {
 } __ec_align4;
 
 /* Get board HW version */
-#define EC_CMD_GET_BOARD_VERSION 0x06
+#define EC_CMD_GET_BOARD_VERSION 0x0006
 
 /**
  * struct ec_response_board_version - Response to the board version command.
@@ -679,7 +682,7 @@ struct ec_response_board_version {
  *
  * Response is params.size bytes of data.
  */
-#define EC_CMD_READ_MEMMAP 0x07
+#define EC_CMD_READ_MEMMAP 0x0007
 
 /**
  * struct ec_params_read_memmap - Parameters for the read memory map command.
@@ -692,7 +695,7 @@ struct ec_params_read_memmap {
 } __ec_align1;
 
 /* Read versions supported for a command */
-#define EC_CMD_GET_CMD_VERSIONS 0x08
+#define EC_CMD_GET_CMD_VERSIONS 0x0008
 
 /**
  * struct ec_params_get_cmd_versions - Parameters for the get command versions.
@@ -727,7 +730,7 @@ struct ec_response_get_cmd_versions {
  * lpc must read the status from the command register. Attempting this on
  * lpc will overwrite the args/parameter space and corrupt its data.
  */
-#define EC_CMD_GET_COMMS_STATUS		0x09
+#define EC_CMD_GET_COMMS_STATUS		0x0009
 
 /* Avoid using ec_status which is for return values */
 enum ec_comms_status {
@@ -744,7 +747,7 @@ struct ec_response_get_comms_status {
 } __ec_align4;
 
 /* Fake a variety of responses, purely for testing purposes. */
-#define EC_CMD_TEST_PROTOCOL		0x0a
+#define EC_CMD_TEST_PROTOCOL		0x000A
 
 /* Tell the EC what to send back to us. */
 struct ec_params_test_protocol {
@@ -759,7 +762,7 @@ struct ec_response_test_protocol {
 } __ec_align4;
 
 /* Get protocol information */
-#define EC_CMD_GET_PROTOCOL_INFO	0x0b
+#define EC_CMD_GET_PROTOCOL_INFO	0x000B
 
 /* Flags for ec_response_get_protocol_info.flags */
 /* EC_RES_IN_PROGRESS may be returned if a command is slow */
@@ -805,11 +808,11 @@ struct ec_response_get_set_value {
 } __ec_align4;
 
 /* More than one command can use these structs to get/set parameters. */
-#define EC_CMD_GSV_PAUSE_IN_S5	0x0c
+#define EC_CMD_GSV_PAUSE_IN_S5	0x000C
 
 /*****************************************************************************/
 /* List the features supported by the firmware */
-#define EC_CMD_GET_FEATURES  0x0d
+#define EC_CMD_GET_FEATURES  0x000D
 
 /* Supported features */
 enum ec_feature_code {
@@ -933,7 +936,7 @@ struct ec_response_get_features {
 /* Flash commands */
 
 /* Get flash info */
-#define EC_CMD_FLASH_INFO 0x10
+#define EC_CMD_FLASH_INFO 0x0010
 
 /**
  * struct ec_response_flash_info - Response to the flash info command.
@@ -1000,7 +1003,7 @@ struct ec_response_flash_info_1 {
  *
  * Response is params.size bytes of data.
  */
-#define EC_CMD_FLASH_READ 0x11
+#define EC_CMD_FLASH_READ 0x0011
 
 /**
  * struct ec_params_flash_read - Parameters for the flash read command.
@@ -1013,7 +1016,7 @@ struct ec_params_flash_read {
 } __ec_align4;
 
 /* Write flash */
-#define EC_CMD_FLASH_WRITE 0x12
+#define EC_CMD_FLASH_WRITE 0x0012
 #define EC_VER_FLASH_WRITE 1
 
 /* Version 0 of the flash command supported only 64 bytes of data */
@@ -1031,7 +1034,7 @@ struct ec_params_flash_write {
 } __ec_align4;
 
 /* Erase flash */
-#define EC_CMD_FLASH_ERASE 0x13
+#define EC_CMD_FLASH_ERASE 0x0013
 
 /**
  * struct ec_params_flash_erase - Parameters for the flash erase command.
@@ -1053,7 +1056,7 @@ struct ec_params_flash_erase {
  *
  * If mask=0, simply returns the current flags state.
  */
-#define EC_CMD_FLASH_PROTECT 0x15
+#define EC_CMD_FLASH_PROTECT 0x0015
 #define EC_VER_FLASH_PROTECT 1  /* Command version 1 */
 
 /* Flags for flash protection */
@@ -1110,7 +1113,7 @@ struct ec_response_flash_protect {
  */
 
 /* Get the region offset/size */
-#define EC_CMD_FLASH_REGION_INFO 0x16
+#define EC_CMD_FLASH_REGION_INFO 0x0016
 #define EC_VER_FLASH_REGION_INFO 1
 
 enum ec_flash_region {
@@ -1142,7 +1145,7 @@ struct ec_response_flash_region_info {
 } __ec_align4;
 
 /* Read/write VbNvContext */
-#define EC_CMD_VBNV_CONTEXT 0x17
+#define EC_CMD_VBNV_CONTEXT 0x0017
 #define EC_VER_VBNV_CONTEXT 1
 #define EC_VBNV_BLOCK_SIZE 16
 
@@ -1164,7 +1167,7 @@ struct ec_response_vbnvcontext {
 /* PWM commands */
 
 /* Get fan target RPM */
-#define EC_CMD_PWM_GET_FAN_TARGET_RPM 0x20
+#define EC_CMD_PWM_GET_FAN_TARGET_RPM 0x0020
 
 struct ec_response_pwm_get_fan_rpm {
 	uint32_t rpm;
@@ -1178,7 +1181,7 @@ struct ec_params_pwm_set_fan_target_rpm {
 } __ec_align_size1;
 
 /* Get keyboard backlight */
-#define EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT 0x22
+#define EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT 0x0022
 
 struct ec_response_pwm_get_keyboard_backlight {
 	uint8_t percent;
@@ -1186,20 +1189,20 @@ struct ec_response_pwm_get_keyboard_backlight {
 } __ec_align1;
 
 /* Set keyboard backlight */
-#define EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT 0x23
+#define EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT 0x0023
 
 struct ec_params_pwm_set_keyboard_backlight {
 	uint8_t percent;
 } __ec_align1;
 
 /* Set target fan PWM duty cycle */
-#define EC_CMD_PWM_SET_FAN_DUTY 0x24
+#define EC_CMD_PWM_SET_FAN_DUTY 0x0024
 
 struct ec_params_pwm_set_fan_duty {
 	uint32_t percent;
 } __ec_align4;
 
-#define EC_CMD_PWM_SET_DUTY 0x25
+#define EC_CMD_PWM_SET_DUTY 0x0025
 /* 16 bit duty cycle, 0xffff = 100% */
 #define EC_PWM_MAX_DUTY 0xffff
 
@@ -1219,7 +1222,7 @@ struct ec_params_pwm_set_duty {
 	uint8_t index;     /* Type-specific index, or 0 if unique */
 } __ec_align4;
 
-#define EC_CMD_PWM_GET_DUTY 0x26
+#define EC_CMD_PWM_GET_DUTY 0x0026
 
 struct ec_params_pwm_get_duty {
 	uint8_t pwm_type;  /* ec_pwm_type */
@@ -1237,7 +1240,7 @@ struct ec_response_pwm_get_duty {
  * into a subcommand. We'll make separate structs for subcommands with
  * different input args, so that we know how much to expect.
  */
-#define EC_CMD_LIGHTBAR_CMD 0x28
+#define EC_CMD_LIGHTBAR_CMD 0x0028
 
 struct rgb_s {
 	uint8_t r, g, b;
@@ -1431,7 +1434,7 @@ enum lightbar_command {
 /*****************************************************************************/
 /* LED control commands */
 
-#define EC_CMD_LED_CONTROL 0x29
+#define EC_CMD_LED_CONTROL 0x0029
 
 enum ec_led_id {
 	/* LED to indicate battery state of charge */
@@ -1488,7 +1491,7 @@ struct ec_response_led_control {
  */
 
 /* Verified boot hash command */
-#define EC_CMD_VBOOT_HASH 0x2A
+#define EC_CMD_VBOOT_HASH 0x002A
 
 struct ec_params_vboot_hash {
 	uint8_t cmd;             /* enum ec_vboot_hash_cmd */
@@ -1540,7 +1543,7 @@ enum ec_vboot_hash_status {
  * Motion sense commands. We'll make separate structs for sub-commands with
  * different input args, so that we know how much to expect.
  */
-#define EC_CMD_MOTION_SENSE_CMD 0x2B
+#define EC_CMD_MOTION_SENSE_CMD 0x002B
 
 /* Motion sense commands */
 enum motionsense_command {
@@ -1804,7 +1807,7 @@ struct ec_response_motion_sense {
 /* USB charging control commands */
 
 /* Set USB port charging mode */
-#define EC_CMD_USB_CHARGE_SET_MODE 0x30
+#define EC_CMD_USB_CHARGE_SET_MODE 0x0030
 
 struct ec_params_usb_charge_set_mode {
 	uint8_t usb_port_id;
@@ -1818,7 +1821,7 @@ struct ec_params_usb_charge_set_mode {
 #define EC_PSTORE_SIZE_MAX 64
 
 /* Get persistent storage info */
-#define EC_CMD_PSTORE_INFO 0x40
+#define EC_CMD_PSTORE_INFO 0x0040
 
 struct ec_response_pstore_info {
 	/* Persistent storage size, in bytes */
@@ -1832,7 +1835,7 @@ struct ec_response_pstore_info {
  *
  * Response is params.size bytes of data.
  */
-#define EC_CMD_PSTORE_READ 0x41
+#define EC_CMD_PSTORE_READ 0x0041
 
 struct ec_params_pstore_read {
 	uint32_t offset;   /* Byte offset to read */
@@ -1840,7 +1843,7 @@ struct ec_params_pstore_read {
 } __ec_align4;
 
 /* Write persistent storage */
-#define EC_CMD_PSTORE_WRITE 0x42
+#define EC_CMD_PSTORE_WRITE 0x0042
 
 struct ec_params_pstore_write {
 	uint32_t offset;   /* Byte offset to write */
@@ -1861,12 +1864,12 @@ struct ec_response_rtc {
 } __ec_align4;
 
 /* These use ec_response_rtc */
-#define EC_CMD_RTC_GET_VALUE 0x44
-#define EC_CMD_RTC_GET_ALARM 0x45
+#define EC_CMD_RTC_GET_VALUE 0x0044
+#define EC_CMD_RTC_GET_ALARM 0x0045
 
 /* These all use ec_params_rtc */
-#define EC_CMD_RTC_SET_VALUE 0x46
-#define EC_CMD_RTC_SET_ALARM 0x47
+#define EC_CMD_RTC_SET_VALUE 0x0046
+#define EC_CMD_RTC_SET_ALARM 0x0047
 
 /* Pass as time param to SET_ALARM to clear the current alarm */
 #define EC_RTC_ALARM_CLEAR 0
@@ -1878,8 +1881,8 @@ struct ec_response_rtc {
 #define EC_PORT80_SIZE_MAX 32
 
 /* Get last port80 code from previous boot */
-#define EC_CMD_PORT80_LAST_BOOT 0x48
-#define EC_CMD_PORT80_READ 0x48
+#define EC_CMD_PORT80_LAST_BOOT 0x0048
+#define EC_CMD_PORT80_READ 0x0048
 
 enum ec_port80_subcmd {
 	EC_PORT80_GET_INFO = 0,
@@ -1920,8 +1923,8 @@ struct ec_response_port80_last_boot {
  * Version 1 separates the CPU thermal limits from the fan control.
  */
 
-#define EC_CMD_THERMAL_SET_THRESHOLD 0x50
-#define EC_CMD_THERMAL_GET_THRESHOLD 0x51
+#define EC_CMD_THERMAL_SET_THRESHOLD 0x0050
+#define EC_CMD_THERMAL_GET_THRESHOLD 0x0051
 
 /* The version 0 structs are opaque. You have to know what they are for
  * the get/set commands to make any sense.
@@ -1983,10 +1986,10 @@ struct ec_params_thermal_set_threshold_v1 {
 /****************************************************************************/
 
 /* Toggle automatic fan control */
-#define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x52
+#define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052
 
 /* Get TMP006 calibration data */
-#define EC_CMD_TMP006_GET_CALIBRATION 0x53
+#define EC_CMD_TMP006_GET_CALIBRATION 0x0053
 
 struct ec_params_tmp006_get_calibration {
 	uint8_t index;
@@ -2000,7 +2003,7 @@ struct ec_response_tmp006_get_calibration {
 } __ec_align4;
 
 /* Set TMP006 calibration data */
-#define EC_CMD_TMP006_SET_CALIBRATION 0x54
+#define EC_CMD_TMP006_SET_CALIBRATION 0x0054
 
 struct ec_params_tmp006_set_calibration {
 	uint8_t index;
@@ -2012,7 +2015,7 @@ struct ec_params_tmp006_set_calibration {
 } __ec_align4;
 
 /* Read raw TMP006 data */
-#define EC_CMD_TMP006_GET_RAW 0x55
+#define EC_CMD_TMP006_GET_RAW 0x0055
 
 struct ec_params_tmp006_get_raw {
 	uint8_t index;
@@ -2036,12 +2039,12 @@ struct ec_response_tmp006_get_raw {
  * to obtain the instantaneous state, use EC_CMD_MKBP_INFO with the type
  * EC_MKBP_INFO_CURRENT and event EC_MKBP_EVENT_KEY_MATRIX.
  */
-#define EC_CMD_MKBP_STATE 0x60
+#define EC_CMD_MKBP_STATE 0x0060
 
 /*
  * Provide information about various MKBP things.  See enum ec_mkbp_info_type.
  */
-#define EC_CMD_MKBP_INFO 0x61
+#define EC_CMD_MKBP_INFO 0x0061
 
 struct ec_response_mkbp_info {
 	uint32_t rows;
@@ -2095,7 +2098,7 @@ enum ec_mkbp_info_type {
 };
 
 /* Simulate key press */
-#define EC_CMD_MKBP_SIMULATE_KEY 0x62
+#define EC_CMD_MKBP_SIMULATE_KEY 0x0062
 
 struct ec_params_mkbp_simulate_key {
 	uint8_t col;
@@ -2104,8 +2107,8 @@ struct ec_params_mkbp_simulate_key {
 } __ec_align1;
 
 /* Configure keyboard scanning */
-#define EC_CMD_MKBP_SET_CONFIG 0x64
-#define EC_CMD_MKBP_GET_CONFIG 0x65
+#define EC_CMD_MKBP_SET_CONFIG 0x0064
+#define EC_CMD_MKBP_GET_CONFIG 0x0065
 
 /* flags */
 enum mkbp_config_flags {
@@ -2158,7 +2161,7 @@ struct ec_response_mkbp_get_config {
 } __ec_align_size1;
 
 /* Run the key scan emulation */
-#define EC_CMD_KEYSCAN_SEQ_CTRL 0x66
+#define EC_CMD_KEYSCAN_SEQ_CTRL 0x0066
 
 enum ec_keyscan_seq_cmd {
 	EC_KEYSCAN_SEQ_STATUS = 0,	/* Get status information */
@@ -2219,7 +2222,7 @@ struct ec_result_keyscan_seq_ctrl {
  *
  * The device replies with UNAVAILABLE if there aren't any pending events.
  */
-#define EC_CMD_GET_NEXT_EVENT 0x67
+#define EC_CMD_GET_NEXT_EVENT 0x0067
 
 enum ec_mkbp_event {
 	/* Keyboard matrix changed. The event data is the new matrix state. */
@@ -2298,7 +2301,7 @@ struct ec_response_get_next_event_v1 {
 /* Temperature sensor commands */
 
 /* Read temperature sensor info */
-#define EC_CMD_TEMP_SENSOR_GET_INFO 0x70
+#define EC_CMD_TEMP_SENSOR_GET_INFO 0x0070
 
 struct ec_params_temp_sensor_get_info {
 	uint8_t id;
@@ -2333,30 +2336,30 @@ struct ec_response_host_event_mask {
 } __ec_align4;
 
 /* These all use ec_response_host_event_mask */
-#define EC_CMD_HOST_EVENT_GET_B         0x87
-#define EC_CMD_HOST_EVENT_GET_SMI_MASK  0x88
-#define EC_CMD_HOST_EVENT_GET_SCI_MASK  0x89
-#define EC_CMD_HOST_EVENT_GET_WAKE_MASK 0x8d
+#define EC_CMD_HOST_EVENT_GET_B         0x0087
+#define EC_CMD_HOST_EVENT_GET_SMI_MASK  0x0088
+#define EC_CMD_HOST_EVENT_GET_SCI_MASK  0x0089
+#define EC_CMD_HOST_EVENT_GET_WAKE_MASK 0x008D
 
 /* These all use ec_params_host_event_mask */
-#define EC_CMD_HOST_EVENT_SET_SMI_MASK  0x8a
-#define EC_CMD_HOST_EVENT_SET_SCI_MASK  0x8b
-#define EC_CMD_HOST_EVENT_CLEAR         0x8c
-#define EC_CMD_HOST_EVENT_SET_WAKE_MASK 0x8e
-#define EC_CMD_HOST_EVENT_CLEAR_B       0x8f
+#define EC_CMD_HOST_EVENT_SET_SMI_MASK  0x008A
+#define EC_CMD_HOST_EVENT_SET_SCI_MASK  0x008B
+#define EC_CMD_HOST_EVENT_CLEAR         0x008C
+#define EC_CMD_HOST_EVENT_SET_WAKE_MASK 0x008E
+#define EC_CMD_HOST_EVENT_CLEAR_B       0x008F
 
 /*****************************************************************************/
 /* Switch commands */
 
 /* Enable/disable LCD backlight */
-#define EC_CMD_SWITCH_ENABLE_BKLIGHT 0x90
+#define EC_CMD_SWITCH_ENABLE_BKLIGHT 0x0090
 
 struct ec_params_switch_enable_backlight {
 	uint8_t enabled;
 } __ec_align1;
 
 /* Enable/disable WLAN/Bluetooth */
-#define EC_CMD_SWITCH_ENABLE_WIRELESS 0x91
+#define EC_CMD_SWITCH_ENABLE_WIRELESS 0x0091
 #define EC_VER_SWITCH_ENABLE_WIRELESS 1
 
 /* Version 0 params; no response */
@@ -2396,7 +2399,7 @@ struct ec_response_switch_enable_wireless_v1 {
 /* GPIO commands. Only available on EC if write protect has been disabled. */
 
 /* Set GPIO output value */
-#define EC_CMD_GPIO_SET 0x92
+#define EC_CMD_GPIO_SET 0x0092
 
 struct ec_params_gpio_set {
 	char name[32];
@@ -2404,7 +2407,7 @@ struct ec_params_gpio_set {
 } __ec_align1;
 
 /* Get GPIO value */
-#define EC_CMD_GPIO_GET 0x93
+#define EC_CMD_GPIO_GET 0x0093
 
 /* Version 0 of input params and response */
 struct ec_params_gpio_get {
@@ -2458,7 +2461,7 @@ enum gpio_get_subcmd {
  */
 
 /* Read I2C bus */
-#define EC_CMD_I2C_READ 0x94
+#define EC_CMD_I2C_READ 0x0094
 
 struct ec_params_i2c_read {
 	uint16_t addr; /* 8-bit address (7-bit shifted << 1) */
@@ -2472,7 +2475,7 @@ struct ec_response_i2c_read {
 } __ec_align2;
 
 /* Write I2C bus */
-#define EC_CMD_I2C_WRITE 0x95
+#define EC_CMD_I2C_WRITE 0x0095
 
 struct ec_params_i2c_write {
 	uint16_t data;
@@ -2488,7 +2491,7 @@ struct ec_params_i2c_write {
 /* Force charge state machine to stop charging the battery or force it to
  * discharge the battery.
  */
-#define EC_CMD_CHARGE_CONTROL 0x96
+#define EC_CMD_CHARGE_CONTROL 0x0096
 #define EC_VER_CHARGE_CONTROL 1
 
 enum ec_charge_control_mode {
@@ -2504,7 +2507,7 @@ struct ec_params_charge_control {
 /*****************************************************************************/
 
 /* Snapshot console output buffer for use by EC_CMD_CONSOLE_READ. */
-#define EC_CMD_CONSOLE_SNAPSHOT 0x97
+#define EC_CMD_CONSOLE_SNAPSHOT 0x0097
 
 /*
  * Read data from the saved snapshot. If the subcmd parameter is
@@ -2518,7 +2521,7 @@ struct ec_params_charge_control {
  * Response is null-terminated string.  Empty string, if there is no more
  * remaining output.
  */
-#define EC_CMD_CONSOLE_READ 0x98
+#define EC_CMD_CONSOLE_READ 0x0098
 
 enum ec_console_read_subcmd {
 	CONSOLE_READ_NEXT = 0,
@@ -2538,8 +2541,7 @@ struct ec_params_console_read_v1 {
  *	  EC_RES_SUCCESS if the command was successful.
  *	  EC_RES_ERROR if the cut off command failed.
  */
-
-#define EC_CMD_BATTERY_CUT_OFF 0x99
+#define EC_CMD_BATTERY_CUT_OFF 0x0099
 
 #define EC_BATTERY_CUTOFF_FLAG_AT_SHUTDOWN	(1 << 0)
 
@@ -2553,7 +2555,7 @@ struct ec_params_battery_cutoff {
 /*
  * Switch USB mux or return to automatic switching.
  */
-#define EC_CMD_USB_MUX 0x9a
+#define EC_CMD_USB_MUX 0x009A
 
 struct ec_params_usb_mux {
 	uint8_t mux;
@@ -2570,7 +2572,7 @@ enum ec_ldo_state {
 /*
  * Switch on/off a LDO.
  */
-#define EC_CMD_LDO_SET 0x9b
+#define EC_CMD_LDO_SET 0x009B
 
 struct ec_params_ldo_set {
 	uint8_t index;
@@ -2580,7 +2582,7 @@ struct ec_params_ldo_set {
 /*
  * Get LDO state.
  */
-#define EC_CMD_LDO_GET 0x9c
+#define EC_CMD_LDO_GET 0x009C
 
 struct ec_params_ldo_get {
 	uint8_t index;
@@ -2596,7 +2598,7 @@ struct ec_response_ldo_get {
 /*
  * Get power info.
  */
-#define EC_CMD_POWER_INFO 0x9d
+#define EC_CMD_POWER_INFO 0x009D
 
 struct ec_response_power_info {
 	uint32_t usb_dev_type;
@@ -2609,7 +2611,7 @@ struct ec_response_power_info {
 /*****************************************************************************/
 /* I2C passthru command */
 
-#define EC_CMD_I2C_PASSTHRU 0x9e
+#define EC_CMD_I2C_PASSTHRU 0x009E
 
 /* Read data; if not present, message is a write */
 #define EC_I2C_FLAG_READ	(1 << 15)
@@ -2644,7 +2646,7 @@ struct ec_response_i2c_passthru {
 /*****************************************************************************/
 /* Power button hang detect */
 
-#define EC_CMD_HANG_DETECT 0x9f
+#define EC_CMD_HANG_DETECT 0x009F
 
 /* Reasons to start hang detection timer */
 /* Power button pressed */
@@ -2703,7 +2705,7 @@ struct ec_params_hang_detect {
  * This is the single catch-all host command to exchange data regarding the
  * charge state machine (v2 and up).
  */
-#define EC_CMD_CHARGE_STATE 0xa0
+#define EC_CMD_CHARGE_STATE 0x00A0
 
 /* Subcommands for this host command */
 enum charge_state_command {
@@ -2774,7 +2776,7 @@ struct ec_response_charge_state {
 /*
  * Set maximum battery charging current.
  */
-#define EC_CMD_CHARGE_CURRENT_LIMIT 0xa1
+#define EC_CMD_CHARGE_CURRENT_LIMIT 0x00A1
 
 struct ec_params_current_limit {
 	uint32_t limit; /* in mA */
@@ -2794,7 +2796,7 @@ struct ec_params_external_power_limit_v1 {
 #define EC_POWER_LIMIT_NONE 0xffff
 
 /* Inform the EC when entering a sleep state */
-#define EC_CMD_HOST_SLEEP_EVENT 0xa9
+#define EC_CMD_HOST_SLEEP_EVENT 0x00A9
 
 enum host_sleep_event {
 	HOST_SLEEP_EVENT_S3_SUSPEND   = 1,
@@ -2868,14 +2870,14 @@ struct ec_response_host_sleep_event_v1 {
 /* Smart battery pass-through */
 
 /* Get / Set 16-bit smart battery registers */
-#define EC_CMD_SB_READ_WORD   0xb0
-#define EC_CMD_SB_WRITE_WORD  0xb1
+#define EC_CMD_SB_READ_WORD   0x00B0
+#define EC_CMD_SB_WRITE_WORD  0x00B1
 
 /* Get / Set string smart battery parameters
  * formatted as SMBUS "block".
  */
-#define EC_CMD_SB_READ_BLOCK  0xb2
-#define EC_CMD_SB_WRITE_BLOCK 0xb3
+#define EC_CMD_SB_READ_BLOCK  0x00B2
+#define EC_CMD_SB_WRITE_BLOCK 0x00B3
 
 struct ec_params_sb_rd {
 	uint8_t reg;
@@ -2908,7 +2910,7 @@ struct ec_params_sb_wr_block {
  * requested value.
  */
 
-#define EC_CMD_BATTERY_VENDOR_PARAM 0xb4
+#define EC_CMD_BATTERY_VENDOR_PARAM 0x00B4
 
 enum ec_battery_vendor_param_mode {
 	BATTERY_VENDOR_PARAM_MODE_GET = 0,
@@ -3024,7 +3026,7 @@ struct ec_response_codec_gain {
  * TODO(crosbug.com/p/23747): This is a confusing name, since it doesn't
  * necessarily reboot the EC.  Rename to "image" or something similar?
  */
-#define EC_CMD_REBOOT_EC 0xd2
+#define EC_CMD_REBOOT_EC 0x00D2
 
 /* Command */
 enum ec_reboot_cmd {
@@ -3052,7 +3054,7 @@ struct ec_params_reboot_ec {
  * Returns variable-length platform-dependent panic information.  See panic.h
  * for details.
  */
-#define EC_CMD_GET_PANIC_INFO 0xd3
+#define EC_CMD_GET_PANIC_INFO 0x00D3
 
 /*****************************************************************************/
 /*
@@ -3260,7 +3262,7 @@ enum mkbp_cec_event {
  *
  * Use EC_CMD_REBOOT_EC to reboot the EC more politely.
  */
-#define EC_CMD_REBOOT 0xd1  /* Think "die" */
+#define EC_CMD_REBOOT 0x00D1  /* Think "die" */
 
 /*
  * Resend last response (not supported on LPC).
@@ -3269,7 +3271,7 @@ enum mkbp_cec_event {
  * there was no previous command, or the previous command's response was too
  * big to save.
  */
-#define EC_CMD_RESEND_RESPONSE 0xdb
+#define EC_CMD_RESEND_RESPONSE 0x00DB
 
 /*
  * This header byte on a command indicate version 0. Any header byte less
@@ -3281,7 +3283,7 @@ enum mkbp_cec_event {
  *
  * The old EC interface must not use commands 0xdc or higher.
  */
-#define EC_CMD_VERSION0 0xdc
+#define EC_CMD_VERSION0 0x00DC
 
 #endif  /* !__ACPI__ */
 
@@ -3293,7 +3295,7 @@ enum mkbp_cec_event {
  */
 
 /* EC to PD MCU exchange status command */
-#define EC_CMD_PD_EXCHANGE_STATUS 0x100
+#define EC_CMD_PD_EXCHANGE_STATUS 0x0100
 
 /* Status of EC being sent to PD */
 struct ec_params_pd_status {
@@ -3307,7 +3309,7 @@ struct ec_response_pd_status {
 } __ec_align_size1;
 
 /* Set USB type-C port role and muxes */
-#define EC_CMD_USB_PD_CONTROL 0x101
+#define EC_CMD_USB_PD_CONTROL 0x0101
 
 enum usb_pd_control_role {
 	USB_PD_CTRL_ROLE_NO_CHANGE = 0,
@@ -3360,7 +3362,7 @@ struct ec_response_usb_pd_control_v1 {
 	char state[32];
 } __ec_align1;
 
-#define EC_CMD_USB_PD_PORTS 0x102
+#define EC_CMD_USB_PD_PORTS 0x0102
 
 /* Maximum number of PD ports on a device, num_ports will be <= this */
 #define EC_USB_PD_MAX_PORTS 8
@@ -3369,7 +3371,7 @@ struct ec_response_usb_pd_ports {
 	uint8_t num_ports;
 } __ec_align1;
 
-#define EC_CMD_USB_PD_POWER_INFO 0x103
+#define EC_CMD_USB_PD_POWER_INFO 0x0103
 
 #define PD_POWER_CHARGING_PORT 0xff
 struct ec_params_usb_pd_power_info {
@@ -3536,7 +3538,7 @@ struct mcdp_info {
 #define MCDP_FAMILY(family) ((family[0] << 8) | family[1])
 
 /* Get info about USB-C SS muxes */
-#define EC_CMD_USB_PD_MUX_INFO 0x11a
+#define EC_CMD_USB_PD_MUX_INFO 0x011A
 
 struct ec_params_usb_pd_mux_info {
 	uint8_t port; /* USB-C port number */
@@ -3551,6 +3553,41 @@ struct ec_params_usb_pd_mux_info {
 struct ec_response_usb_pd_mux_info {
 	uint8_t flags; /* USB_PD_MUX_*-encoded USB mux state */
 } __ec_align1;
+/*****************************************************************************/
+/*
+ * Reserve a range of host commands for board-specific, experimental, or
+ * special purpose features. These can be (re)used without updating this file.
+ *
+ * CAUTION: Don't go nuts with this. Shipping products should document ALL
+ * their EC commands for easier development, testing, debugging, and support.
+ *
+ * All commands MUST be #defined to be 4-digit UPPER CASE hex values
+ * (e.g., 0x00AB, not 0xab) for CONFIG_HOSTCMD_SECTION_SORTED to work.
+ *
+ * In your experimental code, you may want to do something like this:
+ *
+ *   #define EC_CMD_MAGIC_FOO 0x0000
+ *   #define EC_CMD_MAGIC_BAR 0x0001
+ *   #define EC_CMD_MAGIC_HEY 0x0002
+ *
+ *   DECLARE_PRIVATE_HOST_COMMAND(EC_CMD_MAGIC_FOO, magic_foo_handler,
+ *      EC_VER_MASK(0);
+ *
+ *   DECLARE_PRIVATE_HOST_COMMAND(EC_CMD_MAGIC_BAR, magic_bar_handler,
+ *      EC_VER_MASK(0);
+ *
+ *   DECLARE_PRIVATE_HOST_COMMAND(EC_CMD_MAGIC_HEY, magic_hey_handler,
+ *      EC_VER_MASK(0);
+ */
+#define EC_CMD_BOARD_SPECIFIC_BASE 0x3E00
+#define EC_CMD_BOARD_SPECIFIC_LAST 0x3FFF
+
+/*
+ * Given the private host command offset, calculate the true private host
+ * command value.
+ */
+#define EC_PRIVATE_HOST_COMMAND_VALUE(command) \
+	(EC_CMD_BOARD_SPECIFIC_BASE + (command))
 
 /*****************************************************************************/
 /*
-- 
cgit v1.2.3


From 9e81656063776b38f8fa8ffd7a5a2d4d42bdd2b4 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:37 -0700
Subject: mfd: cros_ec: use BIT macro

Replace (1 << ...) with BIT().

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 110 +++++++++++++++++------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 8ad77d8a9141..e97e9e976bd0 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -28,7 +28,7 @@
 #define EC_PROTO_VERSION          0x00000002
 
 /* Command version mask */
-#define EC_VER_MASK(version) (1UL << (version))
+#define EC_VER_MASK(version) BIT(version)
 
 /* I/O addresses for ACPI commands */
 #define EC_LPC_ADDR_ACPI_DATA  0x62
@@ -57,13 +57,13 @@
 #define EC_HOST_CMD_REGION_SIZE 0x80
 
 /* EC command register bit functions */
-#define EC_LPC_CMDR_DATA	(1 << 0)  /* Data ready for host to read */
-#define EC_LPC_CMDR_PENDING	(1 << 1)  /* Write pending to EC */
-#define EC_LPC_CMDR_BUSY	(1 << 2)  /* EC is busy processing a command */
-#define EC_LPC_CMDR_CMD		(1 << 3)  /* Last host write was a command */
-#define EC_LPC_CMDR_ACPI_BRST	(1 << 4)  /* Burst mode (not used) */
-#define EC_LPC_CMDR_SCI		(1 << 5)  /* SCI event is pending */
-#define EC_LPC_CMDR_SMI		(1 << 6)  /* SMI event is pending */
+#define EC_LPC_CMDR_DATA	BIT(0)  /* Data ready for host to read */
+#define EC_LPC_CMDR_PENDING	BIT(1)  /* Write pending to EC */
+#define EC_LPC_CMDR_BUSY	BIT(2)  /* EC is busy processing a command */
+#define EC_LPC_CMDR_CMD		BIT(3)  /* Last host write was a command */
+#define EC_LPC_CMDR_ACPI_BRST	BIT(4)  /* Burst mode (not used) */
+#define EC_LPC_CMDR_SCI		BIT(5)  /* SCI event is pending */
+#define EC_LPC_CMDR_SMI		BIT(6)  /* SMI event is pending */
 
 #define EC_LPC_ADDR_MEMMAP       0x900
 #define EC_MEMMAP_SIZE         255 /* ACPI IO buffer max is 255 bytes */
@@ -110,8 +110,8 @@
 
 /* Define the format of the accelerometer mapped memory status byte. */
 #define EC_MEMMAP_ACC_STATUS_SAMPLE_ID_MASK  0x0f
-#define EC_MEMMAP_ACC_STATUS_BUSY_BIT        (1 << 4)
-#define EC_MEMMAP_ACC_STATUS_PRESENCE_BIT    (1 << 7)
+#define EC_MEMMAP_ACC_STATUS_BUSY_BIT        BIT(4)
+#define EC_MEMMAP_ACC_STATUS_PRESENCE_BIT    BIT(7)
 
 /* Number of temp sensors at EC_MEMMAP_TEMP_SENSOR */
 #define EC_TEMP_SENSOR_ENTRIES     16
@@ -336,7 +336,7 @@ enum host_event_code {
 	EC_HOST_EVENT_INVALID = 32
 };
 /* Host event mask */
-#define EC_HOST_EVENT_MASK(event_code) (1UL << ((event_code) - 1))
+#define EC_HOST_EVENT_MASK(event_code) BIT_ULL((event_code) - 1)
 
 /**
  * struct ec_lpc_host_args - Arguments at EC_LPC_ADDR_HOST_ARGS
@@ -734,7 +734,7 @@ struct ec_response_get_cmd_versions {
 
 /* Avoid using ec_status which is for return values */
 enum ec_comms_status {
-	EC_COMMS_STATUS_PROCESSING	= 1 << 0,	/* Processing cmd */
+	EC_COMMS_STATUS_PROCESSING	= BIT(0),	/* Processing cmd */
 };
 
 /**
@@ -766,7 +766,7 @@ struct ec_response_test_protocol {
 
 /* Flags for ec_response_get_protocol_info.flags */
 /* EC_RES_IN_PROGRESS may be returned if a command is slow */
-#define EC_PROTOCOL_INFO_IN_PROGRESS_SUPPORTED (1 << 0)
+#define EC_PROTOCOL_INFO_IN_PROGRESS_SUPPORTED BIT(0)
 
 /**
  * struct ec_response_get_protocol_info - Response to the get protocol info.
@@ -925,8 +925,8 @@ enum ec_feature_code {
 	EC_FEATURE_ISH = 40,
 };
 
-#define EC_FEATURE_MASK_0(event_code) (1UL << (event_code % 32))
-#define EC_FEATURE_MASK_1(event_code) (1UL << (event_code - 32))
+#define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
+#define EC_FEATURE_MASK_1(event_code) BIT(event_code - 32)
 
 struct ec_response_get_features {
 	uint32_t flags[2];
@@ -961,7 +961,7 @@ struct ec_response_flash_info {
  * Flags for version 1+ flash info command
  * EC flash erases bits to 0 instead of 1.
  */
-#define EC_FLASH_INFO_ERASE_TO_0 (1 << 0)
+#define EC_FLASH_INFO_ERASE_TO_0 BIT(0)
 
 /**
  * struct ec_response_flash_info_1 - Response to the flash info v1 command.
@@ -1061,26 +1061,26 @@ struct ec_params_flash_erase {
 
 /* Flags for flash protection */
 /* RO flash code protected when the EC boots */
-#define EC_FLASH_PROTECT_RO_AT_BOOT         (1 << 0)
+#define EC_FLASH_PROTECT_RO_AT_BOOT         BIT(0)
 /*
  * RO flash code protected now.  If this bit is set, at-boot status cannot
  * be changed.
  */
-#define EC_FLASH_PROTECT_RO_NOW             (1 << 1)
+#define EC_FLASH_PROTECT_RO_NOW             BIT(1)
 /* Entire flash code protected now, until reboot. */
-#define EC_FLASH_PROTECT_ALL_NOW            (1 << 2)
+#define EC_FLASH_PROTECT_ALL_NOW            BIT(2)
 /* Flash write protect GPIO is asserted now */
-#define EC_FLASH_PROTECT_GPIO_ASSERTED      (1 << 3)
+#define EC_FLASH_PROTECT_GPIO_ASSERTED      BIT(3)
 /* Error - at least one bank of flash is stuck locked, and cannot be unlocked */
-#define EC_FLASH_PROTECT_ERROR_STUCK        (1 << 4)
+#define EC_FLASH_PROTECT_ERROR_STUCK        BIT(4)
 /*
  * Error - flash protection is in inconsistent state.  At least one bank of
  * flash which should be protected is not protected.  Usually fixed by
  * re-requesting the desired flags, or by a hard reset if that fails.
  */
-#define EC_FLASH_PROTECT_ERROR_INCONSISTENT (1 << 5)
+#define EC_FLASH_PROTECT_ERROR_INCONSISTENT BIT(5)
 /* Entire flash code protected when the EC boots */
-#define EC_FLASH_PROTECT_ALL_AT_BOOT        (1 << 6)
+#define EC_FLASH_PROTECT_ALL_AT_BOOT        BIT(6)
 
 /**
  * struct ec_params_flash_protect - Parameters for the flash protect command.
@@ -1451,8 +1451,8 @@ enum ec_led_id {
 };
 
 /* LED control flags */
-#define EC_LED_FLAGS_QUERY (1 << 0) /* Query LED capability only */
-#define EC_LED_FLAGS_AUTO  (1 << 1) /* Switch LED back to automatic control */
+#define EC_LED_FLAGS_QUERY BIT(0) /* Query LED capability only */
+#define EC_LED_FLAGS_AUTO  BIT(1) /* Switch LED back to automatic control */
 
 enum ec_led_colors {
 	EC_LED_COLOR_RED = 0,
@@ -2116,13 +2116,13 @@ enum mkbp_config_flags {
 };
 
 enum mkbp_config_valid {
-	EC_MKBP_VALID_SCAN_PERIOD		= 1 << 0,
-	EC_MKBP_VALID_POLL_TIMEOUT		= 1 << 1,
-	EC_MKBP_VALID_MIN_POST_SCAN_DELAY	= 1 << 3,
-	EC_MKBP_VALID_OUTPUT_SETTLE		= 1 << 4,
-	EC_MKBP_VALID_DEBOUNCE_DOWN		= 1 << 5,
-	EC_MKBP_VALID_DEBOUNCE_UP		= 1 << 6,
-	EC_MKBP_VALID_FIFO_MAX_DEPTH		= 1 << 7,
+	EC_MKBP_VALID_SCAN_PERIOD		= BIT(0),
+	EC_MKBP_VALID_POLL_TIMEOUT		= BIT(1),
+	EC_MKBP_VALID_MIN_POST_SCAN_DELAY	= BIT(3),
+	EC_MKBP_VALID_OUTPUT_SETTLE		= BIT(4),
+	EC_MKBP_VALID_DEBOUNCE_DOWN		= BIT(5),
+	EC_MKBP_VALID_DEBOUNCE_UP		= BIT(6),
+	EC_MKBP_VALID_FIFO_MAX_DEPTH		= BIT(7),
 };
 
 /*
@@ -2176,7 +2176,7 @@ enum ec_collect_flags {
 	 * Indicates this scan was processed by the EC. Due to timing, some
 	 * scans may be skipped.
 	 */
-	EC_KEYSCAN_SEQ_FLAG_DONE	= 1 << 0,
+	EC_KEYSCAN_SEQ_FLAG_DONE	= BIT(0),
 };
 
 struct ec_collect_item {
@@ -2543,7 +2543,7 @@ struct ec_params_console_read_v1 {
  */
 #define EC_CMD_BATTERY_CUT_OFF 0x0099
 
-#define EC_BATTERY_CUTOFF_FLAG_AT_SHUTDOWN	(1 << 0)
+#define EC_BATTERY_CUTOFF_FLAG_AT_SHUTDOWN	BIT(0)
 
 struct ec_params_battery_cutoff {
 	uint8_t flags;
@@ -2614,13 +2614,13 @@ struct ec_response_power_info {
 #define EC_CMD_I2C_PASSTHRU 0x009E
 
 /* Read data; if not present, message is a write */
-#define EC_I2C_FLAG_READ	(1 << 15)
+#define EC_I2C_FLAG_READ	BIT(15)
 
 /* Mask for address */
 #define EC_I2C_ADDR_MASK	0x3ff
 
-#define EC_I2C_STATUS_NAK	(1 << 0) /* Transfer was not acknowledged */
-#define EC_I2C_STATUS_TIMEOUT	(1 << 1) /* Timeout during transfer */
+#define EC_I2C_STATUS_NAK	BIT(0) /* Transfer was not acknowledged */
+#define EC_I2C_STATUS_TIMEOUT	BIT(1) /* Timeout during transfer */
 
 /* Any error */
 #define EC_I2C_STATUS_ERROR	(EC_I2C_STATUS_NAK | EC_I2C_STATUS_TIMEOUT)
@@ -2650,27 +2650,27 @@ struct ec_response_i2c_passthru {
 
 /* Reasons to start hang detection timer */
 /* Power button pressed */
-#define EC_HANG_START_ON_POWER_PRESS  (1 << 0)
+#define EC_HANG_START_ON_POWER_PRESS  BIT(0)
 
 /* Lid closed */
-#define EC_HANG_START_ON_LID_CLOSE    (1 << 1)
+#define EC_HANG_START_ON_LID_CLOSE    BIT(1)
 
  /* Lid opened */
-#define EC_HANG_START_ON_LID_OPEN     (1 << 2)
+#define EC_HANG_START_ON_LID_OPEN     BIT(2)
 
 /* Start of AP S3->S0 transition (booting or resuming from suspend) */
-#define EC_HANG_START_ON_RESUME       (1 << 3)
+#define EC_HANG_START_ON_RESUME       BIT(3)
 
 /* Reasons to cancel hang detection */
 
 /* Power button released */
-#define EC_HANG_STOP_ON_POWER_RELEASE (1 << 8)
+#define EC_HANG_STOP_ON_POWER_RELEASE BIT(8)
 
 /* Any host command from AP received */
-#define EC_HANG_STOP_ON_HOST_COMMAND  (1 << 9)
+#define EC_HANG_STOP_ON_HOST_COMMAND  BIT(9)
 
 /* Stop on end of AP S0->S3 transition (suspending or shutting down) */
-#define EC_HANG_STOP_ON_SUSPEND       (1 << 10)
+#define EC_HANG_STOP_ON_SUSPEND       BIT(10)
 
 /*
  * If this flag is set, all the other fields are ignored, and the hang detect
@@ -2678,14 +2678,14 @@ struct ec_response_i2c_passthru {
  * without reconfiguring any of the other hang detect settings.  Note that
  * you must previously have configured the timeouts.
  */
-#define EC_HANG_START_NOW             (1 << 30)
+#define EC_HANG_START_NOW             BIT(30)
 
 /*
  * If this flag is set, all the other fields are ignored (including
  * EC_HANG_START_NOW).  This provides the AP a way to stop the hang timer
  * without reconfiguring any of the other hang detect settings.
  */
-#define EC_HANG_STOP_NOW              (1 << 31)
+#define EC_HANG_STOP_NOW              BIT(31)
 
 struct ec_params_hang_detect {
 	/* Flags; see EC_HANG_* */
@@ -3040,8 +3040,8 @@ enum ec_reboot_cmd {
 };
 
 /* Flags for ec_params_reboot_ec.reboot_flags */
-#define EC_REBOOT_FLAG_RESERVED0      (1 << 0)  /* Was recovery request */
-#define EC_REBOOT_FLAG_ON_AP_SHUTDOWN (1 << 1)  /* Reboot after AP shutdown */
+#define EC_REBOOT_FLAG_RESERVED0      BIT(0)  /* Was recovery request */
+#define EC_REBOOT_FLAG_ON_AP_SHUTDOWN BIT(1)  /* Reboot after AP shutdown */
 
 struct ec_params_reboot_ec {
 	uint8_t cmd;           /* enum ec_reboot_cmd */
@@ -3343,9 +3343,9 @@ struct ec_params_usb_pd_control {
 	uint8_t swap;
 } __ec_align1;
 
-#define PD_CTRL_RESP_ENABLED_COMMS      (1 << 0) /* Communication enabled */
-#define PD_CTRL_RESP_ENABLED_CONNECTED  (1 << 1) /* Device connected */
-#define PD_CTRL_RESP_ENABLED_PD_CAPABLE (1 << 2) /* Partner is PD capable */
+#define PD_CTRL_RESP_ENABLED_COMMS      BIT(0) /* Communication enabled */
+#define PD_CTRL_RESP_ENABLED_CONNECTED  BIT(1) /* Device connected */
+#define PD_CTRL_RESP_ENABLED_PD_CAPABLE BIT(2) /* Partner is PD capable */
 
 #define PD_CTRL_RESP_ROLE_POWER         BIT(0) /* 0=SNK/1=SRC */
 #define PD_CTRL_RESP_ROLE_DATA          BIT(1) /* 0=UFP/1=DFP */
@@ -3545,10 +3545,10 @@ struct ec_params_usb_pd_mux_info {
 } __ec_align1;
 
 /* Flags representing mux state */
-#define USB_PD_MUX_USB_ENABLED       (1 << 0)
-#define USB_PD_MUX_DP_ENABLED        (1 << 1)
-#define USB_PD_MUX_POLARITY_INVERTED (1 << 2)
-#define USB_PD_MUX_HPD_IRQ           (1 << 3)
+#define USB_PD_MUX_USB_ENABLED       BIT(0) /* USB connected */
+#define USB_PD_MUX_DP_ENABLED        BIT(1) /* DP connected */
+#define USB_PD_MUX_POLARITY_INVERTED BIT(2) /* CC line Polarity inverted */
+#define USB_PD_MUX_HPD_IRQ           BIT(3) /* HPD IRQ is asserted */
 
 struct ec_response_usb_pd_mux_info {
 	uint8_t flags; /* USB_PD_MUX_*-encoded USB mux state */
-- 
cgit v1.2.3


From ce86c87d73512c8ac71b9b5335c8b830c7ac9491 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:38 -0700
Subject: mfd: cros_ec: Update ACPI interface definition

Add more fields and improve API when EC presents data through ACPI
memory space.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 419 ++++++++++++++++++++++++-----------
 1 file changed, 293 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index e97e9e976bd0..575066b90bab 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -83,13 +83,15 @@
 /* Unused 0x28 - 0x2f */
 #define EC_MEMMAP_SWITCHES         0x30	/* 8 bits */
 /* Unused 0x31 - 0x33 */
-#define EC_MEMMAP_HOST_EVENTS      0x34 /* 32 bits */
-/* Reserve 0x38 - 0x3f for additional host event-related stuff */
-/* Battery values are all 32 bits */
+#define EC_MEMMAP_HOST_EVENTS      0x34 /* 64 bits */
+/* Battery values are all 32 bits, unless otherwise noted. */
 #define EC_MEMMAP_BATT_VOLT        0x40 /* Battery Present Voltage */
 #define EC_MEMMAP_BATT_RATE        0x44 /* Battery Present Rate */
 #define EC_MEMMAP_BATT_CAP         0x48 /* Battery Remaining Capacity */
-#define EC_MEMMAP_BATT_FLAG        0x4c /* Battery State, defined below */
+#define EC_MEMMAP_BATT_FLAG        0x4c /* Battery State, see below (8-bit) */
+#define EC_MEMMAP_BATT_COUNT       0x4d /* Battery Count (8-bit) */
+#define EC_MEMMAP_BATT_INDEX       0x4e /* Current Battery Data Index (8-bit) */
+/* Unused 0x4f */
 #define EC_MEMMAP_BATT_DCAP        0x50 /* Battery Design Capacity */
 #define EC_MEMMAP_BATT_DVLT        0x54 /* Battery Design Voltage */
 #define EC_MEMMAP_BATT_LFCC        0x58 /* Battery Last Full Charge Capacity */
@@ -103,10 +105,19 @@
 /* Unused 0x84 - 0x8f */
 #define EC_MEMMAP_ACC_STATUS       0x90 /* Accelerometer status (8 bits )*/
 /* Unused 0x91 */
-#define EC_MEMMAP_ACC_DATA         0x92 /* Accelerometer data 0x92 - 0x9f */
+#define EC_MEMMAP_ACC_DATA         0x92 /* Accelerometers data 0x92 - 0x9f */
+/* 0x92: Lid Angle if available, LID_ANGLE_UNRELIABLE otherwise */
+/* 0x94 - 0x99: 1st Accelerometer */
+/* 0x9a - 0x9f: 2nd Accelerometer */
 #define EC_MEMMAP_GYRO_DATA        0xa0 /* Gyroscope data 0xa0 - 0xa5 */
-/* Unused 0xa6 - 0xfe (remember, 0xff is NOT part of the memmap region) */
+/* Unused 0xa6 - 0xdf */
 
+/*
+ * ACPI is unable to access memory mapped data at or above this offset due to
+ * limitations of the ACPI protocol. Do not place data in the range 0xe0 - 0xfe
+ * which might be needed by ACPI.
+ */
+#define EC_MEMMAP_NO_ACPI 0xe0
 
 /* Define the format of the accelerometer mapped memory status byte. */
 #define EC_MEMMAP_ACC_STATUS_SAMPLE_ID_MASK  0x0f
@@ -155,6 +166,8 @@
 #define EC_BATT_FLAG_DISCHARGING  0x04
 #define EC_BATT_FLAG_CHARGING     0x08
 #define EC_BATT_FLAG_LEVEL_CRITICAL 0x10
+/* Set if some of the static/dynamic data is invalid (or outdated). */
+#define EC_BATT_FLAG_INVALID_DATA 0x20
 
 /* Switch flags at EC_MEMMAP_SWITCHES */
 #define EC_SWITCH_LID_OPEN               0x01
@@ -180,12 +193,200 @@
 #define EC_WIRELESS_SWITCH_WWAN       0x04  /* WWAN power */
 #define EC_WIRELESS_SWITCH_WLAN_POWER 0x08  /* WLAN power */
 
+/*****************************************************************************/
+/*
+ * ACPI commands
+ *
+ * These are valid ONLY on the ACPI command/data port.
+ */
+
+/*
+ * ACPI Read Embedded Controller
+ *
+ * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
+ *
+ * Use the following sequence:
+ *
+ *    - Write EC_CMD_ACPI_READ to EC_LPC_ADDR_ACPI_CMD
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write address to EC_LPC_ADDR_ACPI_DATA
+ *    - Wait for EC_LPC_CMDR_DATA bit to set
+ *    - Read value from EC_LPC_ADDR_ACPI_DATA
+ */
+#define EC_CMD_ACPI_READ 0x0080
+
+/*
+ * ACPI Write Embedded Controller
+ *
+ * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
+ *
+ * Use the following sequence:
+ *
+ *    - Write EC_CMD_ACPI_WRITE to EC_LPC_ADDR_ACPI_CMD
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write address to EC_LPC_ADDR_ACPI_DATA
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write value to EC_LPC_ADDR_ACPI_DATA
+ */
+#define EC_CMD_ACPI_WRITE 0x0081
+
+/*
+ * ACPI Burst Enable Embedded Controller
+ *
+ * This enables burst mode on the EC to allow the host to issue several
+ * commands back-to-back. While in this mode, writes to mapped multi-byte
+ * data are locked out to ensure data consistency.
+ */
+#define EC_CMD_ACPI_BURST_ENABLE 0x0082
+
+/*
+ * ACPI Burst Disable Embedded Controller
+ *
+ * This disables burst mode on the EC and stops preventing EC writes to mapped
+ * multi-byte data.
+ */
+#define EC_CMD_ACPI_BURST_DISABLE 0x0083
+
+/*
+ * ACPI Query Embedded Controller
+ *
+ * This clears the lowest-order bit in the currently pending host events, and
+ * sets the result code to the 1-based index of the bit (event 0x00000001 = 1,
+ * event 0x80000000 = 32), or 0 if no event was pending.
+ */
+#define EC_CMD_ACPI_QUERY_EVENT 0x0084
+
+/* Valid addresses in ACPI memory space, for read/write commands */
+
+/* Memory space version; set to EC_ACPI_MEM_VERSION_CURRENT */
+#define EC_ACPI_MEM_VERSION            0x00
+/*
+ * Test location; writing value here updates test compliment byte to (0xff -
+ * value).
+ */
+#define EC_ACPI_MEM_TEST               0x01
+/* Test compliment; writes here are ignored. */
+#define EC_ACPI_MEM_TEST_COMPLIMENT    0x02
+
+/* Keyboard backlight brightness percent (0 - 100) */
+#define EC_ACPI_MEM_KEYBOARD_BACKLIGHT 0x03
+/* DPTF Target Fan Duty (0-100, 0xff for auto/none) */
+#define EC_ACPI_MEM_FAN_DUTY           0x04
+
+/*
+ * DPTF temp thresholds. Any of the EC's temp sensors can have up to two
+ * independent thresholds attached to them. The current value of the ID
+ * register determines which sensor is affected by the THRESHOLD and COMMIT
+ * registers. The THRESHOLD register uses the same EC_TEMP_SENSOR_OFFSET scheme
+ * as the memory-mapped sensors. The COMMIT register applies those settings.
+ *
+ * The spec does not mandate any way to read back the threshold settings
+ * themselves, but when a threshold is crossed the AP needs a way to determine
+ * which sensor(s) are responsible. Each reading of the ID register clears and
+ * returns one sensor ID that has crossed one of its threshold (in either
+ * direction) since the last read. A value of 0xFF means "no new thresholds
+ * have tripped". Setting or enabling the thresholds for a sensor will clear
+ * the unread event count for that sensor.
+ */
+#define EC_ACPI_MEM_TEMP_ID            0x05
+#define EC_ACPI_MEM_TEMP_THRESHOLD     0x06
+#define EC_ACPI_MEM_TEMP_COMMIT        0x07
+/*
+ * Here are the bits for the COMMIT register:
+ *   bit 0 selects the threshold index for the chosen sensor (0/1)
+ *   bit 1 enables/disables the selected threshold (0 = off, 1 = on)
+ * Each write to the commit register affects one threshold.
+ */
+#define EC_ACPI_MEM_TEMP_COMMIT_SELECT_MASK BIT(0)
+#define EC_ACPI_MEM_TEMP_COMMIT_ENABLE_MASK BIT(1)
+/*
+ * Example:
+ *
+ * Set the thresholds for sensor 2 to 50 C and 60 C:
+ *   write 2 to [0x05]      --  select temp sensor 2
+ *   write 0x7b to [0x06]   --  C_TO_K(50) - EC_TEMP_SENSOR_OFFSET
+ *   write 0x2 to [0x07]    --  enable threshold 0 with this value
+ *   write 0x85 to [0x06]   --  C_TO_K(60) - EC_TEMP_SENSOR_OFFSET
+ *   write 0x3 to [0x07]    --  enable threshold 1 with this value
+ *
+ * Disable the 60 C threshold, leaving the 50 C threshold unchanged:
+ *   write 2 to [0x05]      --  select temp sensor 2
+ *   write 0x1 to [0x07]    --  disable threshold 1
+ */
+
+/* DPTF battery charging current limit */
+#define EC_ACPI_MEM_CHARGING_LIMIT     0x08
+
+/* Charging limit is specified in 64 mA steps */
+#define EC_ACPI_MEM_CHARGING_LIMIT_STEP_MA   64
+/* Value to disable DPTF battery charging limit */
+#define EC_ACPI_MEM_CHARGING_LIMIT_DISABLED  0xff
+
+/*
+ * Report device orientation
+ *  Bits       Definition
+ *  3:1        Device DPTF Profile Number (DDPN)
+ *               0   = Reserved for backward compatibility (indicates no valid
+ *                     profile number. Host should fall back to using TBMD).
+ *              1..7 = DPTF Profile number to indicate to host which table needs
+ *                     to be loaded.
+ *   0         Tablet Mode Device Indicator (TBMD)
+ */
+#define EC_ACPI_MEM_DEVICE_ORIENTATION 0x09
+#define EC_ACPI_MEM_TBMD_SHIFT         0
+#define EC_ACPI_MEM_TBMD_MASK          0x1
+#define EC_ACPI_MEM_DDPN_SHIFT         1
+#define EC_ACPI_MEM_DDPN_MASK          0x7
+
+/*
+ * Report device features. Uses the same format as the host command, except:
+ *
+ * bit 0 (EC_FEATURE_LIMITED) changes meaning from "EC code has a limited set
+ * of features", which is of limited interest when the system is already
+ * interpreting ACPI bytecode, to "EC_FEATURES[0-7] is not supported". Since
+ * these are supported, it defaults to 0.
+ * This allows detecting the presence of this field since older versions of
+ * the EC codebase would simply return 0xff to that unknown address. Check
+ * FEATURES0 != 0xff (or FEATURES0[0] == 0) to make sure that the other bits
+ * are valid.
+ */
+#define EC_ACPI_MEM_DEVICE_FEATURES0 0x0a
+#define EC_ACPI_MEM_DEVICE_FEATURES1 0x0b
+#define EC_ACPI_MEM_DEVICE_FEATURES2 0x0c
+#define EC_ACPI_MEM_DEVICE_FEATURES3 0x0d
+#define EC_ACPI_MEM_DEVICE_FEATURES4 0x0e
+#define EC_ACPI_MEM_DEVICE_FEATURES5 0x0f
+#define EC_ACPI_MEM_DEVICE_FEATURES6 0x10
+#define EC_ACPI_MEM_DEVICE_FEATURES7 0x11
+
+#define EC_ACPI_MEM_BATTERY_INDEX    0x12
+
+/*
+ * USB Port Power. Each bit indicates whether the corresponding USB ports' power
+ * is enabled (1) or disabled (0).
+ *   bit 0 USB port ID 0
+ *   ...
+ *   bit 7 USB port ID 7
+ */
+#define EC_ACPI_MEM_USB_PORT_POWER 0x13
+
+/*
+ * ACPI addresses 0x20 - 0xff map to EC_MEMMAP offset 0x00 - 0xdf.  This data
+ * is read-only from the AP.  Added in EC_ACPI_MEM_VERSION 2.
+ */
+#define EC_ACPI_MEM_MAPPED_BEGIN   0x20
+#define EC_ACPI_MEM_MAPPED_SIZE    0xe0
+
+/* Current version of ACPI memory address space */
+#define EC_ACPI_MEM_VERSION_CURRENT 2
+
+
 /*
  * This header file is used in coreboot both in C and ACPI code.  The ACPI code
  * is pre-processed to handle constants but the ASL compiler is unable to
  * handle actual C code so keep it separate.
  */
-#ifndef __ACPI__
+
 
 /*
  * Attributes for EC request and response packets.  Just defining __packed
@@ -238,7 +439,7 @@
 #define EC_LPC_STATUS_PROCESSING  0x04
 /* Last write to EC was a command, not data */
 #define EC_LPC_STATUS_LAST_CMD    0x08
-/* EC is in burst mode.  Unsupported by Chrome EC, so this bit is never set */
+/* EC is in burst mode */
 #define EC_LPC_STATUS_BURST_MODE  0x10
 /* SCI event is pending (requesting SCI query) */
 #define EC_LPC_STATUS_SCI_PENDING 0x20
@@ -2323,6 +2524,8 @@ struct ec_response_temp_sensor_get_info {
 /*****************************************************************************/
 /* Host event commands */
 
+
+/* Obsolete. New implementation should use EC_CMD_HOST_EVENT instead */
 /*
  * Host event mask params and response structures, shared by all of the host
  * event commands below.
@@ -2348,6 +2551,86 @@ struct ec_response_host_event_mask {
 #define EC_CMD_HOST_EVENT_SET_WAKE_MASK 0x008E
 #define EC_CMD_HOST_EVENT_CLEAR_B       0x008F
 
+/*
+ * Unified host event programming interface - Should be used by newer versions
+ * of BIOS/OS to program host events and masks
+ */
+
+struct ec_params_host_event {
+
+	/* Action requested by host - one of enum ec_host_event_action. */
+	uint8_t action;
+
+	/*
+	 * Mask type that the host requested the action on - one of
+	 * enum ec_host_event_mask_type.
+	 */
+	uint8_t mask_type;
+
+	/* Set to 0, ignore on read */
+	uint16_t reserved;
+
+	/* Value to be used in case of set operations. */
+	uint64_t value;
+} __ec_align4;
+
+/*
+ * Response structure returned by EC_CMD_HOST_EVENT.
+ * Update the value on a GET request. Set to 0 on GET/CLEAR
+ */
+
+struct ec_response_host_event {
+
+	/* Mask value in case of get operation */
+	uint64_t value;
+} __ec_align4;
+
+enum ec_host_event_action {
+	/*
+	 * params.value is ignored. Value of mask_type populated
+	 * in response.value
+	 */
+	EC_HOST_EVENT_GET,
+
+	/* Bits in params.value are set */
+	EC_HOST_EVENT_SET,
+
+	/* Bits in params.value are cleared */
+	EC_HOST_EVENT_CLEAR,
+};
+
+enum ec_host_event_mask_type {
+
+	/* Main host event copy */
+	EC_HOST_EVENT_MAIN,
+
+	/* Copy B of host events */
+	EC_HOST_EVENT_B,
+
+	/* SCI Mask */
+	EC_HOST_EVENT_SCI_MASK,
+
+	/* SMI Mask */
+	EC_HOST_EVENT_SMI_MASK,
+
+	/* Mask of events that should be always reported in hostevents */
+	EC_HOST_EVENT_ALWAYS_REPORT_MASK,
+
+	/* Active wake mask */
+	EC_HOST_EVENT_ACTIVE_WAKE_MASK,
+
+	/* Lazy wake mask for S0ix */
+	EC_HOST_EVENT_LAZY_WAKE_MASK_S0IX,
+
+	/* Lazy wake mask for S3 */
+	EC_HOST_EVENT_LAZY_WAKE_MASK_S3,
+
+	/* Lazy wake mask for S5 */
+	EC_HOST_EVENT_LAZY_WAKE_MASK_S5,
+};
+
+#define EC_CMD_HOST_EVENT       0x00A4
+
 /*****************************************************************************/
 /* Switch commands */
 
@@ -3056,122 +3339,6 @@ struct ec_params_reboot_ec {
  */
 #define EC_CMD_GET_PANIC_INFO 0x00D3
 
-/*****************************************************************************/
-/*
- * ACPI commands
- *
- * These are valid ONLY on the ACPI command/data port.
- */
-
-/*
- * ACPI Read Embedded Controller
- *
- * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
- *
- * Use the following sequence:
- *
- *    - Write EC_CMD_ACPI_READ to EC_LPC_ADDR_ACPI_CMD
- *    - Wait for EC_LPC_CMDR_PENDING bit to clear
- *    - Write address to EC_LPC_ADDR_ACPI_DATA
- *    - Wait for EC_LPC_CMDR_DATA bit to set
- *    - Read value from EC_LPC_ADDR_ACPI_DATA
- */
-#define EC_CMD_ACPI_READ 0x80
-
-/*
- * ACPI Write Embedded Controller
- *
- * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
- *
- * Use the following sequence:
- *
- *    - Write EC_CMD_ACPI_WRITE to EC_LPC_ADDR_ACPI_CMD
- *    - Wait for EC_LPC_CMDR_PENDING bit to clear
- *    - Write address to EC_LPC_ADDR_ACPI_DATA
- *    - Wait for EC_LPC_CMDR_PENDING bit to clear
- *    - Write value to EC_LPC_ADDR_ACPI_DATA
- */
-#define EC_CMD_ACPI_WRITE 0x81
-
-/*
- * ACPI Query Embedded Controller
- *
- * This clears the lowest-order bit in the currently pending host events, and
- * sets the result code to the 1-based index of the bit (event 0x00000001 = 1,
- * event 0x80000000 = 32), or 0 if no event was pending.
- */
-#define EC_CMD_ACPI_QUERY_EVENT 0x84
-
-/* Valid addresses in ACPI memory space, for read/write commands */
-
-/* Memory space version; set to EC_ACPI_MEM_VERSION_CURRENT */
-#define EC_ACPI_MEM_VERSION            0x00
-/*
- * Test location; writing value here updates test compliment byte to (0xff -
- * value).
- */
-#define EC_ACPI_MEM_TEST               0x01
-/* Test compliment; writes here are ignored. */
-#define EC_ACPI_MEM_TEST_COMPLIMENT    0x02
-
-/* Keyboard backlight brightness percent (0 - 100) */
-#define EC_ACPI_MEM_KEYBOARD_BACKLIGHT 0x03
-/* DPTF Target Fan Duty (0-100, 0xff for auto/none) */
-#define EC_ACPI_MEM_FAN_DUTY           0x04
-
-/*
- * DPTF temp thresholds. Any of the EC's temp sensors can have up to two
- * independent thresholds attached to them. The current value of the ID
- * register determines which sensor is affected by the THRESHOLD and COMMIT
- * registers. The THRESHOLD register uses the same EC_TEMP_SENSOR_OFFSET scheme
- * as the memory-mapped sensors. The COMMIT register applies those settings.
- *
- * The spec does not mandate any way to read back the threshold settings
- * themselves, but when a threshold is crossed the AP needs a way to determine
- * which sensor(s) are responsible. Each reading of the ID register clears and
- * returns one sensor ID that has crossed one of its threshold (in either
- * direction) since the last read. A value of 0xFF means "no new thresholds
- * have tripped". Setting or enabling the thresholds for a sensor will clear
- * the unread event count for that sensor.
- */
-#define EC_ACPI_MEM_TEMP_ID            0x05
-#define EC_ACPI_MEM_TEMP_THRESHOLD     0x06
-#define EC_ACPI_MEM_TEMP_COMMIT        0x07
-/*
- * Here are the bits for the COMMIT register:
- *   bit 0 selects the threshold index for the chosen sensor (0/1)
- *   bit 1 enables/disables the selected threshold (0 = off, 1 = on)
- * Each write to the commit register affects one threshold.
- */
-#define EC_ACPI_MEM_TEMP_COMMIT_SELECT_MASK (1 << 0)
-#define EC_ACPI_MEM_TEMP_COMMIT_ENABLE_MASK (1 << 1)
-/*
- * Example:
- *
- * Set the thresholds for sensor 2 to 50 C and 60 C:
- *   write 2 to [0x05]      --  select temp sensor 2
- *   write 0x7b to [0x06]   --  C_TO_K(50) - EC_TEMP_SENSOR_OFFSET
- *   write 0x2 to [0x07]    --  enable threshold 0 with this value
- *   write 0x85 to [0x06]   --  C_TO_K(60) - EC_TEMP_SENSOR_OFFSET
- *   write 0x3 to [0x07]    --  enable threshold 1 with this value
- *
- * Disable the 60 C threshold, leaving the 50 C threshold unchanged:
- *   write 2 to [0x05]      --  select temp sensor 2
- *   write 0x1 to [0x07]    --  disable threshold 1
- */
-
-/* DPTF battery charging current limit */
-#define EC_ACPI_MEM_CHARGING_LIMIT     0x08
-
-/* Charging limit is specified in 64 mA steps */
-#define EC_ACPI_MEM_CHARGING_LIMIT_STEP_MA   64
-/* Value to disable DPTF battery charging limit */
-#define EC_ACPI_MEM_CHARGING_LIMIT_DISABLED  0xff
-
-/* Current version of ACPI memory address space */
-#define EC_ACPI_MEM_VERSION_CURRENT 1
-
-
 /*****************************************************************************/
 /*
  * HDMI CEC commands
@@ -3285,8 +3452,6 @@ enum mkbp_cec_event {
  */
 #define EC_CMD_VERSION0 0x00DC
 
-#endif  /* !__ACPI__ */
-
 /*****************************************************************************/
 /*
  * PD commands
@@ -3627,4 +3792,6 @@ struct ec_response_usb_pd_mux_info {
 #define EC_LPC_ADDR_OLD_PARAM   EC_HOST_CMD_REGION1
 #define EC_OLD_PARAM_SIZE       EC_HOST_CMD_REGION_SIZE
 
+
+
 #endif  /* __CROS_EC_COMMANDS_H */
-- 
cgit v1.2.3


From e849b87487fb2ee448318d54381608f99ce2d4e0 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:39 -0700
Subject: mfd: cros_ec: move HDMI CEC API definition

Move near the end of file.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 148 ++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 575066b90bab..d8bde2b5e9ce 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -3211,6 +3211,81 @@ struct ec_response_battery_vendor_param {
 } __ec_align4;
 
 /*****************************************************************************/
+/*
+ * HDMI CEC commands
+ *
+ * These commands are for sending and receiving message via HDMI CEC
+ */
+
+#define MAX_CEC_MSG_LEN 16
+
+/* CEC message from the AP to be written on the CEC bus */
+#define EC_CMD_CEC_WRITE_MSG 0x00B8
+
+/**
+ * struct ec_params_cec_write - Message to write to the CEC bus
+ * @msg: message content to write to the CEC bus
+ */
+struct ec_params_cec_write {
+	uint8_t msg[MAX_CEC_MSG_LEN];
+} __ec_align1;
+
+/* Set various CEC parameters */
+#define EC_CMD_CEC_SET 0x00BA
+
+/**
+ * struct ec_params_cec_set - CEC parameters set
+ * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ * @val: in case cmd is CEC_CMD_ENABLE, this field can be 0 to disable CEC
+ *	or 1 to enable CEC functionality, in case cmd is
+ *	CEC_CMD_LOGICAL_ADDRESS, this field encodes the requested logical
+ *	address between 0 and 15 or 0xff to unregister
+ */
+struct ec_params_cec_set {
+	uint8_t cmd; /* enum cec_command */
+	uint8_t val;
+} __ec_align1;
+
+/* Read various CEC parameters */
+#define EC_CMD_CEC_GET 0x00BB
+
+/**
+ * struct ec_params_cec_get - CEC parameters get
+ * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ */
+struct ec_params_cec_get {
+	uint8_t cmd; /* enum cec_command */
+} __ec_align1;
+
+/**
+ * struct ec_response_cec_get - CEC parameters get response
+ * @val: in case cmd was CEC_CMD_ENABLE, this field will 0 if CEC is
+ *	disabled or 1 if CEC functionality is enabled,
+ *	in case cmd was CEC_CMD_LOGICAL_ADDRESS, this will encode the
+ *	configured logical address between 0 and 15 or 0xff if unregistered
+ */
+struct ec_response_cec_get {
+	uint8_t val;
+} __ec_align1;
+
+/* CEC parameters command */
+enum cec_command {
+	/* CEC reading, writing and events enable */
+	CEC_CMD_ENABLE,
+	/* CEC logical address  */
+	CEC_CMD_LOGICAL_ADDRESS,
+};
+
+/* Events from CEC to AP */
+enum mkbp_cec_event {
+	/* Outgoing message was acknowledged by a follower */
+	EC_MKBP_CEC_SEND_OK			= BIT(0),
+	/* Outgoing message was not acknowledged */
+	EC_MKBP_CEC_SEND_FAILED			= BIT(1),
+};
+
+/*****************************************************************************/
+
 /* Commands for I2S recording on audio codec. */
 
 #define EC_CMD_CODEC_I2S 0x00BC
@@ -3339,79 +3414,6 @@ struct ec_params_reboot_ec {
  */
 #define EC_CMD_GET_PANIC_INFO 0x00D3
 
-/*****************************************************************************/
-/*
- * HDMI CEC commands
- *
- * These commands are for sending and receiving message via HDMI CEC
- */
-#define EC_MAX_CEC_MSG_LEN 16
-
-/* CEC message from the AP to be written on the CEC bus */
-#define EC_CMD_CEC_WRITE_MSG 0x00B8
-
-/**
- * struct ec_params_cec_write - Message to write to the CEC bus
- * @msg: message content to write to the CEC bus
- */
-struct ec_params_cec_write {
-	uint8_t msg[EC_MAX_CEC_MSG_LEN];
-} __ec_align1;
-
-/* Set various CEC parameters */
-#define EC_CMD_CEC_SET 0x00BA
-
-/**
- * struct ec_params_cec_set - CEC parameters set
- * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
- * @val: in case cmd is CEC_CMD_ENABLE, this field can be 0 to disable CEC
- *	or 1 to enable CEC functionality, in case cmd is CEC_CMD_LOGICAL_ADDRESS,
- *	this field encodes the requested logical address between 0 and 15
- *	or 0xff to unregister
- */
-struct ec_params_cec_set {
-	uint8_t cmd; /* enum cec_command */
-	uint8_t val;
-} __ec_align1;
-
-/* Read various CEC parameters */
-#define EC_CMD_CEC_GET 0x00BB
-
-/**
- * struct ec_params_cec_get - CEC parameters get
- * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
- */
-struct ec_params_cec_get {
-	uint8_t cmd; /* enum cec_command */
-} __ec_align1;
-
-/**
- * struct ec_response_cec_get - CEC parameters get response
- * @val: in case cmd was CEC_CMD_ENABLE, this field will 0 if CEC is
- *	disabled or 1 if CEC functionality is enabled,
- *	in case cmd was CEC_CMD_LOGICAL_ADDRESS, this will encode the
- *	configured logical address between 0 and 15 or 0xff if unregistered
- */
-struct ec_response_cec_get {
-	uint8_t val;
-} __ec_align1;
-
-/* CEC parameters command */
-enum ec_cec_command {
-	/* CEC reading, writing and events enable */
-	CEC_CMD_ENABLE,
-	/* CEC logical address  */
-	CEC_CMD_LOGICAL_ADDRESS,
-};
-
-/* Events from CEC to AP */
-enum mkbp_cec_event {
-	/* Outgoing message was acknowledged by a follower */
-	EC_MKBP_CEC_SEND_OK			= BIT(0),
-	/* Outgoing message was not acknowledged */
-	EC_MKBP_CEC_SEND_FAILED			= BIT(1),
-};
-
 /*****************************************************************************/
 /*
  * Special commands
-- 
cgit v1.2.3


From fd3bbf4a47445a47b95d03f501651df0054ce117 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:40 -0700
Subject: mfd: cros_ec: Remove zero-size structs

Empty structure size is different between C and C++.
To prevent clang warning when compiling this include file in C++
programs, remove empty structures.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index d8bde2b5e9ce..fabf341af97f 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1540,10 +1540,14 @@ struct lightbar_program {
 struct ec_params_lightbar {
 	uint8_t cmd;		      /* Command (see enum lightbar_command) */
 	union {
-		struct {
-			/* no args */
-		} dump, off, on, init, get_seq, get_params_v0, get_params_v1,
-			version, get_brightness, get_demo, suspend, resume;
+		/*
+		 * The following commands have no args:
+		 *
+		 * dump, off, on, init, get_seq, get_params_v0, get_params_v1,
+		 * version, get_brightness, get_demo, suspend, resume
+		 *
+		 * Don't use an empty struct, because C++ hates that.
+		 */
 
 		struct __ec_todo_unpacked {
 			uint8_t num;
@@ -1597,11 +1601,13 @@ struct ec_response_lightbar {
 			uint8_t red, green, blue;
 		} get_rgb;
 
-		struct {
-			/* no return params */
-		} off, on, init, set_brightness, seq, reg, set_rgb,
-			demo, set_params_v0, set_params_v1,
-			set_program, manual_suspend_ctrl, suspend, resume;
+		/*
+		 * The following commands have no response:
+		 *
+		 * off, on, init, set_brightness, seq, reg, set_rgb,
+		 * set_params_v0, set_params_v1, set_program,
+		 * manual_suspend_ctrl, suspend, resume
+		 */
 	};
 } __ec_todo_packed;
 
@@ -3021,9 +3027,7 @@ enum charge_state_params {
 struct ec_params_charge_state {
 	uint8_t cmd;				/* enum charge_state_command */
 	union {
-		struct {
-			/* no args */
-		} get_state;
+		/* get_state has no args */
 
 		struct __ec_todo_unpacked {
 			uint32_t param;		/* enum charge_state_param */
@@ -3049,9 +3053,8 @@ struct ec_response_charge_state {
 		struct __ec_align4 {
 			uint32_t value;
 		} get_param;
-		struct {
-			/* no return values */
-		} set_param;
+
+		/* set_param returns no args */
 	};
 } __ec_align4;
 
-- 
cgit v1.2.3


From 3c46ae6160aff70e01bd180dd3ddcccf09fcf901 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:41 -0700
Subject: mfd: cros_ec: Add Flash V2 commands API

Added for supporting larger embedded controller flash.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 150 ++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index fabf341af97f..3d1d26f62bd3 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1138,6 +1138,7 @@ struct ec_response_get_features {
 
 /* Get flash info */
 #define EC_CMD_FLASH_INFO 0x0010
+#define EC_VER_FLASH_INFO 2
 
 /**
  * struct ec_response_flash_info - Response to the flash info command.
@@ -1164,6 +1165,15 @@ struct ec_response_flash_info {
  */
 #define EC_FLASH_INFO_ERASE_TO_0 BIT(0)
 
+/*
+ * Flash must be selected for read/write/erase operations to succeed.  This may
+ * be necessary on a chip where write/erase can be corrupted by other board
+ * activity, or where the chip needs to enable some sort of programming voltage,
+ * or where the read/write/erase operations require cleanly suspending other
+ * chip functionality.
+ */
+#define EC_FLASH_INFO_SELECT_REQUIRED BIT(1)
+
 /**
  * struct ec_response_flash_info_1 - Response to the flash info v1 command.
  * @flash_size: Usable flash size in bytes.
@@ -1186,6 +1196,12 @@ struct ec_response_flash_info {
  * gcc anonymous structs don't seem to get along with the __packed directive;
  * if they did we'd define the version 0 structure as a sub-structure of this
  * one.
+ *
+ * Version 2 supports flash banks of different sizes:
+ * The caller specified the number of banks it has preallocated
+ * (num_banks_desc)
+ * The EC returns the number of banks describing the flash memory.
+ * It adds banks descriptions up to num_banks_desc.
  */
 struct ec_response_flash_info_1 {
 	/* Version 0 fields; see above for description */
@@ -1199,6 +1215,42 @@ struct ec_response_flash_info_1 {
 	uint32_t flags;
 } __ec_align4;
 
+struct ec_params_flash_info_2 {
+	/* Number of banks to describe */
+	uint16_t num_banks_desc;
+	/* Reserved; set 0; ignore on read */
+	uint8_t reserved[2];
+} __ec_align4;
+
+struct ec_flash_bank {
+	/* Number of sector is in this bank. */
+	uint16_t count;
+	/* Size in power of 2 of each sector (8 --> 256 bytes) */
+	uint8_t size_exp;
+	/* Minimal write size for the sectors in this bank */
+	uint8_t write_size_exp;
+	/* Erase size for the sectors in this bank */
+	uint8_t erase_size_exp;
+	/* Size for write protection, usually identical to erase size. */
+	uint8_t protect_size_exp;
+	/* Reserved; set 0; ignore on read */
+	uint8_t reserved[2];
+};
+
+struct ec_response_flash_info_2 {
+	/* Total flash in the EC. */
+	uint32_t flash_size;
+	/* Flags; see EC_FLASH_INFO_* */
+	uint32_t flags;
+	/* Maximum size to use to send data to write to the EC. */
+	uint32_t write_ideal_size;
+	/* Number of banks present in the EC. */
+	uint16_t num_banks_total;
+	/* Number of banks described in banks array. */
+	uint16_t num_banks_desc;
+	struct ec_flash_bank banks[0];
+} __ec_align4;
+
 /*
  * Read flash
  *
@@ -1238,7 +1290,7 @@ struct ec_params_flash_write {
 #define EC_CMD_FLASH_ERASE 0x0013
 
 /**
- * struct ec_params_flash_erase - Parameters for the flash erase command.
+ * struct ec_params_flash_erase - Parameters for the flash erase command, v0.
  * @offset: Byte offset to erase.
  * @size: Size to erase in bytes.
  */
@@ -1247,6 +1299,43 @@ struct ec_params_flash_erase {
 	uint32_t size;
 } __ec_align4;
 
+/*
+ * v1 add async erase:
+ * subcommands can returns:
+ * EC_RES_SUCCESS : erased (see ERASE_SECTOR_ASYNC case below).
+ * EC_RES_INVALID_PARAM : offset/size are not aligned on a erase boundary.
+ * EC_RES_ERROR : other errors.
+ * EC_RES_BUSY : an existing erase operation is in progress.
+ * EC_RES_ACCESS_DENIED: Trying to erase running image.
+ *
+ * When ERASE_SECTOR_ASYNC returns EC_RES_SUCCESS, the operation is just
+ * properly queued. The user must call ERASE_GET_RESULT subcommand to get
+ * the proper result.
+ * When ERASE_GET_RESULT returns EC_RES_BUSY, the caller must wait and send
+ * ERASE_GET_RESULT again to get the result of ERASE_SECTOR_ASYNC.
+ * ERASE_GET_RESULT command may timeout on EC where flash access is not
+ * permitted while erasing. (For instance, STM32F4).
+ */
+enum ec_flash_erase_cmd {
+	FLASH_ERASE_SECTOR,     /* Erase and wait for result */
+	FLASH_ERASE_SECTOR_ASYNC,  /* Erase and return immediately. */
+	FLASH_ERASE_GET_RESULT,  /* Ask for last erase result */
+};
+
+/**
+ * struct ec_params_flash_erase_v1 - Parameters for the flash erase command, v1.
+ * @cmd: One of ec_flash_erase_cmd.
+ * @reserved: Pad byte; currently always contains 0.
+ * @flag: No flags defined yet; set to 0.
+ * @params: Same as v0 parameters.
+ */
+struct ec_params_flash_erase_v1 {
+	uint8_t  cmd;
+	uint8_t  reserved;
+	uint16_t flag;
+	struct ec_params_flash_erase params;
+} __ec_align4;
+
 /*
  * Get/set flash protection.
  *
@@ -1282,6 +1371,15 @@ struct ec_params_flash_erase {
 #define EC_FLASH_PROTECT_ERROR_INCONSISTENT BIT(5)
 /* Entire flash code protected when the EC boots */
 #define EC_FLASH_PROTECT_ALL_AT_BOOT        BIT(6)
+/* RW flash code protected when the EC boots */
+#define EC_FLASH_PROTECT_RW_AT_BOOT         BIT(7)
+/* RW flash code protected now. */
+#define EC_FLASH_PROTECT_RW_NOW             BIT(8)
+/* Rollback information flash region protected when the EC boots */
+#define EC_FLASH_PROTECT_ROLLBACK_AT_BOOT   BIT(9)
+/* Rollback information flash region protected now */
+#define EC_FLASH_PROTECT_ROLLBACK_NOW       BIT(10)
+
 
 /**
  * struct ec_params_flash_protect - Parameters for the flash protect command.
@@ -1320,16 +1418,31 @@ struct ec_response_flash_protect {
 enum ec_flash_region {
 	/* Region which holds read-only EC image */
 	EC_FLASH_REGION_RO = 0,
-	/* Region which holds rewritable EC image */
-	EC_FLASH_REGION_RW,
+	/*
+	 * Region which holds active RW image. 'Active' is different from
+	 * 'running'. Active means 'scheduled-to-run'. Since RO image always
+	 * scheduled to run, active/non-active applies only to RW images (for
+	 * the same reason 'update' applies only to RW images. It's a state of
+	 * an image on a flash. Running image can be RO, RW_A, RW_B but active
+	 * image can only be RW_A or RW_B. In recovery mode, an active RW image
+	 * doesn't enter 'running' state but it's still active on a flash.
+	 */
+	EC_FLASH_REGION_ACTIVE,
 	/*
 	 * Region which should be write-protected in the factory (a superset of
 	 * EC_FLASH_REGION_RO)
 	 */
 	EC_FLASH_REGION_WP_RO,
+	/* Region which holds updatable (non-active) RW image */
+	EC_FLASH_REGION_UPDATE,
 	/* Number of regions */
 	EC_FLASH_REGION_COUNT,
 };
+/*
+ * 'RW' is vague if there are multiple RW images; we mean the active one,
+ * so the old constant is deprecated.
+ */
+#define EC_FLASH_REGION_RW EC_FLASH_REGION_ACTIVE
 
 /**
  * struct ec_params_flash_region_info - Parameters for the flash region info
@@ -1364,6 +1477,37 @@ struct ec_response_vbnvcontext {
 	uint8_t block[EC_VBNV_BLOCK_SIZE];
 } __ec_align4;
 
+
+/* Get SPI flash information */
+#define EC_CMD_FLASH_SPI_INFO 0x0018
+
+struct ec_response_flash_spi_info {
+	/* JEDEC info from command 0x9F (manufacturer, memory type, size) */
+	uint8_t jedec[3];
+
+	/* Pad byte; currently always contains 0 */
+	uint8_t reserved0;
+
+	/* Manufacturer / device ID from command 0x90 */
+	uint8_t mfr_dev_id[2];
+
+	/* Status registers from command 0x05 and 0x35 */
+	uint8_t sr1, sr2;
+} __ec_align1;
+
+
+/* Select flash during flash operations */
+#define EC_CMD_FLASH_SELECT 0x0019
+
+/**
+ * struct ec_params_flash_select - Parameters for the flash select command.
+ * @select: 1 to select flash, 0 to deselect flash
+ */
+struct ec_params_flash_select {
+	uint8_t select;
+} __ec_align4;
+
+
 /*****************************************************************************/
 /* PWM commands */
 
-- 
cgit v1.2.3


From 89193a04fce5b869eaf20a330deebcde0dd6806f Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:42 -0700
Subject: mfd: cros_ec: Add PWM_SET_DUTY API

Add API for fan control.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 3d1d26f62bd3..2b3a94a4f0f4 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1521,11 +1521,19 @@ struct ec_response_pwm_get_fan_rpm {
 /* Set target fan RPM */
 #define EC_CMD_PWM_SET_FAN_TARGET_RPM 0x0021
 
-struct ec_params_pwm_set_fan_target_rpm {
+/* Version 0 of input params */
+struct ec_params_pwm_set_fan_target_rpm_v0 {
 	uint32_t rpm;
+} __ec_align4;
+
+/* Version 1 of input params */
+struct ec_params_pwm_set_fan_target_rpm_v1 {
+	uint32_t rpm;
+	uint8_t fan_idx;
 } __ec_align_size1;
 
 /* Get keyboard backlight */
+/* OBSOLETE - Use EC_CMD_PWM_SET_DUTY */
 #define EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT 0x0022
 
 struct ec_response_pwm_get_keyboard_backlight {
@@ -1534,6 +1542,7 @@ struct ec_response_pwm_get_keyboard_backlight {
 } __ec_align1;
 
 /* Set keyboard backlight */
+/* OBSOLETE - Use EC_CMD_PWM_SET_DUTY */
 #define EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT 0x0023
 
 struct ec_params_pwm_set_keyboard_backlight {
@@ -1543,10 +1552,17 @@ struct ec_params_pwm_set_keyboard_backlight {
 /* Set target fan PWM duty cycle */
 #define EC_CMD_PWM_SET_FAN_DUTY 0x0024
 
-struct ec_params_pwm_set_fan_duty {
+/* Version 0 of input params */
+struct ec_params_pwm_set_fan_duty_v0 {
 	uint32_t percent;
 } __ec_align4;
 
+/* Version 1 of input params */
+struct ec_params_pwm_set_fan_duty_v1 {
+	uint32_t percent;
+	uint8_t fan_idx;
+} __ec_align_size1;
+
 #define EC_CMD_PWM_SET_DUTY 0x0025
 /* 16 bit duty cycle, 0xffff = 100% */
 #define EC_PWM_MAX_DUTY 0xffff
-- 
cgit v1.2.3


From de83db57d7343a201f2ef0204929015ba268e098 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:43 -0700
Subject: mfd: cros_ec: Add lightbar v2 API

New API split commands, improve EC command latency.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 124 +++++++++++++++++++++++++++++++++--
 1 file changed, 120 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 2b3a94a4f0f4..0ff1941288cf 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1658,7 +1658,10 @@ struct lightbar_params_v1 {
 	int32_t s3_sleep_for;
 	int32_t s3_ramp_up;
 	int32_t s3_ramp_down;
+	int32_t s5_ramp_up;
+	int32_t s5_ramp_down;
 	int32_t tap_tick_delay;
+	int32_t tap_gate_delay;
 	int32_t tap_display_time;
 
 	/* Tap-for-battery params */
@@ -1686,11 +1689,82 @@ struct lightbar_params_v1 {
 	uint8_t s0_idx[2][LB_BATTERY_LEVELS];	/* AP is running */
 	uint8_t s3_idx[2][LB_BATTERY_LEVELS];	/* AP is sleeping */
 
+	/* s5: single color pulse on inhibited power-up */
+	uint8_t s5_idx;
+
 	/* Color palette */
 	struct rgb_s color[8];			/* 0-3 are Google colors */
 } __ec_todo_packed;
 
-/* Lightbar program */
+/* Lightbar command params v2
+ * crbug.com/467716
+ *
+ * lightbar_parms_v1 was too big for i2c, therefore in v2, we split them up by
+ * logical groups to make it more manageable ( < 120 bytes).
+ *
+ * NOTE: Each of these groups must be less than 120 bytes.
+ */
+
+struct lightbar_params_v2_timing {
+	/* Timing */
+	int32_t google_ramp_up;
+	int32_t google_ramp_down;
+	int32_t s3s0_ramp_up;
+	int32_t s0_tick_delay[2];		/* AC=0/1 */
+	int32_t s0a_tick_delay[2];		/* AC=0/1 */
+	int32_t s0s3_ramp_down;
+	int32_t s3_sleep_for;
+	int32_t s3_ramp_up;
+	int32_t s3_ramp_down;
+	int32_t s5_ramp_up;
+	int32_t s5_ramp_down;
+	int32_t tap_tick_delay;
+	int32_t tap_gate_delay;
+	int32_t tap_display_time;
+} __ec_todo_packed;
+
+struct lightbar_params_v2_tap {
+	/* Tap-for-battery params */
+	uint8_t tap_pct_red;
+	uint8_t tap_pct_green;
+	uint8_t tap_seg_min_on;
+	uint8_t tap_seg_max_on;
+	uint8_t tap_seg_osc;
+	uint8_t tap_idx[3];
+} __ec_todo_packed;
+
+struct lightbar_params_v2_oscillation {
+	/* Oscillation */
+	uint8_t osc_min[2];			/* AC=0/1 */
+	uint8_t osc_max[2];			/* AC=0/1 */
+	uint8_t w_ofs[2];			/* AC=0/1 */
+} __ec_todo_packed;
+
+struct lightbar_params_v2_brightness {
+	/* Brightness limits based on the backlight and AC. */
+	uint8_t bright_bl_off_fixed[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_min[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_max[2];		/* AC=0/1 */
+} __ec_todo_packed;
+
+struct lightbar_params_v2_thresholds {
+	/* Battery level thresholds */
+	uint8_t battery_threshold[LB_BATTERY_LEVELS - 1];
+} __ec_todo_packed;
+
+struct lightbar_params_v2_colors {
+	/* Map [AC][battery_level] to color index */
+	uint8_t s0_idx[2][LB_BATTERY_LEVELS];	/* AP is running */
+	uint8_t s3_idx[2][LB_BATTERY_LEVELS];	/* AP is sleeping */
+
+	/* s5: single color pulse on inhibited power-up */
+	uint8_t s5_idx;
+
+	/* Color palette */
+	struct rgb_s color[8];			/* 0-3 are Google colors */
+} __ec_todo_packed;
+
+/* Lightbar program. */
 #define EC_LB_PROG_LEN 192
 struct lightbar_program {
 	uint8_t size;
@@ -1704,7 +1778,10 @@ struct ec_params_lightbar {
 		 * The following commands have no args:
 		 *
 		 * dump, off, on, init, get_seq, get_params_v0, get_params_v1,
-		 * version, get_brightness, get_demo, suspend, resume
+		 * version, get_brightness, get_demo, suspend, resume,
+		 * get_params_v2_timing, get_params_v2_tap, get_params_v2_osc,
+		 * get_params_v2_bright, get_params_v2_thlds,
+		 * get_params_v2_colors
 		 *
 		 * Don't use an empty struct, because C++ hates that.
 		 */
@@ -1731,6 +1808,14 @@ struct ec_params_lightbar {
 
 		struct lightbar_params_v0 set_params_v0;
 		struct lightbar_params_v1 set_params_v1;
+
+		struct lightbar_params_v2_timing set_v2par_timing;
+		struct lightbar_params_v2_tap set_v2par_tap;
+		struct lightbar_params_v2_oscillation set_v2par_osc;
+		struct lightbar_params_v2_brightness set_v2par_bright;
+		struct lightbar_params_v2_thresholds set_v2par_thlds;
+		struct lightbar_params_v2_colors set_v2par_colors;
+
 		struct lightbar_program set_program;
 	};
 } __ec_todo_packed;
@@ -1752,6 +1837,14 @@ struct ec_response_lightbar {
 		struct lightbar_params_v0 get_params_v0;
 		struct lightbar_params_v1 get_params_v1;
 
+
+		struct lightbar_params_v2_timing get_params_v2_timing;
+		struct lightbar_params_v2_tap get_params_v2_tap;
+		struct lightbar_params_v2_oscillation get_params_v2_osc;
+		struct lightbar_params_v2_brightness get_params_v2_bright;
+		struct lightbar_params_v2_thresholds get_params_v2_thlds;
+		struct lightbar_params_v2_colors get_params_v2_colors;
+
 		struct __ec_todo_unpacked {
 			uint32_t num;
 			uint32_t flags;
@@ -1764,9 +1857,11 @@ struct ec_response_lightbar {
 		/*
 		 * The following commands have no response:
 		 *
-		 * off, on, init, set_brightness, seq, reg, set_rgb,
+		 * off, on, init, set_brightness, seq, reg, set_rgb, demo,
 		 * set_params_v0, set_params_v1, set_program,
-		 * manual_suspend_ctrl, suspend, resume
+		 * manual_suspend_ctrl, suspend, resume, set_v2par_timing,
+		 * set_v2par_tap, set_v2par_osc, set_v2par_bright,
+		 * set_v2par_thlds, set_v2par_colors
 		 */
 	};
 } __ec_todo_packed;
@@ -1795,6 +1890,18 @@ enum lightbar_command {
 	LIGHTBAR_CMD_MANUAL_SUSPEND_CTRL = 19,
 	LIGHTBAR_CMD_SUSPEND = 20,
 	LIGHTBAR_CMD_RESUME = 21,
+	LIGHTBAR_CMD_GET_PARAMS_V2_TIMING = 22,
+	LIGHTBAR_CMD_SET_PARAMS_V2_TIMING = 23,
+	LIGHTBAR_CMD_GET_PARAMS_V2_TAP = 24,
+	LIGHTBAR_CMD_SET_PARAMS_V2_TAP = 25,
+	LIGHTBAR_CMD_GET_PARAMS_V2_OSCILLATION = 26,
+	LIGHTBAR_CMD_SET_PARAMS_V2_OSCILLATION = 27,
+	LIGHTBAR_CMD_GET_PARAMS_V2_BRIGHTNESS = 28,
+	LIGHTBAR_CMD_SET_PARAMS_V2_BRIGHTNESS = 29,
+	LIGHTBAR_CMD_GET_PARAMS_V2_THRESHOLDS = 30,
+	LIGHTBAR_CMD_SET_PARAMS_V2_THRESHOLDS = 31,
+	LIGHTBAR_CMD_GET_PARAMS_V2_COLORS = 32,
+	LIGHTBAR_CMD_SET_PARAMS_V2_COLORS = 33,
 	LIGHTBAR_NUM_CMDS
 };
 
@@ -1813,6 +1920,14 @@ enum ec_led_id {
 	EC_LED_ID_POWER_LED,
 	/* LED on power adapter or its plug */
 	EC_LED_ID_ADAPTER_LED,
+	/* LED to indicate left side */
+	EC_LED_ID_LEFT_LED,
+	/* LED to indicate right side */
+	EC_LED_ID_RIGHT_LED,
+	/* LED to indicate recovery mode with HW_REINIT */
+	EC_LED_ID_RECOVERY_HW_REINIT_LED,
+	/* LED to indicate sysrq debug mode. */
+	EC_LED_ID_SYSRQ_DEBUG_LED,
 
 	EC_LED_ID_COUNT
 };
@@ -1827,6 +1942,7 @@ enum ec_led_colors {
 	EC_LED_COLOR_BLUE,
 	EC_LED_COLOR_YELLOW,
 	EC_LED_COLOR_WHITE,
+	EC_LED_COLOR_AMBER,
 
 	EC_LED_COLOR_COUNT
 };
-- 
cgit v1.2.3


From 03f6896aeb5bd03b95d29a0f22b3820773d97b9d Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:44 -0700
Subject: mfd: cros_ec: Expand hash API

Improve API to verify EC image signature.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 0ff1941288cf..76943e64998a 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2018,8 +2018,15 @@ enum ec_vboot_hash_status {
  * If one of these is specified, the EC will automatically update offset and
  * size to the correct values for the specified image (RO or RW).
  */
-#define EC_VBOOT_HASH_OFFSET_RO 0xfffffffe
-#define EC_VBOOT_HASH_OFFSET_RW 0xfffffffd
+#define EC_VBOOT_HASH_OFFSET_RO		0xfffffffe
+#define EC_VBOOT_HASH_OFFSET_ACTIVE	0xfffffffd
+#define EC_VBOOT_HASH_OFFSET_UPDATE	0xfffffffc
+
+/*
+ * 'RW' is vague if there are multiple RW images; we mean the active one,
+ * so the old constant is deprecated.
+ */
+#define EC_VBOOT_HASH_OFFSET_RW EC_VBOOT_HASH_OFFSET_ACTIVE
 
 /*****************************************************************************/
 /*
-- 
cgit v1.2.3


From 2908c4ed296ee2107c03503328eb951d5bc58211 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:45 -0700
Subject: mfd: cros_ec: Add EC transport protocol v4

Introduce a new transport procotol between EC and host.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 143 ++++++++++++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 76943e64998a..40a8069a58e8 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -455,7 +455,10 @@
 #define EC_LPC_STATUS_BUSY_MASK \
 	(EC_LPC_STATUS_FROM_HOST | EC_LPC_STATUS_PROCESSING)
 
-/* Host command response codes */
+/*
+ * Host command response codes (16-bit).  Note that response codes should be
+ * stored in a uint16_t rather than directly in a value of this type.
+ */
 enum ec_status {
 	EC_RES_SUCCESS = 0,
 	EC_RES_INVALID_COMMAND = 1,
@@ -471,7 +474,13 @@ enum ec_status {
 	EC_RES_OVERFLOW = 11,		/* Table / data overflow */
 	EC_RES_INVALID_HEADER = 12,     /* Header contains invalid data */
 	EC_RES_REQUEST_TRUNCATED = 13,  /* Didn't get the entire request */
-	EC_RES_RESPONSE_TOO_BIG = 14    /* Response was too big to handle */
+	EC_RES_RESPONSE_TOO_BIG = 14,   /* Response was too big to handle */
+	EC_RES_BUS_ERROR = 15,		/* Communications bus error */
+	EC_RES_BUSY = 16,		/* Up but too busy.  Should retry */
+	EC_RES_INVALID_HEADER_VERSION = 17,  /* Header version invalid */
+	EC_RES_INVALID_HEADER_CRC = 18,      /* Header CRC invalid */
+	EC_RES_INVALID_DATA_CRC = 19,        /* Data CRC invalid */
+	EC_RES_DUP_UNAVAILABLE = 20,         /* Can't resend response */
 };
 
 /*
@@ -744,6 +753,136 @@ struct ec_host_response {
 	uint16_t reserved;
 } __ec_align4;
 
+/*****************************************************************************/
+
+/*
+ * Host command protocol V4.
+ *
+ * Packets always start with a request or response header.  They are followed
+ * by data_len bytes of data.  If the data_crc_present flag is set, the data
+ * bytes are followed by a CRC-8 of that data, using using x^8 + x^2 + x + 1
+ * polynomial.
+ *
+ * Host algorithm when sending a request q:
+ *
+ * 101) tries_left=(some value, e.g. 3);
+ * 102) q.seq_num++
+ * 103) q.seq_dup=0
+ * 104) Calculate q.header_crc.
+ * 105) Send request q to EC.
+ * 106) Wait for response r.  Go to 201 if received or 301 if timeout.
+ *
+ * 201) If r.struct_version != 4, go to 301.
+ * 202) If r.header_crc mismatches calculated CRC for r header, go to 301.
+ * 203) If r.data_crc_present and r.data_crc mismatches, go to 301.
+ * 204) If r.seq_num != q.seq_num, go to 301.
+ * 205) If r.seq_dup == q.seq_dup, return success.
+ * 207) If r.seq_dup == 1, go to 301.
+ * 208) Return error.
+ *
+ * 301) If --tries_left <= 0, return error.
+ * 302) If q.seq_dup == 1, go to 105.
+ * 303) q.seq_dup = 1
+ * 304) Go to 104.
+ *
+ * EC algorithm when receiving a request q.
+ * EC has response buffer r, error buffer e.
+ *
+ * 101) If q.struct_version != 4, set e.result = EC_RES_INVALID_HEADER_VERSION
+ *      and go to 301
+ * 102) If q.header_crc mismatches calculated CRC, set e.result =
+ *      EC_RES_INVALID_HEADER_CRC and go to 301
+ * 103) If q.data_crc_present, calculate data CRC.  If that mismatches the CRC
+ *      byte at the end of the packet, set e.result = EC_RES_INVALID_DATA_CRC
+ *      and go to 301.
+ * 104) If q.seq_dup == 0, go to 201.
+ * 105) If q.seq_num != r.seq_num, go to 201.
+ * 106) If q.seq_dup == r.seq_dup, go to 205, else go to 203.
+ *
+ * 201) Process request q into response r.
+ * 202) r.seq_num = q.seq_num
+ * 203) r.seq_dup = q.seq_dup
+ * 204) Calculate r.header_crc
+ * 205) If r.data_len > 0 and data is no longer available, set e.result =
+ *      EC_RES_DUP_UNAVAILABLE and go to 301.
+ * 206) Send response r.
+ *
+ * 301) e.seq_num = q.seq_num
+ * 302) e.seq_dup = q.seq_dup
+ * 303) Calculate e.header_crc.
+ * 304) Send error response e.
+ */
+
+/* Version 4 request from host */
+struct ec_host_request4 {
+	/*
+	 * bits 0-3: struct_version: Structure version (=4)
+	 * bit    4: is_response: Is response (=0)
+	 * bits 5-6: seq_num: Sequence number
+	 * bit    7: seq_dup: Sequence duplicate flag
+	 */
+	uint8_t fields0;
+
+	/*
+	 * bits 0-4: command_version: Command version
+	 * bits 5-6: Reserved (set 0, ignore on read)
+	 * bit    7: data_crc_present: Is data CRC present after data
+	 */
+	uint8_t fields1;
+
+	/* Command code (EC_CMD_*) */
+	uint16_t command;
+
+	/* Length of data which follows this header (not including data CRC) */
+	uint16_t data_len;
+
+	/* Reserved (set 0, ignore on read) */
+	uint8_t reserved;
+
+	/* CRC-8 of above fields, using x^8 + x^2 + x + 1 polynomial */
+	uint8_t header_crc;
+} __ec_align4;
+
+/* Version 4 response from EC */
+struct ec_host_response4 {
+	/*
+	 * bits 0-3: struct_version: Structure version (=4)
+	 * bit    4: is_response: Is response (=1)
+	 * bits 5-6: seq_num: Sequence number
+	 * bit    7: seq_dup: Sequence duplicate flag
+	 */
+	uint8_t fields0;
+
+	/*
+	 * bits 0-6: Reserved (set 0, ignore on read)
+	 * bit    7: data_crc_present: Is data CRC present after data
+	 */
+	uint8_t fields1;
+
+	/* Result code (EC_RES_*) */
+	uint16_t result;
+
+	/* Length of data which follows this header (not including data CRC) */
+	uint16_t data_len;
+
+	/* Reserved (set 0, ignore on read) */
+	uint8_t reserved;
+
+	/* CRC-8 of above fields, using x^8 + x^2 + x + 1 polynomial */
+	uint8_t header_crc;
+} __ec_align4;
+
+/* Fields in fields0 byte */
+#define EC_PACKET4_0_STRUCT_VERSION_MASK	0x0f
+#define EC_PACKET4_0_IS_RESPONSE_MASK		0x10
+#define EC_PACKET4_0_SEQ_NUM_SHIFT		5
+#define EC_PACKET4_0_SEQ_NUM_MASK		0x60
+#define EC_PACKET4_0_SEQ_DUP_MASK		0x80
+
+/* Fields in fields1 byte */
+#define EC_PACKET4_1_COMMAND_VERSION_MASK	0x1f  /* (request only) */
+#define EC_PACKET4_1_DATA_CRC_PRESENT_MASK	0x80
+
 /*****************************************************************************/
 /*
  * Notes on commands:
-- 
cgit v1.2.3


From a517bb4bb8c15e6f427496b9bb7eba89f0b96bbb Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:46 -0700
Subject: mfd: cros_ec: Complete MEMS sensor API

Add new command for batched mode, add support for more sensors.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 464 ++++++++++++++++++++++++++++++-----
 1 file changed, 406 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 40a8069a58e8..701b03cfa445 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2191,7 +2191,13 @@ enum motionsense_command {
 
 	/*
 	 * EC Rate command is a setter/getter command for the EC sampling rate
-	 * of all motion sensors in milliseconds.
+	 * in milliseconds.
+	 * It is per sensor, the EC run sample task  at the minimum of all
+	 * sensors EC_RATE.
+	 * For sensors without hardware FIFO, EC_RATE should be equals to 1/ODR
+	 * to collect all the sensor samples.
+	 * For sensor with hardware FIFO, EC_RATE is used as the maximal delay
+	 * to process of all motion sensors in milliseconds.
 	 */
 	MOTIONSENSE_CMD_EC_RATE = 2,
 
@@ -2222,32 +2228,76 @@ enum motionsense_command {
 	MOTIONSENSE_CMD_DATA = 6,
 
 	/*
-	 * Perform low level calibration.. On sensors that support it, ask to
-	 * do offset calibration.
+	 * Return sensor fifo info.
+	 */
+	MOTIONSENSE_CMD_FIFO_INFO = 7,
+
+	/*
+	 * Insert a flush element in the fifo and return sensor fifo info.
+	 * The host can use that element to synchronize its operation.
+	 */
+	MOTIONSENSE_CMD_FIFO_FLUSH = 8,
+
+	/*
+	 * Return a portion of the fifo.
+	 */
+	MOTIONSENSE_CMD_FIFO_READ = 9,
+
+	/*
+	 * Perform low level calibration.
+	 * On sensors that support it, ask to do offset calibration.
 	 */
 	MOTIONSENSE_CMD_PERFORM_CALIB = 10,
 
 	/*
-	 * Sensor Offset command is a setter/getter command for the offset used
-	 * for calibration. The offsets can be calculated by the host, or via
+	 * Sensor Offset command is a setter/getter command for the offset
+	 * used for calibration.
+	 * The offsets can be calculated by the host, or via
 	 * PERFORM_CALIB command.
 	 */
 	MOTIONSENSE_CMD_SENSOR_OFFSET = 11,
 
-	/* Number of motionsense sub-commands. */
-	MOTIONSENSE_NUM_CMDS
-};
+	/*
+	 * List available activities for a MOTION sensor.
+	 * Indicates if they are enabled or disabled.
+	 */
+	MOTIONSENSE_CMD_LIST_ACTIVITIES = 12,
+
+	/*
+	 * Activity management
+	 * Enable/Disable activity recognition.
+	 */
+	MOTIONSENSE_CMD_SET_ACTIVITY = 13,
+
+	/*
+	 * Lid Angle
+	 */
+	MOTIONSENSE_CMD_LID_ANGLE = 14,
+
+	/*
+	 * Allow the FIFO to trigger interrupt via MKBP events.
+	 * By default the FIFO does not send interrupt to process the FIFO
+	 * until the AP is ready or it is coming from a wakeup sensor.
+	 */
+	MOTIONSENSE_CMD_FIFO_INT_ENABLE = 15,
+
+	/*
+	 * Spoof the readings of the sensors.  The spoofed readings can be set
+	 * to arbitrary values, or will lock to the last read actual values.
+	 */
+	MOTIONSENSE_CMD_SPOOF = 16,
 
-enum motionsensor_id {
-	EC_MOTION_SENSOR_ACCEL_BASE = 0,
-	EC_MOTION_SENSOR_ACCEL_LID = 1,
-	EC_MOTION_SENSOR_GYRO = 2,
+	/* Set lid angle for tablet mode detection. */
+	MOTIONSENSE_CMD_TABLET_MODE_LID_ANGLE = 17,
 
 	/*
-	 * Note, if more sensors are added and this count changes, the padding
-	 * in ec_response_motion_sense dump command must be modified.
+	 * Sensor Scale command is a setter/getter command for the calibration
+	 * scale.
 	 */
-	EC_MOTION_SENSOR_COUNT = 3
+	MOTIONSENSE_CMD_SENSOR_SCALE = 18,
+
+	/* Number of motionsense sub-commands. */
+	MOTIONSENSE_NUM_CMDS
 };
 
 /* List of motion sensor types. */
@@ -2259,6 +2309,7 @@ enum motionsensor_type {
 	MOTIONSENSE_TYPE_LIGHT = 4,
 	MOTIONSENSE_TYPE_ACTIVITY = 5,
 	MOTIONSENSE_TYPE_BARO = 6,
+	MOTIONSENSE_TYPE_SYNC = 7,
 	MOTIONSENSE_TYPE_MAX,
 };
 
@@ -2266,36 +2317,48 @@ enum motionsensor_type {
 enum motionsensor_location {
 	MOTIONSENSE_LOC_BASE = 0,
 	MOTIONSENSE_LOC_LID = 1,
+	MOTIONSENSE_LOC_CAMERA = 2,
 	MOTIONSENSE_LOC_MAX,
 };
 
 /* List of motion sensor chips. */
 enum motionsensor_chip {
 	MOTIONSENSE_CHIP_KXCJ9 = 0,
+	MOTIONSENSE_CHIP_LSM6DS0 = 1,
+	MOTIONSENSE_CHIP_BMI160 = 2,
+	MOTIONSENSE_CHIP_SI1141 = 3,
+	MOTIONSENSE_CHIP_SI1142 = 4,
+	MOTIONSENSE_CHIP_SI1143 = 5,
+	MOTIONSENSE_CHIP_KX022 = 6,
+	MOTIONSENSE_CHIP_L3GD20H = 7,
+	MOTIONSENSE_CHIP_BMA255 = 8,
+	MOTIONSENSE_CHIP_BMP280 = 9,
+	MOTIONSENSE_CHIP_OPT3001 = 10,
+	MOTIONSENSE_CHIP_BH1730 = 11,
+	MOTIONSENSE_CHIP_GPIO = 12,
+	MOTIONSENSE_CHIP_LIS2DH = 13,
+	MOTIONSENSE_CHIP_LSM6DSM = 14,
+	MOTIONSENSE_CHIP_LIS2DE = 15,
+	MOTIONSENSE_CHIP_LIS2MDL = 16,
+	MOTIONSENSE_CHIP_LSM6DS3 = 17,
+	MOTIONSENSE_CHIP_LSM6DSO = 18,
+	MOTIONSENSE_CHIP_LNG2DM = 19,
+	MOTIONSENSE_CHIP_MAX,
 };
 
-/* Module flag masks used for the dump sub-command. */
-#define MOTIONSENSE_MODULE_FLAG_ACTIVE (1<<0)
-
-/* Sensor flag masks used for the dump sub-command. */
-#define MOTIONSENSE_SENSOR_FLAG_PRESENT (1<<0)
-
-/*
- * Send this value for the data element to only perform a read. If you
- * send any other value, the EC will interpret it as data to set and will
- * return the actual value set.
- */
-#define EC_MOTION_SENSE_NO_VALUE -1
-
-#define EC_MOTION_SENSE_INVALID_CALIB_TEMP 0x8000
-
-/* Set Calibration information */
-#define MOTION_SENSE_SET_OFFSET	1
+/* List of orientation positions */
+enum motionsensor_orientation {
+	MOTIONSENSE_ORIENTATION_LANDSCAPE = 0,
+	MOTIONSENSE_ORIENTATION_PORTRAIT = 1,
+	MOTIONSENSE_ORIENTATION_UPSIDE_DOWN_PORTRAIT = 2,
+	MOTIONSENSE_ORIENTATION_UPSIDE_DOWN_LANDSCAPE = 3,
+	MOTIONSENSE_ORIENTATION_UNKNOWN = 4,
+};
 
 struct ec_response_motion_sensor_data {
 	/* Flags for each sensor. */
 	uint8_t flags;
-	/* Sensor number the data comes from */
+	/* Sensor number the data comes from. */
 	uint8_t sensor_num;
 	/* Each sensor is up to 3-axis. */
 	union {
@@ -2312,22 +2375,138 @@ struct ec_response_motion_sensor_data {
 	};
 } __ec_todo_packed;
 
+/* Note: used in ec_response_get_next_data */
+struct ec_response_motion_sense_fifo_info {
+	/* Size of the fifo */
+	uint16_t size;
+	/* Amount of space used in the fifo */
+	uint16_t count;
+	/* Timestamp recorded in us.
+	 * aka accurate timestamp when host event was triggered.
+	 */
+	uint32_t timestamp;
+	/* Total amount of vector lost */
+	uint16_t total_lost;
+	/* Lost events since the last fifo_info, per sensors */
+	uint16_t lost[0];
+} __ec_todo_packed;
+
+struct ec_response_motion_sense_fifo_data {
+	uint32_t number_data;
+	struct ec_response_motion_sensor_data data[0];
+} __ec_todo_packed;
+
+/* List supported activity recognition */
+enum motionsensor_activity {
+	MOTIONSENSE_ACTIVITY_RESERVED = 0,
+	MOTIONSENSE_ACTIVITY_SIG_MOTION = 1,
+	MOTIONSENSE_ACTIVITY_DOUBLE_TAP = 2,
+	MOTIONSENSE_ACTIVITY_ORIENTATION = 3,
+};
+
+struct ec_motion_sense_activity {
+	uint8_t sensor_num;
+	uint8_t activity; /* one of enum motionsensor_activity */
+	uint8_t enable;   /* 1: enable, 0: disable */
+	uint8_t reserved;
+	uint16_t parameters[3]; /* activity dependent parameters */
+} __ec_todo_unpacked;
+
+/* Module flag masks used for the dump sub-command. */
+#define MOTIONSENSE_MODULE_FLAG_ACTIVE BIT(0)
+
+/* Sensor flag masks used for the dump sub-command. */
+#define MOTIONSENSE_SENSOR_FLAG_PRESENT BIT(0)
+
+/*
+ * Flush entry for synchronization.
+ * data contains time stamp
+ */
+#define MOTIONSENSE_SENSOR_FLAG_FLUSH BIT(0)
+#define MOTIONSENSE_SENSOR_FLAG_TIMESTAMP BIT(1)
+#define MOTIONSENSE_SENSOR_FLAG_WAKEUP BIT(2)
+#define MOTIONSENSE_SENSOR_FLAG_TABLET_MODE BIT(3)
+#define MOTIONSENSE_SENSOR_FLAG_ODR BIT(4)
+
+/*
+ * Send this value for the data element to only perform a read. If you
+ * send any other value, the EC will interpret it as data to set and will
+ * return the actual value set.
+ */
+#define EC_MOTION_SENSE_NO_VALUE -1
+
+#define EC_MOTION_SENSE_INVALID_CALIB_TEMP 0x8000
+
+/* MOTIONSENSE_CMD_SENSOR_OFFSET subcommand flag */
+/* Set Calibration information */
+#define MOTION_SENSE_SET_OFFSET BIT(0)
+
+/* Default Scale value, factor 1. */
+#define MOTION_SENSE_DEFAULT_SCALE BIT(15)
+
+#define LID_ANGLE_UNRELIABLE 500
+
+enum motionsense_spoof_mode {
+	/* Disable spoof mode. */
+	MOTIONSENSE_SPOOF_MODE_DISABLE = 0,
+
+	/* Enable spoof mode, but use provided component values. */
+	MOTIONSENSE_SPOOF_MODE_CUSTOM,
+
+	/* Enable spoof mode, but use the current sensor values. */
+	MOTIONSENSE_SPOOF_MODE_LOCK_CURRENT,
+
+	/* Query the current spoof mode status for the sensor. */
+	MOTIONSENSE_SPOOF_MODE_QUERY,
+};
+
 struct ec_params_motion_sense {
 	uint8_t cmd;
 	union {
 		/* Used for MOTIONSENSE_CMD_DUMP. */
 		struct __ec_todo_unpacked {
-			/* no args */
+			/*
+			 * Maximal number of sensor the host is expecting.
+			 * 0 means the host is only interested in the number
+			 * of sensors controlled by the EC.
+			 */
+			uint8_t max_sensor_count;
 		} dump;
 
 		/*
-		 * Used for MOTIONSENSE_CMD_EC_RATE and
-		 * MOTIONSENSE_CMD_KB_WAKE_ANGLE.
+		 * Used for MOTIONSENSE_CMD_KB_WAKE_ANGLE.
 		 */
 		struct __ec_todo_unpacked {
-			/* Data to set or EC_MOTION_SENSE_NO_VALUE to read. */
+			/* Data to set or EC_MOTION_SENSE_NO_VALUE to read.
+			 * kb_wake_angle: angle to wakup AP.
+			 */
 			int16_t data;
-		} ec_rate, kb_wake_angle;
+		} kb_wake_angle;
+
+		/*
+		 * Used for MOTIONSENSE_CMD_INFO, MOTIONSENSE_CMD_DATA
+		 * and MOTIONSENSE_CMD_PERFORM_CALIB.
+		 */
+		struct __ec_todo_unpacked {
+			uint8_t sensor_num;
+		} info, info_3, data, fifo_flush, perform_calib,
+				list_activities;
+
+		/*
+		 * Used for MOTIONSENSE_CMD_EC_RATE, MOTIONSENSE_CMD_SENSOR_ODR
+		 * and MOTIONSENSE_CMD_SENSOR_RANGE.
+		 */
+		struct __ec_todo_unpacked {
+			uint8_t sensor_num;
+
+			/* Rounding flag, true for round-up, false for down. */
+			uint8_t roundup;
+
+			uint16_t reserved;
+
+			/* Data to set or EC_MOTION_SENSE_NO_VALUE to read. */
+			int32_t data;
+		} ec_rate, sensor_odr, sensor_range;
 
 		/* Used for MOTIONSENSE_CMD_SENSOR_OFFSET */
 		struct __ec_todo_packed {
@@ -2358,33 +2537,99 @@ struct ec_params_motion_sense {
 			int16_t offset[3];
 		} sensor_offset;
 
-		/* Used for MOTIONSENSE_CMD_INFO. */
+		/* Used for MOTIONSENSE_CMD_SENSOR_SCALE */
 		struct __ec_todo_packed {
 			uint8_t sensor_num;
-		} info;
 
-		/*
-		 * Used for MOTIONSENSE_CMD_SENSOR_ODR and
-		 * MOTIONSENSE_CMD_SENSOR_RANGE.
-		 */
-		struct {
-			/* Should be element of enum motionsensor_id. */
-			uint8_t sensor_num;
+			/*
+			 * bit 0: If set (MOTION_SENSE_SET_OFFSET), set
+			 * the calibration information in the EC.
+			 * If unset, just retrieve calibration information.
+			 */
+			uint16_t flags;
 
-			/* Rounding flag, true for round-up, false for down. */
-			uint8_t roundup;
+			/*
+			 * Temperature at calibration, in units of 0.01 C
+			 * 0x8000: invalid / unknown.
+			 * 0x0: 0C
+			 * 0x7fff: +327.67C
+			 */
+			int16_t temp;
 
-			uint16_t reserved;
+			/*
+			 * Scale for calibration:
+			 * By default scale is 1, it is encoded on 16bits:
+			 * 1 = BIT(15)
+			 * ~2 = 0xFFFF
+			 * ~0 = 0.
+			 */
+			uint16_t scale[3];
+		} sensor_scale;
 
-			/* Data to set or EC_MOTION_SENSE_NO_VALUE to read. */
-			int32_t data;
-		} sensor_odr, sensor_range;
+
+		/* Used for MOTIONSENSE_CMD_FIFO_INFO */
+		/* (no params) */
+
+		/* Used for MOTIONSENSE_CMD_FIFO_READ */
+		struct __ec_todo_unpacked {
+			/*
+			 * Number of expected vector to return.
+			 * EC may return less or 0 if none available.
+			 */
+			uint32_t max_data_vector;
+		} fifo_read;
+
+		struct ec_motion_sense_activity set_activity;
+
+		/* Used for MOTIONSENSE_CMD_LID_ANGLE */
+		/* (no params) */
+
+		/* Used for MOTIONSENSE_CMD_FIFO_INT_ENABLE */
+		struct __ec_todo_unpacked {
+			/*
+			 * 1: enable, 0 disable fifo,
+			 * EC_MOTION_SENSE_NO_VALUE return value.
+			 */
+			int8_t enable;
+		} fifo_int_enable;
+
+		/* Used for MOTIONSENSE_CMD_SPOOF */
+		struct __ec_todo_packed {
+			uint8_t sensor_id;
+
+			/* See enum motionsense_spoof_mode. */
+			uint8_t spoof_enable;
+
+			/* Ignored, used for alignment. */
+			uint8_t reserved;
+
+			/* Individual component values to spoof. */
+			int16_t components[3];
+		} spoof;
+
+		/* Used for MOTIONSENSE_CMD_TABLET_MODE_LID_ANGLE. */
+		struct __ec_todo_unpacked {
+			/*
+			 * Lid angle threshold for switching between tablet and
+			 * clamshell mode.
+			 */
+			int16_t lid_angle;
+
+			/*
+			 * Hysteresis degree to prevent fluctuations between
+			 * clamshell and tablet mode if lid angle keeps
+			 * changing around the threshold. Lid motion driver will
+			 * use lid_angle + hys_degree to trigger tablet mode and
+			 * lid_angle - hys_degree to trigger clamshell mode.
+			 */
+			int16_t hys_degree;
+		} tablet_mode_threshold;
 	};
 } __ec_todo_packed;
 
 struct ec_response_motion_sense {
 	union {
-		/* Used for MOTIONSENSE_CMD_DUMP. */
+		/* Used for MOTIONSENSE_CMD_DUMP */
 		struct __ec_todo_unpacked {
 			/* Flags representing the motion sensor module. */
 			uint8_t module_flags;
@@ -2411,27 +2656,118 @@ struct ec_response_motion_sense {
 			uint8_t chip;
 		} info;
 
+		/* Used for MOTIONSENSE_CMD_INFO version 3 */
+		struct __ec_todo_unpacked {
+			/* Should be element of enum motionsensor_type. */
+			uint8_t type;
+
+			/* Should be element of enum motionsensor_location. */
+			uint8_t location;
+
+			/* Should be element of enum motionsensor_chip. */
+			uint8_t chip;
+
+			/* Minimum sensor sampling frequency */
+			uint32_t min_frequency;
+
+			/* Maximum sensor sampling frequency */
+			uint32_t max_frequency;
+
+			/* Max number of sensor events that could be in fifo */
+			uint32_t fifo_max_event_count;
+		} info_3;
+
 		/* Used for MOTIONSENSE_CMD_DATA */
 		struct ec_response_motion_sensor_data data;
 
 		/*
 		 * Used for MOTIONSENSE_CMD_EC_RATE, MOTIONSENSE_CMD_SENSOR_ODR,
-		 * MOTIONSENSE_CMD_SENSOR_RANGE, and
-		 * MOTIONSENSE_CMD_KB_WAKE_ANGLE.
+		 * MOTIONSENSE_CMD_SENSOR_RANGE,
+		 * MOTIONSENSE_CMD_KB_WAKE_ANGLE,
+		 * MOTIONSENSE_CMD_FIFO_INT_ENABLE and
+		 * MOTIONSENSE_CMD_SPOOF.
 		 */
 		struct __ec_todo_unpacked {
 			/* Current value of the parameter queried. */
 			int32_t ret;
-		} ec_rate, sensor_odr, sensor_range, kb_wake_angle;
+		} ec_rate, sensor_odr, sensor_range, kb_wake_angle,
+		  fifo_int_enable, spoof;
 
-		/* Used for MOTIONSENSE_CMD_SENSOR_OFFSET */
+		/*
+		 * Used for MOTIONSENSE_CMD_SENSOR_OFFSET,
+		 * PERFORM_CALIB.
+		 */
 		struct __ec_todo_unpacked  {
 			int16_t temp;
 			int16_t offset[3];
 		} sensor_offset, perform_calib;
+
+		/* Used for MOTIONSENSE_CMD_SENSOR_SCALE */
+		struct __ec_todo_unpacked  {
+			int16_t temp;
+			uint16_t scale[3];
+		} sensor_scale;
+
+		struct ec_response_motion_sense_fifo_info fifo_info, fifo_flush;
+
+		struct ec_response_motion_sense_fifo_data fifo_read;
+
+		struct __ec_todo_packed {
+			uint16_t reserved;
+			uint32_t enabled;
+			uint32_t disabled;
+		} list_activities;
+
+		/* No params for set activity */
+
+		/* Used for MOTIONSENSE_CMD_LID_ANGLE */
+		struct __ec_todo_unpacked {
+			/*
+			 * Angle between 0 and 360 degree if available,
+			 * LID_ANGLE_UNRELIABLE otherwise.
+			 */
+			uint16_t value;
+		} lid_angle;
+
+		/* Used for MOTIONSENSE_CMD_TABLET_MODE_LID_ANGLE. */
+		struct __ec_todo_unpacked {
+			/*
+			 * Lid angle threshold for switching between tablet and
+			 * clamshell mode.
+			 */
+			uint16_t lid_angle;
+
+			/* Hysteresis degree. */
+			uint16_t hys_degree;
+		} tablet_mode_threshold;
+
 	};
 } __ec_todo_packed;
 
+/*****************************************************************************/
+/* Force lid open command */
+
+/* Make lid event always open */
+#define EC_CMD_FORCE_LID_OPEN 0x002C
+
+struct ec_params_force_lid_open {
+	uint8_t enabled;
+} __ec_align1;
+
+/*****************************************************************************/
+/* Configure the behavior of the power button */
+#define EC_CMD_CONFIG_POWER_BUTTON 0x002D
+
+enum ec_config_power_button_flags {
+	/* Enable/Disable power button pulses for x86 devices */
+	EC_POWER_BUTTON_ENABLE_PULSE = BIT(0),
+};
+
+struct ec_params_config_power_button {
+	/* See enum ec_config_power_button_flags */
+	uint8_t flags;
+} __ec_align1;
+
 /*****************************************************************************/
 /* USB charging control commands */
 
@@ -2888,6 +3224,12 @@ union __ec_align_offset1 ec_response_get_next_data {
 	/* Unaligned */
 	uint32_t  host_event;
 
+	struct __ec_todo_unpacked {
+		/* For aligning the fifo_info */
+		uint8_t reserved[3];
+		struct ec_response_motion_sense_fifo_info info;
+	} sensor_fifo;
+
 	uint32_t   buttons;
 	uint32_t   switches;
 	uint32_t   sysrq;
@@ -2896,6 +3238,12 @@ union __ec_align_offset1 ec_response_get_next_data {
 union __ec_align_offset1 ec_response_get_next_data_v1 {
 	uint8_t key_matrix[16];
 	uint32_t host_event;
+	struct __ec_todo_unpacked {
+		/* For aligning the fifo_info */
+		uint8_t reserved[3];
+		struct ec_response_motion_sense_fifo_info info;
+	} sensor_fifo;
+
 	uint32_t buttons;
 	uint32_t switches;
 	uint32_t sysrq;
-- 
cgit v1.2.3


From 784dd15c930fd65576200bee225a2796e363c342 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:47 -0700
Subject: mfd: cros_ec: Fix event processing API

Improve API between EC and Host to report events.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 87 +++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 701b03cfa445..51fe65170ce6 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -500,7 +500,8 @@ enum host_event_code {
 	EC_HOST_EVENT_BATTERY_CRITICAL = 7,
 	EC_HOST_EVENT_BATTERY = 8,
 	EC_HOST_EVENT_THERMAL_THRESHOLD = 9,
-	EC_HOST_EVENT_THERMAL_OVERLOAD = 10,
+	/* Event generated by a device attached to the EC */
+	EC_HOST_EVENT_DEVICE = 10,
 	EC_HOST_EVENT_THERMAL = 11,
 	EC_HOST_EVENT_USB_CHARGER = 12,
 	EC_HOST_EVENT_KEY_PRESSED = 13,
@@ -527,15 +528,34 @@ enum host_event_code {
 	EC_HOST_EVENT_HANG_DETECT = 20,
 	/* Hang detect logic detected a hang and warm rebooted the AP */
 	EC_HOST_EVENT_HANG_REBOOT = 21,
+
 	/* PD MCU triggering host event */
 	EC_HOST_EVENT_PD_MCU = 22,
 
-	/* EC desires to change state of host-controlled USB mux */
-	EC_HOST_EVENT_USB_MUX = 28,
+	/* Battery Status flags have changed */
+	EC_HOST_EVENT_BATTERY_STATUS = 23,
+
+	/* EC encountered a panic, triggering a reset */
+	EC_HOST_EVENT_PANIC = 24,
+
+	/* Keyboard fastboot combo has been pressed */
+	EC_HOST_EVENT_KEYBOARD_FASTBOOT = 25,
 
 	/* EC RTC event occurred */
 	EC_HOST_EVENT_RTC = 26,
 
+	/* Emulate MKBP event */
+	EC_HOST_EVENT_MKBP = 27,
+
+	/* EC desires to change state of host-controlled USB mux */
+	EC_HOST_EVENT_USB_MUX = 28,
+
+	/* TABLET/LAPTOP mode or detachable base attach/detach event */
+	EC_HOST_EVENT_MODE_CHANGE = 29,
+
+	/* Keyboard recovery combo with hardware reinitialization */
+	EC_HOST_EVENT_KEYBOARD_RECOVERY_HW_REINIT = 30,
+
 	/*
 	 * The high bit of the event mask is not used as a host event code.  If
 	 * it reads back as set, then the entire event mask should be
@@ -1259,7 +1279,7 @@ enum ec_feature_code {
 	EC_FEATURE_REFINED_TABLET_MODE_HYSTERESIS = 37,
 	/* EC supports audio codec. */
 	EC_FEATURE_AUDIO_CODEC = 38,
-	/* EC Supports SCP. */
+	/* The MCU is a System Companion Processor (SCP). */
 	EC_FEATURE_SCP = 39,
 	/* The MCU is an Integrated Sensor Hub */
 	EC_FEATURE_ISH = 40,
@@ -3183,12 +3203,23 @@ struct ec_result_keyscan_seq_ctrl {
 } __ec_todo_packed;
 
 /*
- * Command for retrieving the next pending MKBP event from the EC device
+ * Get the next pending MKBP event.
  *
- * The device replies with UNAVAILABLE if there aren't any pending events.
+ * Returns EC_RES_UNAVAILABLE if there is no event pending.
  */
 #define EC_CMD_GET_NEXT_EVENT 0x0067
 
+#define EC_MKBP_HAS_MORE_EVENTS_SHIFT 7
+
+/*
+ * We use the most significant bit of the event type to indicate to the host
+ * that the EC has more MKBP events available to provide.
+ */
+#define EC_MKBP_HAS_MORE_EVENTS BIT(EC_MKBP_HAS_MORE_EVENTS_SHIFT)
+
+/* The mask to apply to get the raw event type */
+#define EC_MKBP_EVENT_TYPE_MASK (BIT(EC_MKBP_HAS_MORE_EVENTS_SHIFT) - 1)
+
 enum ec_mkbp_event {
 	/* Keyboard matrix changed. The event data is the new matrix state. */
 	EC_MKBP_EVENT_KEY_MATRIX = 0,
@@ -3205,9 +3236,21 @@ enum ec_mkbp_event {
 	/* The state of the switches have changed. */
 	EC_MKBP_EVENT_SWITCH = 4,
 
-	/* EC sent a sysrq command */
+	/* New Fingerprint sensor event, the event data is fp_events bitmap. */
+	EC_MKBP_EVENT_FINGERPRINT = 5,
+
+	/*
+	 * Sysrq event: send emulated sysrq. The event data is sysrq,
+	 * corresponding to the key to be pressed.
+	 */
 	EC_MKBP_EVENT_SYSRQ = 6,
 
+	/*
+	 * New 64-bit host event.
+	 * The event data is 8 bytes of host event flags.
+	 */
+	EC_MKBP_EVENT_HOST_EVENT64 = 7,
+
 	/* Notify the AP that something happened on CEC */
 	EC_MKBP_EVENT_CEC_EVENT = 8,
 
@@ -3217,12 +3260,14 @@ enum ec_mkbp_event {
 	/* Number of MKBP events */
 	EC_MKBP_EVENT_COUNT,
 };
+BUILD_ASSERT(EC_MKBP_EVENT_COUNT <= EC_MKBP_EVENT_TYPE_MASK);
 
 union __ec_align_offset1 ec_response_get_next_data {
 	uint8_t key_matrix[13];
 
 	/* Unaligned */
-	uint32_t  host_event;
+	uint32_t host_event;
+	uint64_t host_event64;
 
 	struct __ec_todo_unpacked {
 		/* For aligning the fifo_info */
@@ -3230,14 +3275,25 @@ union __ec_align_offset1 ec_response_get_next_data {
 		struct ec_response_motion_sense_fifo_info info;
 	} sensor_fifo;
 
-	uint32_t   buttons;
-	uint32_t   switches;
-	uint32_t   sysrq;
+	uint32_t buttons;
+
+	uint32_t switches;
+
+	uint32_t fp_events;
+
+	uint32_t sysrq;
+
+	/* CEC events from enum mkbp_cec_event */
+	uint32_t cec_events;
 };
 
 union __ec_align_offset1 ec_response_get_next_data_v1 {
 	uint8_t key_matrix[16];
+
+	/* Unaligned */
 	uint32_t host_event;
+	uint64_t host_event64;
+
 	struct __ec_todo_unpacked {
 		/* For aligning the fifo_info */
 		uint8_t reserved[3];
@@ -3245,11 +3301,19 @@ union __ec_align_offset1 ec_response_get_next_data_v1 {
 	} sensor_fifo;
 
 	uint32_t buttons;
+
 	uint32_t switches;
+
+	uint32_t fp_events;
+
 	uint32_t sysrq;
+
+	/* CEC events from enum mkbp_cec_event */
 	uint32_t cec_events;
+
 	uint8_t cec_message[16];
 };
+BUILD_ASSERT(sizeof(union ec_response_get_next_data_v1) == 16);
 
 struct ec_response_get_next_event {
 	uint8_t event_type;
@@ -3268,6 +3332,7 @@ struct ec_response_get_next_event_v1 {
 #define EC_MKBP_POWER_BUTTON	0
 #define EC_MKBP_VOL_UP		1
 #define EC_MKBP_VOL_DOWN	2
+#define EC_MKBP_RECOVERY	3
 
 /* Switches */
 #define EC_MKBP_LID_OPEN	0
-- 
cgit v1.2.3


From 716bf50ea8b19c397e9d86238c6e7b307cbac3f5 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:48 -0700
Subject: mfd: cros_ec: Add fingerprint API

Add support for fingerprint sensors managed by embedded controller.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 51fe65170ce6..d5d07a9957ec 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -3339,6 +3339,40 @@ struct ec_response_get_next_event_v1 {
 #define EC_MKBP_TABLET_MODE	1
 #define EC_MKBP_BASE_ATTACHED	2
 
+/* Fingerprint events in 'fp_events' for EC_MKBP_EVENT_FINGERPRINT */
+#define EC_MKBP_FP_RAW_EVENT(fp_events) ((fp_events) & 0x00FFFFFF)
+#define EC_MKBP_FP_ERRCODE(fp_events)   ((fp_events) & 0x0000000F)
+#define EC_MKBP_FP_ENROLL_PROGRESS_OFFSET 4
+#define EC_MKBP_FP_ENROLL_PROGRESS(fpe) (((fpe) & 0x00000FF0) \
+					 >> EC_MKBP_FP_ENROLL_PROGRESS_OFFSET)
+#define EC_MKBP_FP_MATCH_IDX_OFFSET 12
+#define EC_MKBP_FP_MATCH_IDX_MASK 0x0000F000
+#define EC_MKBP_FP_MATCH_IDX(fpe) (((fpe) & EC_MKBP_FP_MATCH_IDX_MASK) \
+					 >> EC_MKBP_FP_MATCH_IDX_OFFSET)
+#define EC_MKBP_FP_ENROLL               BIT(27)
+#define EC_MKBP_FP_MATCH                BIT(28)
+#define EC_MKBP_FP_FINGER_DOWN          BIT(29)
+#define EC_MKBP_FP_FINGER_UP            BIT(30)
+#define EC_MKBP_FP_IMAGE_READY          BIT(31)
+/* code given by EC_MKBP_FP_ERRCODE() when EC_MKBP_FP_ENROLL is set */
+#define EC_MKBP_FP_ERR_ENROLL_OK               0
+#define EC_MKBP_FP_ERR_ENROLL_LOW_QUALITY      1
+#define EC_MKBP_FP_ERR_ENROLL_IMMOBILE         2
+#define EC_MKBP_FP_ERR_ENROLL_LOW_COVERAGE     3
+#define EC_MKBP_FP_ERR_ENROLL_INTERNAL         5
+/* Can be used to detect if image was usable for enrollment or not. */
+#define EC_MKBP_FP_ERR_ENROLL_PROBLEM_MASK     1
+/* code given by EC_MKBP_FP_ERRCODE() when EC_MKBP_FP_MATCH is set */
+#define EC_MKBP_FP_ERR_MATCH_NO                0
+#define EC_MKBP_FP_ERR_MATCH_NO_INTERNAL       6
+#define EC_MKBP_FP_ERR_MATCH_NO_TEMPLATES      7
+#define EC_MKBP_FP_ERR_MATCH_NO_LOW_QUALITY    2
+#define EC_MKBP_FP_ERR_MATCH_NO_LOW_COVERAGE   4
+#define EC_MKBP_FP_ERR_MATCH_YES               1
+#define EC_MKBP_FP_ERR_MATCH_YES_UPDATED       3
+#define EC_MKBP_FP_ERR_MATCH_YES_UPDATE_FAILED 5
+
+
 /*****************************************************************************/
 /* Temperature sensor commands */
 
-- 
cgit v1.2.3


From 170309b438a3826b09401a84f3de911db5b627a7 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:49 -0700
Subject: mfd: cros_ec: Fix temperature API

Improve API to retrieve temperature information.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 64 ++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index d5d07a9957ec..9a84aad7475a 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2945,9 +2945,28 @@ enum ec_temp_thresholds {
 /*
  * Thermal configuration for one temperature sensor. Temps are in degrees K.
  * Zero values will be silently ignored by the thermal task.
+ *
+ * Set 'temp_host' value allows thermal task to trigger some event with 1 degree
+ * hysteresis.
+ * For example,
+ *	temp_host[EC_TEMP_THRESH_HIGH] = 300 K
+ *	temp_host_release[EC_TEMP_THRESH_HIGH] = 0 K
+ * EC will throttle ap when temperature >= 301 K, and release throttling when
+ * temperature <= 299 K.
+ *
+ * Set 'temp_host_release' value allows thermal task has a custom hysteresis.
+ * For example,
+ *	temp_host[EC_TEMP_THRESH_HIGH] = 300 K
+ *	temp_host_release[EC_TEMP_THRESH_HIGH] = 295 K
+ * EC will throttle ap when temperature >= 301 K, and release throttling when
+ * temperature <= 294 K.
+ *
+ * Note that this structure is a sub-structure of
+ * ec_params_thermal_set_threshold_v1, but maintains its alignment there.
  */
 struct ec_thermal_config {
 	uint32_t temp_host[EC_TEMP_THRESH_COUNT]; /* levels of hotness */
+	uint32_t temp_host_release[EC_TEMP_THRESH_COUNT]; /* release levels */
 	uint32_t temp_fan_off;		/* no active cooling needed */
 	uint32_t temp_fan_max;		/* max active cooling needed */
 } __ec_align4;
@@ -2973,32 +2992,63 @@ struct ec_params_thermal_set_threshold_v1 {
 /* Toggle automatic fan control */
 #define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052
 
-/* Get TMP006 calibration data */
+/* Version 1 of input params */
+struct ec_params_auto_fan_ctrl_v1 {
+	uint8_t fan_idx;
+} __ec_align1;
+
+/* Get/Set TMP006 calibration data */
 #define EC_CMD_TMP006_GET_CALIBRATION 0x0053
+#define EC_CMD_TMP006_SET_CALIBRATION 0x0054
+
+/*
+ * The original TMP006 calibration only needed four params, but now we need
+ * more. Since the algorithm is nothing but magic numbers anyway, we'll leave
+ * the params opaque. The v1 "get" response will include the algorithm number
+ * and how many params it requires. That way we can change the EC code without
+ * needing to update this file. We can also use a different algorithm on each
+ * sensor.
+ */
 
+/* This is the same struct for both v0 and v1. */
 struct ec_params_tmp006_get_calibration {
 	uint8_t index;
 } __ec_align1;
 
-struct ec_response_tmp006_get_calibration {
+/* Version 0 */
+struct ec_response_tmp006_get_calibration_v0 {
 	float s0;
 	float b0;
 	float b1;
 	float b2;
 } __ec_align4;
 
-/* Set TMP006 calibration data */
-#define EC_CMD_TMP006_SET_CALIBRATION 0x0054
-
-struct ec_params_tmp006_set_calibration {
+struct ec_params_tmp006_set_calibration_v0 {
 	uint8_t index;
-	uint8_t reserved[3];  /* Reserved; set 0 */
+	uint8_t reserved[3];
 	float s0;
 	float b0;
 	float b1;
 	float b2;
 } __ec_align4;
 
+/* Version 1 */
+struct ec_response_tmp006_get_calibration_v1 {
+	uint8_t algorithm;
+	uint8_t num_params;
+	uint8_t reserved[2];
+	float val[0];
+} __ec_align4;
+
+struct ec_params_tmp006_set_calibration_v1 {
+	uint8_t index;
+	uint8_t algorithm;
+	uint8_t num_params;
+	uint8_t reserved;
+	float val[0];
+} __ec_align4;
+
+
 /* Read raw TMP006 data */
 #define EC_CMD_TMP006_GET_RAW 0x0055
 
-- 
cgit v1.2.3


From e16efdf12105d921b44b78a0012acf2487f3245b Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:50 -0700
Subject: mfd: cros_ec: Complete Power and USB PD API

Improve API for USB Powe delivery and power management.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 236 +++++++++++++++++++++++++++++++++--
 1 file changed, 228 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 9a84aad7475a..e05cdcb12481 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2796,7 +2796,8 @@ struct ec_params_config_power_button {
 
 struct ec_params_usb_charge_set_mode {
 	uint8_t usb_port_id;
-	uint8_t mode;
+	uint8_t mode:7;
+	uint8_t inhibit_charge:1;
 } __ec_align1;
 
 /*****************************************************************************/
@@ -3933,6 +3934,11 @@ enum charge_state_params {
 	CS_PARAM_CHG_INPUT_CURRENT,   /* charger input current limit */
 	CS_PARAM_CHG_STATUS,	      /* charger-specific status */
 	CS_PARAM_CHG_OPTION,	      /* charger-specific options */
+	CS_PARAM_LIMIT_POWER,	      /*
+				       * Check if power is limited due to
+				       * low battery and / or a weak external
+				       * charger. READ ONLY.
+				       */
 	/* How many so far? */
 	CS_NUM_BASE_PARAMS,
 
@@ -3940,6 +3946,17 @@ enum charge_state_params {
 	CS_PARAM_CUSTOM_PROFILE_MIN = 0x10000,
 	CS_PARAM_CUSTOM_PROFILE_MAX = 0x1ffff,
 
+	/* Range for CONFIG_CHARGE_STATE_DEBUG params */
+	CS_PARAM_DEBUG_MIN = 0x20000,
+	CS_PARAM_DEBUG_CTL_MODE = 0x20000,
+	CS_PARAM_DEBUG_MANUAL_MODE,
+	CS_PARAM_DEBUG_SEEMS_DEAD,
+	CS_PARAM_DEBUG_SEEMS_DISCONNECTED,
+	CS_PARAM_DEBUG_BATT_REMOVED,
+	CS_PARAM_DEBUG_MANUAL_CURRENT,
+	CS_PARAM_DEBUG_MANUAL_VOLTAGE,
+	CS_PARAM_DEBUG_MAX = 0x2ffff,
+
 	/* Other custom param ranges go here... */
 };
 
@@ -4000,6 +4017,16 @@ struct ec_params_external_power_limit_v1 {
 
 #define EC_POWER_LIMIT_NONE 0xffff
 
+/*
+ * Set maximum voltage & current of a dedicated charge port
+ */
+#define EC_CMD_OVERRIDE_DEDICATED_CHARGER_LIMIT 0x00A3
+
+struct ec_params_dedicated_charger_limit {
+	uint16_t current_lim; /* in mA */
+	uint16_t voltage_lim; /* in mV */
+} __ec_align2;
+
 /* Inform the EC when entering a sleep state */
 #define EC_CMD_HOST_SLEEP_EVENT 0x00A9
 
@@ -4385,18 +4412,53 @@ struct ec_params_reboot_ec {
 
 /* EC to PD MCU exchange status command */
 #define EC_CMD_PD_EXCHANGE_STATUS 0x0100
+#define EC_VER_PD_EXCHANGE_STATUS 2
+
+enum pd_charge_state {
+	PD_CHARGE_NO_CHANGE = 0, /* Don't change charge state */
+	PD_CHARGE_NONE,          /* No charging allowed */
+	PD_CHARGE_5V,            /* 5V charging only */
+	PD_CHARGE_MAX            /* Charge at max voltage */
+};
 
 /* Status of EC being sent to PD */
+#define EC_STATUS_HIBERNATING	BIT(0)
+
 struct ec_params_pd_status {
-	int8_t batt_soc; /* battery state of charge */
+	uint8_t status;       /* EC status */
+	int8_t batt_soc;      /* battery state of charge */
+	uint8_t charge_state; /* charging state (from enum pd_charge_state) */
 } __ec_align1;
 
 /* Status of PD being sent back to EC */
+#define PD_STATUS_HOST_EVENT      BIT(0) /* Forward host event to AP */
+#define PD_STATUS_IN_RW           BIT(1) /* Running RW image */
+#define PD_STATUS_JUMPED_TO_IMAGE BIT(2) /* Current image was jumped to */
+#define PD_STATUS_TCPC_ALERT_0    BIT(3) /* Alert active in port 0 TCPC */
+#define PD_STATUS_TCPC_ALERT_1    BIT(4) /* Alert active in port 1 TCPC */
+#define PD_STATUS_TCPC_ALERT_2    BIT(5) /* Alert active in port 2 TCPC */
+#define PD_STATUS_TCPC_ALERT_3    BIT(6) /* Alert active in port 3 TCPC */
+#define PD_STATUS_EC_INT_ACTIVE  (PD_STATUS_TCPC_ALERT_0 | \
+				      PD_STATUS_TCPC_ALERT_1 | \
+				      PD_STATUS_HOST_EVENT)
 struct ec_response_pd_status {
-	int8_t status;        /* PD MCU status */
-	uint32_t curr_lim_ma; /* input current limit */
+	uint32_t curr_lim_ma;       /* input current limit */
+	uint16_t status;            /* PD MCU status */
+	int8_t active_charge_port;  /* active charging port */
 } __ec_align_size1;
 
+/* AP to PD MCU host event status command, cleared on read */
+#define EC_CMD_PD_HOST_EVENT_STATUS 0x0104
+
+/* PD MCU host event status bits */
+#define PD_EVENT_UPDATE_DEVICE     BIT(0)
+#define PD_EVENT_POWER_CHANGE      BIT(1)
+#define PD_EVENT_IDENTITY_RECEIVED BIT(2)
+#define PD_EVENT_DATA_SWAP         BIT(3)
+struct ec_response_host_event_status {
+	uint32_t status;      /* PD MCU host event status */
+} __ec_align4;
+
 /* Set USB type-C port role and muxes */
 #define EC_CMD_USB_PD_CONTROL 0x0101
 
@@ -4406,6 +4468,8 @@ enum usb_pd_control_role {
 	USB_PD_CTRL_ROLE_TOGGLE_OFF = 2,
 	USB_PD_CTRL_ROLE_FORCE_SINK = 3,
 	USB_PD_CTRL_ROLE_FORCE_SOURCE = 4,
+	USB_PD_CTRL_ROLE_FREEZE = 5,
+	USB_PD_CTRL_ROLE_COUNT
 };
 
 enum usb_pd_control_mux {
@@ -4415,6 +4479,7 @@ enum usb_pd_control_mux {
 	USB_PD_CTRL_MUX_DP = 3,
 	USB_PD_CTRL_MUX_DOCK = 4,
 	USB_PD_CTRL_MUX_AUTO = 5,
+	USB_PD_CTRL_MUX_COUNT
 };
 
 enum usb_pd_control_swap {
@@ -4444,6 +4509,13 @@ struct ec_params_usb_pd_control {
 #define PD_CTRL_RESP_ROLE_USB_COMM      BIT(5) /* Partner USB comm capable */
 #define PD_CTRL_RESP_ROLE_EXT_POWERED   BIT(6) /* Partner externally powerd */
 
+struct ec_response_usb_pd_control {
+	uint8_t enabled;
+	uint8_t role;
+	uint8_t polarity;
+	uint8_t state;
+} __ec_align1;
+
 struct ec_response_usb_pd_control_v1 {
 	uint8_t enabled;
 	uint8_t role;
@@ -4451,6 +4523,25 @@ struct ec_response_usb_pd_control_v1 {
 	char state[32];
 } __ec_align1;
 
+/* Values representing usbc PD CC state */
+#define USBC_PD_CC_NONE		0 /* No accessory connected */
+#define USBC_PD_CC_NO_UFP	1 /* No UFP accessory connected */
+#define USBC_PD_CC_AUDIO_ACC	2 /* Audio accessory connected */
+#define USBC_PD_CC_DEBUG_ACC	3 /* Debug accessory connected */
+#define USBC_PD_CC_UFP_ATTACHED	4 /* UFP attached to usbc */
+#define USBC_PD_CC_DFP_ATTACHED	5 /* DPF attached to usbc */
+
+struct ec_response_usb_pd_control_v2 {
+	uint8_t enabled;
+	uint8_t role;
+	uint8_t polarity;
+	char state[32];
+	uint8_t cc_state; /* USBC_PD_CC_*Encoded cc state */
+	uint8_t dp_mode;  /* Current DP pin mode (MODE_DP_PIN_[A-E]) */
+	/* CL:1500994 Current cable type */
+	uint8_t reserved_cable_type;
+} __ec_align1;
+
 #define EC_CMD_USB_PD_PORTS 0x0102
 
 /* Maximum number of PD ports on a device, num_ports will be <= this */
@@ -4478,6 +4569,7 @@ enum usb_chg_type {
 	USB_CHG_TYPE_OTHER,
 	USB_CHG_TYPE_VBUS,
 	USB_CHG_TYPE_UNKNOWN,
+	USB_CHG_TYPE_DEDICATED,
 };
 enum usb_power_roles {
 	USB_PD_PORT_POWER_DISCONNECTED,
@@ -4502,9 +4594,6 @@ struct ec_response_usb_pd_power_info {
 	uint32_t max_power;
 } __ec_align4;
 
-struct ec_params_usb_pd_info_request {
-	uint8_t port;
-} __ec_align1;
 
 /*
  * This command will return the number of USB PD charge port + the number
@@ -4516,6 +4605,46 @@ struct ec_response_charge_port_count {
 	uint8_t port_count;
 } __ec_align1;
 
+/* Write USB-PD device FW */
+#define EC_CMD_USB_PD_FW_UPDATE 0x0110
+
+enum usb_pd_fw_update_cmds {
+	USB_PD_FW_REBOOT,
+	USB_PD_FW_FLASH_ERASE,
+	USB_PD_FW_FLASH_WRITE,
+	USB_PD_FW_ERASE_SIG,
+};
+
+struct ec_params_usb_pd_fw_update {
+	uint16_t dev_id;
+	uint8_t cmd;
+	uint8_t port;
+	uint32_t size;     /* Size to write in bytes */
+	/* Followed by data to write */
+} __ec_align4;
+
+/* Write USB-PD Accessory RW_HASH table entry */
+#define EC_CMD_USB_PD_RW_HASH_ENTRY 0x0111
+/* RW hash is first 20 bytes of SHA-256 of RW section */
+#define PD_RW_HASH_SIZE 20
+struct ec_params_usb_pd_rw_hash_entry {
+	uint16_t dev_id;
+	uint8_t dev_rw_hash[PD_RW_HASH_SIZE];
+	uint8_t reserved;        /*
+				  * For alignment of current_image
+				  * TODO(rspangler) but it's not aligned!
+				  * Should have been reserved[2].
+				  */
+	uint32_t current_image;  /* One of ec_current_image */
+} __ec_align1;
+
+/* Read USB-PD Accessory info */
+#define EC_CMD_USB_PD_DEV_INFO 0x0112
+
+struct ec_params_usb_pd_info_request {
+	uint8_t port;
+} __ec_align1;
+
 /* Read USB-PD Device discovery info */
 #define EC_CMD_USB_PD_DISCOVERY 0x0113
 struct ec_params_usb_pd_discovery_entry {
@@ -4538,7 +4667,11 @@ struct ec_params_charge_port_override {
 	int16_t override_port; /* Override port# */
 } __ec_align2;
 
-/* Read (and delete) one entry of PD event log */
+/*
+ * Read (and delete) one entry of PD event log.
+ * TODO(crbug.com/751742): Make this host command more generic to accommodate
+ * future non-PD logs that use the same internal EC event_log.
+ */
 #define EC_CMD_PD_GET_LOG_ENTRY 0x0115
 
 struct ec_response_pd_log {
@@ -4626,6 +4759,60 @@ struct mcdp_info {
 #define MCDP_CHIPID(chipid) ((chipid[0] << 8) | chipid[1])
 #define MCDP_FAMILY(family) ((family[0] << 8) | family[1])
 
+/* Get/Set USB-PD Alternate mode info */
+#define EC_CMD_USB_PD_GET_AMODE 0x0116
+struct ec_params_usb_pd_get_mode_request {
+	uint16_t svid_idx; /* SVID index to get */
+	uint8_t port;      /* port */
+} __ec_align_size1;
+
+struct ec_params_usb_pd_get_mode_response {
+	uint16_t svid;   /* SVID */
+	uint16_t opos;    /* Object Position */
+	uint32_t vdo[6]; /* Mode VDOs */
+} __ec_align4;
+
+#define EC_CMD_USB_PD_SET_AMODE 0x0117
+
+enum pd_mode_cmd {
+	PD_EXIT_MODE = 0,
+	PD_ENTER_MODE = 1,
+	/* Not a command.  Do NOT remove. */
+	PD_MODE_CMD_COUNT,
+};
+
+struct ec_params_usb_pd_set_mode_request {
+	uint32_t cmd;  /* enum pd_mode_cmd */
+	uint16_t svid; /* SVID to set */
+	uint8_t opos;  /* Object Position */
+	uint8_t port;  /* port */
+} __ec_align4;
+
+/* Ask the PD MCU to record a log of a requested type */
+#define EC_CMD_PD_WRITE_LOG_ENTRY 0x0118
+
+struct ec_params_pd_write_log_entry {
+	uint8_t type; /* event type : see PD_EVENT_xx above */
+	uint8_t port; /* port#, or 0 for events unrelated to a given port */
+} __ec_align1;
+
+
+/* Control USB-PD chip */
+#define EC_CMD_PD_CONTROL 0x0119
+
+enum ec_pd_control_cmd {
+	PD_SUSPEND = 0,      /* Suspend the PD chip (EC: stop talking to PD) */
+	PD_RESUME,           /* Resume the PD chip (EC: start talking to PD) */
+	PD_RESET,            /* Force reset the PD chip */
+	PD_CONTROL_DISABLE,  /* Disable further calls to this command */
+	PD_CHIP_ON,          /* Power on the PD chip */
+};
+
+struct ec_params_pd_control {
+	uint8_t chip;         /* chip id */
+	uint8_t subcmd;
+} __ec_align1;
+
 /* Get info about USB-C SS muxes */
 #define EC_CMD_USB_PD_MUX_INFO 0x011A
 
@@ -4638,10 +4825,43 @@ struct ec_params_usb_pd_mux_info {
 #define USB_PD_MUX_DP_ENABLED        BIT(1) /* DP connected */
 #define USB_PD_MUX_POLARITY_INVERTED BIT(2) /* CC line Polarity inverted */
 #define USB_PD_MUX_HPD_IRQ           BIT(3) /* HPD IRQ is asserted */
+#define USB_PD_MUX_HPD_LVL           BIT(4) /* HPD level is asserted */
 
 struct ec_response_usb_pd_mux_info {
 	uint8_t flags; /* USB_PD_MUX_*-encoded USB mux state */
 } __ec_align1;
+
+#define EC_CMD_PD_CHIP_INFO		0x011B
+
+struct ec_params_pd_chip_info {
+	uint8_t port;	/* USB-C port number */
+	uint8_t renew;	/* Force renewal */
+} __ec_align1;
+
+struct ec_response_pd_chip_info {
+	uint16_t vendor_id;
+	uint16_t product_id;
+	uint16_t device_id;
+	union {
+		uint8_t fw_version_string[8];
+		uint64_t fw_version_number;
+	};
+} __ec_align2;
+
+struct ec_response_pd_chip_info_v1 {
+	uint16_t vendor_id;
+	uint16_t product_id;
+	uint16_t device_id;
+	union {
+		uint8_t fw_version_string[8];
+		uint64_t fw_version_number;
+	};
+	union {
+		uint8_t min_req_fw_version_string[8];
+		uint64_t min_req_fw_version_number;
+	};
+} __ec_align2;
+
 /*****************************************************************************/
 /*
  * Reserve a range of host commands for board-specific, experimental, or
-- 
cgit v1.2.3


From fd5372848a6fab0f4ec80c554f384cb1a8c19bb7 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:51 -0700
Subject: mfd: cros_ec: Add API for keyboard testing

Add command to allow keyboard testing in factory.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index e05cdcb12481..cc054a0a4c4c 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -3142,6 +3142,17 @@ struct ec_params_mkbp_simulate_key {
 	uint8_t pressed;
 } __ec_align1;
 
+#define EC_CMD_GET_KEYBOARD_ID 0x0063
+
+struct ec_response_keyboard_id {
+	uint32_t keyboard_id;
+} __ec_align4;
+
+enum keyboard_id {
+	KEYBOARD_ID_UNSUPPORTED = 0,
+	KEYBOARD_ID_UNREADABLE = 0xffffffff,
+};
+
 /* Configure keyboard scanning */
 #define EC_CMD_MKBP_SET_CONFIG 0x0064
 #define EC_CMD_MKBP_GET_CONFIG 0x0065
@@ -3390,6 +3401,13 @@ struct ec_response_get_next_event_v1 {
 #define EC_MKBP_TABLET_MODE	1
 #define EC_MKBP_BASE_ATTACHED	2
 
+/* Run keyboard factory test scanning */
+#define EC_CMD_KEYBOARD_FACTORY_TEST 0x0068
+
+struct ec_response_keyboard_factory_test {
+	uint16_t shorted;	/* Keyboard pins are shorted */
+} __ec_align2;
+
 /* Fingerprint events in 'fp_events' for EC_MKBP_EVENT_FINGERPRINT */
 #define EC_MKBP_FP_RAW_EVENT(fp_events) ((fp_events) & 0x00FFFFFF)
 #define EC_MKBP_FP_ERRCODE(fp_events)   ((fp_events) & 0x0000000F)
-- 
cgit v1.2.3


From b92be99f37427fbc5deb6cb1a246f096e302f92d Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:52 -0700
Subject: mfd: cros_ec: Add Hibernate API

Add support for controlling hibernation of the Embedded Controller.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 72 +++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index cc054a0a4c4c..7f98c6e63ad1 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -4045,6 +4045,40 @@ struct ec_params_dedicated_charger_limit {
 	uint16_t voltage_lim; /* in mV */
 } __ec_align2;
 
+/*****************************************************************************/
+/* Hibernate/Deep Sleep Commands */
+
+/* Set the delay before going into hibernation. */
+#define EC_CMD_HIBERNATION_DELAY 0x00A8
+
+struct ec_params_hibernation_delay {
+	/*
+	 * Seconds to wait in G3 before hibernate.  Pass in 0 to read the
+	 * current settings without changing them.
+	 */
+	uint32_t seconds;
+} __ec_align4;
+
+struct ec_response_hibernation_delay {
+	/*
+	 * The current time in seconds in which the system has been in the G3
+	 * state.  This value is reset if the EC transitions out of G3.
+	 */
+	uint32_t time_g3;
+
+	/*
+	 * The current time remaining in seconds until the EC should hibernate.
+	 * This value is also reset if the EC transitions out of G3.
+	 */
+	uint32_t time_remaining;
+
+	/*
+	 * The current time in seconds that the EC should wait in G3 before
+	 * hibernating.
+	 */
+	uint32_t hibernate_delay;
+} __ec_align4;
+
 /* Inform the EC when entering a sleep state */
 #define EC_CMD_HOST_SLEEP_EVENT 0x00A9
 
@@ -4052,7 +4086,9 @@ enum host_sleep_event {
 	HOST_SLEEP_EVENT_S3_SUSPEND   = 1,
 	HOST_SLEEP_EVENT_S3_RESUME    = 2,
 	HOST_SLEEP_EVENT_S0IX_SUSPEND = 3,
-	HOST_SLEEP_EVENT_S0IX_RESUME  = 4
+	HOST_SLEEP_EVENT_S0IX_RESUME  = 4,
+	/* S3 suspend with additional enabled wake sources */
+	HOST_SLEEP_EVENT_S3_WAKEABLE_SUSPEND = 5,
 };
 
 struct ec_params_host_sleep_event {
@@ -4116,6 +4152,36 @@ struct ec_response_host_sleep_event_v1 {
 	};
 } __ec_align4;
 
+/*****************************************************************************/
+/* Device events */
+#define EC_CMD_DEVICE_EVENT 0x00AA
+
+enum ec_device_event {
+	EC_DEVICE_EVENT_TRACKPAD,
+	EC_DEVICE_EVENT_DSP,
+	EC_DEVICE_EVENT_WIFI,
+};
+
+enum ec_device_event_param {
+	/* Get and clear pending device events */
+	EC_DEVICE_EVENT_PARAM_GET_CURRENT_EVENTS,
+	/* Get device event mask */
+	EC_DEVICE_EVENT_PARAM_GET_ENABLED_EVENTS,
+	/* Set device event mask */
+	EC_DEVICE_EVENT_PARAM_SET_ENABLED_EVENTS,
+};
+
+#define EC_DEVICE_EVENT_MASK(event_code) BIT(event_code % 32)
+
+struct ec_params_device_event {
+	uint32_t event_mask;
+	uint8_t param;
+} __ec_align_size1;
+
+struct ec_response_device_event {
+	uint32_t event_mask;
+} __ec_align4;
+
 /*****************************************************************************/
 /* Smart battery pass-through */
 
@@ -4361,12 +4427,14 @@ enum ec_reboot_cmd {
 	/* (command 3 was jump to RW-B) */
 	EC_REBOOT_COLD = 4,          /* Cold-reboot */
 	EC_REBOOT_DISABLE_JUMP = 5,  /* Disable jump until next reboot */
-	EC_REBOOT_HIBERNATE = 6      /* Hibernate EC */
+	EC_REBOOT_HIBERNATE = 6,     /* Hibernate EC */
+	EC_REBOOT_HIBERNATE_CLEAR_AP_OFF = 7, /* and clears AP_OFF flag */
 };
 
 /* Flags for ec_params_reboot_ec.reboot_flags */
 #define EC_REBOOT_FLAG_RESERVED0      BIT(0)  /* Was recovery request */
 #define EC_REBOOT_FLAG_ON_AP_SHUTDOWN BIT(1)  /* Reboot after AP shutdown */
+#define EC_REBOOT_FLAG_SWITCH_RW_SLOT BIT(2)  /* Switch RW slot */
 
 struct ec_params_reboot_ec {
 	uint8_t cmd;           /* enum ec_reboot_cmd */
-- 
cgit v1.2.3


From 77c48c76ad91cf774df26c6e5c74c76842943802 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:53 -0700
Subject: mfd: cros_ec: Add Smart Battery Firmware update API

Add API to update battery firmware.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 73 ++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 7f98c6e63ad1..49ea905cfd18 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -4243,6 +4243,79 @@ struct ec_response_battery_vendor_param {
 	uint32_t value;
 } __ec_align4;
 
+/*****************************************************************************/
+/*
+ * Smart Battery Firmware Update Commands
+ */
+#define EC_CMD_SB_FW_UPDATE 0x00B5
+
+enum ec_sb_fw_update_subcmd {
+	EC_SB_FW_UPDATE_PREPARE  = 0x0,
+	EC_SB_FW_UPDATE_INFO     = 0x1, /*query sb info */
+	EC_SB_FW_UPDATE_BEGIN    = 0x2, /*check if protected */
+	EC_SB_FW_UPDATE_WRITE    = 0x3, /*check if protected */
+	EC_SB_FW_UPDATE_END      = 0x4,
+	EC_SB_FW_UPDATE_STATUS   = 0x5,
+	EC_SB_FW_UPDATE_PROTECT  = 0x6,
+	EC_SB_FW_UPDATE_MAX      = 0x7,
+};
+
+#define SB_FW_UPDATE_CMD_WRITE_BLOCK_SIZE 32
+#define SB_FW_UPDATE_CMD_STATUS_SIZE 2
+#define SB_FW_UPDATE_CMD_INFO_SIZE 8
+
+struct ec_sb_fw_update_header {
+	uint16_t subcmd;  /* enum ec_sb_fw_update_subcmd */
+	uint16_t fw_id;   /* firmware id */
+} __ec_align4;
+
+struct ec_params_sb_fw_update {
+	struct ec_sb_fw_update_header hdr;
+	union {
+		/* EC_SB_FW_UPDATE_PREPARE  = 0x0 */
+		/* EC_SB_FW_UPDATE_INFO     = 0x1 */
+		/* EC_SB_FW_UPDATE_BEGIN    = 0x2 */
+		/* EC_SB_FW_UPDATE_END      = 0x4 */
+		/* EC_SB_FW_UPDATE_STATUS   = 0x5 */
+		/* EC_SB_FW_UPDATE_PROTECT  = 0x6 */
+		/* Those have no args */
+
+		/* EC_SB_FW_UPDATE_WRITE    = 0x3 */
+		struct __ec_align4 {
+			uint8_t  data[SB_FW_UPDATE_CMD_WRITE_BLOCK_SIZE];
+		} write;
+	};
+} __ec_align4;
+
+struct ec_response_sb_fw_update {
+	union {
+		/* EC_SB_FW_UPDATE_INFO     = 0x1 */
+		struct __ec_align1 {
+			uint8_t data[SB_FW_UPDATE_CMD_INFO_SIZE];
+		} info;
+
+		/* EC_SB_FW_UPDATE_STATUS   = 0x5 */
+		struct __ec_align1 {
+			uint8_t data[SB_FW_UPDATE_CMD_STATUS_SIZE];
+		} status;
+	};
+} __ec_align1;
+
+/*
+ * Entering Verified Boot Mode Command
+ * Default mode is VBOOT_MODE_NORMAL if EC did not receive this command.
+ * Valid Modes are: normal, developer, and recovery.
+ */
+#define EC_CMD_ENTERING_MODE 0x00B6
+
+struct ec_params_entering_mode {
+	int vboot_mode;
+} __ec_align4;
+
+#define VBOOT_MODE_NORMAL    0
+#define VBOOT_MODE_DEVELOPER 1
+#define VBOOT_MODE_RECOVERY  2
+
 /*****************************************************************************/
 /*
  * HDMI CEC commands
-- 
cgit v1.2.3


From a47bc8a4e88b35d9ea97ecd773cd7e0688b3322a Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:54 -0700
Subject: mfd: cros_ec: Add I2C passthru protection API

Prevent direct i2c access to device behind EC when not in development mode.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 49ea905cfd18..59ad6bae3f9b 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -4316,6 +4316,28 @@ struct ec_params_entering_mode {
 #define VBOOT_MODE_DEVELOPER 1
 #define VBOOT_MODE_RECOVERY  2
 
+/*****************************************************************************/
+/*
+ * I2C passthru protection command: Protects I2C tunnels against access on
+ * certain addresses (board-specific).
+ */
+#define EC_CMD_I2C_PASSTHRU_PROTECT 0x00B7
+
+enum ec_i2c_passthru_protect_subcmd {
+	EC_CMD_I2C_PASSTHRU_PROTECT_STATUS = 0x0,
+	EC_CMD_I2C_PASSTHRU_PROTECT_ENABLE = 0x1,
+};
+
+struct ec_params_i2c_passthru_protect {
+	uint8_t subcmd;
+	uint8_t port;		/* I2C port number */
+} __ec_align1;
+
+struct ec_response_i2c_passthru_protect {
+	uint8_t status;		/* Status flags (0: unlocked, 1: locked) */
+} __ec_align1;
+
+
 /*****************************************************************************/
 /*
  * HDMI CEC commands
-- 
cgit v1.2.3


From d90a4121bf98d959a01306599b370d8f883a0737 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:55 -0700
Subject: mfd: cros_ec: Add API for EC-EC communication

Allow EC to talk to other ECs that are not presented to the host.
Neeed when EC are present in detachable keyboard.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 95 ++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 59ad6bae3f9b..52fd9bfafc7f 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -5043,6 +5043,101 @@ struct ec_response_pd_chip_info_v1 {
 	};
 } __ec_align2;
 
+/*****************************************************************************/
+/* EC-EC communication commands: range 0x0600-0x06FF */
+
+#define EC_COMM_TEXT_MAX 8
+
+/*
+ * Get battery static information, i.e. information that never changes, or
+ * very infrequently.
+ */
+#define EC_CMD_BATTERY_GET_STATIC 0x0600
+
+/**
+ * struct ec_params_battery_static_info - Battery static info parameters
+ * @index: Battery index.
+ */
+struct ec_params_battery_static_info {
+	uint8_t index;
+} __ec_align_size1;
+
+/**
+ * struct ec_response_battery_static_info - Battery static info response
+ * @design_capacity: Battery Design Capacity (mAh)
+ * @design_voltage: Battery Design Voltage (mV)
+ * @manufacturer: Battery Manufacturer String
+ * @model: Battery Model Number String
+ * @serial: Battery Serial Number String
+ * @type: Battery Type String
+ * @cycle_count: Battery Cycle Count
+ */
+struct ec_response_battery_static_info {
+	uint16_t design_capacity;
+	uint16_t design_voltage;
+	char manufacturer[EC_COMM_TEXT_MAX];
+	char model[EC_COMM_TEXT_MAX];
+	char serial[EC_COMM_TEXT_MAX];
+	char type[EC_COMM_TEXT_MAX];
+	/* TODO(crbug.com/795991): Consider moving to dynamic structure. */
+	uint32_t cycle_count;
+} __ec_align4;
+
+/*
+ * Get battery dynamic information, i.e. information that is likely to change
+ * every time it is read.
+ */
+#define EC_CMD_BATTERY_GET_DYNAMIC 0x0601
+
+/**
+ * struct ec_params_battery_dynamic_info - Battery dynamic info parameters
+ * @index: Battery index.
+ */
+struct ec_params_battery_dynamic_info {
+	uint8_t index;
+} __ec_align_size1;
+
+/**
+ * struct ec_response_battery_dynamic_info - Battery dynamic info response
+ * @actual_voltage: Battery voltage (mV)
+ * @actual_current: Battery current (mA); negative=discharging
+ * @remaining_capacity: Remaining capacity (mAh)
+ * @full_capacity: Capacity (mAh, might change occasionally)
+ * @flags: Flags, see EC_BATT_FLAG_*
+ * @desired_voltage: Charging voltage desired by battery (mV)
+ * @desired_current: Charging current desired by battery (mA)
+ */
+struct ec_response_battery_dynamic_info {
+	int16_t actual_voltage;
+	int16_t actual_current;
+	int16_t remaining_capacity;
+	int16_t full_capacity;
+	int16_t flags;
+	int16_t desired_voltage;
+	int16_t desired_current;
+} __ec_align2;
+
+/*
+ * Control charger chip. Used to control charger chip on the slave.
+ */
+#define EC_CMD_CHARGER_CONTROL 0x0602
+
+/**
+ * struct ec_params_charger_control - Charger control parameters
+ * @max_current: Charger current (mA). Positive to allow base to draw up to
+ *     max_current and (possibly) charge battery, negative to request current
+ *     from base (OTG).
+ * @otg_voltage: Voltage (mV) to use in OTG mode, ignored if max_current is
+ *     >= 0.
+ * @allow_charging: Allow base battery charging (only makes sense if
+ *     max_current > 0).
+ */
+struct ec_params_charger_control {
+	int16_t max_current;
+	uint16_t otg_voltage;
+	uint8_t allow_charging;
+} __ec_align_size1;
+
 /*****************************************************************************/
 /*
  * Reserve a range of host commands for board-specific, experimental, or
-- 
cgit v1.2.3


From 6f9d485ca4c5d3ac223a1e49f604192be12e0676 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:56 -0700
Subject: mfd: cros_ec: Add API for Touchpad support

Add API to control touchpad presented by Embedded Controller.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 52fd9bfafc7f..1d0311df44d3 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -5043,6 +5043,32 @@ struct ec_response_pd_chip_info_v1 {
 	};
 } __ec_align2;
 
+/*****************************************************************************/
+/* Touchpad MCU commands: range 0x0500-0x05FF */
+
+/* Perform touchpad self test */
+#define EC_CMD_TP_SELF_TEST 0x0500
+
+/* Get number of frame types, and the size of each type */
+#define EC_CMD_TP_FRAME_INFO 0x0501
+
+struct ec_response_tp_frame_info {
+	uint32_t n_frames;
+	uint32_t frame_sizes[0];
+} __ec_align4;
+
+/* Create a snapshot of current frame readings */
+#define EC_CMD_TP_FRAME_SNAPSHOT 0x0502
+
+/* Read the frame */
+#define EC_CMD_TP_FRAME_GET 0x0503
+
+struct ec_params_tp_frame_get {
+	uint32_t frame_index;
+	uint32_t offset;
+	uint32_t size;
+} __ec_align4;
+
 /*****************************************************************************/
 /* EC-EC communication commands: range 0x0600-0x06FF */
 
-- 
cgit v1.2.3


From da038d6ee7a4e791eba6b1954a6c49f4f2856786 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:57 -0700
Subject: mfd: cros_ec: Add API for Fingerprint support

Add API for fingerprint sensor presented by embedded controller.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 228 +++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 1d0311df44d3..4a9ac3861bdd 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -5043,6 +5043,234 @@ struct ec_response_pd_chip_info_v1 {
 	};
 } __ec_align2;
 
+/*****************************************************************************/
+/* Fingerprint MCU commands: range 0x0400-0x040x */
+
+/* Fingerprint SPI sensor passthru command: prototyping ONLY */
+#define EC_CMD_FP_PASSTHRU 0x0400
+
+#define EC_FP_FLAG_NOT_COMPLETE 0x1
+
+struct ec_params_fp_passthru {
+	uint16_t len;		/* Number of bytes to write then read */
+	uint16_t flags;		/* EC_FP_FLAG_xxx */
+	uint8_t data[];		/* Data to send */
+} __ec_align2;
+
+/* Configure the Fingerprint MCU behavior */
+#define EC_CMD_FP_MODE 0x0402
+
+/* Put the sensor in its lowest power mode */
+#define FP_MODE_DEEPSLEEP      BIT(0)
+/* Wait to see a finger on the sensor */
+#define FP_MODE_FINGER_DOWN    BIT(1)
+/* Poll until the finger has left the sensor */
+#define FP_MODE_FINGER_UP      BIT(2)
+/* Capture the current finger image */
+#define FP_MODE_CAPTURE        BIT(3)
+/* Finger enrollment session on-going */
+#define FP_MODE_ENROLL_SESSION BIT(4)
+/* Enroll the current finger image */
+#define FP_MODE_ENROLL_IMAGE   BIT(5)
+/* Try to match the current finger image */
+#define FP_MODE_MATCH          BIT(6)
+/* Reset and re-initialize the sensor. */
+#define FP_MODE_RESET_SENSOR   BIT(7)
+/* special value: don't change anything just read back current mode */
+#define FP_MODE_DONT_CHANGE    BIT(31)
+
+#define FP_VALID_MODES (FP_MODE_DEEPSLEEP      | \
+			FP_MODE_FINGER_DOWN    | \
+			FP_MODE_FINGER_UP      | \
+			FP_MODE_CAPTURE        | \
+			FP_MODE_ENROLL_SESSION | \
+			FP_MODE_ENROLL_IMAGE   | \
+			FP_MODE_MATCH          | \
+			FP_MODE_RESET_SENSOR   | \
+			FP_MODE_DONT_CHANGE)
+
+/* Capture types defined in bits [30..28] */
+#define FP_MODE_CAPTURE_TYPE_SHIFT 28
+#define FP_MODE_CAPTURE_TYPE_MASK  (0x7 << FP_MODE_CAPTURE_TYPE_SHIFT)
+/*
+ * This enum must remain ordered, if you add new values you must ensure that
+ * FP_CAPTURE_TYPE_MAX is still the last one.
+ */
+enum fp_capture_type {
+	/* Full blown vendor-defined capture (produces 'frame_size' bytes) */
+	FP_CAPTURE_VENDOR_FORMAT = 0,
+	/* Simple raw image capture (produces width x height x bpp bits) */
+	FP_CAPTURE_SIMPLE_IMAGE = 1,
+	/* Self test pattern (e.g. checkerboard) */
+	FP_CAPTURE_PATTERN0 = 2,
+	/* Self test pattern (e.g. inverted checkerboard) */
+	FP_CAPTURE_PATTERN1 = 3,
+	/* Capture for Quality test with fixed contrast */
+	FP_CAPTURE_QUALITY_TEST = 4,
+	/* Capture for pixel reset value test */
+	FP_CAPTURE_RESET_TEST = 5,
+	FP_CAPTURE_TYPE_MAX,
+};
+/* Extracts the capture type from the sensor 'mode' word */
+#define FP_CAPTURE_TYPE(mode) (((mode) & FP_MODE_CAPTURE_TYPE_MASK) \
+				       >> FP_MODE_CAPTURE_TYPE_SHIFT)
+
+struct ec_params_fp_mode {
+	uint32_t mode; /* as defined by FP_MODE_ constants */
+} __ec_align4;
+
+struct ec_response_fp_mode {
+	uint32_t mode; /* as defined by FP_MODE_ constants */
+} __ec_align4;
+
+/* Retrieve Fingerprint sensor information */
+#define EC_CMD_FP_INFO 0x0403
+
+/* Number of dead pixels detected on the last maintenance */
+#define FP_ERROR_DEAD_PIXELS(errors) ((errors) & 0x3FF)
+/* Unknown number of dead pixels detected on the last maintenance */
+#define FP_ERROR_DEAD_PIXELS_UNKNOWN (0x3FF)
+/* No interrupt from the sensor */
+#define FP_ERROR_NO_IRQ    BIT(12)
+/* SPI communication error */
+#define FP_ERROR_SPI_COMM  BIT(13)
+/* Invalid sensor Hardware ID */
+#define FP_ERROR_BAD_HWID  BIT(14)
+/* Sensor initialization failed */
+#define FP_ERROR_INIT_FAIL BIT(15)
+
+struct ec_response_fp_info_v0 {
+	/* Sensor identification */
+	uint32_t vendor_id;
+	uint32_t product_id;
+	uint32_t model_id;
+	uint32_t version;
+	/* Image frame characteristics */
+	uint32_t frame_size;
+	uint32_t pixel_format; /* using V4L2_PIX_FMT_ */
+	uint16_t width;
+	uint16_t height;
+	uint16_t bpp;
+	uint16_t errors; /* see FP_ERROR_ flags above */
+} __ec_align4;
+
+struct ec_response_fp_info {
+	/* Sensor identification */
+	uint32_t vendor_id;
+	uint32_t product_id;
+	uint32_t model_id;
+	uint32_t version;
+	/* Image frame characteristics */
+	uint32_t frame_size;
+	uint32_t pixel_format; /* using V4L2_PIX_FMT_ */
+	uint16_t width;
+	uint16_t height;
+	uint16_t bpp;
+	uint16_t errors; /* see FP_ERROR_ flags above */
+	/* Template/finger current information */
+	uint32_t template_size;  /* max template size in bytes */
+	uint16_t template_max;   /* maximum number of fingers/templates */
+	uint16_t template_valid; /* number of valid fingers/templates */
+	uint32_t template_dirty; /* bitmap of templates with MCU side changes */
+	uint32_t template_version; /* version of the template format */
+} __ec_align4;
+
+/* Get the last captured finger frame or a template content */
+#define EC_CMD_FP_FRAME 0x0404
+
+/* constants defining the 'offset' field which also contains the frame index */
+#define FP_FRAME_INDEX_SHIFT       28
+/* Frame buffer where the captured image is stored */
+#define FP_FRAME_INDEX_RAW_IMAGE    0
+/* First frame buffer holding a template */
+#define FP_FRAME_INDEX_TEMPLATE     1
+#define FP_FRAME_GET_BUFFER_INDEX(offset) ((offset) >> FP_FRAME_INDEX_SHIFT)
+#define FP_FRAME_OFFSET_MASK       0x0FFFFFFF
+
+/* Version of the format of the encrypted templates. */
+#define FP_TEMPLATE_FORMAT_VERSION 3
+
+/* Constants for encryption parameters */
+#define FP_CONTEXT_NONCE_BYTES 12
+#define FP_CONTEXT_USERID_WORDS (32 / sizeof(uint32_t))
+#define FP_CONTEXT_TAG_BYTES 16
+#define FP_CONTEXT_SALT_BYTES 16
+#define FP_CONTEXT_TPM_BYTES 32
+
+struct ec_fp_template_encryption_metadata {
+	/*
+	 * Version of the structure format (N=3).
+	 */
+	uint16_t struct_version;
+	/* Reserved bytes, set to 0. */
+	uint16_t reserved;
+	/*
+	 * The salt is *only* ever used for key derivation. The nonce is unique,
+	 * a different one is used for every message.
+	 */
+	uint8_t nonce[FP_CONTEXT_NONCE_BYTES];
+	uint8_t salt[FP_CONTEXT_SALT_BYTES];
+	uint8_t tag[FP_CONTEXT_TAG_BYTES];
+};
+
+struct ec_params_fp_frame {
+	/*
+	 * The offset contains the template index or FP_FRAME_INDEX_RAW_IMAGE
+	 * in the high nibble, and the real offset within the frame in
+	 * FP_FRAME_OFFSET_MASK.
+	 */
+	uint32_t offset;
+	uint32_t size;
+} __ec_align4;
+
+/* Load a template into the MCU */
+#define EC_CMD_FP_TEMPLATE 0x0405
+
+/* Flag in the 'size' field indicating that the full template has been sent */
+#define FP_TEMPLATE_COMMIT 0x80000000
+
+struct ec_params_fp_template {
+	uint32_t offset;
+	uint32_t size;
+	uint8_t data[];
+} __ec_align4;
+
+/* Clear the current fingerprint user context and set a new one */
+#define EC_CMD_FP_CONTEXT 0x0406
+
+struct ec_params_fp_context {
+	uint32_t userid[FP_CONTEXT_USERID_WORDS];
+} __ec_align4;
+
+#define EC_CMD_FP_STATS 0x0407
+
+#define FPSTATS_CAPTURE_INV  BIT(0)
+#define FPSTATS_MATCHING_INV BIT(1)
+
+struct ec_response_fp_stats {
+	uint32_t capture_time_us;
+	uint32_t matching_time_us;
+	uint32_t overall_time_us;
+	struct {
+		uint32_t lo;
+		uint32_t hi;
+	} overall_t0;
+	uint8_t timestamps_invalid;
+	int8_t template_matched;
+} __ec_align2;
+
+#define EC_CMD_FP_SEED 0x0408
+struct ec_params_fp_seed {
+	/*
+	 * Version of the structure format (N=3).
+	 */
+	uint16_t struct_version;
+	/* Reserved bytes, set to 0. */
+	uint16_t reserved;
+	/* Seed from the TPM. */
+	uint8_t seed[FP_CONTEXT_TPM_BYTES];
+} __ec_align4;
+
 /*****************************************************************************/
 /* Touchpad MCU commands: range 0x0500-0x05FF */
 
-- 
cgit v1.2.3


From a0d50b31cee948de1be0ad14b78127a00530f43e Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:58 -0700
Subject: mfd: cros_ec: Add API for rwsig

Add command to retrieve signature of image stored in the RW memory
slot(s).

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 4a9ac3861bdd..3d3a37b11002 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -5043,6 +5043,32 @@ struct ec_response_pd_chip_info_v1 {
 	};
 } __ec_align2;
 
+/* Run RW signature verification and get status */
+#define EC_CMD_RWSIG_CHECK_STATUS	0x011C
+
+struct ec_response_rwsig_check_status {
+	uint32_t status;
+} __ec_align4;
+
+/* For controlling RWSIG task */
+#define EC_CMD_RWSIG_ACTION	0x011D
+
+enum rwsig_action {
+	RWSIG_ACTION_ABORT = 0,		/* Abort RWSIG and prevent jumping */
+	RWSIG_ACTION_CONTINUE = 1,	/* Jump to RW immediately */
+};
+
+struct ec_params_rwsig_action {
+	uint32_t action;
+} __ec_align4;
+
+/* Run verification on a slot */
+#define EC_CMD_EFS_VERIFY	0x011E
+
+struct ec_params_efs_verify {
+	uint8_t region;		/* enum ec_flash_region */
+} __ec_align1;
+
 /*****************************************************************************/
 /* Fingerprint MCU commands: range 0x0400-0x040x */
 
-- 
cgit v1.2.3


From cc3a032fd7128d03715c655ad66a263b6d518071 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:33:59 -0700
Subject: mfd: cros_ec: Add SKU ID and Secure storage API

Add API to store SKU, Cros board information in EC flash memory.
Add API to store security data in EC.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 107 +++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 3d3a37b11002..860a76274334 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1292,6 +1292,17 @@ struct ec_response_get_features {
 	uint32_t flags[2];
 } __ec_align4;
 
+/*****************************************************************************/
+/* Get the board's SKU ID from EC */
+#define EC_CMD_GET_SKU_ID 0x000E
+
+/* Set SKU ID from AP */
+#define EC_CMD_SET_SKU_ID 0x000F
+
+struct ec_sku_id_info {
+	uint32_t sku_id;
+} __ec_align4;
+
 /*****************************************************************************/
 /* Flash commands */
 
@@ -2902,6 +2913,49 @@ struct ec_response_port80_last_boot {
 	uint16_t code;
 } __ec_align2;
 
+/*****************************************************************************/
+/* Temporary secure storage for host verified boot use */
+
+/* Number of bytes in a vstore slot */
+#define EC_VSTORE_SLOT_SIZE 64
+
+/* Maximum number of vstore slots */
+#define EC_VSTORE_SLOT_MAX 32
+
+/* Get persistent storage info */
+#define EC_CMD_VSTORE_INFO 0x0049
+struct ec_response_vstore_info {
+	/* Indicates which slots are locked */
+	uint32_t slot_locked;
+	/* Total number of slots available */
+	uint8_t slot_count;
+} __ec_align_size1;
+
+/*
+ * Read temporary secure storage
+ *
+ * Response is EC_VSTORE_SLOT_SIZE bytes of data.
+ */
+#define EC_CMD_VSTORE_READ 0x004A
+
+struct ec_params_vstore_read {
+	uint8_t slot; /* Slot to read from */
+} __ec_align1;
+
+struct ec_response_vstore_read {
+	uint8_t data[EC_VSTORE_SLOT_SIZE];
+} __ec_align1;
+
+/*
+ * Write temporary secure storage and lock it.
+ */
+#define EC_CMD_VSTORE_WRITE 0x004B
+
+struct ec_params_vstore_write {
+	uint8_t slot; /* Slot to write to */
+	uint8_t data[EC_VSTORE_SLOT_SIZE];
+} __ec_align1;
+
 /*****************************************************************************/
 /* Thermal engine commands. Note that there are two implementations. We'll
  * reuse the command number, but the data and behavior is incompatible.
@@ -5069,6 +5123,59 @@ struct ec_params_efs_verify {
 	uint8_t region;		/* enum ec_flash_region */
 } __ec_align1;
 
+/*
+ * Retrieve info from Cros Board Info store. Response is based on the data
+ * type. Integers return a uint32. Strings return a string, using the response
+ * size to determine how big it is.
+ */
+#define EC_CMD_GET_CROS_BOARD_INFO	0x011F
+/*
+ * Write info into Cros Board Info on EEPROM. Write fails if the board has
+ * hardware write-protect enabled.
+ */
+#define EC_CMD_SET_CROS_BOARD_INFO	0x0120
+
+enum cbi_data_tag {
+	CBI_TAG_BOARD_VERSION = 0, /* uint32_t or smaller */
+	CBI_TAG_OEM_ID = 1,        /* uint32_t or smaller */
+	CBI_TAG_SKU_ID = 2,        /* uint32_t or smaller */
+	CBI_TAG_DRAM_PART_NUM = 3, /* variable length ascii, nul terminated. */
+	CBI_TAG_OEM_NAME = 4,      /* variable length ascii, nul terminated. */
+	CBI_TAG_MODEL_ID = 5,      /* uint32_t or smaller */
+	CBI_TAG_COUNT,
+};
+
+/*
+ * Flags to control read operation
+ *
+ * RELOAD:  Invalidate cache and read data from EEPROM. Useful to verify
+ *          write was successful without reboot.
+ */
+#define CBI_GET_RELOAD		BIT(0)
+
+struct ec_params_get_cbi {
+	uint32_t tag;		/* enum cbi_data_tag */
+	uint32_t flag;		/* CBI_GET_* */
+} __ec_align4;
+
+/*
+ * Flags to control write behavior.
+ *
+ * NO_SYNC: Makes EC update data in RAM but skip writing to EEPROM. It's
+ *          useful when writing multiple fields in a row.
+ * INIT:    Need to be set when creating a new CBI from scratch. All fields
+ *          will be initialized to zero first.
+ */
+#define CBI_SET_NO_SYNC		BIT(0)
+#define CBI_SET_INIT		BIT(1)
+
+struct ec_params_set_cbi {
+	uint32_t tag;		/* enum cbi_data_tag */
+	uint32_t flag;		/* CBI_SET_* */
+	uint32_t size;		/* Data size */
+	uint8_t data[];		/* For string and raw data */
+} __ec_align1;
+
 /*****************************************************************************/
 /* Fingerprint MCU commands: range 0x0400-0x040x */
 
-- 
cgit v1.2.3


From 2f2e6d14866e34b0982460760f770265215f08c5 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:34:00 -0700
Subject: mfd: cros_ec: Add Management API entry points

Add commands for test and management.
Add command space for future development.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 113 +++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 860a76274334..fc8babce1576 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -5176,6 +5176,119 @@ struct ec_params_set_cbi {
 	uint8_t data[];		/* For string and raw data */
 } __ec_align1;
 
+/*
+ * Information about resets of the AP by the EC and the EC's own uptime.
+ */
+#define EC_CMD_GET_UPTIME_INFO 0x0121
+
+struct ec_response_uptime_info {
+	/*
+	 * Number of milliseconds since the last EC boot. Sysjump resets
+	 * typically do not restart the EC's time_since_boot epoch.
+	 *
+	 * WARNING: The EC's sense of time is much less accurate than the AP's
+	 * sense of time, in both phase and frequency.  This timebase is similar
+	 * to CLOCK_MONOTONIC_RAW, but with 1% or more frequency error.
+	 */
+	uint32_t time_since_ec_boot_ms;
+
+	/*
+	 * Number of times the AP was reset by the EC since the last EC boot.
+	 * Note that the AP may be held in reset by the EC during the initial
+	 * boot sequence, such that the very first AP boot may count as more
+	 * than one here.
+	 */
+	uint32_t ap_resets_since_ec_boot;
+
+	/*
+	 * The set of flags which describe the EC's most recent reset.  See
+	 * include/system.h RESET_FLAG_* for details.
+	 */
+	uint32_t ec_reset_flags;
+
+	/* Empty log entries have both the cause and timestamp set to zero. */
+	struct ap_reset_log_entry {
+		/*
+		 * See include/chipset.h: enum chipset_{reset,shutdown}_reason
+		 * for details.
+		 */
+		uint16_t reset_cause;
+
+		/* Reserved for protocol growth. */
+		uint16_t reserved;
+
+		/*
+		 * The time of the reset's assertion, in milliseconds since the
+		 * last EC boot, in the same epoch as time_since_ec_boot_ms.
+		 * Set to zero if the log entry is empty.
+		 */
+		uint32_t reset_time_ms;
+	} recent_ap_reset[4];
+} __ec_align4;
+
+/*
+ * Add entropy to the device secret (stored in the rollback region).
+ *
+ * Depending on the chip, the operation may take a long time (e.g. to erase
+ * flash), so the commands are asynchronous.
+ */
+#define EC_CMD_ADD_ENTROPY	0x0122
+
+enum add_entropy_action {
+	/* Add entropy to the current secret. */
+	ADD_ENTROPY_ASYNC = 0,
+	/*
+	 * Add entropy, and also make sure that the previous secret is erased.
+	 * (this can be implemented by adding entropy multiple times until
+	 * all rolback blocks have been overwritten).
+	 */
+	ADD_ENTROPY_RESET_ASYNC = 1,
+	/* Read back result from the previous operation. */
+	ADD_ENTROPY_GET_RESULT = 2,
+};
+
+struct ec_params_rollback_add_entropy {
+	uint8_t action;
+} __ec_align1;
+
+/*
+ * Perform a single read of a given ADC channel.
+ */
+#define EC_CMD_ADC_READ		0x0123
+
+struct ec_params_adc_read {
+	uint8_t adc_channel;
+} __ec_align1;
+
+struct ec_response_adc_read {
+	int32_t adc_value;
+} __ec_align4;
+
+/*
+ * Read back rollback info
+ */
+#define EC_CMD_ROLLBACK_INFO		0x0124
+
+struct ec_response_rollback_info {
+	int32_t id; /* Incrementing number to indicate which region to use. */
+	int32_t rollback_min_version;
+	int32_t rw_rollback_version;
+} __ec_align4;
+
+
+/* Issue AP reset */
+#define EC_CMD_AP_RESET 0x0125
+
+/*****************************************************************************/
+/* The command range 0x200-0x2FF is reserved for Rotor. */
+
+/*****************************************************************************/
+/*
+ * Reserve a range of host commands for the CR51 firmware.
+ */
+#define EC_CMD_CR51_BASE 0x0300
+#define EC_CMD_CR51_LAST 0x03FF
+
 /*****************************************************************************/
 /* Fingerprint MCU commands: range 0x0400-0x040x */
 
-- 
cgit v1.2.3


From 3aa6be30da899619c44aa654313ba66eb44e7291 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 3 Jun 2019 11:34:01 -0700
Subject: mfd: cros_ec: Update I2S API

Improve I2S API.
Rename ec_response_codec_gain into ec_codec_i2s_gain,
update caller accordlingly.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Fabien Lahoudere <fabien.lahoudere@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 44 ++++++++++++++++--------------------
 sound/soc/codecs/cros_ec_codec.c     |  8 +++----
 2 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index fc8babce1576..fa397722f17e 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -4471,6 +4471,7 @@ enum mkbp_cec_event {
 /* Commands for I2S recording on audio codec. */
 
 #define EC_CMD_CODEC_I2S 0x00BC
+#define EC_WOV_I2S_SAMPLE_RATE 48000
 
 enum ec_codec_i2s_subcmd {
 	EC_CODEC_SET_SAMPLE_DEPTH = 0x0,
@@ -4480,6 +4481,7 @@ enum ec_codec_i2s_subcmd {
 	EC_CODEC_I2S_SET_CONFIG = 0x4,
 	EC_CODEC_I2S_SET_TDM_CONFIG = 0x5,
 	EC_CODEC_I2S_SET_BCLK = 0x6,
+	EC_CODEC_I2S_SUBCMD_COUNT = 0x7,
 };
 
 enum ec_sample_depth_value {
@@ -4496,6 +4498,21 @@ enum ec_i2s_config {
 	EC_DAI_FMT_PCM_TDM = 5,
 };
 
+/*
+ * For subcommand EC_CODEC_GET_GAIN.
+ */
+struct __ec_align1 ec_codec_i2s_gain {
+	uint8_t left;
+	uint8_t right;
+};
+
+struct __ec_todo_unpacked ec_param_codec_i2s_tdm {
+	int16_t ch0_delay; /* 0 to 496 */
+	int16_t ch1_delay; /* -1 to 496 */
+	uint8_t adjacent_to_ch0;
+	uint8_t adjacent_to_ch1;
+};
+
 struct __ec_todo_packed ec_param_codec_i2s {
 	/* enum ec_codec_i2s_subcmd */
 	uint8_t cmd;
@@ -4510,10 +4527,7 @@ struct __ec_todo_packed ec_param_codec_i2s {
 		 * EC_CODEC_SET_GAIN
 		 * Value should be 0~43 for both channels.
 		 */
-		struct __ec_align1 ec_param_codec_i2s_set_gain {
-			uint8_t left;
-			uint8_t right;
-		} gain;
+		struct ec_codec_i2s_gain gain;
 
 		/*
 		 * EC_CODEC_I2S_ENABLE
@@ -4522,7 +4536,7 @@ struct __ec_todo_packed ec_param_codec_i2s {
 		uint8_t i2s_enable;
 
 		/*
-		 * EC_CODEC_I2S_SET_COFNIG
+		 * EC_CODEC_I2S_SET_CONFIG
 		 * Value should be one of ec_i2s_config.
 		 */
 		uint8_t i2s_config;
@@ -4531,18 +4545,7 @@ struct __ec_todo_packed ec_param_codec_i2s {
 		 * EC_CODEC_I2S_SET_TDM_CONFIG
 		 * Value should be one of ec_i2s_config.
 		 */
-		struct __ec_todo_unpacked ec_param_codec_i2s_tdm {
-			/*
-			 * 0 to 496
-			 */
-			int16_t ch0_delay;
-			/*
-			 * -1 to 496
-			 */
-			int16_t ch1_delay;
-			uint8_t adjacent_to_ch0;
-			uint8_t adjacent_to_ch1;
-		} tdm_param;
+		struct ec_param_codec_i2s_tdm tdm_param;
 
 		/*
 		 * EC_CODEC_I2S_SET_BCLK
@@ -4551,13 +4554,6 @@ struct __ec_todo_packed ec_param_codec_i2s {
 	};
 };
 
-/*
- * For subcommand EC_CODEC_GET_GAIN.
- */
-struct ec_response_codec_gain {
-	uint8_t left;
-	uint8_t right;
-} __ec_align1;
 
 /*****************************************************************************/
 /* System commands */
diff --git a/sound/soc/codecs/cros_ec_codec.c b/sound/soc/codecs/cros_ec_codec.c
index 99a3af8a15ff..87830ed5ebf4 100644
--- a/sound/soc/codecs/cros_ec_codec.c
+++ b/sound/soc/codecs/cros_ec_codec.c
@@ -38,21 +38,21 @@ static const DECLARE_TLV_DB_SCALE(ec_mic_gain_tlv, 0, 100, 0);
 
 static int ec_command_get_gain(struct snd_soc_component *component,
 			       struct ec_param_codec_i2s *param,
-			       struct ec_response_codec_gain *resp)
+			       struct ec_codec_i2s_gain *resp)
 {
 	struct cros_ec_codec_data *codec_data =
 		snd_soc_component_get_drvdata(component);
 	struct cros_ec_device *ec_device = codec_data->ec_device;
 	u8 buffer[sizeof(struct cros_ec_command) +
 		  max(sizeof(struct ec_param_codec_i2s),
-		      sizeof(struct ec_response_codec_gain))];
+		      sizeof(struct ec_codec_i2s_gain))];
 	struct cros_ec_command *msg = (struct cros_ec_command *)&buffer;
 	int ret;
 
 	msg->version = 0;
 	msg->command = EC_CMD_CODEC_I2S;
 	msg->outsize = sizeof(struct ec_param_codec_i2s);
-	msg->insize = sizeof(struct ec_response_codec_gain);
+	msg->insize = sizeof(struct ec_codec_i2s_gain);
 
 	memcpy(msg->data, param, msg->outsize);
 
@@ -226,7 +226,7 @@ static int get_ec_mic_gain(struct snd_soc_component *component,
 			   u8 *left, u8 *right)
 {
 	struct ec_param_codec_i2s param;
-	struct ec_response_codec_gain resp;
+	struct ec_codec_i2s_gain resp;
 	int ret;
 
 	param.cmd = EC_CODEC_GET_GAIN;
-- 
cgit v1.2.3


From fe03d4745675cbd678cb8c50d951df0abafdcaee Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Jun 2019 13:00:24 +0200
Subject: Update my email address

It's better to use my kadlec@netfilter.org email address in
the source code. I might not be able to use
kadlec@blackhole.kfki.hu in the future.

Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 CREDITS                                        | 2 +-
 MAINTAINERS                                    | 2 +-
 include/linux/jhash.h                          | 2 +-
 include/linux/netfilter/ipset/ip_set.h         | 2 +-
 include/linux/netfilter/ipset/ip_set_counter.h | 2 +-
 include/linux/netfilter/ipset/ip_set_skbinfo.h | 2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h | 2 +-
 include/uapi/linux/netfilter/ipset/ip_set.h    | 2 +-
 net/ipv4/netfilter/iptable_raw.c               | 2 +-
 net/ipv4/netfilter/nf_nat_h323.c               | 2 +-
 net/ipv6/netfilter/ip6table_raw.c              | 2 +-
 net/netfilter/ipset/ip_set_bitmap_gen.h        | 2 +-
 net/netfilter/ipset/ip_set_bitmap_ip.c         | 4 ++--
 net/netfilter/ipset/ip_set_bitmap_ipmac.c      | 4 ++--
 net/netfilter/ipset/ip_set_bitmap_port.c       | 4 ++--
 net/netfilter/ipset/ip_set_core.c              | 4 ++--
 net/netfilter/ipset/ip_set_getport.c           | 2 +-
 net/netfilter/ipset/ip_set_hash_gen.h          | 2 +-
 net/netfilter/ipset/ip_set_hash_ip.c           | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipmark.c       | 2 +-
 net/netfilter/ipset/ip_set_hash_ipport.c       | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipportip.c     | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipportnet.c    | 4 ++--
 net/netfilter/ipset/ip_set_hash_mac.c          | 4 ++--
 net/netfilter/ipset/ip_set_hash_net.c          | 4 ++--
 net/netfilter/ipset/ip_set_hash_netiface.c     | 4 ++--
 net/netfilter/ipset/ip_set_hash_netnet.c       | 2 +-
 net/netfilter/ipset/ip_set_hash_netport.c      | 4 ++--
 net/netfilter/ipset/ip_set_hash_netportnet.c   | 2 +-
 net/netfilter/ipset/ip_set_list_set.c          | 4 ++--
 net/netfilter/nf_conntrack_h323_main.c         | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         | 2 +-
 net/netfilter/xt_iprange.c                     | 4 ++--
 net/netfilter/xt_set.c                         | 4 ++--
 34 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index 8e0342620a06..4200f4f91a16 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1800,7 +1800,7 @@ S: 2300 Copenhagen S.
 S: Denmark
 
 N: Jozsef Kadlecsik
-E: kadlec@blackhole.kfki.hu
+E: kadlec@netfilter.org
 P: 1024D/470DB964 4CB3 1A05 713E 9BF7 FAC5  5809 DD8C B7B1 470D B964
 D: netfilter: TCP window tracking code
 D: netfilter: raw table
diff --git a/MAINTAINERS b/MAINTAINERS
index fcbd648b960e..4c65ce86fc9e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10858,7 +10858,7 @@ F:	drivers/net/ethernet/neterion/
 
 NETFILTER
 M:	Pablo Neira Ayuso <pablo@netfilter.org>
-M:	Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+M:	Jozsef Kadlecsik <kadlec@netfilter.org>
 M:	Florian Westphal <fw@strlen.de>
 L:	netfilter-devel@vger.kernel.org
 L:	coreteam@netfilter.org
diff --git a/include/linux/jhash.h b/include/linux/jhash.h
index 8037850f3104..ba2f6a9776b6 100644
--- a/include/linux/jhash.h
+++ b/include/linux/jhash.h
@@ -17,7 +17,7 @@
  * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
  * the public domain.  It has no warranty.
  *
- * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
  *
  * I've modified Bob's hash to be useful in the Linux kernel, and
  * any bugs present are my fault.
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e499d170f12d..f5c6e7cd6469 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h
index 3d33a2c3f39f..305aeda2a899 100644
--- a/include/linux/netfilter/ipset/ip_set_counter.h
+++ b/include/linux/netfilter/ipset/ip_set_counter.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_COUNTER_H
 #define _IP_SET_COUNTER_H
 
-/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h
index 29d7ef2bc3fa..fac57ef854c2 100644
--- a/include/linux/netfilter/ipset/ip_set_skbinfo.h
+++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_SKBINFO_H
 #define _IP_SET_SKBINFO_H
 
-/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 8ce271e187b6..dc74150f3432 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_TIMEOUT_H
 #define _IP_SET_TIMEOUT_H
 
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index ea69ca21ff23..eea166c52c36 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -2,7 +2,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 6eefde5bc468..69697eb4bfc6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -2,7 +2,7 @@
 /*
  * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
  *
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 15f2b2604890..076b6b29d66d 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -7,7 +7,7 @@
  * This source code is licensed under General Public License version 2.
  *
  * Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 
 #include <linux/module.h>
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3f7d4691c423..a22100b1cf2c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -2,7 +2,7 @@
 /*
  * IPv6 raw table, a port of the IPv4 raw table to IPv6
  *
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 38ef2ea838cb..29c1e9a50601 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 488d6d05c65c..5a66c5499700 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip");
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 980000fc3b50..ec7a8b12642c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *			   Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip,mac");
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b561ca8b3659..18275ec4924c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:port");
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 039892cd2b7d..18430ad2fdf2 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -51,7 +51,7 @@ static unsigned int max_sets;
 module_param(max_sets, int, 0600);
 MODULE_PARM_DESC(max_sets, "maximal number of sets");
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_DESCRIPTION("core IP set support");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 3f09cdb42562..dc7b46b41354 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 623e0d675725..07ef941130a6 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 613eb212cb48..7b82bf1104ce 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -30,7 +30,7 @@
 #define IPSET_TYPE_REV_MAX	4	/* skbinfo support  */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index f3ba8348cf9d..7d468f98a252 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index ddb8039ec1d2..d358ee69d04b 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index a7f4d7a85420..0a304785f912 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,ip");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 88b83d6d3084..245f7d714870 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -34,7 +34,7 @@
 #define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,net");
 
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 4fe5f243d0a3..3d1fc71dac38 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@
 #define IPSET_TYPE_REV_MAX	0
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:mac");
 
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 5449e23af13a..470701fda231 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	6 /* skbinfo mapping support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f5164c1efce2..1df8656ad84d 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	6 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,iface");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 5a2b923bd81f..e0553be89600 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 1a187be9ebc8..943d55d76fcf 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -33,7 +33,7 @@
 #define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,port");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 613e18e720a4..afaff99e578c 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 4f894165cdcd..ed4360072f64 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -22,7 +22,7 @@
 #define IPSET_TYPE_REV_MAX	3 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_list:set");
 
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 12de40390e97..1ff66e070cb2 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -7,7 +7,7 @@
  * This source code is licensed under General Public License version 2.
  *
  * Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * For more information, please see http://nath323.sourceforge.net/
  */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7ba01d8ee165..60b68400435d 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,6 +1,6 @@
 /* (C) 1999-2001 Paul `Rusty' Russell
  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index b46626cddd93..4ab4155706d7 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -1,7 +1,7 @@
 /*
  *	xt_iprange - Netfilter module to match IP address ranges
  *
- *	(C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *	(C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  *	(C) CC Computer Consultants GmbH, 2008
  *
  *	This program is free software; you can redistribute it and/or modify
@@ -133,7 +133,7 @@ static void __exit iprange_mt_exit(void)
 module_init(iprange_mt_init);
 module_exit(iprange_mt_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
 MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
 MODULE_ALIAS("ipt_iprange");
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index cf67bbe07dc2..f025c51ba375 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,7 +21,7 @@
 #include <uapi/linux/netfilter/xt_set.h>
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IP set match and target module");
 MODULE_ALIAS("xt_SET");
 MODULE_ALIAS("ipt_set");
-- 
cgit v1.2.3


From 4e23be473e3063a9d3bc06bb0aee89885fffab0e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 10 Jun 2019 04:48:05 -0700
Subject: bus: ti-sysc: Add support for module specific reset quirks

Some older interconnect target modules need module internal clock
toggling quirks to reset properly. We've been doing this in the
platform code earlier, but need to be able to it directly in the
ti-sysc driver when we no longer rely on on the platform code.

Let's add reset handling for 1-wire, i2c and watchdog. Later on
we can add more modules like msdi and dss as they get tested.
For dra7 pcie, we should be able to just use the rstctrl reset
driver when available.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 129 ++++++++++++++++++++++++++++++++--
 include/linux/platform_data/ti-sysc.h |   3 +
 2 files changed, 127 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index a366ae548ec9..e6deabd8305d 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -71,6 +71,9 @@ static const char * const clock_names[SYSC_MAX_CLOCKS] = {
  * @name: name if available
  * @revision: interconnect target module revision
  * @needs_resume: runtime resume needed on resume from suspend
+ * @clk_enable_quirk: module specific clock enable quirk
+ * @clk_disable_quirk: module specific clock disable quirk
+ * @reset_done_quirk: module specific reset done quirk
  */
 struct sysc {
 	struct device *dev;
@@ -94,6 +97,9 @@ struct sysc {
 	unsigned int child_needs_resume:1;
 	unsigned int disable_on_idle:1;
 	struct delayed_work idle_work;
+	void (*clk_enable_quirk)(struct sysc *sysc);
+	void (*clk_disable_quirk)(struct sysc *sysc);
+	void (*reset_done_quirk)(struct sysc *sysc);
 };
 
 static void sysc_parse_dts_quirks(struct sysc *ddata, struct device_node *np,
@@ -760,8 +766,11 @@ static int sysc_ioremap(struct sysc *ddata)
 			    ddata->offsets[SYSC_SYSCONFIG],
 			    ddata->offsets[SYSC_SYSSTATUS]);
 
+		if (size < SZ_1K)
+			size = SZ_1K;
+
 		if ((size + sizeof(u32)) > ddata->module_size)
-			return -EINVAL;
+			size = ddata->module_size;
 	}
 
 	ddata->module_va = devm_ioremap(ddata->dev,
@@ -1234,6 +1243,22 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 		   SYSC_QUIRK_EXT_OPT_CLOCK | SYSC_QUIRK_NO_RESET_ON_INIT |
 		   SYSC_QUIRK_SWSUP_SIDLE),
 
+	/* Quirks that need to be set based on detected module */
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000036, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x0000003c, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000040, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_WDT),
+
 #ifdef DEBUG
 	SYSC_QUIRK("adc", 0, 0, 0x10, -1, 0x47300001, 0xffffffff, 0),
 	SYSC_QUIRK("atl", 0, 0, -1, -1, 0x0a070100, 0xffffffff, 0),
@@ -1247,11 +1272,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("dwc3", 0, 0, 0x10, -1, 0x500a0200, 0xffffffff, 0),
 	SYSC_QUIRK("epwmss", 0, 0, 0x4, -1, 0x47400001, 0xffffffff, 0),
 	SYSC_QUIRK("gpu", 0, 0x1fc00, 0x1fc10, -1, 0, 0, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff, 0),
 	SYSC_QUIRK("hsi", 0, 0, 0x10, 0x14, 0x50043101, 0xffffffff, 0),
 	SYSC_QUIRK("iss", 0, 0, 0x10, -1, 0x40000101, 0xffffffff, 0),
-	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0, 0),
 	SYSC_QUIRK("lcdc", 0, 0, 0x54, -1, 0x4f201000, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44306302, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44307b02, 0xffffffff, 0),
@@ -1287,7 +1309,6 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("usb_host_hs", 0, 0, 0x10, -1, 0x50700101, 0xffffffff, 0),
 	SYSC_QUIRK("usb_otg_hs", 0, 0x400, 0x404, 0x408, 0x00000050,
 		   0xffffffff, 0),
-	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0, 0),
 	SYSC_QUIRK("vfpe", 0, 0, 0x104, -1, 0x4d001200, 0xffffffff, 0),
 #endif
 };
@@ -1360,6 +1381,94 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+/* 1-wire needs module's internal clocks enabled for reset */
+static void sysc_clk_enable_quirk_hdq1w(struct sysc *ddata)
+{
+	int offset = 0x0c;	/* HDQ_CTRL_STATUS */
+	u16 val;
+
+	val = sysc_read(ddata, offset);
+	val |= BIT(5);
+	sysc_write(ddata, offset, val);
+}
+
+/* I2C needs extra enable bit toggling for reset */
+static void sysc_clk_quirk_i2c(struct sysc *ddata, bool enable)
+{
+	int offset;
+	u16 val;
+
+	/* I2C_CON, omap2/3 is different from omap4 and later */
+	if ((ddata->revision & 0xffffff00) == 0x001f0000)
+		offset = 0x24;
+	else
+		offset = 0xa4;
+
+	/* I2C_EN */
+	val = sysc_read(ddata, offset);
+	if (enable)
+		val |= BIT(15);
+	else
+		val &= ~BIT(15);
+	sysc_write(ddata, offset, val);
+}
+
+static void sysc_clk_enable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, true);
+}
+
+static void sysc_clk_disable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, false);
+}
+
+/* Watchdog timer needs a disable sequence after reset */
+static void sysc_reset_done_quirk_wdt(struct sysc *ddata)
+{
+	int wps, spr, error;
+	u32 val;
+
+	wps = 0x34;
+	spr = 0x48;
+
+	sysc_write(ddata, spr, 0xaaaa);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable spr failed\n");
+
+	sysc_write(ddata, wps, 0x5555);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable wps failed\n");
+}
+
+static void sysc_init_module_quirks(struct sysc *ddata)
+{
+	if (ddata->legacy_mode || !ddata->name)
+		return;
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_HDQ1W) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_hdq1w;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_i2c;
+		ddata->clk_disable_quirk = sysc_clk_disable_quirk_i2c;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_WDT)
+		ddata->reset_done_quirk = sysc_reset_done_quirk_wdt;
+}
+
 static int sysc_clockdomain_init(struct sysc *ddata)
 {
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
@@ -1468,10 +1577,16 @@ static int sysc_reset(struct sysc *ddata)
 	else
 		syss_done = ddata->cfg.syss_mask;
 
+	if (ddata->clk_disable_quirk)
+		ddata->clk_disable_quirk(ddata);
+
 	sysc_val = sysc_read_sysconfig(ddata);
 	sysc_val |= sysc_mask;
 	sysc_write(ddata, sysc_offset, sysc_val);
 
+	if (ddata->clk_enable_quirk)
+		ddata->clk_enable_quirk(ddata);
+
 	/* Poll on reset status */
 	if (syss_offset >= 0) {
 		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
@@ -1485,6 +1600,9 @@ static int sysc_reset(struct sysc *ddata)
 					   100, MAX_MODULE_SOFTRESET_WAIT);
 	}
 
+	if (ddata->reset_done_quirk)
+		ddata->reset_done_quirk(ddata);
+
 	return error;
 }
 
@@ -1531,6 +1649,7 @@ static int sysc_init_module(struct sysc *ddata)
 
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
+	sysc_init_module_quirks(ddata);
 
 	if (ddata->legacy_mode) {
 		error = sysc_legacy_init(ddata);
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 8822e99ff813..0c587d4fc718 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,9 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_MODULE_QUIRK_HDQ1W		BIT(17)
+#define SYSC_MODULE_QUIRK_I2C		BIT(16)
+#define SYSC_MODULE_QUIRK_WDT		BIT(15)
 #define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
-- 
cgit v1.2.3


From e36acfe6c86d13eec62321e1e86a1ce287e52e7d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 09:41:19 -0300
Subject: mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register

Ralph observes that hmm_range_register() can only be called by a driver
while a mirror is registered. Make this clear in the API by passing in the
mirror structure as a parameter.

This also simplifies understanding the lifetime model for struct hmm, as
the hmm pointer must be valid as part of a registered mirror so all we
need in hmm_register_range() is a simple kref_get.

Suggested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
 include/linux/hmm.h                   |  7 ++++---
 mm/hmm.c                              | 13 ++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 93ed43c413f0..8c92374afcf2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
 		range.values = nouveau_svm_pfn_values;
 		range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
 again:
-		ret = hmm_vma_fault(&range, true);
+		ret = hmm_vma_fault(&svmm->mirror, &range, true);
 		if (ret == 0) {
 			mutex_lock(&svmm->mutex);
 			if (!hmm_vma_range_done(&range)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index cb01cf1fa3c0..1fba6979adf4 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -496,7 +496,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift);
@@ -532,7 +532,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range)
 }
 
 /* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+static inline int hmm_vma_fault(struct hmm_mirror *mirror,
+				struct hmm_range *range, bool block)
 {
 	long ret;
 
@@ -545,7 +546,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	range->default_flags = 0;
 	range->pfn_flags_mask = -1UL;
 
-	ret = hmm_range_register(range, range->vma->vm_mm,
+	ret = hmm_range_register(range, mirror,
 				 range->start, range->end,
 				 PAGE_SHIFT);
 	if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index f6956d78e3cb..22a97ada108b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -914,13 +914,13 @@ static void hmm_pfns_clear(struct hmm_range *range,
  * Track updates to the CPU page table see include/linux/hmm.h
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
-	struct hmm *hmm;
+	struct hmm *hmm = mirror->hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -934,20 +934,15 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	hmm = hmm_get_or_create(mm);
-	if (!hmm)
-		return -EFAULT;
-
 	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead) {
-		hmm_put(hmm);
+	if (hmm->mm == NULL || hmm->dead)
 		return -EFAULT;
-	}
 
 	/* Initialize range to track CPU page table updates. */
 	mutex_lock(&hmm->lock);
 
 	range->hmm = hmm;
+	kref_get(&hmm->kref);
 	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
-- 
cgit v1.2.3


From c8a53b2db0aec40d8b217936e1b7f3d840c50390 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 10:36:46 -0300
Subject: mm/hmm: Hold a mmgrab from hmm to mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So long as a struct hmm pointer exists, so should the struct mm it is
linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it
once the hmm refcount goes to zero.

Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL
mm->hmm delete the hmm_hmm_destroy().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  3 ---
 kernel/fork.c       |  1 -
 mm/hmm.c            | 22 ++++------------------
 3 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1fba6979adf4..1d97b6d62c5b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -577,14 +577,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror,
 }
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
-void hmm_mm_destroy(struct mm_struct *mm);
-
 static inline void hmm_mm_init(struct mm_struct *mm)
 {
 	mm->hmm = NULL;
 }
 #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..c704c3cedee7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm)
 	WARN_ON_ONCE(mm == current->active_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
-	hmm_mm_destroy(mm);
 	mmu_notifier_mm_destroy(mm);
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
diff --git a/mm/hmm.c b/mm/hmm.c
index 22a97ada108b..080b17a2e87e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -20,6 +20,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
+#include <linux/sched/mm.h>
 #include <linux/jump_label.h>
 #include <linux/dma-mapping.h>
 #include <linux/mmu_notifier.h>
@@ -73,6 +74,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	hmm->notifiers = 0;
 	hmm->dead = false;
 	hmm->mm = mm;
+	mmgrab(hmm->mm);
 
 	spin_lock(&mm->page_table_lock);
 	if (!mm->hmm)
@@ -100,6 +102,7 @@ error_mm:
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 error:
+	mmdrop(hmm->mm);
 	kfree(hmm);
 	return NULL;
 }
@@ -121,6 +124,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
+	mmdrop(hmm->mm);
 	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
@@ -129,24 +133,6 @@ static inline void hmm_put(struct hmm *hmm)
 	kref_put(&hmm->kref, hmm_free);
 }
 
-void hmm_mm_destroy(struct mm_struct *mm)
-{
-	struct hmm *hmm;
-
-	spin_lock(&mm->page_table_lock);
-	hmm = mm_get_hmm(mm);
-	mm->hmm = NULL;
-	if (hmm) {
-		hmm->mm = NULL;
-		hmm->dead = true;
-		spin_unlock(&mm->page_table_lock);
-		hmm_put(hmm);
-		return;
-	}
-
-	spin_unlock(&mm->page_table_lock);
-}
-
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
-- 
cgit v1.2.3


From 7fae8a9ced742f364604a88a53084f471cc9c6e5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 9 Jun 2019 01:11:38 +0200
Subject: fmc: Decouple from Linux GPIO subsystem

FMC has its own GPIO handling, the inclusion of <linux/gpio.h>
is only to reuse some flags that we can just as well provide
using local defines.

Cc: Federico Vaga <federico.vaga@cern.ch>
Cc: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/fmc/fmc-trivial.c |  1 -
 include/linux/fmc.h       | 18 ++++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fmc/fmc-trivial.c b/drivers/fmc/fmc-trivial.c
index 8defdee3e3a3..b99dbc7ee203 100644
--- a/drivers/fmc/fmc-trivial.c
+++ b/drivers/fmc/fmc-trivial.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/gpio.h>
 #include <linux/fmc.h>
 
 static struct fmc_driver t_drv; /* initialized later */
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
index b355f3806f3f..8661a46a676f 100644
--- a/include/linux/fmc.h
+++ b/include/linux/fmc.h
@@ -100,7 +100,7 @@ struct fmc_gpio {
 	char *carrier_name; /* name or NULL for virtual pins */
 	int gpio;
 	int _gpio;	/* internal use by the carrier */
-	int mode;	/* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
+	int mode;	/* GPIOF_DIR_OUT etc */
 	int irqmode;	/* IRQF_TRIGGER_LOW and so on */
 };
 
@@ -114,13 +114,15 @@ struct fmc_gpio {
 #define FMC_GPIO_USER(x)	((x) + 0x1400)	/*  256 of them */
 /* We may add SCL and SDA, or other roles if the need arises */
 
-/* GPIOF_DIR_IN etc are missing before 3.0. copy from <linux/gpio.h> */
-#ifndef GPIOF_DIR_IN
-#  define GPIOF_DIR_OUT   (0 << 0)
-#  define GPIOF_DIR_IN    (1 << 0)
-#  define GPIOF_INIT_LOW  (0 << 1)
-#  define GPIOF_INIT_HIGH (1 << 1)
-#endif
+/*
+ * These are similar to the legacy Linux GPIO defines from <linux/gpio.h>
+ * but in fact FMC has its own GPIO handling and is not using the Linux
+ * GPIO subsystem.
+ */
+#define GPIOF_DIR_OUT   (0 << 0)
+#define GPIOF_DIR_IN    (1 << 0)
+#define GPIOF_INIT_LOW  (0 << 1)
+#define GPIOF_INIT_HIGH (1 << 1)
 
 /*
  * The operations are offered by each carrier and should make driver
-- 
cgit v1.2.3


From fada7fdc83c0bf8755956bff707c42b609223301 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Thu, 6 Jun 2019 13:59:40 -0700
Subject: bpf: Allow bpf_map_lookup_elem() on an xskmap

Currently, the AF_XDP code uses a separate map in order to
determine if an xsk is bound to a queue.  Instead of doing this,
have bpf_map_lookup_elem() return a xdp_sock.

Rearrange some xdp_sock members to eliminate structure holes.

Remove selftest - will be added back in later patch.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  8 +++++
 include/net/xdp_sock.h                             |  4 +--
 include/uapi/linux/bpf.h                           |  4 +++
 kernel/bpf/verifier.c                              | 26 ++++++++++++--
 kernel/bpf/xskmap.c                                |  7 ++++
 net/core/filter.c                                  | 40 ++++++++++++++++++++++
 .../selftests/bpf/verifier/prevent_map_lookup.c    | 15 --------
 7 files changed, 85 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e5a309e6a400..1fe137afa898 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -280,6 +280,7 @@ enum bpf_reg_type {
 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
 	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
+	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -727,6 +728,13 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info);
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog,
+				    u32 *target_size);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d074b6d60f8a..ae0f368a62bb 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -58,11 +58,11 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
-	struct xsk_queue *tx ____cacheline_aligned_in_smp;
-	struct list_head list;
 	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
+	struct xsk_queue *tx ____cacheline_aligned_in_smp;
+	struct list_head list;
 	/* Mutual exclusion of NAPI TX thread and sendmsg error paths
 	 * in the SKB destructor callback.
 	 */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c6aef253173..ae0907d8c03a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3083,6 +3083,10 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_xdp_sock {
+	__u32 queue_id;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c2cb5bd84ce..8d1786357a09 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -334,7 +334,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_SOCK_COMMON ||
-		type == PTR_TO_TCP_SOCK;
+		type == PTR_TO_TCP_SOCK ||
+		type == PTR_TO_XDP_SOCK;
 }
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -406,6 +407,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TCP_SOCK]	= "tcp_sock",
 	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
+	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 };
 
 static char slot_type_char[] = {
@@ -1363,6 +1365,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		return true;
 	default:
 		return false;
@@ -1843,6 +1846,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	case PTR_TO_TCP_SOCK:
 		valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
 		break;
+	case PTR_TO_XDP_SOCK:
+		valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+		break;
 	default:
 		valid = false;
 	}
@@ -2007,6 +2013,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_TCP_SOCK:
 		pointer_desc = "tcp_sock ";
 		break;
+	case PTR_TO_XDP_SOCK:
+		pointer_desc = "xdp_sock ";
+		break;
 	default:
 		break;
 	}
@@ -2905,10 +2914,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	 * appear.
 	 */
 	case BPF_MAP_TYPE_CPUMAP:
-	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_XSKMAP:
+		if (func_id != BPF_FUNC_redirect_map &&
+		    func_id != BPF_FUNC_map_lookup_elem)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
@@ -3799,6 +3812,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -5038,6 +5052,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			if (reg->map_ptr->inner_map_meta) {
 				reg->type = CONST_PTR_TO_MAP;
 				reg->map_ptr = reg->map_ptr->inner_map_meta;
+			} else if (reg->map_ptr->map_type ==
+				   BPF_MAP_TYPE_XSKMAP) {
+				reg->type = PTR_TO_XDP_SOCK;
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
@@ -6299,6 +6316,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -6693,6 +6711,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		return false;
 	default:
 		return true;
@@ -7826,6 +7845,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		case PTR_TO_TCP_SOCK:
 			convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
 			break;
+		case PTR_TO_XDP_SOCK:
+			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+			break;
 		default:
 			continue;
 		}
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 413d75f4fc72..ef7338cebd18 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map)
 }
 
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_free = xsk_map_free,
 	.map_get_next_key = xsk_map_get_next_key,
 	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
diff --git a/net/core/filter.c b/net/core/filter.c
index f2777dc0b624..a5e4ac7fcbe5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5680,6 +5680,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
 	return INET_ECN_set_ce(skb);
 }
 
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	default:
+		return size == sizeof(__u32);
+	}
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD)						\
+	do {								\
+		BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) >	\
+			     FIELD_SIZEOF(struct bpf_xdp_sock, FIELD));	\
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+				      si->dst_reg, si->src_reg,		\
+				      offsetof(struct xdp_sock, FIELD)); \
+	} while (0)
+
+	switch (si->off) {
+	case offsetof(struct bpf_xdp_sock, queue_id):
+		BPF_XDP_SOCK_GET(queue_id);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
 	.func           = bpf_skb_ecn_set_ce,
 	.gpl_only       = false,
diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
index bbdba990fefb..da7a4b37cb98 100644
--- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
+++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
@@ -28,21 +28,6 @@
 	.errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem",
 	.prog_type = BPF_PROG_TYPE_SOCK_OPS,
 },
-{
-	"prevent map lookup in xskmap",
-	.insns = {
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
-	BPF_LD_MAP_FD(BPF_REG_1, 0),
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_EXIT_INSN(),
-	},
-	.fixup_map_xskmap = { 3 },
-	.result = REJECT,
-	.errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem",
-	.prog_type = BPF_PROG_TYPE_XDP,
-},
 {
 	"prevent map lookup in stack trace",
 	.insns = {
-- 
cgit v1.2.3


From 90b4c55586155cf13bbafbd4e55327f89681859d Mon Sep 17 00:00:00 2001
From: Zeev Zilberman <zeev@amazon.com>
Date: Mon, 10 Jun 2019 13:52:01 +0300
Subject: irqchip/gic-v2m: Add support for Amazon Graviton variant of
 GICv3+GICv2m

Add support for Amazon Graviton custom variant of GICv2m, where the message
is encoded using the MSI message address, as opposed to standard
GICv2m, where the SPI number is encoded in the MSI message data.

In addition, the Graviton flavor of GICv2m is used along GICv3 (and not
GICv2).

Co-developed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Zeev Zilberman <zeev@amazon.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v2m.c          | 85 +++++++++++++++++++++++++++-------
 drivers/irqchip/irq-gic-v3.c           |  3 ++
 include/linux/irqchip/arm-gic-common.h |  5 ++
 include/linux/irqchip/arm-gic.h        |  3 --
 4 files changed, 76 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index 3c77ab676e54..5356739d4799 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -56,6 +56,7 @@
 
 /* List of flags for specific v2m implementation */
 #define GICV2M_NEEDS_SPI_OFFSET		0x00000001
+#define GICV2M_GRAVITON_ADDRESS_ONLY	0x00000002
 
 static LIST_HEAD(v2m_nodes);
 static DEFINE_SPINLOCK(v2m_lock);
@@ -98,15 +99,26 @@ static struct msi_domain_info gicv2m_msi_domain_info = {
 	.chip	= &gicv2m_msi_irq_chip,
 };
 
+static phys_addr_t gicv2m_get_msi_addr(struct v2m_data *v2m, int hwirq)
+{
+	if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY)
+		return v2m->res.start | ((hwirq - 32) << 3);
+	else
+		return v2m->res.start + V2M_MSI_SETSPI_NS;
+}
+
 static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct v2m_data *v2m = irq_data_get_irq_chip_data(data);
-	phys_addr_t addr = v2m->res.start + V2M_MSI_SETSPI_NS;
+	phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq);
 
 	msg->address_hi = upper_32_bits(addr);
 	msg->address_lo = lower_32_bits(addr);
-	msg->data = data->hwirq;
 
+	if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY)
+		msg->data = 0;
+	else
+		msg->data = data->hwirq;
 	if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET)
 		msg->data -= v2m->spi_offset;
 
@@ -188,7 +200,7 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	hwirq = v2m->spi_start + offset;
 
 	err = iommu_dma_prepare_msi(info->desc,
-				    v2m->res.start + V2M_MSI_SETSPI_NS);
+				    gicv2m_get_msi_addr(v2m, hwirq));
 	if (err)
 		return err;
 
@@ -307,7 +319,7 @@ static int gicv2m_allocate_domains(struct irq_domain *parent)
 
 static int __init gicv2m_init_one(struct fwnode_handle *fwnode,
 				  u32 spi_start, u32 nr_spis,
-				  struct resource *res)
+				  struct resource *res, u32 flags)
 {
 	int ret;
 	struct v2m_data *v2m;
@@ -320,6 +332,7 @@ static int __init gicv2m_init_one(struct fwnode_handle *fwnode,
 
 	INIT_LIST_HEAD(&v2m->entry);
 	v2m->fwnode = fwnode;
+	v2m->flags = flags;
 
 	memcpy(&v2m->res, res, sizeof(struct resource));
 
@@ -334,7 +347,14 @@ static int __init gicv2m_init_one(struct fwnode_handle *fwnode,
 		v2m->spi_start = spi_start;
 		v2m->nr_spis = nr_spis;
 	} else {
-		u32 typer = readl_relaxed(v2m->base + V2M_MSI_TYPER);
+		u32 typer;
+
+		/* Graviton should always have explicit spi_start/nr_spis */
+		if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) {
+			ret = -EINVAL;
+			goto err_iounmap;
+		}
+		typer = readl_relaxed(v2m->base + V2M_MSI_TYPER);
 
 		v2m->spi_start = V2M_MSI_TYPER_BASE_SPI(typer);
 		v2m->nr_spis = V2M_MSI_TYPER_NUM_SPI(typer);
@@ -355,18 +375,21 @@ static int __init gicv2m_init_one(struct fwnode_handle *fwnode,
 	 *
 	 * Broadom NS2 GICv2m implementation has an erratum where the MSI data
 	 * is 'spi_number - 32'
+	 *
+	 * Reading that register fails on the Graviton implementation
 	 */
-	switch (readl_relaxed(v2m->base + V2M_MSI_IIDR)) {
-	case XGENE_GICV2M_MSI_IIDR:
-		v2m->flags |= GICV2M_NEEDS_SPI_OFFSET;
-		v2m->spi_offset = v2m->spi_start;
-		break;
-	case BCM_NS2_GICV2M_MSI_IIDR:
-		v2m->flags |= GICV2M_NEEDS_SPI_OFFSET;
-		v2m->spi_offset = 32;
-		break;
+	if (!(v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY)) {
+		switch (readl_relaxed(v2m->base + V2M_MSI_IIDR)) {
+		case XGENE_GICV2M_MSI_IIDR:
+			v2m->flags |= GICV2M_NEEDS_SPI_OFFSET;
+			v2m->spi_offset = v2m->spi_start;
+			break;
+		case BCM_NS2_GICV2M_MSI_IIDR:
+			v2m->flags |= GICV2M_NEEDS_SPI_OFFSET;
+			v2m->spi_offset = 32;
+			break;
+		}
 	}
-
 	v2m->bm = kcalloc(BITS_TO_LONGS(v2m->nr_spis), sizeof(long),
 			  GFP_KERNEL);
 	if (!v2m->bm) {
@@ -419,7 +442,8 @@ static int __init gicv2m_of_init(struct fwnode_handle *parent_handle,
 			pr_info("DT overriding V2M MSI_TYPER (base:%u, num:%u)\n",
 				spi_start, nr_spis);
 
-		ret = gicv2m_init_one(&child->fwnode, spi_start, nr_spis, &res);
+		ret = gicv2m_init_one(&child->fwnode, spi_start, nr_spis,
+				      &res, 0);
 		if (ret) {
 			of_node_put(child);
 			break;
@@ -451,6 +475,25 @@ static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev)
 	return data->fwnode;
 }
 
+static bool acpi_check_amazon_graviton_quirks(void)
+{
+	static struct acpi_table_madt *madt;
+	acpi_status status;
+	bool rc = false;
+
+#define ACPI_AMZN_OEM_ID		"AMAZON"
+
+	status = acpi_get_table(ACPI_SIG_MADT, 0,
+				(struct acpi_table_header **)&madt);
+
+	if (ACPI_FAILURE(status) || !madt)
+		return rc;
+	rc = !memcmp(madt->header.oem_id, ACPI_AMZN_OEM_ID, ACPI_OEM_ID_SIZE);
+	acpi_put_table((struct acpi_table_header *)madt);
+
+	return rc;
+}
+
 static int __init
 acpi_parse_madt_msi(union acpi_subtable_headers *header,
 		    const unsigned long end)
@@ -460,6 +503,7 @@ acpi_parse_madt_msi(union acpi_subtable_headers *header,
 	u32 spi_start = 0, nr_spis = 0;
 	struct acpi_madt_generic_msi_frame *m;
 	struct fwnode_handle *fwnode;
+	u32 flags = 0;
 
 	m = (struct acpi_madt_generic_msi_frame *)header;
 	if (BAD_MADT_ENTRY(m, end))
@@ -469,6 +513,13 @@ acpi_parse_madt_msi(union acpi_subtable_headers *header,
 	res.end = m->base_address + SZ_4K - 1;
 	res.flags = IORESOURCE_MEM;
 
+	if (acpi_check_amazon_graviton_quirks()) {
+		pr_info("applying Amazon Graviton quirk\n");
+		res.end = res.start + SZ_8K - 1;
+		flags |= GICV2M_GRAVITON_ADDRESS_ONLY;
+		gicv2m_msi_domain_info.flags &= ~MSI_FLAG_MULTI_PCI_MSI;
+	}
+
 	if (m->flags & ACPI_MADT_OVERRIDE_SPI_VALUES) {
 		spi_start = m->spi_base;
 		nr_spis = m->spi_count;
@@ -483,7 +534,7 @@ acpi_parse_madt_msi(union acpi_subtable_headers *header,
 		return -EINVAL;
 	}
 
-	ret = gicv2m_init_one(fwnode, spi_start, nr_spis, &res);
+	ret = gicv2m_init_one(fwnode, spi_start, nr_spis, &res, flags);
 	if (ret)
 		irq_domain_free_fwnode(fwnode);
 
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index f44cd89cfc40..1282f81696b2 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1343,6 +1343,9 @@ static int __init gic_init_bases(void __iomem *dist_base,
 	if (gic_dist_supports_lpis()) {
 		its_init(handle, &gic_data.rdists, gic_data.domain);
 		its_cpu_init();
+	} else {
+		if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
+			gicv2m_init(handle, gic_data.domain);
 	}
 
 	if (gic_prio_masking_enabled()) {
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
index 9a1a479a2bf4..62a882104790 100644
--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -39,4 +39,9 @@ struct gic_kvm_info {
 
 const struct gic_kvm_info *gic_get_kvm_info(void);
 
+struct irq_domain;
+struct fwnode_handle;
+int gicv2m_init(struct fwnode_handle *parent_handle,
+		struct irq_domain *parent);
+
 #endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 0f049b384ccd..7bd3bc6baa40 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -160,9 +160,6 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq);
  */
 void gic_init(void __iomem *dist , void __iomem *cpu);
 
-int gicv2m_init(struct fwnode_handle *parent_handle,
-		struct irq_domain *parent);
-
 void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
 int gic_get_cpu_id(unsigned int cpu);
 void gic_migrate_target(unsigned int new_cpu_id);
-- 
cgit v1.2.3


From 78b99577b3934e3e787fe0c52aa1b59442c8bbb5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 10 Jun 2019 00:09:53 +0900
Subject: pinctrl: remove unused pin_is_valid()

This function was used by pin_request() to pointlessly double-check
the pin validity, and it was the only user ever.

Since commit d2f6a1c6fb0e ("pinctrl: remove double pin validity
check."), no one has ever used it.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 23 -----------------------
 include/linux/pinctrl/pinctrl.h | 10 ----------
 2 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 04787eefe2a2..e745788fa36f 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -178,29 +178,6 @@ const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin)
 	return desc->name;
 }
 
-/**
- * pin_is_valid() - check if pin exists on controller
- * @pctldev: the pin control device to check the pin on
- * @pin: pin to check, use the local pin controller index number
- *
- * This tells us whether a certain pin exist on a certain pin controller or
- * not. Pin lists may be sparse, so some pins may not exist.
- */
-bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	struct pin_desc *pindesc;
-
-	if (pin < 0)
-		return false;
-
-	mutex_lock(&pctldev->mutex);
-	pindesc = pin_desc_get(pctldev, pin);
-	mutex_unlock(&pctldev->mutex);
-
-	return pindesc != NULL;
-}
-EXPORT_SYMBOL_GPL(pin_is_valid);
-
 /* Deletes a range of pin descriptors */
 static void pinctrl_free_pindescs(struct pinctrl_dev *pctldev,
 				  const struct pinctrl_pin_desc *pins,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 2744113f1024..36a79fe7b84f 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -172,7 +172,6 @@ extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
 extern void devm_pinctrl_unregister(struct device *dev,
 				struct pinctrl_dev *pctldev);
 
-extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin);
 extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
@@ -203,15 +202,6 @@ struct pinctrl_dev *of_pinctrl_get(struct device_node *np)
 extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
-#else
-
-struct pinctrl_dev;
-
-/* Sufficiently stupid default functions when pinctrl is not in use */
-static inline bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	return pin >= 0;
-}
 
 #endif /* !CONFIG_PINCTRL */
 
-- 
cgit v1.2.3


From ec6bc2e9e81b8805390851d7c7c907b0ed08b646 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 3 Jun 2019 15:57:46 +0100
Subject: driver core: Add per device iommu param

DMA faults can be detected by IOMMU at device level. Adding a pointer
to struct device allows IOMMU subsystem to report relevant faults
back to the device driver for further handling.
For direct assigned device (or user space drivers), guest OS holds
responsibility to handle and respond per device IOMMU fault.
Therefore we need fault reporting mechanism to propagate faults beyond
IOMMU subsystem.

There are two other IOMMU data pointers under struct device today, here
we introduce iommu_param as a parent pointer such that all device IOMMU
data can be consolidated here. The idea was suggested here by Greg KH
and Joerg. The name iommu_param is chosen here since iommu_data has been
used.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lkml.org/lkml/2017/10/6/81
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/device.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..f0a975abd6e9 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,6 +42,7 @@ struct iommu_ops;
 struct iommu_group;
 struct iommu_fwspec;
 struct dev_pin_info;
+struct iommu_param;
 
 struct bus_attribute {
 	struct attribute	attr;
@@ -959,6 +960,7 @@ struct dev_links_info {
  * 		device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
  * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
+ * @iommu_param: Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:	Set after successful invocation of bus type's .offline().
@@ -1052,6 +1054,7 @@ struct device {
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
 	struct iommu_fwspec	*iommu_fwspec;
+	struct iommu_param	*iommu_param;
 
 	bool			offline_disabled:1;
 	bool			offline:1;
-- 
cgit v1.2.3


From 4e32348ba5269aac1165f496b78189201568dd8c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 3 Jun 2019 15:57:47 +0100
Subject: iommu: Introduce device fault data

Device faults detected by IOMMU can be reported outside the IOMMU
subsystem for further processing. This patch introduces
a generic device fault data structure.

The fault can be either an unrecoverable fault or a page request,
also referred to as a recoverable fault.

We only care about non internal faults that are likely to be reported
to an external subsystem.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h      |  39 +++++++++++++++
 include/uapi/linux/iommu.h | 118 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 include/uapi/linux/iommu.h

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a815cf6f6f47..2b05056d5fa7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
 #define IOMMU_WRITE	(1 << 1)
@@ -49,6 +50,7 @@ struct device;
 struct iommu_domain;
 struct notifier_block;
 struct iommu_sva;
+struct iommu_fault_event;
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
@@ -58,6 +60,7 @@ typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
 			struct device *, unsigned long, int, void *);
 typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *,
 				       void *);
+typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *);
 
 struct iommu_domain_geometry {
 	dma_addr_t aperture_start; /* First address that can be mapped    */
@@ -301,6 +304,41 @@ struct iommu_device {
 	struct device *dev;
 };
 
+/**
+ * struct iommu_fault_event - Generic fault event
+ *
+ * Can represent recoverable faults such as a page requests or
+ * unrecoverable faults such as DMA or IRQ remapping faults.
+ *
+ * @fault: fault descriptor
+ */
+struct iommu_fault_event {
+	struct iommu_fault fault;
+};
+
+/**
+ * struct iommu_fault_param - per-device IOMMU fault data
+ * @handler: Callback function to handle IOMMU faults at device level
+ * @data: handler private data
+ */
+struct iommu_fault_param {
+	iommu_dev_fault_handler_t handler;
+	void *data;
+};
+
+/**
+ * struct iommu_param - collection of per-device IOMMU data
+ *
+ * @fault_param: IOMMU detected device fault reporting data
+ *
+ * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
+ *	struct iommu_group	*iommu_group;
+ *	struct iommu_fwspec	*iommu_fwspec;
+ */
+struct iommu_param {
+	struct iommu_fault_param *fault_param;
+};
+
 int  iommu_device_register(struct iommu_device *iommu);
 void iommu_device_unregister(struct iommu_device *iommu);
 int  iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -504,6 +542,7 @@ struct iommu_ops {};
 struct iommu_group {};
 struct iommu_fwspec {};
 struct iommu_device {};
+struct iommu_fault_param {};
 
 static inline bool iommu_present(struct bus_type *bus)
 {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index 000000000000..796402174d6c
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _UAPI_IOMMU_H
+#define _UAPI_IOMMU_H
+
+#include <linux/types.h>
+
+#define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC	(1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV	(1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
+	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
+};
+
+enum iommu_fault_reason {
+	IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+	/* Could not access the PASID table (fetch caused external abort) */
+	IOMMU_FAULT_REASON_PASID_FETCH,
+
+	/* PASID entry is invalid or has configuration errors */
+	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+	/*
+	 * PASID is out of range (e.g. exceeds the maximum PASID
+	 * supported by the IOMMU) or disabled.
+	 */
+	IOMMU_FAULT_REASON_PASID_INVALID,
+
+	/*
+	 * An external abort occurred fetching (or updating) a translation
+	 * table descriptor
+	 */
+	IOMMU_FAULT_REASON_WALK_EABT,
+
+	/*
+	 * Could not access the page table entry (Bad address),
+	 * actual translation fault
+	 */
+	IOMMU_FAULT_REASON_PTE_FETCH,
+
+	/* Protection flag check failed */
+	IOMMU_FAULT_REASON_PERMISSION,
+
+	/* access flag check failed */
+	IOMMU_FAULT_REASON_ACCESS,
+
+	/* Output address of a translation stage caused Address Size fault */
+	IOMMU_FAULT_REASON_OOR_ADDRESS,
+};
+
+/**
+ * struct iommu_fault_unrecoverable - Unrecoverable fault data
+ * @reason: reason of the fault, from &enum iommu_fault_reason
+ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
+ * @pasid: Process Address Space ID
+ * @perm: requested permission access using by the incoming transaction
+ *        (IOMMU_FAULT_PERM_* values)
+ * @addr: offending page address
+ * @fetch_addr: address that caused a fetch abort, if any
+ */
+struct iommu_fault_unrecoverable {
+	__u32	reason;
+#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
+#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
+#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
+	__u32	flags;
+	__u32	pasid;
+	__u32	perm;
+	__u64	addr;
+	__u64	fetch_addr;
+};
+
+/**
+ * struct iommu_fault_page_request - Page Request data
+ * @flags: encodes whether the corresponding fields are valid and whether this
+ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values)
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
+ * @addr: page address
+ * @private_data: device-specific private information
+ */
+struct iommu_fault_page_request {
+#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
+#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
+#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
+	__u32	flags;
+	__u32	pasid;
+	__u32	grpid;
+	__u32	perm;
+	__u64	addr;
+	__u64	private_data[2];
+};
+
+/**
+ * struct iommu_fault - Generic fault data
+ * @type: fault type from &enum iommu_fault_type
+ * @padding: reserved for future use (should be zero)
+ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
+ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
+ */
+struct iommu_fault {
+	__u32	type;
+	__u32	padding;
+	union {
+		struct iommu_fault_unrecoverable event;
+		struct iommu_fault_page_request prm;
+	};
+};
+#endif /* _UAPI_IOMMU_H */
-- 
cgit v1.2.3


From 0c830e6b32826311fc2b9ea1f4679be0f4ef0933 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 3 Jun 2019 15:57:48 +0100
Subject: iommu: Introduce device fault report API

Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.

This patch introduces a registration API for device specific fault
handlers. This differs from the existing iommu_set_fault_handler/
report_iommu_fault infrastructures in several ways:
- it allows to report more sophisticated fault events (both
  unrecoverable faults and page request faults) due to the nature
  of the iommu_fault struct
- it is device specific and not domain specific.

The current iommu_report_device_fault() implementation only handles
the "shoot and forget" unrecoverable fault case. Handling of page
request faults or stalled faults will come later.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/iommu.h |  29 ++++++++++
 2 files changed, 172 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3fa025f849e9..293a6fa716e0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -107,15 +107,43 @@ void iommu_device_unregister(struct iommu_device *iommu)
 	spin_unlock(&iommu_device_lock);
 }
 
+static struct iommu_param *iommu_get_dev_param(struct device *dev)
+{
+	struct iommu_param *param = dev->iommu_param;
+
+	if (param)
+		return param;
+
+	param = kzalloc(sizeof(*param), GFP_KERNEL);
+	if (!param)
+		return NULL;
+
+	mutex_init(&param->lock);
+	dev->iommu_param = param;
+	return param;
+}
+
+static void iommu_free_dev_param(struct device *dev)
+{
+	kfree(dev->iommu_param);
+	dev->iommu_param = NULL;
+}
+
 int iommu_probe_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
-	int ret = -EINVAL;
+	int ret;
 
 	WARN_ON(dev->iommu_group);
+	if (!ops)
+		return -EINVAL;
 
-	if (ops)
-		ret = ops->add_device(dev);
+	if (!iommu_get_dev_param(dev))
+		return -ENOMEM;
+
+	ret = ops->add_device(dev);
+	if (ret)
+		iommu_free_dev_param(dev);
 
 	return ret;
 }
@@ -126,6 +154,8 @@ void iommu_release_device(struct device *dev)
 
 	if (dev->iommu_group)
 		ops->remove_device(dev);
+
+	iommu_free_dev_param(dev);
 }
 
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
@@ -854,6 +884,116 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+/**
+ * iommu_register_device_fault_handler() - Register a device fault handler
+ * @dev: the device
+ * @handler: the fault handler
+ * @data: private data passed as argument to the handler
+ *
+ * When an IOMMU fault event is received, this handler gets called with the
+ * fault event and data as argument. The handler should return 0 on success.
+ *
+ * Return 0 if the fault handler was installed successfully, or an error.
+ */
+int iommu_register_device_fault_handler(struct device *dev,
+					iommu_dev_fault_handler_t handler,
+					void *data)
+{
+	struct iommu_param *param = dev->iommu_param;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	mutex_lock(&param->lock);
+	/* Only allow one fault handler registered for each device */
+	if (param->fault_param) {
+		ret = -EBUSY;
+		goto done_unlock;
+	}
+
+	get_device(dev);
+	param->fault_param = kzalloc(sizeof(*param->fault_param), GFP_KERNEL);
+	if (!param->fault_param) {
+		put_device(dev);
+		ret = -ENOMEM;
+		goto done_unlock;
+	}
+	param->fault_param->handler = handler;
+	param->fault_param->data = data;
+
+done_unlock:
+	mutex_unlock(&param->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+/**
+ * iommu_unregister_device_fault_handler() - Unregister the device fault handler
+ * @dev: the device
+ *
+ * Remove the device fault handler installed with
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+	struct iommu_param *param = dev->iommu_param;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	mutex_lock(&param->lock);
+
+	if (!param->fault_param)
+		goto unlock;
+
+	kfree(param->fault_param);
+	param->fault_param = NULL;
+	put_device(dev);
+unlock:
+	mutex_unlock(&param->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+/**
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault event data
+ *
+ * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
+ * handler.
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+	struct iommu_param *param = dev->iommu_param;
+	struct iommu_fault_param *fparam;
+	int ret = 0;
+
+	if (!param || !evt)
+		return -EINVAL;
+
+	/* we only report device fault if there is a handler registered */
+	mutex_lock(&param->lock);
+	fparam = param->fault_param;
+	if (!fparam || !fparam->handler) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	ret = fparam->handler(&evt->fault, fparam->data);
+done_unlock:
+	mutex_unlock(&param->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
 /**
  * iommu_group_id - Return ID for a group
  * @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2b05056d5fa7..3e783f5bf472 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -336,6 +336,7 @@ struct iommu_fault_param {
  *	struct iommu_fwspec	*iommu_fwspec;
  */
 struct iommu_param {
+	struct mutex lock;
 	struct iommu_fault_param *fault_param;
 };
 
@@ -428,6 +429,15 @@ extern int iommu_group_register_notifier(struct iommu_group *group,
 					 struct notifier_block *nb);
 extern int iommu_group_unregister_notifier(struct iommu_group *group,
 					   struct notifier_block *nb);
+extern int iommu_register_device_fault_handler(struct device *dev,
+					iommu_dev_fault_handler_t handler,
+					void *data);
+
+extern int iommu_unregister_device_fault_handler(struct device *dev);
+
+extern int iommu_report_device_fault(struct device *dev,
+				     struct iommu_fault_event *evt);
+
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
@@ -736,6 +746,25 @@ static inline int iommu_group_unregister_notifier(struct iommu_group *group,
 	return 0;
 }
 
+static inline
+int iommu_register_device_fault_handler(struct device *dev,
+					iommu_dev_fault_handler_t handler,
+					void *data)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_unregister_device_fault_handler(struct device *dev)
+{
+	return 0;
+}
+
+static inline
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+	return -ENODEV;
+}
+
 static inline int iommu_group_id(struct iommu_group *group)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From bf3255b3cfe2d06280340dbac3f44b65d3ee6da3 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Mon, 3 Jun 2019 15:57:49 +0100
Subject: iommu: Add recoverable fault reporting

Some IOMMU hardware features, for example PCI PRI and Arm SMMU Stall,
enable recoverable I/O page faults. Allow IOMMU drivers to report PRI Page
Requests and Stall events through the new fault reporting API. The
consumer of the fault can be either an I/O page fault handler in the host,
or a guest OS.

Once handled, the fault must be completed by sending a page response back
to the IOMMU. Add an iommu_page_response() function to complete a page
fault.

There are two ways to extend the userspace API:
* Add a field to iommu_page_response and a flag to
  iommu_page_response::flags describing the validity of this field.
* Introduce a new iommu_page_response_X structure with a different version
  number. The kernel must then support both versions.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c      | 94 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/iommu.h      | 19 ++++++++++
 include/uapi/linux/iommu.h | 35 +++++++++++++++++
 3 files changed, 146 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 293a6fa716e0..ac1f29c19e59 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -891,7 +891,14 @@ EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
  * @data: private data passed as argument to the handler
  *
  * When an IOMMU fault event is received, this handler gets called with the
- * fault event and data as argument. The handler should return 0 on success.
+ * fault event and data as argument. The handler should return 0 on success. If
+ * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the consumer should also
+ * complete the fault by calling iommu_page_response() with one of the following
+ * response code:
+ * - IOMMU_PAGE_RESP_SUCCESS: retry the translation
+ * - IOMMU_PAGE_RESP_INVALID: terminate the fault
+ * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting
+ *   page faults if possible.
  *
  * Return 0 if the fault handler was installed successfully, or an error.
  */
@@ -921,6 +928,8 @@ int iommu_register_device_fault_handler(struct device *dev,
 	}
 	param->fault_param->handler = handler;
 	param->fault_param->data = data;
+	mutex_init(&param->fault_param->lock);
+	INIT_LIST_HEAD(&param->fault_param->faults);
 
 done_unlock:
 	mutex_unlock(&param->lock);
@@ -951,6 +960,12 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 	if (!param->fault_param)
 		goto unlock;
 
+	/* we cannot unregister handler if there are pending faults */
+	if (!list_empty(&param->fault_param->faults)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
 	kfree(param->fault_param);
 	param->fault_param = NULL;
 	put_device(dev);
@@ -967,13 +982,15 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
  * @evt: fault event data
  *
  * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler.
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
  *
  * Return 0 on success, or an error.
  */
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {
 	struct iommu_param *param = dev->iommu_param;
+	struct iommu_fault_event *evt_pending = NULL;
 	struct iommu_fault_param *fparam;
 	int ret = 0;
 
@@ -987,13 +1004,86 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 		ret = -EINVAL;
 		goto done_unlock;
 	}
+
+	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
+	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event),
+				      GFP_KERNEL);
+		if (!evt_pending) {
+			ret = -ENOMEM;
+			goto done_unlock;
+		}
+		mutex_lock(&fparam->lock);
+		list_add_tail(&evt_pending->list, &fparam->faults);
+		mutex_unlock(&fparam->lock);
+	}
+
 	ret = fparam->handler(&evt->fault, fparam->data);
+	if (ret && evt_pending) {
+		mutex_lock(&fparam->lock);
+		list_del(&evt_pending->list);
+		mutex_unlock(&fparam->lock);
+		kfree(evt_pending);
+	}
 done_unlock:
 	mutex_unlock(&param->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
+int iommu_page_response(struct device *dev,
+			struct iommu_page_response *msg)
+{
+	bool pasid_valid;
+	int ret = -EINVAL;
+	struct iommu_fault_event *evt;
+	struct iommu_fault_page_request *prm;
+	struct iommu_param *param = dev->iommu_param;
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	if (!domain || !domain->ops->page_response)
+		return -ENODEV;
+
+	if (!param || !param->fault_param)
+		return -EINVAL;
+
+	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
+	    msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID)
+		return -EINVAL;
+
+	/* Only send response if there is a fault report pending */
+	mutex_lock(&param->fault_param->lock);
+	if (list_empty(&param->fault_param->faults)) {
+		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
+		goto done_unlock;
+	}
+	/*
+	 * Check if we have a matching page request pending to respond,
+	 * otherwise return -EINVAL
+	 */
+	list_for_each_entry(evt, &param->fault_param->faults, list) {
+		prm = &evt->fault.prm;
+		pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+
+		if ((pasid_valid && prm->pasid != msg->pasid) ||
+		    prm->grpid != msg->grpid)
+			continue;
+
+		/* Sanitize the reply */
+		msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0;
+
+		ret = domain->ops->page_response(dev, evt, msg);
+		list_del(&evt->list);
+		kfree(evt);
+		break;
+	}
+
+done_unlock:
+	mutex_unlock(&param->fault_param->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_page_response);
+
 /**
  * iommu_group_id - Return ID for a group
  * @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3e783f5bf472..76c8cda61dfd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -227,6 +227,7 @@ struct iommu_sva_ops {
  * @sva_bind: Bind process address space to device
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
+ * @page_response: handle page request response
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
@@ -287,6 +288,10 @@ struct iommu_ops {
 	void (*sva_unbind)(struct iommu_sva *handle);
 	int (*sva_get_pasid)(struct iommu_sva *handle);
 
+	int (*page_response)(struct device *dev,
+			     struct iommu_fault_event *evt,
+			     struct iommu_page_response *msg);
+
 	unsigned long pgsize_bitmap;
 };
 
@@ -311,19 +316,25 @@ struct iommu_device {
  * unrecoverable faults such as DMA or IRQ remapping faults.
  *
  * @fault: fault descriptor
+ * @list: pending fault event list, used for tracking responses
  */
 struct iommu_fault_event {
 	struct iommu_fault fault;
+	struct list_head list;
 };
 
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @handler: Callback function to handle IOMMU faults at device level
  * @data: handler private data
+ * @faults: holds the pending faults which needs response
+ * @lock: protect pending faults list
  */
 struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
 	void *data;
+	struct list_head faults;
+	struct mutex lock;
 };
 
 /**
@@ -437,6 +448,8 @@ extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
+extern int iommu_page_response(struct device *dev,
+			       struct iommu_page_response *msg);
 
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
@@ -765,6 +778,12 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	return -ENODEV;
 }
 
+static inline int iommu_page_response(struct device *dev,
+				      struct iommu_page_response *msg)
+{
+	return -ENODEV;
+}
+
 static inline int iommu_group_id(struct iommu_group *group)
 {
 	return -ENODEV;
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 796402174d6c..f45d8e9e59c3 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -115,4 +115,39 @@ struct iommu_fault {
 		struct iommu_fault_page_request prm;
 	};
 };
+
+/**
+ * enum iommu_page_response_code - Return status of fault handlers
+ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
+ *	populated, retry the access. This is "Success" in PCI PRI.
+ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
+ *	this device if possible. This is "Response Failure" in PCI PRI.
+ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
+ *	access. This is "Invalid Request" in PCI PRI.
+ */
+enum iommu_page_response_code {
+	IOMMU_PAGE_RESP_SUCCESS = 0,
+	IOMMU_PAGE_RESP_INVALID,
+	IOMMU_PAGE_RESP_FAILURE,
+};
+
+/**
+ * struct iommu_page_response - Generic page response information
+ * @version: API version of this structure
+ * @flags: encodes whether the corresponding fields are valid
+ *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @code: response code from &enum iommu_page_response_code
+ */
+struct iommu_page_response {
+#define IOMMU_PAGE_RESP_VERSION_1	1
+	__u32	version;
+#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
+	__u32	flags;
+	__u32	pasid;
+	__u32	grpid;
+	__u32	code;
+};
+
 #endif /* _UAPI_IOMMU_H */
-- 
cgit v1.2.3


From adfd373820906d376c8b643f1a279ac809605b6b Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 3 Jun 2019 08:53:35 +0200
Subject: iommu: Introduce IOMMU_RESV_DIRECT_RELAXABLE reserved memory regions

Introduce a new type for reserved region. This corresponds
to directly mapped regions which are known to be relaxable
in some specific conditions, such as device assignment use
case. Well known examples are those used by USB controllers
providing PS/2 keyboard emulation for pre-boot BIOS and
early BOOT or RMRRs associated to IGD working in legacy mode.

Since commit c875d2c1b808 ("iommu/vt-d: Exclude devices using RMRRs
from IOMMU API domains") and commit 18436afdc11a ("iommu/vt-d: Allow
RMRR on graphics devices too"), those regions are currently
considered "safe" with respect to device assignment use case
which requires a non direct mapping at IOMMU physical level
(RAM GPA -> HPA mapping).

Those RMRRs currently exist and sometimes the device is
attempting to access it but this has not been considered
an issue until now.

However at the moment, iommu_get_group_resv_regions() is
not able to make any difference between directly mapped
regions: those which must be absolutely enforced and those
like above ones which are known as relaxable.

This is a blocker for reporting severe conflicts between
non relaxable RMRRs (like MSI doorbells) and guest GPA space.

With this new reserved region type we will be able to use
iommu_get_group_resv_regions() to enumerate the IOVA space
that is usable through the IOMMU API without introducing
regressions with respect to existing device assignment
use cases (USB and IGD).

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/ABI/testing/sysfs-kernel-iommu_groups |  9 +++++++++
 drivers/iommu/iommu.c                               | 12 +++++++-----
 include/linux/iommu.h                               |  6 ++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 35c64e00b35c..017f5bc3920c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -24,3 +24,12 @@ Description:    /sys/kernel/iommu_groups/reserved_regions list IOVA
 		region is described on a single line: the 1st field is
 		the base IOVA, the second is the end IOVA and the third
 		field describes the type of the region.
+
+What:		/sys/kernel/iommu_groups/reserved_regions
+Date: 		June 2019
+KernelVersion:  v5.3
+Contact: 	Eric Auger <eric.auger@redhat.com>
+Description:    In case an RMRR is used only by graphics or USB devices
+		it is now exposed as "direct-relaxable" instead of "direct".
+		In device assignment use case, for instance, those RMRR
+		are considered to be relaxable and safe.
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ba0661744a3d..46a06ff46e47 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -73,10 +73,11 @@ struct iommu_group_attribute {
 };
 
 static const char * const iommu_group_resv_type_string[] = {
-	[IOMMU_RESV_DIRECT]	= "direct",
-	[IOMMU_RESV_RESERVED]	= "reserved",
-	[IOMMU_RESV_MSI]	= "msi",
-	[IOMMU_RESV_SW_MSI]	= "msi",
+	[IOMMU_RESV_DIRECT]			= "direct",
+	[IOMMU_RESV_DIRECT_RELAXABLE]		= "direct-relaxable",
+	[IOMMU_RESV_RESERVED]			= "reserved",
+	[IOMMU_RESV_MSI]			= "msi",
+	[IOMMU_RESV_SW_MSI]			= "msi",
 };
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
@@ -575,7 +576,8 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 		start = ALIGN(entry->start, pg_size);
 		end   = ALIGN(entry->start + entry->length, pg_size);
 
-		if (entry->type != IOMMU_RESV_DIRECT)
+		if (entry->type != IOMMU_RESV_DIRECT &&
+		    entry->type != IOMMU_RESV_DIRECT_RELAXABLE)
 			continue;
 
 		for (addr = start; addr < end; addr += pg_size) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 91af22a344e2..ab7a1c85af75 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -135,6 +135,12 @@ enum iommu_attr {
 enum iommu_resv_type {
 	/* Memory regions which must be mapped 1:1 at all times */
 	IOMMU_RESV_DIRECT,
+	/*
+	 * Memory regions which are advertised to be 1:1 but are
+	 * commonly considered relaxable in some conditions,
+	 * for instance in device assignment use case (USB, Graphics)
+	 */
+	IOMMU_RESV_DIRECT_RELAXABLE,
 	/* Arbitrary "never map this or give it to a device" address ranges */
 	IOMMU_RESV_RESERVED,
 	/* Hardware MSI region (untranslated) */
-- 
cgit v1.2.3


From 18bd49c4c7c22a59634c8142d8618f5da8d29250 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 10 Jun 2019 20:11:00 +0300
Subject: gpio: omap: constify register tables

We must never alter the register tables; these are read-only as far
as the driver is concerned.  Constify these tables.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-omap.c                | 12 ++++++------
 include/linux/platform_data/gpio-omap.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index 1c5fa12bcf9f..039bbb1ae6cb 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -48,6 +48,8 @@ struct gpio_regs {
 
 struct gpio_bank {
 	void __iomem *base;
+	const struct omap_gpio_reg_offs *regs;
+
 	int irq;
 	u32 non_wakeup_gpios;
 	u32 enabled_non_wakeup_gpios;
@@ -75,8 +77,6 @@ struct gpio_bank {
 
 	void (*set_dataout)(struct gpio_bank *bank, unsigned gpio, int enable);
 	int (*get_context_loss_count)(struct device *dev);
-
-	struct omap_gpio_reg_offs *regs;
 };
 
 #define GPIO_MOD_CTRL_BIT	BIT(0)
@@ -1075,7 +1075,7 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
 
 static void omap_gpio_init_context(struct gpio_bank *p)
 {
-	struct omap_gpio_reg_offs *regs = p->regs;
+	const struct omap_gpio_reg_offs *regs = p->regs;
 	void __iomem *base = p->base;
 
 	p->context.ctrl		= readl_relaxed(base + regs->ctrl);
@@ -1094,7 +1094,7 @@ static void omap_gpio_init_context(struct gpio_bank *p)
 
 static void omap_gpio_restore_context(struct gpio_bank *bank)
 {
-	struct omap_gpio_reg_offs *regs = bank->regs;
+	const struct omap_gpio_reg_offs *regs = bank->regs;
 	void __iomem *base = bank->base;
 
 	writel_relaxed(bank->context.wake_en, base + regs->wkup_en);
@@ -1267,7 +1267,7 @@ static int gpio_omap_cpu_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static struct omap_gpio_reg_offs omap2_gpio_regs = {
+static const struct omap_gpio_reg_offs omap2_gpio_regs = {
 	.revision =		OMAP24XX_GPIO_REVISION,
 	.direction =		OMAP24XX_GPIO_OE,
 	.datain =		OMAP24XX_GPIO_DATAIN,
@@ -1290,7 +1290,7 @@ static struct omap_gpio_reg_offs omap2_gpio_regs = {
 	.fallingdetect =	OMAP24XX_GPIO_FALLINGDETECT,
 };
 
-static struct omap_gpio_reg_offs omap4_gpio_regs = {
+static const struct omap_gpio_reg_offs omap4_gpio_regs = {
 	.revision =		OMAP4_GPIO_REVISION,
 	.direction =		OMAP4_GPIO_OE,
 	.datain =		OMAP4_GPIO_DATAIN,
diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h
index 7c36370c062e..1ca400005233 100644
--- a/include/linux/platform_data/gpio-omap.h
+++ b/include/linux/platform_data/gpio-omap.h
@@ -200,7 +200,7 @@ struct omap_gpio_platform_data {
 	bool is_mpuio;		/* whether the bank is of type MPUIO */
 	u32 non_wakeup_gpios;
 
-	struct omap_gpio_reg_offs *regs;
+	const struct omap_gpio_reg_offs *regs;
 
 	/* Return context loss count due to PM states changing */
 	int (*get_context_loss_count)(struct device *dev);
-- 
cgit v1.2.3


From 68bc30bb9f33fc8d11e3d110d29e06490896a999 Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@linux.intel.com>
Date: Thu, 6 Jun 2019 09:22:34 +0800
Subject: proc: Add /proc/<pid>/arch_status

Exposing architecture specific per process information is useful for
various reasons. An example is the AVX512 usage on x86 which is important
for task placement for power/performance optimizations.

Adding this information to the existing /prcc/pid/status file would be the
obvious choise, but it has been agreed on that a explicit arch_status file
is better in separating the generic and architecture specific information.

[ tglx: Massage changelog ]

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: peterz@infradead.org
Cc: hpa@zytor.com
Cc: ak@linux.intel.com
Cc: tim.c.chen@linux.intel.com
Cc: dave.hansen@intel.com
Cc: arjan@linux.intel.com
Cc: adobriyan@gmail.com
Cc: aubrey.li@intel.com
Cc: linux-api@vger.kernel.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Linux API <linux-api@vger.kernel.org>
Link: https://lkml.kernel.org/r/20190606012236.9391-1-aubrey.li@linux.intel.com
---
 fs/proc/Kconfig         | 4 ++++
 fs/proc/base.c          | 6 ++++++
 include/linux/proc_fs.h | 9 +++++++++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 62ee41b4bbd0..4c3dcb718961 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -98,3 +98,7 @@ config PROC_CHILDREN
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config PROC_PID_ARCH_STATUS
+	def_bool n
+	depends on PROC_FS
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9c8ca6cd3ce4..ec436c61eece 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3061,6 +3061,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_STACKLEAK_METRICS
 	ONE("stack_depth", S_IRUGO, proc_stack_depth),
 #endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3449,6 +3452,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 52a283ba0465..a705aa2d03f9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -75,6 +75,15 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+/*
+ * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
+ * provide proc_pid_arch_status() definition.
+ */
+int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task);
+#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
 #else /* CONFIG_PROC_FS */
 
 static inline void proc_root_init(void)
-- 
cgit v1.2.3


From 0b673b6486998061b0489b09447ebe8452da0146 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 8 May 2019 11:46:34 -0700
Subject: firmware: arm_scmi: fetch and store sensor scale

In preparation for dealing with scales within the SCMI HWMON driver,
fetch and store the sensor unit scale into the scmi_sensor_info
structure. In order to simplify computations for upper layer, take care
of sign extending the scale to a full 8-bit signed value.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
[sudeep.holla: update bitfield values as per specification]
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 6 ++++++
 include/linux/scmi_protocol.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index c00287b5f2c2..0e94ab56f679 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -34,6 +34,8 @@ struct scmi_msg_resp_sensor_description {
 		__le32 attributes_high;
 #define SENSOR_TYPE(x)		((x) & 0xff)
 #define SENSOR_SCALE(x)		(((x) >> 11) & 0x1f)
+#define SENSOR_SCALE_SIGN	BIT(4)
+#define SENSOR_SCALE_EXTEND	GENMASK(7, 5)
 #define SENSOR_UPDATE_SCALE(x)	(((x) >> 22) & 0x1f)
 #define SENSOR_UPDATE_BASE(x)	(((x) >> 27) & 0x1f)
 		    u8 name[SCMI_MAX_STR_SIZE];
@@ -140,6 +142,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 			s = &si->sensors[desc_index + cnt];
 			s->id = le32_to_cpu(buf->desc[cnt].id);
 			s->type = SENSOR_TYPE(attrh);
+			s->scale = SENSOR_SCALE(attrh);
+			/* Sign extend to a full s8 */
+			if (s->scale & SENSOR_SCALE_SIGN)
+				s->scale |= SENSOR_SCALE_EXTEND;
 			strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
 		}
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 3105055c00a7..9ff2e9357e9a 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -144,6 +144,7 @@ struct scmi_power_ops {
 struct scmi_sensor_info {
 	u32 id;
 	u8 type;
+	s8 scale;
 	char name[SCMI_MAX_STR_SIZE];
 };
 
-- 
cgit v1.2.3


From 81f4458c9c6998fcd37c427d16d76d4dba65d015 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 28 May 2019 16:10:24 +0300
Subject: firmware: ti_sci: extend clock identifiers from u8 to u32

Future SoCs are going to have more than 255 device clocks in certain cases,
and thus the API must be extended to support this. The support is done in
backwards compatible extension, in which the new u32 clock identifier
fields are only used if the existing u8 size clock identifier is set as
255. In all the other cases, the existing u8 clock identifier is used. As
the size of the messages sent / received is not verified for existing
devices / old firmware, increasing the size of the messages from the end
is also fine. Due to this reason, depending on ABI version isn't necessary
either.

Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/firmware/ti_sci.c              | 115 ++++++++++++++++++++++++---------
 drivers/firmware/ti_sci.h              |  63 ++++++++++++++----
 include/linux/soc/ti/ti_sci_protocol.h |  28 ++++----
 3 files changed, 150 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index ef93406ace1b..b417cef35769 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -916,7 +916,7 @@ static int ti_sci_cmd_get_device_resets(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id,
+				  u32 dev_id, u32 clk_id,
 				  u32 flags, u8 state)
 {
 	struct ti_sci_info *info;
@@ -944,7 +944,12 @@ static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->request_state = state;
 
 	ret = ti_sci_do_xfer(info, xfer);
@@ -976,7 +981,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
-				      u32 dev_id, u8 clk_id,
+				      u32 dev_id, u32 clk_id,
 				      u8 *programmed_state, u8 *current_state)
 {
 	struct ti_sci_info *info;
@@ -1007,7 +1012,12 @@ static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1047,8 +1057,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool needs_ssc, bool can_change_freq,
-				bool enable_input_term)
+				u32 clk_id, bool needs_ssc,
+				bool can_change_freq, bool enable_input_term)
 {
 	u32 flags = 0;
 
@@ -1073,7 +1083,7 @@ static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
-				 u32 dev_id, u8 clk_id)
+				 u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_UNREQ);
@@ -1092,7 +1102,7 @@ static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
-				u32 dev_id, u8 clk_id)
+				u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_AUTO);
@@ -1110,7 +1120,7 @@ static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id, bool *req_state)
+				  u32 dev_id, u32 clk_id, bool *req_state)
 {
 	u8 state = 0;
 	int ret;
@@ -1139,7 +1149,7 @@ static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool *req_state, bool *curr_state)
+				u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1172,7 +1182,7 @@ static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
-				 u8 clk_id, bool *req_state, bool *curr_state)
+				 u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1204,7 +1214,7 @@ static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 parent_id)
+				     u32 dev_id, u32 clk_id, u32 parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_set_clock_parent *req;
@@ -1231,8 +1241,18 @@ static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
-	req->parent_id = parent_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
+	if (parent_id < 255) {
+		req->parent_id = parent_id;
+	} else {
+		req->parent_id = 255;
+		req->parent_id_32 = parent_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1262,7 +1282,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 *parent_id)
+				     u32 dev_id, u32 clk_id, u32 *parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_parent *req;
@@ -1289,7 +1309,12 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1299,10 +1324,14 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_parent *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*parent_id = resp->parent_id;
+	} else {
+		if (resp->parent_id < 255)
+			*parent_id = resp->parent_id;
+		else
+			*parent_id = resp->parent_id_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1322,8 +1351,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
-					  u32 dev_id, u8 clk_id,
-					  u8 *num_parents)
+					  u32 dev_id, u32 clk_id,
+					  u32 *num_parents)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_num_parents *req;
@@ -1350,7 +1379,12 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_num_parents *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1360,10 +1394,14 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_num_parents *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*num_parents = resp->num_parents;
+	} else {
+		if (resp->num_parents < 255)
+			*num_parents = resp->num_parents;
+		else
+			*num_parents = resp->num_parents_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1391,7 +1429,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
-					 u32 dev_id, u8 clk_id, u64 min_freq,
+					 u32 dev_id, u32 clk_id, u64 min_freq,
 					 u64 target_freq, u64 max_freq,
 					 u64 *match_freq)
 {
@@ -1420,7 +1458,12 @@ static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_query_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1463,7 +1506,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 min_freq,
+				   u32 dev_id, u32 clk_id, u64 min_freq,
 				   u64 target_freq, u64 max_freq)
 {
 	struct ti_sci_info *info;
@@ -1491,7 +1534,12 @@ static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1524,7 +1572,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 *freq)
+				   u32 dev_id, u32 clk_id, u64 *freq)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_freq *req;
@@ -1551,7 +1599,12 @@ static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..ad0b47981b87 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -202,7 +202,8 @@ struct ti_sci_msg_req_set_device_resets {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @request_state: Request the state for the clock to be set to.
  *		MSG_CLOCK_SW_STATE_UNREQ: The IP does not require this clock,
  *		it can be disabled, regardless of the state of the device
@@ -213,6 +214,9 @@ struct ti_sci_msg_req_set_device_resets {
  *		being required by the device.(default)
  *		MSG_CLOCK_SW_STATE_REQ:  Configure the clock to be enabled,
  *		regardless of the state of the device.
+ * @clk_id_32:	Clock identifier for the device for this request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Normally, all required clocks are managed by TISCI entity, this is used
  * only for specific control *IF* required. Auto managed state is
@@ -234,6 +238,7 @@ struct ti_sci_msg_req_set_clock_state {
 #define MSG_CLOCK_SW_STATE_AUTO		1
 #define MSG_CLOCK_SW_STATE_REQ		2
 	u8 request_state;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -242,7 +247,11 @@ struct ti_sci_msg_req_set_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get state of.
+ *		which clock input to get state of. Set to 255 if the clock
+ *		ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier for the device for the request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_STATE, response is state
  * of the clock
@@ -251,6 +260,7 @@ struct ti_sci_msg_req_get_clock_state {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -278,9 +288,13 @@ struct ti_sci_msg_resp_get_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @parent_id:	The new clock parent is selectable by an index via this
- *		parameter.
+ *		parameter. Set to 255 if clock ID is greater than or
+ *		equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is 255.
+ * @parent_id_32:	Parent identifier if @parent_id is 255.
  *
  * Request type is TI_SCI_MSG_SET_CLOCK_PARENT, response is generic
  * ACK / NACK message.
@@ -290,6 +304,8 @@ struct ti_sci_msg_req_set_clock_parent {
 	u32 dev_id;
 	u8 clk_id;
 	u8 parent_id;
+	u32 clk_id_32;
+	u32 parent_id_32;
 } __packed;
 
 /**
@@ -298,7 +314,10 @@ struct ti_sci_msg_req_set_clock_parent {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get the parent for.
+ *		which clock input to get the parent for. If this field
+ *		contains 255, the actual clock identifier is stored in
+ *		@clk_id_32.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_PARENT, response is parent information
  */
@@ -306,25 +325,32 @@ struct ti_sci_msg_req_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_parent - Response with clock parent
  * @hdr:	Generic Header
- * @parent_id:	The current clock parent
+ * @parent_id:	The current clock parent. If set to 255, the current parent
+ *		ID can be found from the @parent_id_32 field.
+ * @parent_id_32:	Current clock parent if @parent_id field is set to
+ *			255.
  *
  * Response to TI_SCI_MSG_GET_CLOCK_PARENT.
  */
 struct ti_sci_msg_resp_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u8 parent_id;
+	u32 parent_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_num_parents - Request to get clock parents
  * @hdr:	Generic header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * This request provides information about how many clock parent options
  * are available for a given clock to a device. This is typically used
@@ -337,18 +363,24 @@ struct ti_sci_msg_req_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_num_parents - Response for get clk parents
  * @hdr:		Generic header
- * @num_parents:	Number of clock parents
+ * @num_parents:	Number of clock parents. If set to 255, the actual
+ *			number of parents is stored into @num_parents_32
+ *			field instead.
+ * @num_parents_32:	Number of clock parents if @num_parents field is
+ *			set to 255.
  *
  * Response to TI_SCI_MSG_GET_NUM_CLOCK_PARENTS
  */
 struct ti_sci_msg_resp_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u8 num_parents;
+	u32 num_parents_32;
 } __packed;
 
 /**
@@ -363,7 +395,9 @@ struct ti_sci_msg_resp_get_clock_num_parents {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock identifier is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -380,6 +414,7 @@ struct ti_sci_msg_req_query_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -407,7 +442,9 @@ struct ti_sci_msg_resp_query_clock_freq {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -436,13 +473,16 @@ struct ti_sci_msg_req_set_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_freq - Request to get the clock frequency
  * @hdr:	Generic Header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In some cases, clock frequencies are configured by host.
@@ -454,6 +494,7 @@ struct ti_sci_msg_req_get_clock_freq {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..406e6717d252 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -166,29 +166,29 @@ struct ti_sci_dev_ops {
  * managed by driver for that purpose.
  */
 struct ti_sci_clk_ops {
-	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			 bool needs_ssc, bool can_change_freq,
 			 bool enable_input_term);
-	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		       bool *req_state);
-	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		     bool *req_state, bool *current_state);
-	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		      bool *req_state, bool *current_state);
-	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 parent_id);
-	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 *parent_id);
+	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 parent_id);
+	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 *parent_id);
 	int (*get_num_parents)(const struct ti_sci_handle *handle, u32 did,
-			       u8 cid, u8 *num_parents);
+			       u32 cid, u32 *num_parents);
 	int (*get_best_match_freq)(const struct ti_sci_handle *handle, u32 did,
-				   u8 cid, u64 min_freq, u64 target_freq,
+				   u32 cid, u64 min_freq, u64 target_freq,
 				   u64 max_freq, u64 *match_freq);
-	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 min_freq, u64 target_freq, u64 max_freq);
-	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 *current_freq);
 };
 
-- 
cgit v1.2.3


From 6a80b30086b861b2591ba2a953042abd08c498e3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 10 Jun 2019 16:04:39 +0200
Subject: fmc: Delete the FMC subsystem

The FMC subsystem was created in 2012 with the ambition to
drive development of drivers for this hardware upstream.

The current implementation has architectural flaws and would
need to be revamped using real hardware to something that can
reuse existing kernel abstractions in the subsystems for e.g.
I2C, FPGA and GPIO.

We have concluded that for the mainline kernel it will be
better to delete the subsystem and start over with a clean
slate when/if an active maintainer steps up.

For details see:
https://lkml.org/lkml/2018/10/29/534

Suggested-by: Federico Vaga <federico.vaga@cern.ch>
Cc: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/fmc/API.txt              |  47 ----
 Documentation/fmc/FMC-and-SDB.txt      |  88 --------
 Documentation/fmc/carrier.txt          | 311 --------------------------
 Documentation/fmc/fmc-chardev.txt      |  64 ------
 Documentation/fmc/fmc-fakedev.txt      |  36 ---
 Documentation/fmc/fmc-trivial.txt      |  17 --
 Documentation/fmc/fmc-write-eeprom.txt |  98 ---------
 Documentation/fmc/identifiers.txt      | 168 --------------
 Documentation/fmc/mezzanine.txt        | 123 -----------
 Documentation/fmc/parameters.txt       |  56 -----
 drivers/Kconfig                        |   2 -
 drivers/Makefile                       |   1 -
 drivers/fmc/Kconfig                    |  52 -----
 drivers/fmc/Makefile                   |  15 --
 drivers/fmc/fmc-chardev.c              | 199 -----------------
 drivers/fmc/fmc-core.c                 | 388 ---------------------------------
 drivers/fmc/fmc-debug.c                | 172 ---------------
 drivers/fmc/fmc-dump.c                 |  58 -----
 drivers/fmc/fmc-fakedev.c              | 355 ------------------------------
 drivers/fmc/fmc-match.c                | 113 ----------
 drivers/fmc/fmc-private.h              |   8 -
 drivers/fmc/fmc-sdb.c                  | 219 -------------------
 drivers/fmc/fmc-trivial.c              | 102 ---------
 drivers/fmc/fmc-write-eeprom.c         | 175 ---------------
 drivers/fmc/fru-parse.c                |  80 -------
 include/linux/fmc-sdb.h                |  39 ----
 include/linux/fmc.h                    | 271 -----------------------
 27 files changed, 3257 deletions(-)
 delete mode 100644 Documentation/fmc/API.txt
 delete mode 100644 Documentation/fmc/FMC-and-SDB.txt
 delete mode 100644 Documentation/fmc/carrier.txt
 delete mode 100644 Documentation/fmc/fmc-chardev.txt
 delete mode 100644 Documentation/fmc/fmc-fakedev.txt
 delete mode 100644 Documentation/fmc/fmc-trivial.txt
 delete mode 100644 Documentation/fmc/fmc-write-eeprom.txt
 delete mode 100644 Documentation/fmc/identifiers.txt
 delete mode 100644 Documentation/fmc/mezzanine.txt
 delete mode 100644 Documentation/fmc/parameters.txt
 delete mode 100644 drivers/fmc/Kconfig
 delete mode 100644 drivers/fmc/Makefile
 delete mode 100644 drivers/fmc/fmc-chardev.c
 delete mode 100644 drivers/fmc/fmc-core.c
 delete mode 100644 drivers/fmc/fmc-debug.c
 delete mode 100644 drivers/fmc/fmc-dump.c
 delete mode 100644 drivers/fmc/fmc-fakedev.c
 delete mode 100644 drivers/fmc/fmc-match.c
 delete mode 100644 drivers/fmc/fmc-private.h
 delete mode 100644 drivers/fmc/fmc-sdb.c
 delete mode 100644 drivers/fmc/fmc-trivial.c
 delete mode 100644 drivers/fmc/fmc-write-eeprom.c
 delete mode 100644 drivers/fmc/fru-parse.c
 delete mode 100644 include/linux/fmc-sdb.h
 delete mode 100644 include/linux/fmc.h

(limited to 'include/linux')

diff --git a/Documentation/fmc/API.txt b/Documentation/fmc/API.txt
deleted file mode 100644
index 06b06b92c794..000000000000
--- a/Documentation/fmc/API.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Functions Exported by fmc.ko
-****************************
-
-The FMC core exports the usual 4 functions that are needed for a bus to
-work, and a few more:
-
-        int fmc_driver_register(struct fmc_driver *drv);
-        void fmc_driver_unregister(struct fmc_driver *drv);
-        int fmc_device_register(struct fmc_device *fmc);
-        void fmc_device_unregister(struct fmc_device *fmc);
-
-        int fmc_device_register_n(struct fmc_device **fmc, int n);
-        void fmc_device_unregister_n(struct fmc_device **fmc, int n);
-
-        uint32_t fmc_readl(struct fmc_device *fmc, int offset);
-        void fmc_writel(struct fmc_device *fmc, uint32_t val, int off);
-        void *fmc_get_drvdata(struct fmc_device *fmc);
-        void fmc_set_drvdata(struct fmc_device *fmc, void *data);
-
-        int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
-                          int sdb_entry);
-
-The data structure that describe a device is detailed in *note FMC
-Device::, the one that describes a driver is detailed in *note FMC
-Driver::.  Please note that structures of type fmc_device must be
-allocated by the caller, but must not be released after unregistering.
-The fmc-bus itself takes care of releasing the structure when their use
-count reaches zero - actually, the device model does that in lieu of us.
-
-The functions to register and unregister n devices are meant to be used
-by carriers that host more than one mezzanine. The devices must all be
-registered at the same time because if the FPGA is reprogrammed, all
-devices in the array are affected. Usually, the driver matching the
-first device will reprogram the FPGA, so other devices must know they
-are already driven by a reprogrammed FPGA.
-
-If a carrier hosts slots that are driven by different FPGA devices, it
-should register as a group only mezzanines that are driven by the same
-FPGA, for the reason outlined above.
-
-Finally, the fmc_reprogram function calls the reprogram method (see
-*note The API Offered by Carriers:: and also scans the memory area for
-an SDB tree. You can pass -1 as sdb_entry to disable such scan.
-Otherwise, the function fails if no tree is found at the specified
-entry point.  The function is meant to factorize common code, and by
-the time you read this it is already used by the spec-sw and fine-delay
-modules.
diff --git a/Documentation/fmc/FMC-and-SDB.txt b/Documentation/fmc/FMC-and-SDB.txt
deleted file mode 100644
index fa14e0b24521..000000000000
--- a/Documentation/fmc/FMC-and-SDB.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-
-FMC (FPGA Mezzanine Card) is the standard we use for our I/O devices,
-in the context of White Rabbit and related hardware.
-
-In our I/O environments we need to write drivers for each mezzanine
-card, and such drivers must work regardless of the carrier being used.
-To achieve this, we abstract the FMC interface.
-
-We have a carrier for PCI-E called SPEC and one for VME called SVEC,
-but more are planned.  Also, we support stand-alone devices (usually
-plugged on a SPEC card), controlled through Etherbone, developed by GSI.
-
-Code and documentation for the FMC bus was born as part of the spec-sw
-project, but now it lives in its own project. Other projects, i.e.
-software support for the various carriers, should include this as a
-submodule.
-
-The most up to date version of code and documentation is always
-available from the repository you can clone from:
-
-        git://ohwr.org/fmc-projects/fmc-bus.git (read-only)
-        git@ohwr.org:fmc-projects/fmc-bus.git (read-write for developers)
-
-Selected versions of the documentation, as well as complete tar
-archives for selected revisions are placed to the Files section of the
-project: `http://www.ohwr.org/projects/fmc-bus/files'
-
-
-What is FMC
-***********
-
-FMC, as said, stands for "FPGA Mezzanine Card". It is a standard
-developed by the VME consortium called VITA (VMEbus International Trade
-Association and ratified by ANSI, the American National Standard
-Institute.  The official documentation is called "ANSI-VITA 57.1".
-
-The FMC card is an almost square PCB, around 70x75 millimeters, that is
-called mezzanine in this document.  It usually lives plugged into
-another PCB for power supply and control; such bigger circuit board is
-called carrier from now on, and a single carrier may host more than one
-mezzanine.
-
-In the typical application the mezzanine is mostly analog while the
-carrier is mostly digital, and hosts an FPGA that must be configured to
-match the specific mezzanine and the desired application. Thus, you may
-need to load different FPGA images to drive different instances of the
-same mezzanine.
-
-FMC, as such, is not a bus in the usual meaning of the term, because
-most carriers have only one connector, and carriers with several
-connectors have completely separate electrical connections to them.
-This package, however, implements a bus as a software abstraction.
-
-
-What is SDB
-***********
-
-SDB (Self Describing Bus) is a set of data structures that we use for
-enumerating the internal structure of an FPGA image. We also use it as
-a filesystem inside the FMC EEPROM.
-
-SDB is not mandatory for use of this FMC kernel bus, but if you have SDB
-this package can make good use of it.  SDB itself is developed in the
-fpga-config-space OHWR project. The link to the repository is
-`git://ohwr.org/hdl-core-lib/fpga-config-space.git' and what is used in
-this project lives in the sdbfs subdirectory in there.
-
-SDB support for FMC is described in *note FMC Identification:: and
-*note SDB Support::
-
-
-SDB Support
-***********
-
-The fmc.ko bus driver exports a few functions to help drivers taking
-advantage of the SDB information that may be present in your own FPGA
-memory image.
-
-The module exports the following functions, in the special header
-<linux/fmc-sdb.h>. The linux/ prefix in the name is there because we
-plan to submit it upstream in the future, and don't want to force
-changes on our drivers if that happens.
-
-         int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
-         void fmc_show_sdb_tree(struct fmc_device *fmc);
-         signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
-                                         uint32_t device, unsigned long *sz);
-         int fmc_free_sdb_tree(struct fmc_device *fmc);
diff --git a/Documentation/fmc/carrier.txt b/Documentation/fmc/carrier.txt
deleted file mode 100644
index 5e4f1dd3e98b..000000000000
--- a/Documentation/fmc/carrier.txt
+++ /dev/null
@@ -1,311 +0,0 @@
-FMC Device
-**********
-
-Within the Linux bus framework, the FMC device is created and
-registered by the carrier driver. For example, the PCI driver for the
-SPEC card fills a data structure for each SPEC that it drives, and
-registers an associated FMC device for each card.  The SVEC driver can
-do exactly the same for the VME carrier (actually, it should do it
-twice, because the SVEC carries two FMC mezzanines).  Similarly, an
-Etherbone driver will be able to register its own FMC devices, offering
-communication primitives through frame exchange.
-
-The contents of the EEPROM within the FMC are used for identification
-purposes, i.e. for matching the device with its own driver. For this
-reason the device structure includes a complete copy of the EEPROM
-(actually, the carrier driver may choose whether or not to return it -
-for example we most likely won't have the whole EEPROM available for
-Etherbone devices.
-
-The following listing shows the current structure defining a device.
-Please note that all the machinery is in place but some details may
-still change in the future.  For this reason, there is a version field
-at the beginning of the structure.  As usual, the minor number will
-change for compatible changes (like a new flag) and the major number
-will increase when an incompatible change happens (for example, a
-change in layout of some fmc data structures).  Device writers should
-just set it to the value FMC_VERSION, and be ready to get back -EINVAL
-at registration time.
-
-     struct fmc_device {
-             unsigned long version;
-             unsigned long flags;
-             struct module *owner;           /* char device must pin it */
-             struct fmc_fru_id id;           /* for EEPROM-based match */
-             struct fmc_operations *op;      /* carrier-provided */
-             int irq;                        /* according to host bus. 0 == none */
-             int eeprom_len;                 /* Usually 8kB, may be less */
-             int eeprom_addr;                /* 0x50, 0x52 etc */
-             uint8_t *eeprom;                /* Full contents or leading part */
-             char *carrier_name;             /* "SPEC" or similar, for special use */
-             void *carrier_data;             /* "struct spec *" or equivalent */
-             __iomem void *fpga_base;        /* May be NULL (Etherbone) */
-             __iomem void *slot_base;        /* Set by the driver */
-             struct fmc_device **devarray;   /* Allocated by the bus */
-             int slot_id;                    /* Index in the slot array */
-             int nr_slots;                   /* Number of slots in this carrier */
-             unsigned long memlen;           /* Used for the char device */
-             struct device dev;              /* For Linux use */
-             struct device *hwdev;           /* The underlying hardware device */
-             unsigned long sdbfs_entry;
-             struct sdb_array *sdb;
-             uint32_t device_id;             /* Filled by the device */
-             char *mezzanine_name;           /* Defaults to ``fmc'' */
-             void *mezzanine_data;
-     };
-
-The meaning of most fields is summarized in the code comment above.
-
-The following fields must be filled by the carrier driver before
-registration:
-
-   * version: must be set to FMC_VERSION.
-
-   * owner: set to MODULE_OWNER.
-
-   * op: the operations to act on the device.
-
-   * irq: number for the mezzanine; may be zero.
-
-   * eeprom_len: length of the following array.
-
-   * eeprom_addr: 0x50 for first mezzanine and so on.
-
-   * eeprom: the full content of the I2C EEPROM.
-
-   * carrier_name.
-
-   * carrier_data: a unique pointer for the carrier.
-
-   * fpga_base: the I/O memory address (may be NULL).
-
-   * slot_id: the index of this slot (starting from zero).
-
-   * memlen: if fpga_base is valid, the length of I/O memory.
-
-   * hwdev: to be used in some dev_err() calls.
-
-   * device_id: a slot-specific unique integer number.
-
-
-Please note that the carrier should read its own EEPROM memory before
-registering the device, as well as fill all other fields listed above.
-
-The following fields should not be assigned, because they are filled
-later by either the bus or the device driver:
-
-   * flags.
-
-   * fru_id: filled by the bus, parsing the eeprom.
-
-   * slot_base: filled and used by the driver, if useful to it.
-
-   * devarray: an array og all mezzanines driven by a singe FPGA.
-
-   * nr_slots: set by the core at registration time.
-
-   * dev: used by Linux.
-
-   * sdb: FPGA contents, scanned according to driver's directions.
-
-   * sdbfs_entry: SDB entry point in EEPROM: autodetected.
-
-   * mezzanine_data: available for the driver.
-
-   * mezzanine_name: filled by fmc-bus during identification.
-
-
-Note: mezzanine_data may be redundant, because Linux offers the drvdata
-approach, so the field may be removed in later versions of this bus
-implementation.
-
-As I write this, she SPEC carrier is already completely functional in
-the fmc-bus environment, and is a good reference to look at.
-
-
-The API Offered by Carriers
-===========================
-
-The carrier provides a number of methods by means of the
-`fmc_operations' structure, which currently is defined like this
-(again, it is a moving target, please refer to the header rather than
-this document):
-
-     struct fmc_operations {
-             uint32_t (*readl)(struct fmc_device *fmc, int offset);
-             void (*writel)(struct fmc_device *fmc, uint32_t value, int offset);
-             int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
-             int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
-             int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
-                                char *name, int flags);
-             void (*irq_ack)(struct fmc_device *fmc);
-             int (*irq_free)(struct fmc_device *fmc);
-             int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
-                                int ngpio);
-             int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
-             int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
-     };
-
-The individual methods perform the following tasks:
-
-`readl'
-`writel'
-     These functions access FPGA registers by whatever means the
-     carrier offers. They are not expected to fail, and most of the time
-     they will just make a memory access to the host bus. If the
-     carrier provides a fpga_base pointer, the driver may use direct
-     access through that pointer. For this reason the header offers the
-     inline functions fmc_readl and fmc_writel that access fpga_base if
-     the respective method is NULL. A driver that wants to be portable
-     and efficient should use fmc_readl and fmc_writel.  For Etherbone,
-     or other non-local carriers, error-management is still to be
-     defined.
-
-`validate'
-     Module parameters are used to manage different applications for
-     two or more boards of the same kind. Validation is based on the
-     busid module parameter, if provided, and returns the matching
-     index in the associated array. See *note Module Parameters:: in in
-     doubt. If no match is found, `-ENOENT' is returned; if the user
-     didn't pass `busid=', all devices will pass validation.  The value
-     returned by the validate method can be used as index into other
-     parameters (for example, some drivers use the `lm32=' parameter in
-     this way). Such "generic parameters" are documented in *note
-     Module Parameters::, below. The validate method is used by
-     `fmc-trivial.ko', described in *note fmc-trivial::.
-
-`reprogram'
-     The carrier enumerates FMC devices by loading a standard (or
-     golden) FPGA binary that allows EEPROM access. Each driver, then,
-     will need to reprogram the FPGA by calling this function.  If the
-     name argument is NULL, the carrier should reprogram the golden
-     binary. If the gateware name has been overridden through module
-     parameters (in a carrier-specific way) the file loaded will match
-     the parameters. Per-device gateware names can be specified using
-     the `gateware=' parameter, see *note Module Parameters::.  Note:
-     Clients should call rhe new helper, fmc_reprogram, which both
-     calls this method and parse the SDB tree of the FPGA.
-
-`irq_request'
-`irq_ack'
-`irq_free'
-     Interrupt management is carrier-specific, so it is abstracted as
-     operations. The interrupt number is listed in the device
-     structure, and for the mezzanine driver the number is only
-     informative.  The handler will receive the fmc pointer as dev_id;
-     the flags argument is passed to the Linux request_irq function,
-     but fmc-specific flags may be added in the future. You'll most
-     likely want to pass the `IRQF_SHARED' flag.
-
-`gpio_config'
-     The method allows to configure a GPIO pin in the carrier, and read
-     its current value if it is configured as input. See *note The GPIO
-     Abstraction:: for details.
-
-`read_ee'
-`write_ee'
-     Read or write the EEPROM. The functions are expected to be only
-     called before reprogramming and the carrier should refuse them
-     with `ENODEV' after reprogramming.  The offset is expected to be
-     within 8kB (the current size), but addresses up to 1MB are
-     reserved to fit bigger I2C devices in the future. Carriers may
-     offer access to other internal flash memories using these same
-     methods: for example the SPEC driver may define that its carrier
-     I2C memory is seen at offset 1M and the internal SPI flash is seen
-     at offset 16M.  This multiplexing of several flash memories in the
-     same address space is carrier-specific and should only be used
-     by a driver that has verified the `carrier_name' field.
-
-
-
-The GPIO Abstraction
-====================
-
-Support for GPIO pins in the fmc-bus environment is not very
-straightforward and deserves special discussion.
-
-While the general idea of a carrier-independent driver seems to fly,
-configuration of specific signals within the carrier needs at least
-some knowledge of the carrier itself.  For this reason, the specific
-driver can request to configure carrier-specific GPIO pins, numbered
-from 0 to at most 4095.  Configuration is performed by passing a
-pointer to an array of struct fmc_gpio items, as well as the length of
-the array. This is the data structure:
-
-        struct fmc_gpio {
-                char *carrier_name;
-                int gpio;
-                int _gpio;      /* internal use by the carrier */
-                int mode;       /* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
-                int irqmode;    /* IRQF_TRIGGER_LOW and so on */
-        };
-
-By specifying a carrier_name for each pin, the driver may access
-different pins in different carriers.  The gpio_config method is
-expected to return the number of pins successfully configured, ignoring
-requests for other carriers. However, if no pin is configured (because
-no structure at all refers to the current carrier_name), the operation
-returns an error so the caller will know that it is running under a
-yet-unsupported carrier.
-
-So, for example, a driver that has been developed and tested on both
-the SPEC and the SVEC may request configuration of two different GPIO
-pins, and expect one such configuration to succeed - if none succeeds
-it most likely means that the current carrier is a still-unknown one.
-
-If, however, your GPIO pin has a specific known role, you can pass a
-special number in the gpio field, using one of the following macros:
-
-        #define FMC_GPIO_RAW(x)         (x)             /* 4096 of them */
-        #define FMC_GPIO_IRQ(x)         ((x) + 0x1000)  /*  256 of them */
-        #define FMC_GPIO_LED(x)         ((x) + 0x1100)  /*  256 of them */
-        #define FMC_GPIO_KEY(x)         ((x) + 0x1200)  /*  256 of them */
-        #define FMC_GPIO_TP(x)          ((x) + 0x1300)  /*  256 of them */
-        #define FMC_GPIO_USER(x)        ((x) + 0x1400)  /*  256 of them */
-
-Use of virtual GPIO numbers (anything but FMC_GPIO_RAW) is allowed
-provided the carrier_name field in the data structure is left
-unspecified (NULL). Each carrier is responsible for providing a mapping
-between virtual and physical GPIO numbers. The carrier may then use the
-_gpio field to cache the result of this mapping.
-
-All carriers must map their I/O lines to the sets above starting from
-zero.  The SPEC, for example, maps interrupt pins 0 and 1, and test
-points 0 through 3 (even if the test points on the PCB are called
-5,6,7,8).
-
-If, for example, a driver requires a free LED and a test point (for a
-scope probe to be plugged at some point during development) it may ask
-for FMC_GPIO_LED(0) and FMC_GPIO_TP(0). Each carrier will provide
-suitable GPIO pins.  Clearly, the person running the drivers will know
-the order used by the specific carrier driver in assigning leds and
-testpoints, so to make a carrier-dependent use of the diagnostic tools.
-
-In theory, some form of autodetection should be possible: a driver like
-the wr-nic (which uses IRQ(1) on the SPEC card) should configure
-IRQ(0), make a test with software-generated interrupts and configure
-IRQ(1) if the test fails. This probing step should be used because even
-if the wr-nic gateware is known to use IRQ1 on the SPEC, the driver
-should be carrier-independent and thus use IRQ(0) as a first bet -
-actually, the knowledge that IRQ0 may fail is carrier-dependent
-information, but using it doesn't make the driver unsuitable for other
-carriers.
-
-The return value of gpio_config is defined as follows:
-
-   * If no pin in the array can be used by the carrier, `-ENODEV'.
-
-   * If at least one virtual GPIO number cannot be mapped, `-ENOENT'.
-
-   * On success, 0 or positive. The value returned is the number of
-     high input bits (if no input is configured, the value for success
-     is 0).
-
-While I admit the procedure is not completely straightforward, it
-allows configuration, input and output with a single carrier operation.
-Given the typical use case of FMC devices, GPIO operations are not
-expected to ever by in hot paths, and GPIO access so fare has only been
-used to configure the interrupt pin, mode and polarity. Especially
-reading inputs is not expected to be common. If your device has GPIO
-capabilities in the hot path, you should consider using the kernel's
-GPIO mechanisms.
diff --git a/Documentation/fmc/fmc-chardev.txt b/Documentation/fmc/fmc-chardev.txt
deleted file mode 100644
index d9ccb278e597..000000000000
--- a/Documentation/fmc/fmc-chardev.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-fmc-chardev
-===========
-
-This is a simple generic driver, that allows user access by means of a
-character device (actually, one for each mezzanine it takes hold of).
-
-The char device is created as a misc device. Its name in /dev (as
-created by udev) is the same name as the underlying FMC device. Thus,
-the name can be a silly fmc-0000 look-alike if the device has no
-identifiers nor bus_id, a more specific fmc-0400 if the device has a
-bus-specific address but no associated name, or something like
-fdelay-0400 if the FMC core can rely on both a mezzanine name and a bus
-address.
-
-Currently the driver only supports read and write: you can lseek to the
-desired address and read or write a register.
-
-The driver assumes all registers are 32-bit in size, and only accepts a
-single read or write per system call. However, as a result of Unix read
-and write semantics, users can simply fread or fwrite bigger areas in
-order to dump or store bigger memory areas.
-
-There is currently no support for mmap, user-space interrupt management
-and DMA buffers. They may be added in later versions, if the need
-arises.
-
-The example below shows raw access to a SPEC card programmed with its
-golden FPGA file, that features an SDB structure at offset 256 - i.e.
-64 words.  The mezzanine's EEPROM in this case is not programmed, so the
-default name is fmc-<bus><devfn>, and there are two cards in the system:
-
-  spusa.root# insmod fmc-chardev.ko
-  [ 1073.339332] spec 0000:02:00.0: Driver has no ID: matches all
-  [ 1073.345051] spec 0000:02:00.0: Created misc device "fmc-0200"
-  [ 1073.350821] spec 0000:04:00.0: Driver has no ID: matches all
-  [ 1073.356525] spec 0000:04:00.0: Created misc device "fmc-0400"
-  spusa.root# ls -l /dev/fmc*
-  crw------- 1 root root 10, 58 Nov 20 19:23 /dev/fmc-0200
-  crw------- 1 root root 10, 57 Nov 20 19:23 /dev/fmc-0400
-  spusa.root# dd bs=4 skip=64 count=1 if=/dev/fmc-0200 2> /dev/null | od -t x1z
-  0000000 2d 42 44 53                                      >-BDS<
-  0000004
-
-The simple program tools/fmc-mem in this package can access an FMC char
-device and read or write a word or a whole area.  Actually, the program
-is not specific to FMC at all, it just uses lseek, read and write.
-
-Its first argument is the device name, the second the offset, the third
-(if any) the value to write and the optional last argument that must
-begin with "+" is the number of bytes to read or write.  In case of
-repeated reading data is written to stdout; repeated writes read from
-stdin and the value argument is ignored.
-
-The following examples show reading the SDB magic number and the first
-SDB record from a SPEC device programmed with its golden image:
-
-     spusa.root# ./fmc-mem /dev/fmc-0200 100
-     5344422d
-     spusa.root# ./fmc-mem /dev/fmc-0200 100 +40 | od -Ax -t x1z
-     000000 2d 42 44 53 00 01 02 00 00 00 00 00 00 00 00 00  >-BDS............<
-     000010 00 00 00 00 ff 01 00 00 00 00 00 00 51 06 00 00  >............Q...<
-     000020 c9 42 a5 e6 02 00 00 00 11 05 12 20 2d 34 42 57  >.B......... -4BW<
-     000030 73 6f 72 43 72 61 62 73 49 53 47 2d 00 20 20 20  >sorCrabsISG-.   <
-     000040
diff --git a/Documentation/fmc/fmc-fakedev.txt b/Documentation/fmc/fmc-fakedev.txt
deleted file mode 100644
index e85b74a4ae30..000000000000
--- a/Documentation/fmc/fmc-fakedev.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-fmc-fakedev
-===========
-
-This package includes a software-only device, called fmc-fakedev, which
-is able to register up to 4 mezzanines (by default it registers one).
-Unlike the SPEC driver, which creates an FMC device for each PCI cards
-it manages, this module creates a single instance of its set of
-mezzanines.
-
-It is meant as the simplest possible example of how a driver should be
-written, and it includes a fake EEPROM image (built using the tools
-described in *note FMC Identification::),, which by default is
-replicated for each fake mezzanine.
-
-You can also use this device to verify the match algorithms, by asking
-it to test your own EEPROM image. You can provide the image by means of
-the eeprom= module parameter: the new EEPROM image is loaded, as usual,
-by means of the firmware loader.  This example shows the defaults and a
-custom EEPROM image:
-
-     spusa.root# insmod fmc-fakedev.ko
-     [   99.971247]  fake-fmc-carrier: mezzanine 0
-     [   99.975393]       Manufacturer: fake-vendor
-     [   99.979624]       Product name: fake-design-for-testing
-     spusa.root# rmmod fmc-fakedev
-     spusa.root# insmod fmc-fakedev.ko eeprom=fdelay-eeprom.bin
-     [  121.447464]  fake-fmc-carrier: Mezzanine 0: eeprom "fdelay-eeprom.bin"
-     [  121.462725]  fake-fmc-carrier: mezzanine 0
-     [  121.466858]       Manufacturer: CERN
-     [  121.470477]       Product name: FmcDelay1ns4cha
-     spusa.root# rmmod fmc-fakedev
-
-After loading the device, you can use the write_ee method do modify its
-own internal fake EEPROM: whenever the image is overwritten starting at
-offset 0, the module will unregister and register again the FMC device.
-This is shown in fmc-write-eeprom.txt
diff --git a/Documentation/fmc/fmc-trivial.txt b/Documentation/fmc/fmc-trivial.txt
deleted file mode 100644
index d1910bc67159..000000000000
--- a/Documentation/fmc/fmc-trivial.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-fmc-trivial
-===========
-
-The simple module fmc-trivial is just a simple client that registers an
-interrupt handler. I used it to verify the basic mechanism of the FMC
-bus and how interrupts worked.
-
-The module implements the generic FMC parameters, so it can program a
-different gateware file in each card. The whole list of parameters it
-accepts are:
-
-`busid='
-`gateware='
-     Generic parameters. See mezzanine.txt
-
-
-This driver is worth reading, in my opinion.
diff --git a/Documentation/fmc/fmc-write-eeprom.txt b/Documentation/fmc/fmc-write-eeprom.txt
deleted file mode 100644
index e0a9712156aa..000000000000
--- a/Documentation/fmc/fmc-write-eeprom.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-fmc-write-eeprom
-================
-
-This module is designed to load a binary file from /lib/firmware and to
-write it to the internal EEPROM of the mezzanine card. This driver uses
-the `busid' generic parameter.
-
-Overwriting the EEPROM is not something you should do daily, and it is
-expected to only happen during manufacturing. For this reason, the
-module makes it unlikely for the random user to change a working EEPROM.
-
-However, since the EEPROM may include application-specific information
-other than the identification, later versions of this packages added
-write-support through sysfs. See *note Accessing the EEPROM::.
-
-To avoid damaging the EEPROM content, the module takes the following
-measures:
-
-   * It accepts a `file=' argument (within /lib/firmware) and if no
-     such argument is received, it doesn't write anything to EEPROM
-     (i.e. there is no default file name).
-
-   * If the file name ends with `.bin' it is written verbatim starting
-     at offset 0.
-
-   * If the file name ends with `.tlv' it is interpreted as
-     type-length-value (i.e., it allows writev(2)-like operation).
-
-   * If the file name doesn't match any of the patterns above, it is
-     ignored and no write is performed.
-
-   * Only cards listed with `busid=' are written to. If no busid is
-     specified, no programming is done (and the probe function of the
-     driver will fail).
-
-
-Each TLV tuple is formatted in this way: the header is 5 bytes,
-followed by data. The first byte is `w' for write, the next two bytes
-represent the address, in little-endian byte order, and the next two
-represent the data length, in little-endian order. The length does not
-include the header (it is the actual number of bytes to be written).
-
-This is a real example: that writes 5 bytes at position 0x110:
-
-        spusa.root# od -t x1 -Ax /lib/firmware/try.tlv
-        000000 77 10 01 05 00 30 31 32 33 34
-        00000a
-        spusa.root# insmod /tmp/fmc-write-eeprom.ko busid=0x0200 file=try.tlv
-        [19983.391498] spec 0000:03:00.0: write 5 bytes at 0x0110
-        [19983.414615] spec 0000:03:00.0: write_eeprom: success
-
-Please note that you'll most likely want to use SDBFS to build your
-EEPROM image, at least if your mezzanines are being used in the White
-Rabbit environment. For this reason the TLV format is not expected to
-be used much and is not expected to be developed further.
-
-If you want to try reflashing fake EEPROM devices, you can use the
-fmc-fakedev.ko module (see *note fmc-fakedev::).  Whenever you change
-the image starting at offset 0, it will deregister and register again
-after two seconds.  Please note, however, that if fmc-write-eeprom is
-still loaded, the system will associate it to the new device, which
-will be reprogrammed and thus will be unloaded after two seconds.  The
-following example removes the module after it reflashed fakedev the
-first time.
-
-     spusa.root# insmod fmc-fakedev.ko
-        [   72.984733]  fake-fmc: Manufacturer: fake-vendor
-        [   72.989434]  fake-fmc: Product name: fake-design-for-testing
-        spusa.root# insmod fmc-write-eeprom.ko busid=0 file=fdelay-eeprom.bin; \
-            rmmod fmc-write-eeprom
-        [  130.874098]  fake-fmc: Matching a generic driver (no ID)
-        [  130.887845]  fake-fmc: programming 6155 bytes
-        [  130.894567]  fake-fmc: write_eeprom: success
-        [  132.895794]  fake-fmc: Manufacturer: CERN
-        [  132.899872]  fake-fmc: Product name: FmcDelay1ns4cha
-
-
-Accessing the EEPROM
-=====================
-
-The bus creates a sysfs binary file called eeprom for each mezzanine it
-knows about:
-
-        spusa.root# cd /sys/bus/fmc/devices; ls -l */eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcAdc100m14b4cha-0800/eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDelay1ns4cha-0200/eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDio5cha-0400/eeprom
-
-Everybody can read the files and the superuser can also modify it, but
-the operation may on the carrier driver, if the carrier is unable to
-access the I2C bus.  For example, the spec driver can access the bus
-only with its golden gateware: after a mezzanine driver reprogrammed
-the FPGA with a custom circuit, the carrier is unable to access the
-EEPROM and returns ENOTSUPP.
-
-An alternative way to write the EEPROM is the mezzanine driver
-fmc-write-eeprom (See *note fmc-write-eeprom::), but the procedure is
-more complex.
diff --git a/Documentation/fmc/identifiers.txt b/Documentation/fmc/identifiers.txt
deleted file mode 100644
index 3bb577ff0d52..000000000000
--- a/Documentation/fmc/identifiers.txt
+++ /dev/null
@@ -1,168 +0,0 @@
-FMC Identification
-******************
-
-The FMC standard requires every compliant mezzanine to carry
-identification information in an I2C EEPROM.  The information must be
-laid out according to the "IPMI Platform Management FRU Information",
-where IPMI is a lie I'd better not expand, and FRU means "Field
-Replaceable Unit".
-
-The FRU information is an intricate unreadable binary blob that must
-live at offset 0 of the EEPROM, and typically extends for a few hundred
-bytes. The standard allows the application to use all the remaining
-storage area of the EEPROM as it wants.
-
-This chapter explains how to create your own EEPROM image and how to
-write it in your mezzanine, as well as how devices and drivers are
-paired at run time.  EEPROM programming uses tools that are part of this
-package and SDB (part of the fpga-config-space package).
-
-The first sections are only interesting for manufacturers who need to
-write the EEPROM. If you are just a software developer writing an FMC
-device or driver, you may jump straight to *note SDB Support::.
-
-
-Building the FRU Structure
-==========================
-
-If you want to know the internals of the FRU structure and despair, you
-can retrieve the document from
-`http://download.intel.com/design/servers/ipmi/FRU1011.pdf' .  The
-standard is awful and difficult without reason, so we only support the
-minimum mandatory subset - we create a simple structure and parse it
-back at run time, but we are not able to either generate or parse more
-arcane features like non-english languages and 6-bit text.  If you need
-more items of the FRU standard for your boards, please submit patches.
-
-This package includes the Python script that Matthieu Cattin wrote to
-generate the FRU binary blob, based on an helper libipmi by Manohar
-Vanga and Matthieu himself.  I changed the test script to receive
-parameters from the command line or from the environment (the command
-line takes precedence)
-
-To make a long story short, in order to build a standard-compliant
-binary file to be burned in your EEPROM, you need the following items:
-
-        Environment    Opt     Official Name          Default
----------------------------------------------------------------------
-        FRU_VENDOR     -v      "Board Manufacturer"   fmc-example
-        FRU_NAME       -n      "Board Product Name"   mezzanine
-        FRU_SERIAL     -s      `Board Serial Number"  0001
-        FRU_PART       -p      "Board Part Number"    sample-part
-        FRU_OUTPUT     -o      not applicable         /dev/stdout
-
-The "Official Name" above is what you find in the FRU official
-documentation, chapter 11, page 7 ("Board Info Area Format").  The
-output option is used to save the generated binary to a specific file
-name instead of stdout.
-
-You can pass the items to the FRU generator either in the environment
-or on the command line.  This package has currently no support for
-specifying power consumption or such stuff, but I plan to add it as
-soon as I find some time for that.
-
-FIXME: consumption etc for FRU are here or in PTS?
-
-The following example creates a binary image for a specific board:
-
-        ./tools/fru-generator -v CERN -n FmcAdc100m14b4cha \
-               -s HCCFFIA___-CR000003 -p EDA-02063-V5-0 > eeprom.bin
-
-The following example shows a script that builds several binary EEPROM
-images for a series of boards, changing the serial number for each of
-them. The script uses a mix of environment variables and command line
-options, and uses the same string patterns shown above.
-
-        #!/bin/sh
-
-        export FRU_VENDOR="CERN"
-        export FRU_NAME="FmcAdc100m14b4cha"
-        export FRU_PART="EDA-02063-V5-0"
-
-        serial="HCCFFIA___-CR"
-
-        for number in $(seq 1 50); do
-           # build number-string "ns"
-           ns="$(printf %06d $number)"
-           ./fru-generator -s "${serial}${ns}" > eeprom-${ns}.bin
-        done
-
-
-Using SDB-FS in the EEPROM
-==========================
-
-If you want to use SDB as a filesystem in the EEPROM device within the
-mezzanine, you should create one such filesystem using gensdbfs, from
-the fpga-config-space package on OHWR.
-
-By using an SBD filesystem you can cluster several files in a single
-EEPROM, so both the host system and a soft-core running in the FPGA (if
-any) can access extra production-time information.
-
-We chose to use SDB as a storage filesystem because the format is very
-simple, and both the host system and the soft-core will likely already
-include support code for such format. The SDB library offered by the
-fpga-config-space is less than 1kB under LM32, so it proves quite up to
-the task.
-
-The SDB entry point (which acts as a directory listing) cannot live at
-offset zero in the flash device, because the FRU information must live
-there.  To avoid wasting precious storage space while still allowing
-for more-than-minimal FRU structures, the fmc.ko will look for the SDB
-record at address 256, 512 and 1024.
-
-In order to generate the complete EEPROM image you'll need a
-configuration file for gensdbfs: you tell the program where to place
-the sdb entry point, and you must force the FRU data file to be placed
-at the beginning of the storage device. If needed, you can also place
-other files at a special offset (we sometimes do it for backward
-compatibility with drivers we wrote before implementing SDB for flash
-memory).
-
-The directory tools/sdbfs of this package includes a well-commented
-example that you may want to use as a starting point (the comments are
-in the file called -SDB-CONFIG-).  Reading documentation for gensdbfs
-is a suggested first step anyways.
-
-This package (generic FMC bus support) only accesses two files in the
-EEPROM: the FRU information, at offset zero, with a suggested filename
-of IPMI-FRU and the short name for the mezzanine, in a file called
-name. The IPMI-FRU name is not mandatory, but a strongly suggested
-choice; the name filename is mandatory, because this is the preferred
-short name used by the FMC core.  For example, a name of "fdelay" may
-supplement a Product Name like "FmcDelay1ns4cha" - exactly as
-demonstrated in `tools/sdbfs'.
-
-Note: SDB access to flash memory is not yet supported, so the short
-name currently in use is just the "Product Name" FRU string.
-
-The example in tools/sdbfs includes an extra file, that is needed by
-the fine-delay driver, and must live at a known address of 0x1800.  By
-running gensdbfs on that directory you can output your binary EEPROM
-image (here below spusa$ is the shell prompt):
-
-        spusa$ ../fru-generator -v CERN -n FmcDelay1ns4cha -s proto-0 \
-                      -p EDA-02267-V3 > IPMI-FRU
-        spusa$ ls -l
-        total 16
-        -rw-rw-r-- 1 rubini staff 975 Nov 19 18:08 --SDB-CONFIG--
-        -rw-rw-r-- 1 rubini staff 216 Nov 19 18:13 IPMI-FRU
-        -rw-rw-r-- 1 rubini staff  11 Nov 19 18:04 fd-calib
-        -rw-rw-r-- 1 rubini staff   7 Nov 19 18:04 name
-        spusa$ sudo gensdbfs . /lib/firmware/fdelay-eeprom.bin
-        spusa$ sdb-read -l -e 0x100 /lib/firmware/fdelay-eeprom.bin
-        /home/rubini/wip/sdbfs/userspace/sdb-read: listing format is to be defined
-        46696c6544617461:2e202020  00000100-000018ff .
-        46696c6544617461:6e616d65  00000200-00000206 name
-        46696c6544617461:66642d63  00001800-000018ff fd-calib
-        46696c6544617461:49504d49  00000000-000000d7 IPMI-FRU
-        spusa$ ../fru-dump /lib/firmware/fdelay-eeprom.bin
-        /lib/firmware/fdelay-eeprom.bin: manufacturer: CERN
-        /lib/firmware/fdelay-eeprom.bin: product-name: FmcDelay1ns4cha
-        /lib/firmware/fdelay-eeprom.bin: serial-number: proto-0
-        /lib/firmware/fdelay-eeprom.bin: part-number: EDA-02267-V3
-
-As expected, the output file is both a proper sdbfs object and an IPMI
-FRU information blob. The fd-calib file lives at offset 0x1800 and is
-over-allocated to 256 bytes, according to the configuration file for
-gensdbfs.
diff --git a/Documentation/fmc/mezzanine.txt b/Documentation/fmc/mezzanine.txt
deleted file mode 100644
index 87910dbfc91e..000000000000
--- a/Documentation/fmc/mezzanine.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-FMC Driver
-**********
-
-An FMC driver is concerned with the specific mezzanine and associated
-gateware. As such, it is expected to be independent of the carrier
-being used: it will perform I/O accesses only by means of
-carrier-provided functions.
-
-The matching between device and driver is based on the content of the
-EEPROM (as mandated by the FMC standard) or by the actual cores
-configured in the FPGA; the latter technique is used when the FPGA is
-already programmed when the device is registered to the bus core.
-
-In some special cases it is possible for a driver to directly access
-FPGA registers, by means of the `fpga_base' field of the device
-structure. This may be needed for high-bandwidth peripherals like fast
-ADC cards. If the device module registered a remote device (for example
-by means of Etherbone), the `fpga_base' pointer will be NULL.
-Therefore, drivers must be ready to deal with NULL base pointers, and
-fail gracefully.  Most driver, however, are not expected to access the
-pointer directly but run fmc_readl and fmc_writel instead, which will
-work in any case.
-
-In even more special cases, the driver may access carrier-specific
-functionality: the `carrier_name' string allows the driver to check
-which is the current carrier and make use of the `carrier_data'
-pointer.  We chose to use carrier names rather than numeric identifiers
-for greater flexibility, but also to avoid a central registry within
-the `fmc.h' file - we hope other users will exploit our framework with
-their own carriers.  An example use of carrier names is in GPIO setup
-(see *note The GPIO Abstraction::), although the name match is not
-expected to be performed by the driver.  If you depend on specific
-carriers, please check the carrier name and fail gracefully if your
-driver finds it is running in a yet-unknown-to-it environment.
-
-
-ID Table
-========
-
-Like most other Linux drivers, and FMC driver must list all the devices
-which it is able to drive.  This is usually done by means of a device
-table, but in FMC we can match hardware based either on the contents of
-their EEPROM or on the actual FPGA cores that can be enumerated.
-Therefore, we have two tables of identifiers.
-
-Matching of FRU information depends on two names, the manufacturer (or
-vendor) and the device (see *note FMC Identification::); for
-flexibility during production (i.e. before writing to the EEPROM) the
-bus supports a catch-all driver that specifies NULL strings. For this
-reason, the table is specified as pointer-and-length, not a a
-null-terminated array - the entry with NULL names can be a valid entry.
-
-Matching on FPGA cores depends on two numeric fields: the 64-bit vendor
-number and the 32-bit device number. Support for matching based on
-class is not yet implemented.  Each device is expected to be uniquely
-identified by an array of cores (it matches if all of the cores are
-instantiated), and for consistency the list is passed as
-pointer-and-length.  Several similar devices can be driven by the same
-driver, and thus the driver specifies and array of such arrays.
-
-The complete set of involved data structures is thus the following:
-
-        struct fmc_fru_id { char *manufacturer; char *product_name; };
-        struct fmc_sdb_one_id { uint64_t vendor; uint32_t device; };
-        struct fmc_sdb_id { struct fmc_sdb_one_id *cores; int cores_nr; };
-
-        struct fmc_device_id {
-                struct fmc_fru_id *fru_id; int fru_id_nr;
-                struct fmc_sdb_id *sdb_id; int sdb_id_nr;
-        };
-
-A better reference, with full explanation, is the <linux/fmc.h> header.
-
-
-Module Parameters
-=================
-
-Most of the FMC drivers need the same set of kernel parameters. This
-package includes support to implement common parameters by means of
-fields in the `fmc_driver' structure and simple macro definitions.
-
-The parameters are carrier-specific, in that they rely on the busid
-concept, that varies among carriers. For the SPEC, the identifier is a
-PCI bus and devfn number, 16 bits wide in total; drivers for other
-carriers will most likely offer something similar but not identical,
-and some code duplication is unavoidable.
-
-This is the list of parameters that are common to several modules to
-see how they are actually used, please look at spec-trivial.c.
-
-`busid='
-     This is an array of integers, listing carrier-specific
-     identification numbers. For PIC, for example, `0x0400' represents
-     bus 4, slot 0.  If any such ID is specified, the driver will only
-     accept to drive cards that appear in the list (even if the FMC ID
-     matches). This is accomplished by the validate carrier method.
-
-`gateware='
-     The argument is an array of strings. If no busid= is specified,
-     the first string of gateware= is used for all cards; otherwise the
-     identifiers and gateware names are paired one by one, in the order
-     specified.
-
-`show_sdb='
-     For modules supporting it, this parameter asks to show the SDB
-     internal structure by means of kernel messages. It is disabled by
-     default because those lines tend to hide more important messages,
-     if you look at the system console while loading the drivers.
-     Note: the parameter is being obsoleted, because fmc.ko itself now
-     supports dump_sdb= that applies to every client driver.
-
-
-For example, if you are using the trivial driver to load two different
-gateware files to two different cards, you can use the following
-parameters to load different binaries to the cards, after looking up
-the PCI identifiers. This has been tested with a SPEC carrier.
-
-        insmod fmc-trivial.ko \
-                              busid=0x0200,0x0400 \
-                              gateware=fmc/fine-delay.bin,fmc/simple-dio.bin
-
-Please note that not all sub-modules support all of those parameters.
-You can use modinfo to check what is supported by each module.
diff --git a/Documentation/fmc/parameters.txt b/Documentation/fmc/parameters.txt
deleted file mode 100644
index 59edf088e3a4..000000000000
--- a/Documentation/fmc/parameters.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Module Parameters in fmc.ko
-***************************
-
-The core driver receives two module parameters, meant to help debugging
-client modules. Both parameters can be modified by writing to
-/sys/module/fmc/parameters/, because they are used when client drivers
-are devices are registered, not when fmc.ko is loaded.
-
-`dump_eeprom='
-     If not zero, the parameter asks the bus controller to dump the
-     EEPROM of any device that is registered, using printk.
-
-`dump_sdb='
-     If not zero, the parameter prints the SDB tree of every FPGA it is
-     loaded by fmc_reprogram(). If greater than one, it asks to dump
-     the binary content of SDB records.  This currently only dumps the
-     top-level SDB array, though.
-
-
-EEPROM dumping avoids repeating lines, since most of the contents is
-usually empty and all bits are one or zero. This is an example of the
-output:
-
-        [ 6625.850480] spec 0000:02:00.0: FPGA programming successful
-        [ 6626.139949] spec 0000:02:00.0: Manufacturer: CERN
-        [ 6626.144666] spec 0000:02:00.0: Product name: FmcDelay1ns4cha
-        [ 6626.150370] FMC: mezzanine 0: 0000:02:00.0 on SPEC
-        [ 6626.155179] FMC: dumping eeprom 0x2000 (8192) bytes
-        [ 6626.160087] 0000: 01 00 00 01  00 0b 00 f3  01 0a 00 a5  85 87 c4 43
-        [ 6626.167069] 0010: 45 52 4e cf  46 6d 63 44  65 6c 61 79  31 6e 73 34
-        [ 6626.174019] 0020: 63 68 61 c7  70 72 6f 74  6f 2d 30 cc  45 44 41 2d
-        [ 6626.180975] 0030: 30 32 32 36  37 2d 56 33  da 32 30 31  32 2d 31 31
-        [...]
-        [ 6626.371366] 0200: 66 64 65 6c  61 79 0a 00  00 00 00 00  00 00 00 00
-        [ 6626.378359] 0210: 00 00 00 00  00 00 00 00  00 00 00 00  00 00 00 00
-        [ 6626.385361] [...]
-        [ 6626.387308] 1800: 70 6c 61 63  65 68 6f 6c  64 65 72 ff  ff ff ff ff
-        [ 6626.394259] 1810: ff ff ff ff  ff ff ff ff  ff ff ff ff  ff ff ff ff
-        [ 6626.401250] [...]
-
-The dump of SDB looks like the following; the example shows the simple
-golden gateware for the SPEC card, removing the leading timestamps to
-fit the page:
-
-        spec 0000:02:00.0: SDB: 00000651:e6a542c9 WB4-Crossbar-GSI
-        spec 0000:02:00.0: SDB: 0000ce42:ff07fc47 WR-Periph-Syscon (00000000-000000ff)
-        FMC: mezzanine 0: 0000:02:00.0 on SPEC
-        FMC: poor dump of sdb first level:
-        0000: 53 44 42 2d  00 02 01 00  00 00 00 00  00 00 00 00
-        0010: 00 00 00 00  00 00 01 ff  00 00 00 00  00 00 06 51
-        0020: e6 a5 42 c9  00 00 00 02  20 12 05 11  57 42 34 2d
-        0030: 43 72 6f 73  73 62 61 72  2d 47 53 49  20 20 20 00
-        0040: 00 00 01 01  00 00 00 07  00 00 00 00  00 00 00 00
-        0050: 00 00 00 00  00 00 00 ff  00 00 00 00  00 00 ce 42
-        0060: ff 07 fc 47  00 00 00 01  20 12 03 05  57 52 2d 50
-        0070: 65 72 69 70  68 2d 53 79  73 63 6f 6e  20 20 20 01
diff --git a/drivers/Kconfig b/drivers/Kconfig
index e8231663f201..61cf4ea2c229 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -188,8 +188,6 @@ source "drivers/ipack/Kconfig"
 
 source "drivers/reset/Kconfig"
 
-source "drivers/fmc/Kconfig"
-
 source "drivers/phy/Kconfig"
 
 source "drivers/powercap/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 28b030d7988d..6d37564e783c 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -168,7 +168,6 @@ obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IPACK_BUS)		+= ipack/
 obj-$(CONFIG_NTB)		+= ntb/
-obj-$(CONFIG_FMC)		+= fmc/
 obj-$(CONFIG_POWERCAP)		+= powercap/
 obj-$(CONFIG_MCB)		+= mcb/
 obj-$(CONFIG_PERF_EVENTS)	+= perf/
diff --git a/drivers/fmc/Kconfig b/drivers/fmc/Kconfig
deleted file mode 100644
index ae3d7f634932..000000000000
--- a/drivers/fmc/Kconfig
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# FMC (ANSI-VITA 57.1) bus support
-#
-
-menuconfig FMC
-	tristate "FMC support"
-	help
-
-	  FMC (FPGA Mezzanine Carrier) is a mechanical and electrical
-	  standard for mezzanine cards that plug into a carrier board.
-	  This kernel subsystem supports the matching between carrier
-	  and mezzanine based on identifiers stored in the internal I2C
-	  EEPROM, as well as having carrier-independent drivers.
-
-	  The framework was born outside of the kernel and at this time
-	  the off-tree code base is more complete.  Code and documentation
-	  is at git://ohwr.org/fmc-projects/fmc-bus.git .
-
-if FMC
-
-config FMC_FAKEDEV
-	tristate "FMC fake device (software testing)"
-	help
-	  This is a fake carrier, bringing a default EEPROM content
-	  that can be rewritten at run time and usef for matching
-	  mezzanines.
-
-config FMC_TRIVIAL
-	tristate "FMC trivial mezzanine driver (software testing)"
-	help
-	  This is a fake mezzanine driver, to show how FMC works and test it.
-	  The driver also handles interrupts (we used it with a real carrier
-	  before the mezzanines were produced)
-
-config FMC_WRITE_EEPROM
-	tristate "FMC mezzanine driver to write I2C EEPROM"
-	help
-	  This driver matches every mezzanine device and can write the
-	  internal EEPROM of the PCB, using the firmware loader to get
-	  its binary and the function carrier->reprogram to actually do it.
-	  It is useful when the mezzanines are produced.
-
-config FMC_CHARDEV
-	tristate "FMC mezzanine driver that registers a char device"
-	help
-	  This driver matches every mezzanine device and allows user
-	  space to read and write registers using a char device. It
-	  can be used to write user-space drivers, or just get
-	  acquainted with a mezzanine before writing its specific driver.
-
-endif # FMC
diff --git a/drivers/fmc/Makefile b/drivers/fmc/Makefile
deleted file mode 100644
index e3da6192cf39..000000000000
--- a/drivers/fmc/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_FMC) += fmc.o
-
-fmc-y = fmc-core.o
-fmc-y += fmc-match.o
-fmc-y += fmc-sdb.o
-fmc-y += fru-parse.o
-fmc-y += fmc-dump.o
-fmc-y += fmc-debug.o
-
-obj-$(CONFIG_FMC_FAKEDEV) += fmc-fakedev.o
-obj-$(CONFIG_FMC_TRIVIAL) += fmc-trivial.o
-obj-$(CONFIG_FMC_WRITE_EEPROM) += fmc-write-eeprom.o
-obj-$(CONFIG_FMC_CHARDEV) += fmc-chardev.o
diff --git a/drivers/fmc/fmc-chardev.c b/drivers/fmc/fmc-chardev.c
deleted file mode 100644
index 7d2091b5e978..000000000000
--- a/drivers/fmc/fmc-chardev.c
+++ /dev/null
@@ -1,199 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/fmc.h>
-#include <linux/uaccess.h>
-
-static LIST_HEAD(fc_devices);
-static DEFINE_SPINLOCK(fc_lock);
-
-struct fc_instance {
-	struct list_head list;
-	struct fmc_device *fmc;
-	struct miscdevice misc;
-};
-
-/* at open time, we must identify our device */
-static int fc_open(struct inode *ino, struct file *f)
-{
-	struct fmc_device *fmc;
-	struct fc_instance *fc;
-	int minor = iminor(ino);
-
-	list_for_each_entry(fc, &fc_devices, list)
-		if (fc->misc.minor == minor)
-			break;
-	if (fc->misc.minor != minor)
-		return -ENODEV;
-	fmc = fc->fmc;
-	if (try_module_get(fmc->owner) == 0)
-		return -ENODEV;
-
-	f->private_data = fmc;
-	return 0;
-}
-
-static int fc_release(struct inode *ino, struct file *f)
-{
-	struct fmc_device *fmc = f->private_data;
-	module_put(fmc->owner);
-	return 0;
-}
-
-/* read and write are simple after the default llseek has been used */
-static ssize_t fc_read(struct file *f, char __user *buf, size_t count,
-		       loff_t *offp)
-{
-	struct fmc_device *fmc = f->private_data;
-	unsigned long addr;
-	uint32_t val;
-
-	if (count < sizeof(val))
-		return -EINVAL;
-	count = sizeof(val);
-
-	addr = *offp;
-	if (addr > fmc->memlen)
-		return -ESPIPE; /* Illegal seek */
-	val = fmc_readl(fmc, addr);
-	if (copy_to_user(buf, &val, count))
-		return -EFAULT;
-	*offp += count;
-	return count;
-}
-
-static ssize_t fc_write(struct file *f, const char __user *buf, size_t count,
-			loff_t *offp)
-{
-	struct fmc_device *fmc = f->private_data;
-	unsigned long addr;
-	uint32_t val;
-
-	if (count < sizeof(val))
-		return -EINVAL;
-	count = sizeof(val);
-
-	addr = *offp;
-	if (addr > fmc->memlen)
-		return -ESPIPE; /* Illegal seek */
-	if (copy_from_user(&val, buf, count))
-		return -EFAULT;
-	fmc_writel(fmc, val, addr);
-	*offp += count;
-	return count;
-}
-
-static const struct file_operations fc_fops = {
-	.owner = THIS_MODULE,
-	.open = fc_open,
-	.release = fc_release,
-	.llseek = generic_file_llseek,
-	.read = fc_read,
-	.write = fc_write,
-};
-
-
-/* Device part .. */
-static int fc_probe(struct fmc_device *fmc);
-static int fc_remove(struct fmc_device *fmc);
-
-static struct fmc_driver fc_drv = {
-	.version = FMC_VERSION,
-	.driver.name = KBUILD_MODNAME,
-	.probe = fc_probe,
-	.remove = fc_remove,
-	/* no table: we want to match everything */
-};
-
-/* We accept the generic busid parameter */
-FMC_PARAM_BUSID(fc_drv);
-
-/* probe and remove must allocate and release a misc device */
-static int fc_probe(struct fmc_device *fmc)
-{
-	int ret;
-	int index = 0;
-
-	struct fc_instance *fc;
-
-	index = fmc_validate(fmc, &fc_drv);
-	if (index < 0)
-		return -EINVAL; /* not our device: invalid */
-
-	/* Create a char device: we want to create it anew */
-	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
-	if (!fc)
-		return -ENOMEM;
-	fc->fmc = fmc;
-	fc->misc.minor = MISC_DYNAMIC_MINOR;
-	fc->misc.fops = &fc_fops;
-	fc->misc.name = kstrdup(dev_name(&fmc->dev), GFP_KERNEL);
-
-	ret = misc_register(&fc->misc);
-	if (ret < 0)
-		goto out;
-	spin_lock(&fc_lock);
-	list_add(&fc->list, &fc_devices);
-	spin_unlock(&fc_lock);
-	dev_info(&fc->fmc->dev, "Created misc device \"%s\"\n",
-		 fc->misc.name);
-	return 0;
-
-out:
-	kfree(fc->misc.name);
-	kfree(fc);
-	return ret;
-}
-
-static int fc_remove(struct fmc_device *fmc)
-{
-	struct fc_instance *fc;
-
-	list_for_each_entry(fc, &fc_devices, list)
-		if (fc->fmc == fmc)
-			break;
-	if (fc->fmc != fmc) {
-		dev_err(&fmc->dev, "remove called but not found\n");
-		return -ENODEV;
-	}
-
-	spin_lock(&fc_lock);
-	list_del(&fc->list);
-	spin_unlock(&fc_lock);
-	misc_deregister(&fc->misc);
-	kfree(fc->misc.name);
-	kfree(fc);
-
-	return 0;
-}
-
-
-static int fc_init(void)
-{
-	int ret;
-
-	ret = fmc_driver_register(&fc_drv);
-	return ret;
-}
-
-static void fc_exit(void)
-{
-	fmc_driver_unregister(&fc_drv);
-}
-
-module_init(fc_init);
-module_exit(fc_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
deleted file mode 100644
index 573f5471f680..000000000000
--- a/drivers/fmc/fmc-core.c
+++ /dev/null
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/fmc.h>
-#include <linux/fmc-sdb.h>
-
-#include "fmc-private.h"
-
-static int fmc_check_version(unsigned long version, const char *name)
-{
-	if (__FMC_MAJOR(version) != FMC_MAJOR) {
-		pr_err("%s: \"%s\" has wrong major (has %li, expected %i)\n",
-		       __func__, name, __FMC_MAJOR(version), FMC_MAJOR);
-		return -EINVAL;
-	}
-
-	if (__FMC_MINOR(version) != FMC_MINOR)
-		pr_info("%s: \"%s\" has wrong minor (has %li, expected %i)\n",
-		       __func__, name, __FMC_MINOR(version), FMC_MINOR);
-	return 0;
-}
-
-static int fmc_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	/* struct fmc_device *fdev = to_fmc_device(dev); */
-
-	/* FIXME: The MODALIAS */
-	add_uevent_var(env, "MODALIAS=%s", "fmc");
-	return 0;
-}
-
-static int fmc_probe(struct device *dev)
-{
-	struct fmc_driver *fdrv = to_fmc_driver(dev->driver);
-	struct fmc_device *fdev = to_fmc_device(dev);
-
-	return fdrv->probe(fdev);
-}
-
-static int fmc_remove(struct device *dev)
-{
-	struct fmc_driver *fdrv = to_fmc_driver(dev->driver);
-	struct fmc_device *fdev = to_fmc_device(dev);
-
-	return fdrv->remove(fdev);
-}
-
-static void fmc_shutdown(struct device *dev)
-{
-	/* not implemented but mandatory */
-}
-
-static struct bus_type fmc_bus_type = {
-	.name = "fmc",
-	.match = fmc_match,
-	.uevent = fmc_uevent,
-	.probe = fmc_probe,
-	.remove = fmc_remove,
-	.shutdown = fmc_shutdown,
-};
-
-static void fmc_release(struct device *dev)
-{
-	struct fmc_device *fmc = container_of(dev, struct fmc_device, dev);
-
-	kfree(fmc);
-}
-
-/*
- * The eeprom is exported in sysfs, through a binary attribute
- */
-
-static ssize_t fmc_read_eeprom(struct file *file, struct kobject *kobj,
-			   struct bin_attribute *bin_attr,
-			   char *buf, loff_t off, size_t count)
-{
-	struct device *dev;
-	struct fmc_device *fmc;
-	int eelen;
-
-	dev = container_of(kobj, struct device, kobj);
-	fmc = container_of(dev, struct fmc_device, dev);
-	eelen = fmc->eeprom_len;
-	if (off > eelen)
-		return -ESPIPE;
-	if (off == eelen)
-		return 0; /* EOF */
-	if (off + count > eelen)
-		count = eelen - off;
-	memcpy(buf, fmc->eeprom + off, count);
-	return count;
-}
-
-static ssize_t fmc_write_eeprom(struct file *file, struct kobject *kobj,
-				struct bin_attribute *bin_attr,
-				char *buf, loff_t off, size_t count)
-{
-	struct device *dev;
-	struct fmc_device *fmc;
-
-	dev = container_of(kobj, struct device, kobj);
-	fmc = container_of(dev, struct fmc_device, dev);
-	return fmc->op->write_ee(fmc, off, buf, count);
-}
-
-static struct bin_attribute fmc_eeprom_attr = {
-	.attr = { .name = "eeprom", .mode = S_IRUGO | S_IWUSR, },
-	.size = 8192, /* more or less standard */
-	.read = fmc_read_eeprom,
-	.write = fmc_write_eeprom,
-};
-
-int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
-		    char *name, int flags)
-{
-	if (fmc->op->irq_request)
-		return fmc->op->irq_request(fmc, h, name, flags);
-	return -EPERM;
-}
-EXPORT_SYMBOL(fmc_irq_request);
-
-void fmc_irq_free(struct fmc_device *fmc)
-{
-	if (fmc->op->irq_free)
-		fmc->op->irq_free(fmc);
-}
-EXPORT_SYMBOL(fmc_irq_free);
-
-void fmc_irq_ack(struct fmc_device *fmc)
-{
-	if (likely(fmc->op->irq_ack))
-		fmc->op->irq_ack(fmc);
-}
-EXPORT_SYMBOL(fmc_irq_ack);
-
-int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv)
-{
-	if (fmc->op->validate)
-		return fmc->op->validate(fmc, drv);
-	return -EPERM;
-}
-EXPORT_SYMBOL(fmc_validate);
-
-int fmc_gpio_config(struct fmc_device *fmc, struct fmc_gpio *gpio, int ngpio)
-{
-	if (fmc->op->gpio_config)
-		return fmc->op->gpio_config(fmc, gpio, ngpio);
-	return -EPERM;
-}
-EXPORT_SYMBOL(fmc_gpio_config);
-
-int fmc_read_ee(struct fmc_device *fmc, int pos, void *d, int l)
-{
-	if (fmc->op->read_ee)
-		return fmc->op->read_ee(fmc, pos, d, l);
-	return -EPERM;
-}
-EXPORT_SYMBOL(fmc_read_ee);
-
-int fmc_write_ee(struct fmc_device *fmc, int pos, const void *d, int l)
-{
-	if (fmc->op->write_ee)
-		return fmc->op->write_ee(fmc, pos, d, l);
-	return -EPERM;
-}
-EXPORT_SYMBOL(fmc_write_ee);
-
-/*
- * Functions for client modules follow
- */
-
-int fmc_driver_register(struct fmc_driver *drv)
-{
-	if (fmc_check_version(drv->version, drv->driver.name))
-		return -EINVAL;
-	drv->driver.bus = &fmc_bus_type;
-	return driver_register(&drv->driver);
-}
-EXPORT_SYMBOL(fmc_driver_register);
-
-void fmc_driver_unregister(struct fmc_driver *drv)
-{
-	driver_unregister(&drv->driver);
-}
-EXPORT_SYMBOL(fmc_driver_unregister);
-
-/*
- * When a device set is registered, all eeproms must be read
- * and all FRUs must be parsed
- */
-int fmc_device_register_n_gw(struct fmc_device **devs, int n,
-			  struct fmc_gateware *gw)
-{
-	struct fmc_device *fmc, **devarray;
-	uint32_t device_id;
-	int i, ret = 0;
-
-	if (n < 1)
-		return 0;
-
-	/* Check the version of the first data structure (function prints) */
-	if (fmc_check_version(devs[0]->version, devs[0]->carrier_name))
-		return -EINVAL;
-
-	devarray = kmemdup(devs, n * sizeof(*devs), GFP_KERNEL);
-	if (!devarray)
-		return -ENOMEM;
-
-	/* Make all other checks before continuing, for all devices */
-	for (i = 0; i < n; i++) {
-		fmc = devarray[i];
-		if (!fmc->hwdev) {
-			pr_err("%s: device nr. %i has no hwdev pointer\n",
-			       __func__, i);
-			ret = -EINVAL;
-			break;
-		}
-		if (fmc->flags & FMC_DEVICE_NO_MEZZANINE) {
-			dev_info(fmc->hwdev, "absent mezzanine in slot %d\n",
-				 fmc->slot_id);
-			continue;
-		}
-		if (!fmc->eeprom) {
-			dev_err(fmc->hwdev, "no eeprom provided for slot %i\n",
-				fmc->slot_id);
-			ret = -EINVAL;
-		}
-		if (!fmc->eeprom_addr) {
-			dev_err(fmc->hwdev, "no eeprom_addr for slot %i\n",
-				fmc->slot_id);
-			ret = -EINVAL;
-		}
-		if (!fmc->carrier_name || !fmc->carrier_data ||
-		    !fmc->device_id) {
-			dev_err(fmc->hwdev,
-				"device nr %i: carrier name, "
-				"data or dev_id not set\n", i);
-			ret = -EINVAL;
-		}
-		if (ret)
-			break;
-
-	}
-	if (ret) {
-		kfree(devarray);
-		return ret;
-	}
-
-	/* Validation is ok. Now init and register the devices */
-	for (i = 0; i < n; i++) {
-		fmc = devarray[i];
-
-		fmc->nr_slots = n; /* each slot must know how many are there */
-		fmc->devarray = devarray;
-
-		device_initialize(&fmc->dev);
-		fmc->dev.release = fmc_release;
-		fmc->dev.parent = fmc->hwdev;
-
-		/* Fill the identification stuff (may fail) */
-		fmc_fill_id_info(fmc);
-
-		fmc->dev.bus = &fmc_bus_type;
-
-		/* Name from mezzanine info or carrier info. Or 0,1,2.. */
-		device_id = fmc->device_id;
-		if (!fmc->mezzanine_name)
-			dev_set_name(&fmc->dev, "fmc-%04x", device_id);
-		else
-			dev_set_name(&fmc->dev, "%s-%04x", fmc->mezzanine_name,
-				     device_id);
-
-		if (gw) {
-			/*
-			 * The carrier already know the bitstream to load
-			 * for this set of FMC mezzanines.
-			 */
-			ret = fmc->op->reprogram_raw(fmc, NULL,
-						     gw->bitstream, gw->len);
-			if (ret) {
-				dev_warn(fmc->hwdev,
-					 "Invalid gateware for FMC mezzanine\n");
-				goto out;
-			}
-		}
-
-		ret = device_add(&fmc->dev);
-		if (ret < 0) {
-			dev_err(fmc->hwdev, "Slot %i: Failed in registering "
-				"\"%s\"\n", fmc->slot_id, fmc->dev.kobj.name);
-			goto out;
-		}
-		ret = sysfs_create_bin_file(&fmc->dev.kobj, &fmc_eeprom_attr);
-		if (ret < 0) {
-			dev_err(&fmc->dev, "Failed in registering eeprom\n");
-			goto out1;
-		}
-		/* This device went well, give information to the user */
-		fmc_dump_eeprom(fmc);
-		fmc_debug_init(fmc);
-	}
-	return 0;
-
-out1:
-	device_del(&fmc->dev);
-out:
-	kfree(devarray);
-	for (i--; i >= 0; i--) {
-		fmc_debug_exit(devs[i]);
-		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
-		device_del(&devs[i]->dev);
-		fmc_free_id_info(devs[i]);
-		put_device(&devs[i]->dev);
-	}
-	return ret;
-
-}
-EXPORT_SYMBOL(fmc_device_register_n_gw);
-
-int fmc_device_register_n(struct fmc_device **devs, int n)
-{
-	return fmc_device_register_n_gw(devs, n, NULL);
-}
-EXPORT_SYMBOL(fmc_device_register_n);
-
-int fmc_device_register_gw(struct fmc_device *fmc, struct fmc_gateware *gw)
-{
-	return fmc_device_register_n_gw(&fmc, 1, gw);
-}
-EXPORT_SYMBOL(fmc_device_register_gw);
-
-int fmc_device_register(struct fmc_device *fmc)
-{
-	return fmc_device_register_n(&fmc, 1);
-}
-EXPORT_SYMBOL(fmc_device_register);
-
-void fmc_device_unregister_n(struct fmc_device **devs, int n)
-{
-	int i;
-
-	if (n < 1)
-		return;
-
-	/* Free devarray first, not used by the later loop */
-	kfree(devs[0]->devarray);
-
-	for (i = 0; i < n; i++) {
-		fmc_debug_exit(devs[i]);
-		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
-		device_del(&devs[i]->dev);
-		fmc_free_id_info(devs[i]);
-		put_device(&devs[i]->dev);
-	}
-}
-EXPORT_SYMBOL(fmc_device_unregister_n);
-
-void fmc_device_unregister(struct fmc_device *fmc)
-{
-	fmc_device_unregister_n(&fmc, 1);
-}
-EXPORT_SYMBOL(fmc_device_unregister);
-
-/* Init and exit are trivial */
-static int fmc_init(void)
-{
-	return bus_register(&fmc_bus_type);
-}
-
-static void fmc_exit(void)
-{
-	bus_unregister(&fmc_bus_type);
-}
-
-module_init(fmc_init);
-module_exit(fmc_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/fmc/fmc-debug.c b/drivers/fmc/fmc-debug.c
deleted file mode 100644
index 1734c7cf0e76..000000000000
--- a/drivers/fmc/fmc-debug.c
+++ /dev/null
@@ -1,172 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2015 CERN (www.cern.ch)
- * Author: Federico Vaga <federico.vaga@cern.ch>
- */
-
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <asm/byteorder.h>
-
-#include <linux/fmc.h>
-#include <linux/sdb.h>
-#include <linux/fmc-sdb.h>
-
-#define FMC_DBG_SDB_DUMP "dump_sdb"
-
-static char *__strip_trailing_space(char *buf, char *str, int len)
-{
-	int i = len - 1;
-
-	memcpy(buf, str, len);
-	buf[len] = '\0';
-	while (i >= 0 && buf[i] == ' ')
-		buf[i--] = '\0';
-	return buf;
-}
-
-#define __sdb_string(buf, field) ({			\
-	BUILD_BUG_ON(sizeof(buf) < sizeof(field));	\
-	__strip_trailing_space(buf, (void *)(field), sizeof(field));	\
-		})
-
-/**
- * We do not check seq_printf() errors because we want to see things in any case
- */
-static void fmc_sdb_dump_recursive(struct fmc_device *fmc, struct seq_file *s,
-				   const struct sdb_array *arr)
-{
-	unsigned long base = arr->baseaddr;
-	int i, j, n = arr->len, level = arr->level;
-	char tmp[64];
-
-	for (i = 0; i < n; i++) {
-		union  sdb_record *r;
-		struct sdb_product *p;
-		struct sdb_component *c;
-
-		r = &arr->record[i];
-		c = &r->dev.sdb_component;
-		p = &c->product;
-
-		for (j = 0; j < level; j++)
-			seq_printf(s, "   ");
-		switch (r->empty.record_type) {
-		case sdb_type_interconnect:
-			seq_printf(s, "%08llx:%08x %.19s\n",
-				   __be64_to_cpu(p->vendor_id),
-				   __be32_to_cpu(p->device_id),
-				   p->name);
-			break;
-		case sdb_type_device:
-			seq_printf(s, "%08llx:%08x %.19s (%08llx-%08llx)\n",
-				   __be64_to_cpu(p->vendor_id),
-				   __be32_to_cpu(p->device_id),
-				   p->name,
-				   __be64_to_cpu(c->addr_first) + base,
-				   __be64_to_cpu(c->addr_last) + base);
-			break;
-		case sdb_type_bridge:
-			seq_printf(s, "%08llx:%08x %.19s (bridge: %08llx)\n",
-				   __be64_to_cpu(p->vendor_id),
-				   __be32_to_cpu(p->device_id),
-				   p->name,
-				   __be64_to_cpu(c->addr_first) + base);
-			if (IS_ERR(arr->subtree[i])) {
-				seq_printf(s, "SDB: (bridge error %li)\n",
-					 PTR_ERR(arr->subtree[i]));
-				break;
-			}
-			fmc_sdb_dump_recursive(fmc, s, arr->subtree[i]);
-			break;
-		case sdb_type_integration:
-			seq_printf(s, "integration\n");
-			break;
-		case sdb_type_repo_url:
-			seq_printf(s, "Synthesis repository: %s\n",
-					  __sdb_string(tmp, r->repo_url.repo_url));
-			break;
-		case sdb_type_synthesis:
-			seq_printf(s, "Bitstream '%s' ",
-					  __sdb_string(tmp, r->synthesis.syn_name));
-			seq_printf(s, "synthesized %08x by %s ",
-					  __be32_to_cpu(r->synthesis.date),
-					  __sdb_string(tmp, r->synthesis.user_name));
-			seq_printf(s, "(%s version %x), ",
-					  __sdb_string(tmp, r->synthesis.tool_name),
-					  __be32_to_cpu(r->synthesis.tool_version));
-			seq_printf(s, "commit %pm\n",
-					  r->synthesis.commit_id);
-			break;
-		case sdb_type_empty:
-			seq_printf(s, "empty\n");
-			break;
-		default:
-			seq_printf(s, "UNKNOWN TYPE 0x%02x\n",
-				   r->empty.record_type);
-			break;
-		}
-	}
-}
-
-static int fmc_sdb_dump(struct seq_file *s, void *offset)
-{
-	struct fmc_device *fmc = s->private;
-
-	if (!fmc->sdb) {
-		seq_printf(s, "no SDB information\n");
-		return 0;
-	}
-
-	seq_printf(s, "FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
-	fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
-	/* Dump SDB information */
-	fmc_sdb_dump_recursive(fmc, s, fmc->sdb);
-
-	return 0;
-}
-
-
-static int fmc_sdb_dump_open(struct inode *inode, struct file *file)
-{
-	struct fmc_device *fmc = inode->i_private;
-
-	return single_open(file, fmc_sdb_dump, fmc);
-}
-
-
-const struct file_operations fmc_dbgfs_sdb_dump = {
-	.owner = THIS_MODULE,
-	.open  = fmc_sdb_dump_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-int fmc_debug_init(struct fmc_device *fmc)
-{
-	fmc->dbg_dir = debugfs_create_dir(dev_name(&fmc->dev), NULL);
-	if (IS_ERR_OR_NULL(fmc->dbg_dir)) {
-		pr_err("FMC: Cannot create debugfs\n");
-		return PTR_ERR(fmc->dbg_dir);
-	}
-
-	fmc->dbg_sdb_dump = debugfs_create_file(FMC_DBG_SDB_DUMP, 0444,
-						fmc->dbg_dir, fmc,
-						&fmc_dbgfs_sdb_dump);
-	if (IS_ERR_OR_NULL(fmc->dbg_sdb_dump))
-		pr_err("FMC: Cannot create debugfs file %s\n",
-		       FMC_DBG_SDB_DUMP);
-
-	return 0;
-}
-
-void fmc_debug_exit(struct fmc_device *fmc)
-{
-	if (fmc->dbg_dir)
-		debugfs_remove_recursive(fmc->dbg_dir);
-}
diff --git a/drivers/fmc/fmc-dump.c b/drivers/fmc/fmc-dump.c
deleted file mode 100644
index 6c81dbde1d16..000000000000
--- a/drivers/fmc/fmc-dump.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2013 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/fmc.h>
-#include <linux/fmc-sdb.h>
-
-static int fmc_must_dump_eeprom;
-module_param_named(dump_eeprom, fmc_must_dump_eeprom, int, 0644);
-
-#define LINELEN 16
-
-/* Dumping 8k takes oh so much: avoid duplicate lines */
-static const uint8_t *dump_line(int addr, const uint8_t *line,
-				const uint8_t *prev)
-{
-	int i;
-
-	if (!prev || memcmp(line, prev, LINELEN)) {
-		pr_info("%04x: ", addr);
-		for (i = 0; i < LINELEN; ) {
-			printk(KERN_CONT "%02x", line[i]);
-			i++;
-			printk(i & 3 ? " " : i & (LINELEN - 1) ? "  " : "\n");
-		}
-		return line;
-	}
-	/* repeated line */
-	if (line == prev + LINELEN)
-		pr_info("[...]\n");
-	return prev;
-}
-
-void fmc_dump_eeprom(const struct fmc_device *fmc)
-{
-	const uint8_t *line, *prev;
-	int i;
-
-	if (!fmc_must_dump_eeprom)
-		return;
-
-	pr_info("FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
-		fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
-	pr_info("FMC: dumping eeprom 0x%x (%i) bytes\n", fmc->eeprom_len,
-	       fmc->eeprom_len);
-
-	line = fmc->eeprom;
-	prev = NULL;
-	for (i = 0; i < fmc->eeprom_len; i += LINELEN, line += LINELEN)
-		prev = dump_line(i, line, prev);
-}
diff --git a/drivers/fmc/fmc-fakedev.c b/drivers/fmc/fmc-fakedev.c
deleted file mode 100644
index 941d0930969a..000000000000
--- a/drivers/fmc/fmc-fakedev.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * The software is provided "as is"; the copyright holders disclaim
- * all warranties and liabilities, to the extent permitted by
- * applicable law.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/firmware.h>
-#include <linux/workqueue.h>
-#include <linux/err.h>
-#include <linux/fmc.h>
-
-#define FF_EEPROM_SIZE		8192	/* The standard eeprom size */
-#define FF_MAX_MEZZANINES	4	/* Fakes a multi-mezzanine carrier */
-
-/* The user can pass up to 4 names of eeprom images to load */
-static char *ff_eeprom[FF_MAX_MEZZANINES];
-static int ff_nr_eeprom;
-module_param_array_named(eeprom, ff_eeprom, charp, &ff_nr_eeprom, 0444);
-
-/* The user can ask for a multi-mezzanine carrier, with the default eeprom */
-static int ff_nr_dev = 1;
-module_param_named(ndev, ff_nr_dev, int, 0444);
-
-
-/* Lazily, don't support the "standard" module parameters */
-
-/*
- * Eeprom built from these commands:
-
-	../fru-generator -v fake-vendor -n fake-design-for-testing \
-		-s 01234 -p none > IPMI-FRU
-
-	gensdbfs . ../fake-eeprom.bin
-*/
-static char ff_eeimg[FF_MAX_MEZZANINES][FF_EEPROM_SIZE] = {
-	{
-	0x01, 0x00, 0x00, 0x01, 0x00, 0x0c, 0x00, 0xf2, 0x01, 0x0b, 0x00, 0xb2,
-	0x86, 0x87, 0xcb, 0x66, 0x61, 0x6b, 0x65, 0x2d, 0x76, 0x65, 0x6e, 0x64,
-	0x6f, 0x72, 0xd7, 0x66, 0x61, 0x6b, 0x65, 0x2d, 0x64, 0x65, 0x73, 0x69,
-	0x67, 0x6e, 0x2d, 0x66, 0x6f, 0x72, 0x2d, 0x74, 0x65, 0x73, 0x74, 0x69,
-	0x6e, 0x67, 0xc5, 0x30, 0x31, 0x32, 0x33, 0x34, 0xc4, 0x6e, 0x6f, 0x6e,
-	0x65, 0xda, 0x32, 0x30, 0x31, 0x32, 0x2d, 0x31, 0x31, 0x2d, 0x31, 0x39,
-	0x20, 0x32, 0x32, 0x3a, 0x34, 0x32, 0x3a, 0x33, 0x30, 0x2e, 0x30, 0x37,
-	0x34, 0x30, 0x35, 0x35, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87,
-	0x02, 0x02, 0x0d, 0xf7, 0xf8, 0x02, 0xb0, 0x04, 0x74, 0x04, 0xec, 0x04,
-	0x00, 0x00, 0x00, 0x00, 0xe8, 0x03, 0x02, 0x02, 0x0d, 0x5c, 0x93, 0x01,
-	0x4a, 0x01, 0x39, 0x01, 0x5a, 0x01, 0x00, 0x00, 0x00, 0x00, 0xb8, 0x0b,
-	0x02, 0x02, 0x0d, 0x63, 0x8c, 0x00, 0xfa, 0x00, 0xed, 0x00, 0x06, 0x01,
-	0x00, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x01, 0x02, 0x0d, 0xfb, 0xf5, 0x05,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x01, 0x02, 0x0d, 0xfc, 0xf4, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x0d, 0xfd, 0xf3, 0x03,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0xfa, 0x82, 0x0b, 0xea, 0x8f, 0xa2, 0x12, 0x00, 0x00, 0x1e, 0x44, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x53, 0x44, 0x42, 0x2d, 0x00, 0x03, 0x01, 0x01,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x01, 0xc4, 0x46, 0x69, 0x6c, 0x65, 0x44, 0x61, 0x74, 0x61,
-	0x2e, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
-	0x2e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-	0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc0,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc4, 0x46, 0x69, 0x6c, 0x65,
-	0x44, 0x61, 0x74, 0x61, 0x6e, 0x61, 0x6d, 0x65, 0x00, 0x00, 0x00, 0x01,
-	0x00, 0x00, 0x00, 0x00, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x20, 0x20, 0x20,
-	0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x01,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf,
-	0x46, 0x69, 0x6c, 0x65, 0x44, 0x61, 0x74, 0x61, 0x49, 0x50, 0x4d, 0x49,
-	0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x49, 0x50, 0x4d, 0x49,
-	0x2d, 0x46, 0x52, 0x55, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-	0x20, 0x20, 0x20, 0x01, 0x66, 0x61, 0x6b, 0x65, 0x0a,
-	},
-};
-
-struct ff_dev {
-	struct fmc_device *fmc[FF_MAX_MEZZANINES];
-	struct device dev;
-};
-
-static struct ff_dev *ff_current_dev; /* We have 1 carrier, 1 slot */
-
-static int ff_reprogram(struct fmc_device *fmc, struct fmc_driver *drv,
-			  char *gw)
-{
-	const struct firmware *fw;
-	int ret;
-
-	if (!gw) {
-		/* program golden: success */
-		fmc->flags &= ~FMC_DEVICE_HAS_CUSTOM;
-		fmc->flags |= FMC_DEVICE_HAS_GOLDEN;
-		return 0;
-	}
-
-	dev_info(&fmc->dev, "reprogramming with %s\n", gw);
-	ret = request_firmware(&fw, gw, &fmc->dev);
-	if (ret < 0) {
-		dev_warn(&fmc->dev, "request firmware \"%s\": error %i\n",
-			 gw, ret);
-		goto out;
-	}
-	fmc->flags &= ~FMC_DEVICE_HAS_GOLDEN;
-	fmc->flags |= FMC_DEVICE_HAS_CUSTOM;
-
-out:
-	release_firmware(fw);
-	return ret;
-}
-
-static int ff_irq_request(struct fmc_device *fmc, irq_handler_t handler,
-			    char *name, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-/* FIXME: should also have some fake FMC GPIO mapping */
-
-
-/*
- * This work function is called when we changed the eeprom. It removes the
- * current fmc device and registers a new one, with different identifiers.
- */
-static struct ff_dev *ff_dev_create(void); /* defined later */
-
-static void ff_work_fn(struct work_struct *work)
-{
-	struct ff_dev *ff = ff_current_dev;
-	int ret;
-
-	fmc_device_unregister_n(ff->fmc, ff_nr_dev);
-	device_unregister(&ff->dev);
-	ff_current_dev = NULL;
-
-	ff = ff_dev_create();
-	if (IS_ERR(ff)) {
-		pr_warning("%s: can't re-create FMC devices\n", __func__);
-		return;
-	}
-	ret = fmc_device_register_n(ff->fmc, ff_nr_dev);
-	if (ret < 0) {
-		dev_warn(&ff->dev, "can't re-register FMC devices\n");
-		device_unregister(&ff->dev);
-		return;
-	}
-
-	ff_current_dev = ff;
-}
-
-static DECLARE_DELAYED_WORK(ff_work, ff_work_fn);
-
-
-/* low-level i2c */
-static int ff_eeprom_read(struct fmc_device *fmc, uint32_t offset,
-		void *buf, size_t size)
-{
-	if (offset > FF_EEPROM_SIZE)
-		return -EINVAL;
-	if (offset + size > FF_EEPROM_SIZE)
-		size = FF_EEPROM_SIZE - offset;
-	memcpy(buf, fmc->eeprom + offset, size);
-	return size;
-}
-
-static int ff_eeprom_write(struct fmc_device *fmc, uint32_t offset,
-		    const void *buf, size_t size)
-{
-	if (offset > FF_EEPROM_SIZE)
-		return -EINVAL;
-	if (offset + size > FF_EEPROM_SIZE)
-		size = FF_EEPROM_SIZE - offset;
-	dev_info(&fmc->dev, "write_eeprom: offset %i, size %zi\n",
-		 (int)offset, size);
-	memcpy(fmc->eeprom + offset, buf, size);
-	schedule_delayed_work(&ff_work, HZ * 2); /* remove, replug, in 2s */
-	return size;
-}
-
-/* i2c operations for fmc */
-static int ff_read_ee(struct fmc_device *fmc, int pos, void *data, int len)
-{
-	if (!(fmc->flags & FMC_DEVICE_HAS_GOLDEN))
-		return -EOPNOTSUPP;
-	return ff_eeprom_read(fmc, pos, data, len);
-}
-
-static int ff_write_ee(struct fmc_device *fmc, int pos,
-			 const void *data, int len)
-{
-	if (!(fmc->flags & FMC_DEVICE_HAS_GOLDEN))
-		return -EOPNOTSUPP;
-	return ff_eeprom_write(fmc, pos, data, len);
-}
-
-/* readl and writel do not do anything. Don't waste RAM with "base" */
-static uint32_t ff_readl(struct fmc_device *fmc, int offset)
-{
-	return 0;
-}
-
-static void ff_writel(struct fmc_device *fmc, uint32_t value, int offset)
-{
-	return;
-}
-
-/* validate is useful so fmc-write-eeprom will not reprogram every 2 seconds */
-static int ff_validate(struct fmc_device *fmc, struct fmc_driver *drv)
-{
-	int i;
-
-	if (!drv->busid_n)
-		return 0; /* everyhing is valid */
-	for (i = 0; i < drv->busid_n; i++)
-		if (drv->busid_val[i] == fmc->device_id)
-			return i;
-	return -ENOENT;
-}
-
-
-
-static struct fmc_operations ff_fmc_operations = {
-	.read32 =		ff_readl,
-	.write32 =		ff_writel,
-	.reprogram =		ff_reprogram,
-	.irq_request =		ff_irq_request,
-	.read_ee =		ff_read_ee,
-	.write_ee =		ff_write_ee,
-	.validate =		ff_validate,
-};
-
-/* This device is kmalloced: release it */
-static void ff_dev_release(struct device *dev)
-{
-	struct ff_dev *ff = container_of(dev, struct ff_dev, dev);
-	kfree(ff);
-}
-
-static struct fmc_device ff_template_fmc = {
-	.version = FMC_VERSION,
-	.owner = THIS_MODULE,
-	.carrier_name = "fake-fmc-carrier",
-	.device_id = 0xf001, /* fool */
-	.eeprom_len = sizeof(ff_eeimg[0]),
-	.memlen = 0x1000, /* 4k, to show something */
-	.op = &ff_fmc_operations,
-	.hwdev = NULL, /* filled at creation time */
-	.flags = FMC_DEVICE_HAS_GOLDEN,
-};
-
-static struct ff_dev *ff_dev_create(void)
-{
-	struct ff_dev *ff;
-	struct fmc_device *fmc;
-	int i, ret;
-
-	ff = kzalloc(sizeof(*ff), GFP_KERNEL);
-	if (!ff)
-		return ERR_PTR(-ENOMEM);
-	dev_set_name(&ff->dev, "fake-fmc-carrier");
-	ff->dev.release = ff_dev_release;
-
-	ret = device_register(&ff->dev);
-	if (ret < 0) {
-		put_device(&ff->dev);
-		return ERR_PTR(ret);
-	}
-
-	/* Create fmc structures that refer to this new "hw" device */
-	for (i = 0; i < ff_nr_dev; i++) {
-		fmc = kmemdup(&ff_template_fmc, sizeof(ff_template_fmc),
-			      GFP_KERNEL);
-		fmc->hwdev = &ff->dev;
-		fmc->carrier_data = ff;
-		fmc->nr_slots = ff_nr_dev;
-		/* the following fields are different for each slot */
-		fmc->eeprom = ff_eeimg[i];
-		fmc->eeprom_addr = 0x50 + 2 * i;
-		fmc->slot_id = i;
-		ff->fmc[i] = fmc;
-		/* increment the identifier, each must be different */
-		ff_template_fmc.device_id++;
-	}
-	return ff;
-}
-
-/* init and exit */
-static int ff_init(void)
-{
-	struct ff_dev *ff;
-	const struct firmware *fw;
-	int i, len, ret = 0;
-
-	/* Replicate the default eeprom for the max number of mezzanines */
-	for (i = 1; i < FF_MAX_MEZZANINES; i++)
-		memcpy(ff_eeimg[i], ff_eeimg[0], sizeof(ff_eeimg[0]));
-
-	if (ff_nr_eeprom > ff_nr_dev)
-		ff_nr_dev = ff_nr_eeprom;
-
-	ff = ff_dev_create();
-	if (IS_ERR(ff))
-		return PTR_ERR(ff);
-
-	/* If the user passed "eeprom=" as a parameter, fetch them */
-	for (i = 0; i < ff_nr_eeprom; i++) {
-		if (!strlen(ff_eeprom[i]))
-			continue;
-		ret = request_firmware(&fw, ff_eeprom[i], &ff->dev);
-		if (ret < 0) {
-			dev_err(&ff->dev, "Mezzanine %i: can't load \"%s\" "
-				"(error %i)\n", i, ff_eeprom[i], -ret);
-		} else {
-			len = min_t(size_t, fw->size, (size_t)FF_EEPROM_SIZE);
-			memcpy(ff_eeimg[i], fw->data, len);
-			release_firmware(fw);
-			dev_info(&ff->dev, "Mezzanine %i: eeprom \"%s\"\n", i,
-				ff_eeprom[i]);
-		}
-	}
-
-	ret = fmc_device_register_n(ff->fmc, ff_nr_dev);
-	if (ret) {
-		device_unregister(&ff->dev);
-		return ret;
-	}
-	ff_current_dev = ff;
-	return ret;
-}
-
-static void ff_exit(void)
-{
-	if (ff_current_dev) {
-		fmc_device_unregister_n(ff_current_dev->fmc, ff_nr_dev);
-		device_unregister(&ff_current_dev->dev);
-	}
-	cancel_delayed_work_sync(&ff_work);
-}
-
-module_init(ff_init);
-module_exit(ff_exit);
-
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/fmc/fmc-match.c b/drivers/fmc/fmc-match.c
deleted file mode 100644
index 995bd6041a67..000000000000
--- a/drivers/fmc/fmc-match.c
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/fmc.h>
-#include <linux/ipmi-fru.h>
-
-/* The fru parser is both user and kernel capable: it needs alloc */
-void *fru_alloc(size_t size)
-{
-	return kzalloc(size, GFP_KERNEL);
-}
-
-/* The actual match function */
-int fmc_match(struct device *dev, struct device_driver *drv)
-{
-	struct fmc_driver *fdrv = to_fmc_driver(drv);
-	struct fmc_device *fdev = to_fmc_device(dev);
-	struct fmc_fru_id *fid;
-	int i, matched = 0;
-
-	/* This currently only matches the EEPROM (FRU id) */
-	fid = fdrv->id_table.fru_id;
-	if (!fid) {
-		dev_warn(&fdev->dev, "Driver has no ID: matches all\n");
-		matched = 1;
-	} else {
-		if (!fdev->id.manufacturer || !fdev->id.product_name)
-			return 0; /* the device has no FRU information */
-		for (i = 0; i < fdrv->id_table.fru_id_nr; i++, fid++) {
-			if (fid->manufacturer &&
-			    strcmp(fid->manufacturer, fdev->id.manufacturer))
-				continue;
-			if (fid->product_name &&
-			    strcmp(fid->product_name, fdev->id.product_name))
-				continue;
-			matched = 1;
-			break;
-		}
-	}
-
-	/* FIXME: match SDB contents */
-	return matched;
-}
-
-/* This function creates ID info for a newly registered device */
-int fmc_fill_id_info(struct fmc_device *fmc)
-{
-	struct fru_common_header *h;
-	struct fru_board_info_area *bia;
-	int ret, allocated = 0;
-
-	/* If we know the eeprom length, try to read it off the device */
-	if (fmc->eeprom_len && !fmc->eeprom) {
-		fmc->eeprom = kzalloc(fmc->eeprom_len, GFP_KERNEL);
-		if (!fmc->eeprom)
-			return -ENOMEM;
-		allocated = 1;
-		ret = fmc_read_ee(fmc, 0, fmc->eeprom, fmc->eeprom_len);
-		if (ret < 0)
-			goto out;
-	}
-
-	/* If no eeprom, continue with other matches */
-	if (!fmc->eeprom)
-		return 0;
-
-	dev_info(fmc->hwdev, "mezzanine %i\n", fmc->slot_id); /* header */
-
-	/* So we have the eeprom: parse the FRU part (if any) */
-	h = (void *)fmc->eeprom;
-	if (h->format != 1) {
-		pr_info("      EEPROM has no FRU information\n");
-		goto out;
-	}
-	if (!fru_header_cksum_ok(h)) {
-		pr_info("      FRU: wrong header checksum\n");
-		goto out;
-	}
-	bia = fru_get_board_area(h);
-	if (!fru_bia_cksum_ok(bia)) {
-		pr_info("      FRU: wrong board area checksum\n");
-		goto out;
-	}
-	fmc->id.manufacturer = fru_get_board_manufacturer(h);
-	fmc->id.product_name = fru_get_product_name(h);
-	pr_info("      Manufacturer: %s\n", fmc->id.manufacturer);
-	pr_info("      Product name: %s\n", fmc->id.product_name);
-
-	/* Create the short name (FIXME: look in sdb as well) */
-	fmc->mezzanine_name = kstrdup(fmc->id.product_name, GFP_KERNEL);
-
-out:
-	if (allocated) {
-		kfree(fmc->eeprom);
-		fmc->eeprom = NULL;
-	}
-	return 0; /* no error: let other identification work */
-}
-
-/* Some ID data is allocated using fru_alloc() above, so release it */
-void fmc_free_id_info(struct fmc_device *fmc)
-{
-	kfree(fmc->mezzanine_name);
-	kfree(fmc->id.manufacturer);
-	kfree(fmc->id.product_name);
-}
diff --git a/drivers/fmc/fmc-private.h b/drivers/fmc/fmc-private.h
deleted file mode 100644
index 93cb8030f764..000000000000
--- a/drivers/fmc/fmc-private.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2015 CERN (www.cern.ch)
- * Author: Federico Vaga <federico.vaga@cern.ch>
- */
-
-extern int fmc_debug_init(struct fmc_device *fmc);
-extern void fmc_debug_exit(struct fmc_device *fmc);
diff --git a/drivers/fmc/fmc-sdb.c b/drivers/fmc/fmc-sdb.c
deleted file mode 100644
index 14758db1a5fb..000000000000
--- a/drivers/fmc/fmc-sdb.c
+++ /dev/null
@@ -1,219 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/fmc.h>
-#include <linux/sdb.h>
-#include <linux/err.h>
-#include <linux/fmc-sdb.h>
-#include <asm/byteorder.h>
-
-static uint32_t __sdb_rd(struct fmc_device *fmc, unsigned long address,
-			int convert)
-{
-	uint32_t res = fmc_readl(fmc, address);
-	if (convert)
-		return __be32_to_cpu(res);
-	return res;
-}
-
-static struct sdb_array *__fmc_scan_sdb_tree(struct fmc_device *fmc,
-					     unsigned long sdb_addr,
-					     unsigned long reg_base, int level)
-{
-	uint32_t onew;
-	int i, j, n, convert = 0;
-	struct sdb_array *arr, *sub;
-
-	onew = fmc_readl(fmc, sdb_addr);
-	if (onew == SDB_MAGIC) {
-		/* Uh! If we are little-endian, we must convert */
-		if (SDB_MAGIC != __be32_to_cpu(SDB_MAGIC))
-			convert = 1;
-	} else if (onew == __be32_to_cpu(SDB_MAGIC)) {
-		/* ok, don't convert */
-	} else {
-		return ERR_PTR(-ENOENT);
-	}
-	/* So, the magic was there: get the count from offset 4*/
-	onew = __sdb_rd(fmc, sdb_addr + 4, convert);
-	n = __be16_to_cpu(*(uint16_t *)&onew);
-	arr = kzalloc(sizeof(*arr), GFP_KERNEL);
-	if (!arr)
-		return ERR_PTR(-ENOMEM);
-	arr->record = kcalloc(n, sizeof(arr->record[0]), GFP_KERNEL);
-	arr->subtree = kcalloc(n, sizeof(arr->subtree[0]), GFP_KERNEL);
-	if (!arr->record || !arr->subtree) {
-		kfree(arr->record);
-		kfree(arr->subtree);
-		kfree(arr);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	arr->len = n;
-	arr->level = level;
-	arr->fmc = fmc;
-	for (i = 0; i < n; i++) {
-		union  sdb_record *r;
-
-		for (j = 0; j < sizeof(arr->record[0]); j += 4) {
-			*(uint32_t *)((void *)(arr->record + i) + j) =
-				__sdb_rd(fmc, sdb_addr + (i * 64) + j, convert);
-		}
-		r = &arr->record[i];
-		arr->subtree[i] = ERR_PTR(-ENODEV);
-		if (r->empty.record_type == sdb_type_bridge) {
-			struct sdb_component *c = &r->bridge.sdb_component;
-			uint64_t subaddr = __be64_to_cpu(r->bridge.sdb_child);
-			uint64_t newbase = __be64_to_cpu(c->addr_first);
-
-			subaddr += reg_base;
-			newbase += reg_base;
-			sub = __fmc_scan_sdb_tree(fmc, subaddr, newbase,
-						  level + 1);
-			arr->subtree[i] = sub; /* may be error */
-			if (IS_ERR(sub))
-				continue;
-			sub->parent = arr;
-			sub->baseaddr = newbase;
-		}
-	}
-	return arr;
-}
-
-int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address)
-{
-	struct sdb_array *ret;
-	if (fmc->sdb)
-		return -EBUSY;
-	ret = __fmc_scan_sdb_tree(fmc, address, 0 /* regs */, 0);
-	if (IS_ERR(ret))
-		return PTR_ERR(ret);
-	fmc->sdb = ret;
-	return 0;
-}
-EXPORT_SYMBOL(fmc_scan_sdb_tree);
-
-static void __fmc_sdb_free(struct sdb_array *arr)
-{
-	int i, n;
-
-	if (!arr)
-		return;
-	n = arr->len;
-	for (i = 0; i < n; i++) {
-		if (IS_ERR(arr->subtree[i]))
-			continue;
-		__fmc_sdb_free(arr->subtree[i]);
-	}
-	kfree(arr->record);
-	kfree(arr->subtree);
-	kfree(arr);
-}
-
-int fmc_free_sdb_tree(struct fmc_device *fmc)
-{
-	__fmc_sdb_free(fmc->sdb);
-	fmc->sdb = NULL;
-	return 0;
-}
-EXPORT_SYMBOL(fmc_free_sdb_tree);
-
-/* This helper calls reprogram and inizialized sdb as well */
-int fmc_reprogram_raw(struct fmc_device *fmc, struct fmc_driver *d,
-		      void *gw, unsigned long len, int sdb_entry)
-{
-	int ret;
-
-	ret = fmc->op->reprogram_raw(fmc, d, gw, len);
-	if (ret < 0)
-		return ret;
-	if (sdb_entry < 0)
-		return ret;
-
-	/* We are required to find SDB at a given offset */
-	ret = fmc_scan_sdb_tree(fmc, sdb_entry);
-	if (ret < 0) {
-		dev_err(&fmc->dev, "Can't find SDB at address 0x%x\n",
-			sdb_entry);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(fmc_reprogram_raw);
-
-/* This helper calls reprogram and inizialized sdb as well */
-int fmc_reprogram(struct fmc_device *fmc, struct fmc_driver *d, char *gw,
-			 int sdb_entry)
-{
-	int ret;
-
-	ret = fmc->op->reprogram(fmc, d, gw);
-	if (ret < 0)
-		return ret;
-	if (sdb_entry < 0)
-		return ret;
-
-	/* We are required to find SDB at a given offset */
-	ret = fmc_scan_sdb_tree(fmc, sdb_entry);
-	if (ret < 0) {
-		dev_err(&fmc->dev, "Can't find SDB at address 0x%x\n",
-			sdb_entry);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(fmc_reprogram);
-
-void fmc_show_sdb_tree(const struct fmc_device *fmc)
-{
-	pr_err("%s: not supported anymore, use debugfs to dump SDB\n",
-		__func__);
-}
-EXPORT_SYMBOL(fmc_show_sdb_tree);
-
-signed long fmc_find_sdb_device(struct sdb_array *tree,
-				uint64_t vid, uint32_t did, unsigned long *sz)
-{
-	signed long res = -ENODEV;
-	union  sdb_record *r;
-	struct sdb_product *p;
-	struct sdb_component *c;
-	int i, n = tree->len;
-	uint64_t last, first;
-
-	/* FIXME: what if the first interconnect is not at zero? */
-	for (i = 0; i < n; i++) {
-		r = &tree->record[i];
-		c = &r->dev.sdb_component;
-		p = &c->product;
-
-		if (!IS_ERR(tree->subtree[i]))
-			res = fmc_find_sdb_device(tree->subtree[i],
-						  vid, did, sz);
-		if (res >= 0)
-			return res + tree->baseaddr;
-		if (r->empty.record_type != sdb_type_device)
-			continue;
-		if (__be64_to_cpu(p->vendor_id) != vid)
-			continue;
-		if (__be32_to_cpu(p->device_id) != did)
-			continue;
-		/* found */
-		last = __be64_to_cpu(c->addr_last);
-		first = __be64_to_cpu(c->addr_first);
-		if (sz)
-			*sz = (typeof(*sz))(last + 1 - first);
-		return first + tree->baseaddr;
-	}
-	return res;
-}
-EXPORT_SYMBOL(fmc_find_sdb_device);
diff --git a/drivers/fmc/fmc-trivial.c b/drivers/fmc/fmc-trivial.c
deleted file mode 100644
index b99dbc7ee203..000000000000
--- a/drivers/fmc/fmc-trivial.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * The software is provided "as is"; the copyright holders disclaim
- * all warranties and liabilities, to the extent permitted by
- * applicable law.
- */
-
-/* A trivial fmc driver that can load a gateware file and reports interrupts */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/fmc.h>
-
-static struct fmc_driver t_drv; /* initialized later */
-
-static irqreturn_t t_handler(int irq, void *dev_id)
-{
-	struct fmc_device *fmc = dev_id;
-
-	fmc_irq_ack(fmc);
-	dev_info(&fmc->dev, "received irq %i\n", irq);
-	return IRQ_HANDLED;
-}
-
-static struct fmc_gpio t_gpio[] = {
-	{
-		.gpio = FMC_GPIO_IRQ(0),
-		.mode = GPIOF_DIR_IN,
-		.irqmode = IRQF_TRIGGER_RISING,
-	}, {
-		.gpio = FMC_GPIO_IRQ(1),
-		.mode = GPIOF_DIR_IN,
-		.irqmode = IRQF_TRIGGER_RISING,
-	}
-};
-
-static int t_probe(struct fmc_device *fmc)
-{
-	int ret;
-	int index = 0;
-
-	index = fmc_validate(fmc, &t_drv);
-	if (index < 0)
-		return -EINVAL; /* not our device: invalid */
-
-	ret = fmc_irq_request(fmc, t_handler, "fmc-trivial", IRQF_SHARED);
-	if (ret < 0)
-		return ret;
-	/* ignore error code of call below, we really don't care */
-	fmc_gpio_config(fmc, t_gpio, ARRAY_SIZE(t_gpio));
-
-	ret = fmc_reprogram(fmc, &t_drv, "", 0);
-	if (ret == -EPERM) /* programming not supported */
-		ret = 0;
-	if (ret < 0)
-		fmc_irq_free(fmc);
-
-	/* FIXME: reprogram LM32 too */
-	return ret;
-}
-
-static int t_remove(struct fmc_device *fmc)
-{
-	fmc_irq_free(fmc);
-	return 0;
-}
-
-static struct fmc_driver t_drv = {
-	.version = FMC_VERSION,
-	.driver.name = KBUILD_MODNAME,
-	.probe = t_probe,
-	.remove = t_remove,
-	/* no table, as the current match just matches everything */
-};
-
- /* We accept the generic parameters */
-FMC_PARAM_BUSID(t_drv);
-FMC_PARAM_GATEWARE(t_drv);
-
-static int t_init(void)
-{
-	int ret;
-
-	ret = fmc_driver_register(&t_drv);
-	return ret;
-}
-
-static void t_exit(void)
-{
-	fmc_driver_unregister(&t_drv);
-}
-
-module_init(t_init);
-module_exit(t_exit);
-
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/fmc/fmc-write-eeprom.c b/drivers/fmc/fmc-write-eeprom.c
deleted file mode 100644
index 1c7826e3f526..000000000000
--- a/drivers/fmc/fmc-write-eeprom.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/firmware.h>
-#include <linux/init.h>
-#include <linux/fmc.h>
-#include <asm/unaligned.h>
-
-/*
- * This module uses the firmware loader to program the whole or part
- * of the FMC eeprom. The meat is in the _run functions.  However, no
- * default file name is provided, to avoid accidental mishaps. Also,
- * you must pass the busid argument
- */
-static struct fmc_driver fwe_drv;
-
-FMC_PARAM_BUSID(fwe_drv);
-
-/* The "file=" is like the generic "gateware=" used elsewhere */
-static char *fwe_file[FMC_MAX_CARDS];
-static int fwe_file_n;
-module_param_array_named(file, fwe_file, charp, &fwe_file_n, 0444);
-
-static int fwe_run_tlv(struct fmc_device *fmc, const struct firmware *fw,
-	int write)
-{
-	const uint8_t *p = fw->data;
-	int len = fw->size;
-	uint16_t thislen, thisaddr;
-	int err;
-
-	/* format is: 'w' addr16 len16 data... */
-	while (len > 5) {
-		thisaddr = get_unaligned_le16(p+1);
-		thislen = get_unaligned_le16(p+3);
-		if (p[0] != 'w' || thislen + 5 > len) {
-			dev_err(&fmc->dev, "invalid tlv at offset %ti\n",
-				p - fw->data);
-			return -EINVAL;
-		}
-		err = 0;
-		if (write) {
-			dev_info(&fmc->dev, "write %i bytes at 0x%04x\n",
-				 thislen, thisaddr);
-			err = fmc_write_ee(fmc, thisaddr, p + 5, thislen);
-		}
-		if (err < 0) {
-			dev_err(&fmc->dev, "write failure @0x%04x\n",
-				thisaddr);
-			return err;
-		}
-		p += 5 + thislen;
-		len -= 5 + thislen;
-	}
-	if (write)
-		dev_info(&fmc->dev, "write_eeprom: success\n");
-	return 0;
-}
-
-static int fwe_run_bin(struct fmc_device *fmc, const struct firmware *fw)
-{
-	int ret;
-
-	dev_info(&fmc->dev, "programming %zi bytes\n", fw->size);
-	ret = fmc_write_ee(fmc, 0, (void *)fw->data, fw->size);
-	if (ret < 0) {
-		dev_info(&fmc->dev, "write_eeprom: error %i\n", ret);
-		return ret;
-	}
-	dev_info(&fmc->dev, "write_eeprom: success\n");
-	return 0;
-}
-
-static int fwe_run(struct fmc_device *fmc, const struct firmware *fw, char *s)
-{
-	char *last4 = s + strlen(s) - 4;
-	int err;
-
-	if (!strcmp(last4, ".bin"))
-		return fwe_run_bin(fmc, fw);
-	if (!strcmp(last4, ".tlv")) {
-		err = fwe_run_tlv(fmc, fw, 0);
-		if (!err)
-			err = fwe_run_tlv(fmc, fw, 1);
-		return err;
-	}
-	dev_err(&fmc->dev, "invalid file name \"%s\"\n", s);
-	return -EINVAL;
-}
-
-/*
- * Programming is done at probe time. Morever, only those listed with
- * busid= are programmed.
- * card is probed for, only one is programmed. Unfortunately, it's
- * difficult to know in advance when probing the first card if others
- * are there.
- */
-static int fwe_probe(struct fmc_device *fmc)
-{
-	int err, index = 0;
-	const struct firmware *fw;
-	struct device *dev = &fmc->dev;
-	char *s;
-
-	if (!fwe_drv.busid_n) {
-		dev_err(dev, "%s: no busid passed, refusing all cards\n",
-			KBUILD_MODNAME);
-		return -ENODEV;
-	}
-
-	index = fmc_validate(fmc, &fwe_drv);
-	if (index < 0) {
-		pr_err("%s: refusing device \"%s\"\n", KBUILD_MODNAME,
-		       dev_name(dev));
-		return -ENODEV;
-	}
-	if (index >= fwe_file_n) {
-		pr_err("%s: no filename for device index %i\n",
-			KBUILD_MODNAME, index);
-		return -ENODEV;
-	}
-	s = fwe_file[index];
-	if (!s) {
-		pr_err("%s: no filename for \"%s\" not programming\n",
-		       KBUILD_MODNAME, dev_name(dev));
-		return -ENOENT;
-	}
-	err = request_firmware(&fw, s, dev);
-	if (err < 0) {
-		dev_err(&fmc->dev, "request firmware \"%s\": error %i\n",
-			s, err);
-		return err;
-	}
-	fwe_run(fmc, fw, s);
-	release_firmware(fw);
-	return 0;
-}
-
-static int fwe_remove(struct fmc_device *fmc)
-{
-	return 0;
-}
-
-static struct fmc_driver fwe_drv = {
-	.version = FMC_VERSION,
-	.driver.name = KBUILD_MODNAME,
-	.probe = fwe_probe,
-	.remove = fwe_remove,
-	/* no table, as the current match just matches everything */
-};
-
-static int fwe_init(void)
-{
-	int ret;
-
-	ret = fmc_driver_register(&fwe_drv);
-	return ret;
-}
-
-static void fwe_exit(void)
-{
-	fmc_driver_unregister(&fwe_drv);
-}
-
-module_init(fwe_init);
-module_exit(fwe_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/fmc/fru-parse.c b/drivers/fmc/fru-parse.c
deleted file mode 100644
index f551b81f4fd9..000000000000
--- a/drivers/fmc/fru-parse.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#include <linux/ipmi-fru.h>
-
-/* Some internal helpers */
-static struct fru_type_length *
-__fru_get_board_tl(struct fru_common_header *header, int nr)
-{
-	struct fru_board_info_area *bia;
-	struct fru_type_length *tl;
-
-	bia = fru_get_board_area(header);
-	tl = bia->tl;
-	while (nr > 0 && !fru_is_eof(tl)) {
-		tl = fru_next_tl(tl);
-		nr--;
-	}
-	if (fru_is_eof(tl))
-		return NULL;
-	return tl;
-}
-
-static char *__fru_alloc_get_tl(struct fru_common_header *header, int nr)
-{
-	struct fru_type_length *tl;
-	char *res;
-
-	tl = __fru_get_board_tl(header, nr);
-	if (!tl)
-		return NULL;
-
-	res = fru_alloc(fru_strlen(tl) + 1);
-	if (!res)
-		return NULL;
-	return fru_strcpy(res, tl);
-}
-
-/* Public checksum verifiers */
-int fru_header_cksum_ok(struct fru_common_header *header)
-{
-	uint8_t *ptr = (void *)header;
-	int i, sum;
-
-	for (i = sum = 0; i < sizeof(*header); i++)
-		sum += ptr[i];
-	return (sum & 0xff) == 0;
-}
-int fru_bia_cksum_ok(struct fru_board_info_area *bia)
-{
-	uint8_t *ptr = (void *)bia;
-	int i, sum;
-
-	for (i = sum = 0; i < 8 * bia->area_len; i++)
-		sum += ptr[i];
-	return (sum & 0xff) == 0;
-}
-
-/* Get various stuff, trivial */
-char *fru_get_board_manufacturer(struct fru_common_header *header)
-{
-	return __fru_alloc_get_tl(header, 0);
-}
-char *fru_get_product_name(struct fru_common_header *header)
-{
-	return __fru_alloc_get_tl(header, 1);
-}
-char *fru_get_serial_number(struct fru_common_header *header)
-{
-	return __fru_alloc_get_tl(header, 2);
-}
-char *fru_get_part_number(struct fru_common_header *header)
-{
-	return __fru_alloc_get_tl(header, 3);
-}
diff --git a/include/linux/fmc-sdb.h b/include/linux/fmc-sdb.h
deleted file mode 100644
index bec899f0867c..000000000000
--- a/include/linux/fmc-sdb.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file is separate from sdb.h, because I want that one to remain
- * unchanged (as far as possible) from the official sdb distribution
- *
- * This file and associated functionality are a playground for me to
- * understand stuff which will later be implemented in more generic places.
- */
-#include <linux/sdb.h>
-
-/* This is the union of all currently defined types */
-union sdb_record {
-	struct sdb_interconnect ic;
-	struct sdb_device dev;
-	struct sdb_bridge bridge;
-	struct sdb_integration integr;
-	struct sdb_empty empty;
-	struct sdb_synthesis synthesis;
-	struct sdb_repo_url repo_url;
-};
-
-struct fmc_device;
-
-/* Every sdb table is turned into this structure */
-struct sdb_array {
-	int len;
-	int level;
-	unsigned long baseaddr;
-	struct fmc_device *fmc;		/* the device that hosts it */
-	struct sdb_array *parent;	/* NULL at root */
-	union sdb_record *record;	/* copies of the struct */
-	struct sdb_array **subtree;	/* only valid for bridge items */
-};
-
-extern int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
-extern void fmc_show_sdb_tree(const struct fmc_device *fmc);
-extern signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
-				       uint32_t device, unsigned long *sz);
-extern int fmc_free_sdb_tree(struct fmc_device *fmc);
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
deleted file mode 100644
index 8661a46a676f..000000000000
--- a/include/linux/fmc.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2012 CERN (www.cern.ch)
- * Author: Alessandro Rubini <rubini@gnudd.com>
- *
- * This work is part of the White Rabbit project, a research effort led
- * by CERN, the European Institute for Nuclear Research.
- */
-#ifndef __LINUX_FMC_H__
-#define __LINUX_FMC_H__
-#include <linux/types.h>
-#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-struct fmc_device;
-struct fmc_driver;
-
-/*
- * This bus abstraction is developed separately from drivers, so we need
- * to check the version of the data structures we receive.
- */
-
-#define FMC_MAJOR	3
-#define FMC_MINOR	0
-#define FMC_VERSION	((FMC_MAJOR << 16) | FMC_MINOR)
-#define __FMC_MAJOR(x)	((x) >> 16)
-#define __FMC_MINOR(x)	((x) & 0xffff)
-
-/*
- * The device identification, as defined by the IPMI FRU (Field Replaceable
- * Unit) includes four different strings to describe the device. Here we
- * only match the "Board Manufacturer" and the "Board Product Name",
- * ignoring the "Board Serial Number" and "Board Part Number". All 4 are
- * expected to be strings, so they are treated as zero-terminated C strings.
- * Unspecified string (NULL) means "any", so if both are unspecified this
- * is a catch-all driver. So null entries are allowed and we use array
- * and length. This is unlike pci and usb that use null-terminated arrays
- */
-struct fmc_fru_id {
-	char *manufacturer;
-	char *product_name;
-};
-
-/*
- * If the FPGA is already programmed (think Etherbone or the second
- * SVEC slot), we can match on SDB devices in the memory image. This
- * match uses an array of devices that must all be present, and the
- * match is based on vendor and device only. Further checks are expected
- * to happen in the probe function. Zero means "any" and catch-all is allowed.
- */
-struct fmc_sdb_one_id {
-	uint64_t vendor;
-	uint32_t device;
-};
-struct fmc_sdb_id {
-	struct fmc_sdb_one_id *cores;
-	int cores_nr;
-};
-
-struct fmc_device_id {
-	struct fmc_fru_id *fru_id;
-	int fru_id_nr;
-	struct fmc_sdb_id *sdb_id;
-	int sdb_id_nr;
-};
-
-/* This sizes the module_param_array used by generic module parameters */
-#define FMC_MAX_CARDS 32
-
-/* The driver is a pretty simple thing */
-struct fmc_driver {
-	unsigned long version;
-	struct device_driver driver;
-	int (*probe)(struct fmc_device *);
-	int (*remove)(struct fmc_device *);
-	const struct fmc_device_id id_table;
-	/* What follows is for generic module parameters */
-	int busid_n;
-	int busid_val[FMC_MAX_CARDS];
-	int gw_n;
-	char *gw_val[FMC_MAX_CARDS];
-};
-#define to_fmc_driver(x) container_of((x), struct fmc_driver, driver)
-
-/* These are the generic parameters, that drivers may instantiate */
-#define FMC_PARAM_BUSID(_d) \
-    module_param_array_named(busid, _d.busid_val, int, &_d.busid_n, 0444)
-#define FMC_PARAM_GATEWARE(_d) \
-    module_param_array_named(gateware, _d.gw_val, charp, &_d.gw_n, 0444)
-
-/*
- * Drivers may need to configure gpio pins in the carrier. To read input
- * (a very uncommon operation, and definitely not in the hot paths), just
- * configure one gpio only and get 0 or 1 as retval of the config method
- */
-struct fmc_gpio {
-	char *carrier_name; /* name or NULL for virtual pins */
-	int gpio;
-	int _gpio;	/* internal use by the carrier */
-	int mode;	/* GPIOF_DIR_OUT etc */
-	int irqmode;	/* IRQF_TRIGGER_LOW and so on */
-};
-
-/* The numbering of gpio pins allows access to raw pins or virtual roles */
-#define FMC_GPIO_RAW(x)		(x)		/* 4096 of them */
-#define __FMC_GPIO_IS_RAW(x)	((x) < 0x1000)
-#define FMC_GPIO_IRQ(x)		((x) + 0x1000)	/*  256 of them */
-#define FMC_GPIO_LED(x)		((x) + 0x1100)	/*  256 of them */
-#define FMC_GPIO_KEY(x)		((x) + 0x1200)	/*  256 of them */
-#define FMC_GPIO_TP(x)		((x) + 0x1300)	/*  256 of them */
-#define FMC_GPIO_USER(x)	((x) + 0x1400)	/*  256 of them */
-/* We may add SCL and SDA, or other roles if the need arises */
-
-/*
- * These are similar to the legacy Linux GPIO defines from <linux/gpio.h>
- * but in fact FMC has its own GPIO handling and is not using the Linux
- * GPIO subsystem.
- */
-#define GPIOF_DIR_OUT   (0 << 0)
-#define GPIOF_DIR_IN    (1 << 0)
-#define GPIOF_INIT_LOW  (0 << 1)
-#define GPIOF_INIT_HIGH (1 << 1)
-
-/*
- * The operations are offered by each carrier and should make driver
- * design completely independent of the carrier. Named GPIO pins may be
- * the exception.
- */
-struct fmc_operations {
-	uint32_t (*read32)(struct fmc_device *fmc, int offset);
-	void (*write32)(struct fmc_device *fmc, uint32_t value, int offset);
-	int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
-	int (*reprogram_raw)(struct fmc_device *f, struct fmc_driver *d,
-			     void *gw, unsigned long len);
-	int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
-	int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
-			   char *name, int flags);
-	void (*irq_ack)(struct fmc_device *fmc);
-	int (*irq_free)(struct fmc_device *fmc);
-	int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
-			   int ngpio);
-	int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
-	int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
-};
-
-/* Prefer this helper rather than calling of fmc->reprogram directly */
-int fmc_reprogram_raw(struct fmc_device *fmc, struct fmc_driver *d,
-		      void *gw, unsigned long len, int sdb_entry);
-extern int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
-		     int sdb_entry);
-
-/*
- * The device reports all information needed to access hw.
- *
- * If we have eeprom_len and not contents, the core reads it.
- * Then, parsing of identifiers is done by the core which fills fmc_fru_id..
- * Similarly a device that must be matched based on SDB cores must
- * fill the entry point and the core will scan the bus (FIXME: sdb match)
- */
-struct fmc_device {
-	unsigned long version;
-	unsigned long flags;
-	struct module *owner;		/* char device must pin it */
-	struct fmc_fru_id id;		/* for EEPROM-based match */
-	struct fmc_operations *op;	/* carrier-provided */
-	int irq;			/* according to host bus. 0 == none */
-	int eeprom_len;			/* Usually 8kB, may be less */
-	int eeprom_addr;		/* 0x50, 0x52 etc */
-	uint8_t *eeprom;		/* Full contents or leading part */
-	char *carrier_name;		/* "SPEC" or similar, for special use */
-	void *carrier_data;		/* "struct spec *" or equivalent */
-	__iomem void *fpga_base;	/* May be NULL (Etherbone) */
-	__iomem void *slot_base;	/* Set by the driver */
-	struct fmc_device **devarray;	/* Allocated by the bus */
-	int slot_id;			/* Index in the slot array */
-	int nr_slots;			/* Number of slots in this carrier */
-	unsigned long memlen;		/* Used for the char device */
-	struct device dev;		/* For Linux use */
-	struct device *hwdev;		/* The underlying hardware device */
-	unsigned long sdbfs_entry;
-	struct sdb_array *sdb;
-	uint32_t device_id;		/* Filled by the device */
-	char *mezzanine_name;		/* Defaults to ``fmc'' */
-	void *mezzanine_data;
-
-	struct dentry *dbg_dir;
-	struct dentry *dbg_sdb_dump;
-};
-#define to_fmc_device(x) container_of((x), struct fmc_device, dev)
-
-#define FMC_DEVICE_HAS_GOLDEN		1
-#define FMC_DEVICE_HAS_CUSTOM		2
-#define FMC_DEVICE_NO_MEZZANINE		4
-#define FMC_DEVICE_MATCH_SDB		8 /* fmc-core must scan sdb in fpga */
-
-/*
- * If fpga_base can be used, the carrier offers no readl/writel methods, and
- * this expands to a single, fast, I/O access.
- */
-static inline uint32_t fmc_readl(struct fmc_device *fmc, int offset)
-{
-	if (unlikely(fmc->op->read32))
-		return fmc->op->read32(fmc, offset);
-	return readl(fmc->fpga_base + offset);
-}
-static inline void fmc_writel(struct fmc_device *fmc, uint32_t val, int off)
-{
-	if (unlikely(fmc->op->write32))
-		fmc->op->write32(fmc, val, off);
-	else
-		writel(val, fmc->fpga_base + off);
-}
-
-/* pci-like naming */
-static inline void *fmc_get_drvdata(const struct fmc_device *fmc)
-{
-	return dev_get_drvdata(&fmc->dev);
-}
-
-static inline void fmc_set_drvdata(struct fmc_device *fmc, void *data)
-{
-	dev_set_drvdata(&fmc->dev, data);
-}
-
-struct fmc_gateware {
-	void *bitstream;
-	unsigned long len;
-};
-
-/* The 5 access points */
-extern int fmc_driver_register(struct fmc_driver *drv);
-extern void fmc_driver_unregister(struct fmc_driver *drv);
-extern int fmc_device_register(struct fmc_device *tdev);
-extern int fmc_device_register_gw(struct fmc_device *tdev,
-				  struct fmc_gateware *gw);
-extern void fmc_device_unregister(struct fmc_device *tdev);
-
-/* Three more for device sets, all driven by the same FPGA */
-extern int fmc_device_register_n(struct fmc_device **devs, int n);
-extern int fmc_device_register_n_gw(struct fmc_device **devs, int n,
-				    struct fmc_gateware *gw);
-extern void fmc_device_unregister_n(struct fmc_device **devs, int n);
-
-/* Internal cross-calls between files; not exported to other modules */
-extern int fmc_match(struct device *dev, struct device_driver *drv);
-extern int fmc_fill_id_info(struct fmc_device *fmc);
-extern void fmc_free_id_info(struct fmc_device *fmc);
-extern void fmc_dump_eeprom(const struct fmc_device *fmc);
-
-/* helpers for FMC operations */
-extern int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
-			   char *name, int flags);
-extern void fmc_irq_free(struct fmc_device *fmc);
-extern void fmc_irq_ack(struct fmc_device *fmc);
-extern int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv);
-extern int fmc_gpio_config(struct fmc_device *fmc, struct fmc_gpio *gpio,
-			   int ngpio);
-extern int fmc_read_ee(struct fmc_device *fmc, int pos, void *d, int l);
-extern int fmc_write_ee(struct fmc_device *fmc, int pos, const void *d, int l);
-
-/* helpers for FMC operations */
-extern int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
-			   char *name, int flags);
-extern void fmc_irq_free(struct fmc_device *fmc);
-extern void fmc_irq_ack(struct fmc_device *fmc);
-extern int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv);
-
-#endif /* __LINUX_FMC_H__ */
-- 
cgit v1.2.3


From 9a5ed0bac86edce4097abf7595a7de050b2f87fa Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 12 Jun 2019 09:42:22 +0200
Subject: regulator: wm831x: Convert to use GPIO descriptors

This converts the Wolfson Micro WM831x DCDC converter to use
a GPIO descriptor for the GPIO driving the DVS pin.

There is just one (non-DT) machine in the kernel using this, and
that is the Wolfson Micro (now Cirrus) Cragganmore 6410 so we
patch this board to pass a descriptor table and fix up the driver
accordingly.

Cc: Charles Keepax <ckeepax@opensource.cirrus.com>
Cc: Richard Fitzgerald <rf@opensource.cirrus.com>
Cc: patches@opensource.cirrus.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/mach-crag6410.c | 21 ++++++++++++++++++++-
 drivers/regulator/wm831x-dcdc.c       | 29 +++++++++++++----------------
 include/linux/mfd/wm831x/pdata.h      |  1 -
 3 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410.c b/arch/arm/mach-s3c64xx/mach-crag6410.c
index 379424d72ae7..8ec6a4f5eb05 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410.c
@@ -15,6 +15,7 @@
 #include <linux/io.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/leds.h>
 #include <linux/delay.h>
 #include <linux/mmc/host.h>
@@ -398,7 +399,6 @@ static struct pca953x_platform_data crag6410_pca_data = {
 /* VDDARM is controlled by DVS1 connected to GPK(0) */
 static struct wm831x_buckv_pdata vddarm_pdata = {
 	.dvs_control_src = 1,
-	.dvs_gpio = S3C64XX_GPK(0),
 };
 
 static struct regulator_consumer_supply vddarm_consumers[] = {
@@ -596,6 +596,24 @@ static struct wm831x_pdata crag_pmic_pdata = {
 	.touch = &touch_pdata,
 };
 
+/*
+ * VDDARM is eventually ending up as a regulator hanging on the MFD cell device
+ * "wm831x-buckv.1" spawn from drivers/mfd/wm831x-core.c.
+ *
+ * From the note on the platform data we can see that this is clearly DVS1
+ * and assigned as dcdc1 resource to the MFD core which sets .id of the cell
+ * spawning the DVS1 platform device to 1, then the cell platform device
+ * name is calculated from 10*instance + id resulting in the device name
+ * "wm831x-buckv.11"
+ */
+static struct gpiod_lookup_table crag_pmic_gpiod_table = {
+	.dev_id = "wm831x-buckv.11",
+	.table = {
+		GPIO_LOOKUP("GPIOK", 0, "dvs", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct i2c_board_info i2c_devs0[] = {
 	{ I2C_BOARD_INFO("24c08", 0x50), },
 	{ I2C_BOARD_INFO("tca6408", 0x20),
@@ -836,6 +854,7 @@ static void __init crag6410_machine_init(void)
 	s3c_fb_set_platdata(&crag6410_lcd_pdata);
 	dwc2_hsotg_set_platdata(&crag6410_hsotg_pdata);
 
+	gpiod_add_lookup_table(&crag_pmic_gpiod_table);
 	i2c_register_board_info(0, i2c_devs0, ARRAY_SIZE(i2c_devs0));
 	i2c_register_board_info(1, i2c_devs1, ARRAY_SIZE(i2c_devs1));
 
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
index b422eef97b77..018dbbd96771 100644
--- a/drivers/regulator/wm831x-dcdc.c
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -15,7 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 
 #include <linux/mfd/wm831x/core.h>
@@ -50,7 +50,7 @@ struct wm831x_dcdc {
 	int base;
 	struct wm831x *wm831x;
 	struct regulator_dev *regulator;
-	int dvs_gpio;
+	struct gpio_desc *dvs_gpiod;
 	int dvs_gpio_state;
 	int on_vsel;
 	int dvs_vsel;
@@ -217,7 +217,7 @@ static int wm831x_buckv_set_dvs(struct regulator_dev *rdev, int state)
 		return 0;
 
 	dcdc->dvs_gpio_state = state;
-	gpio_set_value(dcdc->dvs_gpio, state);
+	gpiod_set_value(dcdc->dvs_gpiod, state);
 
 	/* Should wait for DVS state change to be asserted if we have
 	 * a GPIO for it, for now assume the device is configured
@@ -237,10 +237,10 @@ static int wm831x_buckv_set_voltage_sel(struct regulator_dev *rdev,
 	int ret;
 
 	/* If this value is already set then do a GPIO update if we can */
-	if (dcdc->dvs_gpio && dcdc->on_vsel == vsel)
+	if (dcdc->dvs_gpiod && dcdc->on_vsel == vsel)
 		return wm831x_buckv_set_dvs(rdev, 0);
 
-	if (dcdc->dvs_gpio && dcdc->dvs_vsel == vsel)
+	if (dcdc->dvs_gpiod && dcdc->dvs_vsel == vsel)
 		return wm831x_buckv_set_dvs(rdev, 1);
 
 	/* Always set the ON status to the minimum voltage */
@@ -249,7 +249,7 @@ static int wm831x_buckv_set_voltage_sel(struct regulator_dev *rdev,
 		return ret;
 	dcdc->on_vsel = vsel;
 
-	if (!dcdc->dvs_gpio)
+	if (!dcdc->dvs_gpiod)
 		return ret;
 
 	/* Kick the voltage transition now */
@@ -296,7 +296,7 @@ static int wm831x_buckv_get_voltage_sel(struct regulator_dev *rdev)
 {
 	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
 
-	if (dcdc->dvs_gpio && dcdc->dvs_gpio_state)
+	if (dcdc->dvs_gpiod && dcdc->dvs_gpio_state)
 		return dcdc->dvs_vsel;
 	else
 		return dcdc->on_vsel;
@@ -337,7 +337,7 @@ static void wm831x_buckv_dvs_init(struct platform_device *pdev,
 	int ret;
 	u16 ctrl;
 
-	if (!pdata || !pdata->dvs_gpio)
+	if (!pdata)
 		return;
 
 	/* gpiolib won't let us read the GPIO status so pick the higher
@@ -345,17 +345,14 @@ static void wm831x_buckv_dvs_init(struct platform_device *pdev,
 	 */
 	dcdc->dvs_gpio_state = pdata->dvs_init_state;
 
-	ret = devm_gpio_request_one(&pdev->dev, pdata->dvs_gpio,
-				    dcdc->dvs_gpio_state ? GPIOF_INIT_HIGH : 0,
-				    "DCDC DVS");
-	if (ret < 0) {
-		dev_err(wm831x->dev, "Failed to get %s DVS GPIO: %d\n",
-			dcdc->name, ret);
+	dcdc->dvs_gpiod = devm_gpiod_get(&pdev->dev, "dvs",
+			dcdc->dvs_gpio_state ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW);
+	if (IS_ERR(dcdc->dvs_gpiod)) {
+		dev_err(wm831x->dev, "Failed to get %s DVS GPIO: %ld\n",
+			dcdc->name, PTR_ERR(dcdc->dvs_gpiod));
 		return;
 	}
 
-	dcdc->dvs_gpio = pdata->dvs_gpio;
-
 	switch (pdata->dvs_control_src) {
 	case 1:
 		ctrl = 2 << WM831X_DC1_DVS_SRC_SHIFT;
diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h
index dcc9631b3052..1b8bb36e13b8 100644
--- a/include/linux/mfd/wm831x/pdata.h
+++ b/include/linux/mfd/wm831x/pdata.h
@@ -52,7 +52,6 @@ struct wm831x_battery_pdata {
  * I2C or SPI buses.
  */
 struct wm831x_buckv_pdata {
-	int dvs_gpio;        /** CPU GPIO to use for DVS switching */
 	int dvs_control_src; /** Hardware DVS source to use (1 or 2) */
 	int dvs_init_state;  /** DVS state to expect on startup */
 	int dvs_state_gpio;  /** CPU GPIO to use for monitoring status */
-- 
cgit v1.2.3


From 5740671e596bdc3986a5391997de194300970201 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 12 Jun 2019 14:28:30 +0100
Subject: dma-fence/reservation: Markup rcu protected access for DEBUG_MUTEXES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mark the access to reservation_object.fence as being protected to
silence sparse.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190612132830.31221-1-chris@chris-wilson.co.uk
---
 include/linux/reservation.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index ee750765cc94..644a22dbe53b 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -216,8 +216,12 @@ reservation_object_unlock(struct reservation_object *obj)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* Test shared fence slot reservation */
-	if (obj->fence)
-		obj->fence->shared_max = obj->fence->shared_count;
+	if (rcu_access_pointer(obj->fence)) {
+		struct reservation_object_list *fence =
+			reservation_object_get_list(obj);
+
+		fence->shared_max = fence->shared_count;
+	}
 #endif
 	ww_mutex_unlock(&obj->lock);
 }
-- 
cgit v1.2.3


From ddde3c18b70061cc09b84a52624909349c212822 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:35 +0200
Subject: vt: More locking checks

I honestly have no idea what the subtle differences between
con_is_visible, con_is_fg (internal to vt.c) and con_is_bound are. But
it looks like both vc->vc_display_fg and con_driver_map are protected
by the console_lock, so probably better if we hold that when checking
this.

To do that I had to deinline the con_is_visible function.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Martin Hostettler <textshell@uchuujin.de>
Cc: Adam Borowski <kilobyte@angband.pl>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-5-daniel.vetter@ffwll.ch
---
 drivers/tty/vt/vt.c            | 16 ++++++++++++++++
 include/linux/console_struct.h |  5 +----
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 30d29a04bfee..ec92f36ab5c4 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -3822,6 +3822,8 @@ int con_is_bound(const struct consw *csw)
 {
 	int i, bound = 0;
 
+	WARN_CONSOLE_UNLOCKED();
+
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (con_driver_map[i] == csw) {
 			bound = 1;
@@ -3833,6 +3835,20 @@ int con_is_bound(const struct consw *csw)
 }
 EXPORT_SYMBOL(con_is_bound);
 
+/**
+ * con_is_visible - checks whether the current console is visible
+ * @vc: virtual console
+ *
+ * RETURNS: zero if not visible, nonzero if visible
+ */
+bool con_is_visible(const struct vc_data *vc)
+{
+	WARN_CONSOLE_UNLOCKED();
+
+	return *vc->vc_display_fg == vc;
+}
+EXPORT_SYMBOL(con_is_visible);
+
 /**
  * con_debug_enter - prepare the console for the kernel debugger
  * @sw: console driver
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index ed798e114663..24d4c16e3ae0 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -168,9 +168,6 @@ extern void vc_SAK(struct work_struct *work);
 
 #define CUR_DEFAULT CUR_UNDERLINE
 
-static inline bool con_is_visible(const struct vc_data *vc)
-{
-	return *vc->vc_display_fg == vc;
-}
+bool con_is_visible(const struct vc_data *vc);
 
 #endif /* _LINUX_CONSOLE_STRUCT_H */
-- 
cgit v1.2.3


From 97b67986f1451c772b488d597310f95c14547cce Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:41 +0200
Subject: fbcon: call fbcon_fb_(un)registered directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With

commit 6104c37094e729f3d4ce65797002112735d49cd1
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Tue Aug 1 17:32:07 2017 +0200

    fbcon: Make fbcon a built-time depency for fbdev

we have a static dependency between fbcon and fbdev, and we can
replace the indirection through the notifier chain with a function
call.

v2: Sam Ravnborg noticed that mach-pxa/am200epd.c has a notifier too,
and listens to this.

...

Looking at the code it seems to wait for some fb to show up, so that
it can get the framebuffer base address from the fb_info struct. I
suspect his is some firmware fbdev. Then it uses that information to
let the real fbdev driver (metronomefb.c by the looks) get at the
framebuffer memory.

This doesn't looke like it's easy to fix (except by deleting the
entire thing, seems untouched since 2008, we might be able to get away
with that), so let's just stuff a few #ifdef into fb.h and fbmem.c and
cry over them for a bit.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Noralf Trønnes" <noralf@tronnes.org>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: linux-fbdev@vger.kernel.org
Cc: Daniel Mack <daniel@zonque.org>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Steve Sakoman <sakoman@gmail.com>
Cc: Steve Sakoman <steve@sakoman.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-11-daniel.vetter@ffwll.ch
---
 arch/arm/mach-pxa/am200epd.c     | 13 +++++++++++--
 drivers/video/fbdev/core/fbcon.c | 14 +++-----------
 drivers/video/fbdev/core/fbmem.c | 24 +++++++++++++++++-------
 include/linux/fb.h               |  7 +++++--
 include/linux/fbcon.h            |  4 ++++
 5 files changed, 40 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/am200epd.c b/arch/arm/mach-pxa/am200epd.c
index 50e18ed37fa6..cac0bb09db14 100644
--- a/arch/arm/mach-pxa/am200epd.c
+++ b/arch/arm/mach-pxa/am200epd.c
@@ -347,8 +347,17 @@ int __init am200_init(void)
 {
 	int ret;
 
-	/* before anything else, we request notification for any fb
-	 * creation events */
+	/*
+	 * Before anything else, we request notification for any fb
+	 * creation events.
+	 *
+	 * FIXME: This is terrible and needs to be nuked. The notifier is used
+	 * to get at the fb base address from the boot splash fb driver, which
+	 * is then passed to metronomefb. Instaed of metronomfb or this board
+	 * support file here figuring this out on their own.
+	 *
+	 * See also the #ifdef in fbmem.c.
+	 */
 	fb_register_client(&am200_fb_notif);
 
 	pxa2xx_mfp_config(ARRAY_AND_SIZE(am200_pin_config));
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 9c14eab77d99..c12fc98035e0 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3119,14 +3119,14 @@ static int fbcon_fb_unbind(int idx)
 }
 
 /* called with console_lock held */
-static int fbcon_fb_unregistered(struct fb_info *info)
+void fbcon_fb_unregistered(struct fb_info *info)
 {
 	int i, idx;
 
 	WARN_CONSOLE_UNLOCKED();
 
 	if (deferred_takeover)
-		return 0;
+		return;
 
 	idx = info->node;
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
@@ -3155,8 +3155,6 @@ static int fbcon_fb_unregistered(struct fb_info *info)
 
 	if (!num_registered_fb)
 		do_unregister_con_driver(&fb_con);
-
-	return 0;
 }
 
 /* called with console_lock held */
@@ -3215,7 +3213,7 @@ static inline void fbcon_select_primary(struct fb_info *info)
 #endif /* CONFIG_FRAMEBUFFER_DETECT_PRIMARY */
 
 /* called with console_lock held */
-static int fbcon_fb_registered(struct fb_info *info)
+int fbcon_fb_registered(struct fb_info *info)
 {
 	int ret = 0, i, idx;
 
@@ -3359,12 +3357,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 		idx = info->node;
 		ret = fbcon_fb_unbind(idx);
 		break;
-	case FB_EVENT_FB_REGISTERED:
-		ret = fbcon_fb_registered(info);
-		break;
-	case FB_EVENT_FB_UNREGISTERED:
-		ret = fbcon_fb_unregistered(info);
-		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		/* called with console lock held */
 		con2fb = event->data;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 8ba674ffb3c9..bed7698ad18a 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1660,7 +1660,6 @@ MODULE_PARM_DESC(lockless_register_fb,
 static int do_register_framebuffer(struct fb_info *fb_info)
 {
 	int i, ret;
-	struct fb_event event;
 	struct fb_videomode mode;
 
 	if (fb_check_foreignness(fb_info))
@@ -1723,7 +1722,14 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 	fb_add_videomode(&mode, &fb_info->modelist);
 	registered_fb[i] = fb_info;
 
-	event.info = fb_info;
+#ifdef CONFIG_GUMSTIX_AM200EPD
+	{
+		struct fb_event event;
+		event.info = fb_info;
+		fb_notifier_call_chain(FB_EVENT_FB_REGISTERED, &event);
+	}
+#endif
+
 	if (!lockless_register_fb)
 		console_lock();
 	else
@@ -1732,9 +1738,8 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 		ret = -ENODEV;
 		goto unlock_console;
 	}
-	ret = 0;
 
-	fb_notifier_call_chain(FB_EVENT_FB_REGISTERED, &event);
+	ret = fbcon_fb_registered(fb_info);
 	unlock_fb_info(fb_info);
 unlock_console:
 	if (!lockless_register_fb)
@@ -1771,7 +1776,6 @@ static int __unlink_framebuffer(struct fb_info *fb_info);
 
 static int do_unregister_framebuffer(struct fb_info *fb_info)
 {
-	struct fb_event event;
 	int ret;
 
 	ret = unbind_console(fb_info);
@@ -1789,9 +1793,15 @@ static int do_unregister_framebuffer(struct fb_info *fb_info)
 	registered_fb[fb_info->node] = NULL;
 	num_registered_fb--;
 	fb_cleanup_device(fb_info);
-	event.info = fb_info;
+#ifdef CONFIG_GUMSTIX_AM200EPD
+	{
+		struct fb_event event;
+		event.info = fb_info;
+		fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event);
+	}
+#endif
 	console_lock();
-	fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event);
+	fbcon_fb_unregistered(fb_info);
 	console_unlock();
 
 	/* this may free fb info */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index f52ef0ad6781..288175fafaf6 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -136,10 +136,13 @@ struct fb_cursor_user {
 #define FB_EVENT_RESUME			0x03
 /*      An entry from the modelist was removed */
 #define FB_EVENT_MODE_DELETE            0x04
-/*      A driver registered itself */
+
+#ifdef CONFIG_GUMSTIX_AM200EPD
+/* only used by mach-pxa/am200epd.c */
 #define FB_EVENT_FB_REGISTERED          0x05
-/*      A driver unregistered itself */
 #define FB_EVENT_FB_UNREGISTERED        0x06
+#endif
+
 /*      CONSOLE-SPECIFIC: get console to framebuffer mapping */
 #define FB_EVENT_GET_CONSOLE_MAP        0x07
 /*      CONSOLE-SPECIFIC: set console to framebuffer mapping */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index f68a7db14165..94a71e9e1257 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -4,9 +4,13 @@
 #ifdef CONFIG_FRAMEBUFFER_CONSOLE
 void __init fb_console_init(void);
 void __exit fb_console_exit(void);
+int fbcon_fb_registered(struct fb_info *info);
+void fbcon_fb_unregistered(struct fb_info *info);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
+static inline int fbcon_fb_registered(struct fb_info *info) { return 0; }
+static inline void fbcon_fb_unregistered(struct fb_info *info) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From cf4a3ae4ef3399179166a464af1d6b172225bef4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:47 +0200
Subject: fbdev: lock_fb_info cannot fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ever since

commit c47747fde931c02455683bd00ea43eaa62f35b0e
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed May 11 14:58:34 2011 -0700

    fbmem: make read/write/ioctl use the frame buffer at open time

fbdev has gained proper refcounting for the fbinfo attached to any
open files, which means that the backing driver (stored in
fb_info->fbops) cannot untimely disappear anymore.

The only thing that can happen is that the entire device just outright
disappears and gets unregistered, but file_fb_info does check for
that. Except that it's racy - it only checks once at the start of a
file_ops, there's no guarantee that the underlying fbdev won't
untimely disappear. Aside: A proper way to fix that race is probably
to replicate the srcu trickery we've rolled out in drm.

But given that this race has existed since forever it's probably not
one we need to fix right away. do_unregister_framebuffer also nowhere
clears fb_info->fbops, hence the check in lock_fb_info can't possible
catch a disappearing fbdev later on.

Long story short: Ever since the above commit the fb_info->fbops
checks have essentially become dead code. Remove this all.

Aside from the file_ops callbacks, and stuff called from there
there's only register/unregister code left. If that goes wrong a driver
managed to register/unregister a device instance twice or in the wrong
order.  That's just a driver bug.

v2:
- fb_mmap had an open-coded version of the fbinfo->fops check, because
  it doesn't need the fbinfo->lock. Delete that too.
- Use the wrapper function in fb_open/release now, since no difference
  anymore.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: "Noralf Trønnes" <noralf@tronnes.org>
Cc: Peter Rosin <peda@axentia.se>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-17-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcmap.c |  6 +---
 drivers/video/fbdev/core/fbcon.c  |  3 +-
 drivers/video/fbdev/core/fbmem.c  | 73 +++++++++------------------------------
 include/linux/fb.h                |  5 ++-
 4 files changed, 23 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcmap.c b/drivers/video/fbdev/core/fbcmap.c
index 2811c4afde01..e5ae33c1a8e8 100644
--- a/drivers/video/fbdev/core/fbcmap.c
+++ b/drivers/video/fbdev/core/fbcmap.c
@@ -285,11 +285,7 @@ int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info)
 		goto out;
 	}
 	umap.start = cmap->start;
-	if (!lock_fb_info(info)) {
-		rc = -ENODEV;
-		goto out;
-	}
-
+	lock_fb_info(info);
 	rc = fb_set_cmap(&umap, info);
 	unlock_fb_info(info);
 out:
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index c12fc98035e0..f7f3eb0f1893 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2364,8 +2364,7 @@ static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
 	}
 
 
-	if (!lock_fb_info(info))
-		return;
+	lock_fb_info(info);
 	event.info = info;
 	event.data = &blank;
 	fb_notifier_call_chain(FB_EVENT_CONBLANK, &event);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index bed7698ad18a..d73762324ca2 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -80,17 +80,6 @@ static void put_fb_info(struct fb_info *fb_info)
 		fb_info->fbops->fb_destroy(fb_info);
 }
 
-int lock_fb_info(struct fb_info *info)
-{
-	mutex_lock(&info->lock);
-	if (!info->fbops) {
-		mutex_unlock(&info->lock);
-		return 0;
-	}
-	return 1;
-}
-EXPORT_SYMBOL(lock_fb_info);
-
 /*
  * Helpers
  */
@@ -1121,8 +1110,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 
 	switch (cmd) {
 	case FBIOGET_VSCREENINFO:
-		if (!lock_fb_info(info))
-			return -ENODEV;
+		lock_fb_info(info);
 		var = info->var;
 		unlock_fb_info(info);
 
@@ -1132,10 +1120,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		if (copy_from_user(&var, argp, sizeof(var)))
 			return -EFAULT;
 		console_lock();
-		if (!lock_fb_info(info)) {
-			console_unlock();
-			return -ENODEV;
-		}
+		lock_fb_info(info);
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_set_var(info, &var);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
@@ -1145,8 +1130,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			ret = -EFAULT;
 		break;
 	case FBIOGET_FSCREENINFO:
-		if (!lock_fb_info(info))
-			return -ENODEV;
+		lock_fb_info(info);
 		fix = info->fix;
 		if (info->flags & FBINFO_HIDE_SMEM_START)
 			fix.smem_start = 0;
@@ -1162,8 +1146,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 	case FBIOGETCMAP:
 		if (copy_from_user(&cmap, argp, sizeof(cmap)))
 			return -EFAULT;
-		if (!lock_fb_info(info))
-			return -ENODEV;
+		lock_fb_info(info);
 		cmap_from = info->cmap;
 		unlock_fb_info(info);
 		ret = fb_cmap_to_user(&cmap_from, &cmap);
@@ -1172,10 +1155,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		if (copy_from_user(&var, argp, sizeof(var)))
 			return -EFAULT;
 		console_lock();
-		if (!lock_fb_info(info)) {
-			console_unlock();
-			return -ENODEV;
-		}
+		lock_fb_info(info);
 		ret = fb_pan_display(info, &var);
 		unlock_fb_info(info);
 		console_unlock();
@@ -1192,8 +1172,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EINVAL;
 		con2fb.framebuffer = -1;
 		event.data = &con2fb;
-		if (!lock_fb_info(info))
-			return -ENODEV;
+		lock_fb_info(info);
 		event.info = info;
 		fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event);
 		unlock_fb_info(info);
@@ -1214,10 +1193,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		}
 		event.data = &con2fb;
 		console_lock();
-		if (!lock_fb_info(info)) {
-			console_unlock();
-			return -ENODEV;
-		}
+		lock_fb_info(info);
 		event.info = info;
 		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event);
 		unlock_fb_info(info);
@@ -1225,10 +1201,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		break;
 	case FBIOBLANK:
 		console_lock();
-		if (!lock_fb_info(info)) {
-			console_unlock();
-			return -ENODEV;
-		}
+		lock_fb_info(info);
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_blank(info, arg);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
@@ -1236,8 +1209,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		console_unlock();
 		break;
 	default:
-		if (!lock_fb_info(info))
-			return -ENODEV;
+		lock_fb_info(info);
 		fb = info->fbops;
 		if (fb->fb_ioctl)
 			ret = fb->fb_ioctl(info, cmd, arg);
@@ -1357,8 +1329,7 @@ static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd,
 {
 	struct fb_fix_screeninfo fix;
 
-	if (!lock_fb_info(info))
-		return -ENODEV;
+	lock_fb_info(info);
 	fix = info->fix;
 	if (info->flags & FBINFO_HIDE_SMEM_START)
 		fix.smem_start = 0;
@@ -1418,8 +1389,6 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 	if (!info)
 		return -ENODEV;
 	fb = info->fbops;
-	if (!fb)
-		return -ENODEV;
 	mutex_lock(&info->mm_lock);
 	if (fb->fb_mmap) {
 		int res;
@@ -1483,7 +1452,7 @@ __releases(&info->lock)
 	if (IS_ERR(info))
 		return PTR_ERR(info);
 
-	mutex_lock(&info->lock);
+	lock_fb_info(info);
 	if (!try_module_get(info->fbops->owner)) {
 		res = -ENODEV;
 		goto out;
@@ -1499,7 +1468,7 @@ __releases(&info->lock)
 		fb_deferred_io_open(info, inode, file);
 #endif
 out:
-	mutex_unlock(&info->lock);
+	unlock_fb_info(info);
 	if (res)
 		put_fb_info(info);
 	return res;
@@ -1512,11 +1481,11 @@ __releases(&info->lock)
 {
 	struct fb_info * const info = file->private_data;
 
-	mutex_lock(&info->lock);
+	lock_fb_info(info);
 	if (info->fbops->fb_release)
 		info->fbops->fb_release(info,1);
 	module_put(info->fbops->owner);
-	mutex_unlock(&info->lock);
+	unlock_fb_info(info);
 	put_fb_info(info);
 	return 0;
 }
@@ -1734,14 +1703,10 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 		console_lock();
 	else
 		atomic_inc(&ignore_console_lock_warning);
-	if (!lock_fb_info(fb_info)) {
-		ret = -ENODEV;
-		goto unlock_console;
-	}
-
+	lock_fb_info(fb_info);
 	ret = fbcon_fb_registered(fb_info);
 	unlock_fb_info(fb_info);
-unlock_console:
+
 	if (!lockless_register_fb)
 		console_unlock();
 	else
@@ -1759,11 +1724,7 @@ static int unbind_console(struct fb_info *fb_info)
 		return -EINVAL;
 
 	console_lock();
-	if (!lock_fb_info(fb_info)) {
-		console_unlock();
-		return -ENODEV;
-	}
-
+	lock_fb_info(fb_info);
 	event.info = fb_info;
 	ret = fb_notifier_call_chain(FB_EVENT_FB_UNBIND, &event);
 	unlock_fb_info(fb_info);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 288175fafaf6..aa8f18163151 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -663,7 +663,10 @@ extern struct class *fb_class;
 	for (i = 0; i < FB_MAX; i++)		\
 		if (!registered_fb[i]) {} else
 
-extern int lock_fb_info(struct fb_info *info);
+static inline void lock_fb_info(struct fb_info *info)
+{
+	mutex_lock(&info->lock);
+}
 
 static inline void unlock_fb_info(struct fb_info *info)
 {
-- 
cgit v1.2.3


From 0e0f3250d4402d60f4571d076ab27d5af049853e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:48 +0200
Subject: fbcon: call fbcon_fb_bind directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also remove the error return value. That's all errors for either
driver bugs (trying to unbind something that isn't bound), or errors
of the new driver that will take over.

There's nothing the outgoing driver can do about this anyway, so
switch over to void.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Kees Cook <keescook@chromium.org>
Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-18-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcon.c | 24 +++++++-----------------
 drivers/video/fbdev/core/fbmem.c |  7 ++-----
 include/linux/fb.h               |  2 --
 include/linux/fbcon.h            |  2 ++
 4 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index f7f3eb0f1893..d1e37afa6f80 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3046,7 +3046,7 @@ static int fbcon_mode_deleted(struct fb_info *info,
 }
 
 #ifdef CONFIG_VT_HW_CONSOLE_BINDING
-static int fbcon_unbind(void)
+static void fbcon_unbind(void)
 {
 	int ret;
 
@@ -3055,25 +3055,21 @@ static int fbcon_unbind(void)
 
 	if (!ret)
 		fbcon_has_console_bind = 0;
-
-	return ret;
 }
 #else
-static inline int fbcon_unbind(void)
-{
-	return -EINVAL;
-}
+static inline void fbcon_unbind(void) {}
 #endif /* CONFIG_VT_HW_CONSOLE_BINDING */
 
 /* called with console_lock held */
-static int fbcon_fb_unbind(int idx)
+void fbcon_fb_unbind(struct fb_info *info)
 {
 	int i, new_idx = -1, ret = 0;
+	int idx = info->node;
 
 	WARN_CONSOLE_UNLOCKED();
 
 	if (!fbcon_has_console_bind)
-		return 0;
+		return;
 
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] != idx &&
@@ -3106,15 +3102,13 @@ static int fbcon_fb_unbind(int idx)
 								     idx, 0);
 					if (ret) {
 						con2fb_map[i] = idx;
-						return ret;
+						return;
 					}
 				}
 			}
 		}
-		ret = fbcon_unbind();
+		fbcon_unbind();
 	}
-
-	return ret;
 }
 
 /* called with console_lock held */
@@ -3352,10 +3346,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 		mode = event->data;
 		ret = fbcon_mode_deleted(info, mode);
 		break;
-	case FB_EVENT_FB_UNBIND:
-		idx = info->node;
-		ret = fbcon_fb_unbind(idx);
-		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		/* called with console lock held */
 		con2fb = event->data;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index d73762324ca2..f3fc2e5b193c 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1716,8 +1716,6 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 
 static int unbind_console(struct fb_info *fb_info)
 {
-	struct fb_event event;
-	int ret;
 	int i = fb_info->node;
 
 	if (i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)
@@ -1725,12 +1723,11 @@ static int unbind_console(struct fb_info *fb_info)
 
 	console_lock();
 	lock_fb_info(fb_info);
-	event.info = fb_info;
-	ret = fb_notifier_call_chain(FB_EVENT_FB_UNBIND, &event);
+	fbcon_fb_unbind(fb_info);
 	unlock_fb_info(fb_info);
 	console_unlock();
 
-	return ret;
+	return 0;
 }
 
 static int __unlink_framebuffer(struct fb_info *fb_info);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index aa8f18163151..b6ce041d9e13 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -158,8 +158,6 @@ struct fb_cursor_user {
 #define FB_EVENT_CONBLANK               0x0C
 /*      Get drawing requirements        */
 #define FB_EVENT_GET_REQ                0x0D
-/*      Unbind from the console if possible */
-#define FB_EVENT_FB_UNBIND              0x0E
 /*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */
 #define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
 /*      A hardware display blank early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 94a71e9e1257..38d44fdb6d14 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -6,11 +6,13 @@ void __init fb_console_init(void);
 void __exit fb_console_exit(void);
 int fbcon_fb_registered(struct fb_info *info);
 void fbcon_fb_unregistered(struct fb_info *info);
+void fbcon_fb_unbind(struct fb_info *info);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
 static inline int fbcon_fb_registered(struct fb_info *info) { return 0; }
 static inline void fbcon_fb_unregistered(struct fb_info *info) {}
+static inline void fbcon_fb_unbind(struct fb_info *info) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From deb00d2785bedd379caa7aaf18c1ffb824580b9d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:49 +0200
Subject: fbdev: make unregister/unlink functions not fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Except for driver bugs (which we'll catch with a WARN_ON) this is only
to report failures of the new driver taking over the console. There's
nothing the outgoing driver can do about that, and no one ever
bothered to actually look at these return values. So remove them all.

v2: fixup unregister_framebuffer in savagefb, fbtft, ivtvfb, and neofb
drivers, reported by kbuild.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Peter Rosin <peda@axentia.se>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-19-daniel.vetter@ffwll.ch
---
 drivers/media/pci/ivtv/ivtvfb.c              |  6 +--
 drivers/staging/fbtft/fbtft-core.c           |  4 +-
 drivers/video/fbdev/core/fbmem.c             | 73 +++++++++-------------------
 drivers/video/fbdev/neofb.c                  |  9 +---
 drivers/video/fbdev/savage/savagefb_driver.c |  9 +---
 include/linux/fb.h                           |  4 +-
 6 files changed, 31 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c
index 66be490ec563..299ff032f528 100644
--- a/drivers/media/pci/ivtv/ivtvfb.c
+++ b/drivers/media/pci/ivtv/ivtvfb.c
@@ -1246,11 +1246,7 @@ static int ivtvfb_callback_cleanup(struct device *dev, void *p)
 	struct osd_info *oi = itv->osd_info;
 
 	if (itv->v4l2_cap & V4L2_CAP_VIDEO_OUTPUT) {
-		if (unregister_framebuffer(&itv->osd_info->ivtvfb_info)) {
-			IVTVFB_WARN("Framebuffer %d is in use, cannot unload\n",
-				       itv->instance);
-			return 0;
-		}
+		unregister_framebuffer(&itv->osd_info->ivtvfb_info);
 		IVTVFB_INFO("Unregister framebuffer %d\n", itv->instance);
 		itv->ivtvfb_restore = NULL;
 		ivtvfb_blank(FB_BLANK_VSYNC_SUSPEND, &oi->ivtvfb_info);
diff --git a/drivers/staging/fbtft/fbtft-core.c b/drivers/staging/fbtft/fbtft-core.c
index 9b07badf4c6c..7cbc1bdd2d8a 100644
--- a/drivers/staging/fbtft/fbtft-core.c
+++ b/drivers/staging/fbtft/fbtft-core.c
@@ -891,7 +891,9 @@ int fbtft_unregister_framebuffer(struct fb_info *fb_info)
 	if (par->fbtftops.unregister_backlight)
 		par->fbtftops.unregister_backlight(par);
 	fbtft_sysfs_exit(par);
-	return unregister_framebuffer(fb_info);
+	unregister_framebuffer(fb_info);
+
+	return 0;
 }
 EXPORT_SYMBOL(fbtft_unregister_framebuffer);
 
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index f3fc2e5b193c..f3bcad30d3ba 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1590,13 +1590,13 @@ static bool fb_do_apertures_overlap(struct apertures_struct *gena,
 	return false;
 }
 
-static int do_unregister_framebuffer(struct fb_info *fb_info);
+static void do_unregister_framebuffer(struct fb_info *fb_info);
 
 #define VGA_FB_PHYS 0xA0000
-static int do_remove_conflicting_framebuffers(struct apertures_struct *a,
-					      const char *name, bool primary)
+static void do_remove_conflicting_framebuffers(struct apertures_struct *a,
+					       const char *name, bool primary)
 {
-	int i, ret;
+	int i;
 
 	/* check all firmware fbs and kick off if the base addr overlaps */
 	for_each_registered_fb(i) {
@@ -1612,13 +1612,9 @@ static int do_remove_conflicting_framebuffers(struct apertures_struct *a,
 
 			printk(KERN_INFO "fb%d: switching to %s from %s\n",
 			       i, name, registered_fb[i]->fix.id);
-			ret = do_unregister_framebuffer(registered_fb[i]);
-			if (ret)
-				return ret;
+			do_unregister_framebuffer(registered_fb[i]);
 		}
 	}
-
-	return 0;
 }
 
 static bool lockless_register_fb;
@@ -1634,11 +1630,9 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 	if (fb_check_foreignness(fb_info))
 		return -ENOSYS;
 
-	ret = do_remove_conflicting_framebuffers(fb_info->apertures,
-						 fb_info->fix.id,
-						 fb_is_primary_device(fb_info));
-	if (ret)
-		return ret;
+	do_remove_conflicting_framebuffers(fb_info->apertures,
+					   fb_info->fix.id,
+					   fb_is_primary_device(fb_info));
 
 	if (num_registered_fb == FB_MAX)
 		return -ENXIO;
@@ -1714,32 +1708,25 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 	return ret;
 }
 
-static int unbind_console(struct fb_info *fb_info)
+static void unbind_console(struct fb_info *fb_info)
 {
 	int i = fb_info->node;
 
-	if (i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)
-		return -EINVAL;
+	if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info))
+		return;
 
 	console_lock();
 	lock_fb_info(fb_info);
 	fbcon_fb_unbind(fb_info);
 	unlock_fb_info(fb_info);
 	console_unlock();
-
-	return 0;
 }
 
-static int __unlink_framebuffer(struct fb_info *fb_info);
+static void __unlink_framebuffer(struct fb_info *fb_info);
 
-static int do_unregister_framebuffer(struct fb_info *fb_info)
+static void do_unregister_framebuffer(struct fb_info *fb_info)
 {
-	int ret;
-
-	ret = unbind_console(fb_info);
-
-	if (ret)
-		return -EINVAL;
+	unbind_console(fb_info);
 
 	pm_vt_switch_unregister(fb_info->dev);
 
@@ -1764,36 +1751,27 @@ static int do_unregister_framebuffer(struct fb_info *fb_info)
 
 	/* this may free fb info */
 	put_fb_info(fb_info);
-	return 0;
 }
 
-static int __unlink_framebuffer(struct fb_info *fb_info)
+static void __unlink_framebuffer(struct fb_info *fb_info)
 {
 	int i;
 
 	i = fb_info->node;
-	if (i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)
-		return -EINVAL;
+	if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info))
+		return;
 
 	if (fb_info->dev) {
 		device_destroy(fb_class, MKDEV(FB_MAJOR, i));
 		fb_info->dev = NULL;
 	}
-
-	return 0;
 }
 
-int unlink_framebuffer(struct fb_info *fb_info)
+void unlink_framebuffer(struct fb_info *fb_info)
 {
-	int ret;
-
-	ret = __unlink_framebuffer(fb_info);
-	if (ret)
-		return ret;
+	__unlink_framebuffer(fb_info);
 
 	unbind_console(fb_info);
-
-	return 0;
 }
 EXPORT_SYMBOL(unlink_framebuffer);
 
@@ -1810,7 +1788,6 @@ EXPORT_SYMBOL(unlink_framebuffer);
 int remove_conflicting_framebuffers(struct apertures_struct *a,
 				    const char *name, bool primary)
 {
-	int ret;
 	bool do_free = false;
 
 	if (!a) {
@@ -1824,13 +1801,13 @@ int remove_conflicting_framebuffers(struct apertures_struct *a,
 	}
 
 	mutex_lock(&registration_lock);
-	ret = do_remove_conflicting_framebuffers(a, name, primary);
+	do_remove_conflicting_framebuffers(a, name, primary);
 	mutex_unlock(&registration_lock);
 
 	if (do_free)
 		kfree(a);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(remove_conflicting_framebuffers);
 
@@ -1927,16 +1904,12 @@ EXPORT_SYMBOL(register_framebuffer);
  *      that the driver implements fb_open() and fb_release() to
  *      check that no processes are using the device.
  */
-int
+void
 unregister_framebuffer(struct fb_info *fb_info)
 {
-	int ret;
-
 	mutex_lock(&registration_lock);
-	ret = do_unregister_framebuffer(fb_info);
+	do_unregister_framebuffer(fb_info);
 	mutex_unlock(&registration_lock);
-
-	return ret;
 }
 EXPORT_SYMBOL(unregister_framebuffer);
 
diff --git a/drivers/video/fbdev/neofb.c b/drivers/video/fbdev/neofb.c
index 5d3a444083f7..b770946a0920 100644
--- a/drivers/video/fbdev/neofb.c
+++ b/drivers/video/fbdev/neofb.c
@@ -2122,14 +2122,7 @@ static void neofb_remove(struct pci_dev *dev)
 	DBG("neofb_remove");
 
 	if (info) {
-		/*
-		 * If unregister_framebuffer fails, then
-		 * we will be leaving hooks that could cause
-		 * oopsen laying around.
-		 */
-		if (unregister_framebuffer(info))
-			printk(KERN_WARNING
-			       "neofb: danger danger!  Oopsen imminent!\n");
+		unregister_framebuffer(info);
 
 		neo_unmap_video(info);
 		fb_destroy_modedb(info->monspecs.modedb);
diff --git a/drivers/video/fbdev/savage/savagefb_driver.c b/drivers/video/fbdev/savage/savagefb_driver.c
index 47b78f0138c3..512789f5f884 100644
--- a/drivers/video/fbdev/savage/savagefb_driver.c
+++ b/drivers/video/fbdev/savage/savagefb_driver.c
@@ -2333,14 +2333,7 @@ static void savagefb_remove(struct pci_dev *dev)
 	DBG("savagefb_remove");
 
 	if (info) {
-		/*
-		 * If unregister_framebuffer fails, then
-		 * we will be leaving hooks that could cause
-		 * oopsen laying around.
-		 */
-		if (unregister_framebuffer(info))
-			printk(KERN_WARNING "savagefb: danger danger! "
-			       "Oopsen imminent!\n");
+		unregister_framebuffer(info);
 
 #ifdef CONFIG_FB_SAVAGE_I2C
 		savagefb_delete_i2c_busses(info);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index b6ce041d9e13..b90cf7d56bd8 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -634,8 +634,8 @@ extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf,
 
 /* drivers/video/fbmem.c */
 extern int register_framebuffer(struct fb_info *fb_info);
-extern int unregister_framebuffer(struct fb_info *fb_info);
-extern int unlink_framebuffer(struct fb_info *fb_info);
+extern void unregister_framebuffer(struct fb_info *fb_info);
+extern void unlink_framebuffer(struct fb_info *fb_info);
 extern int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int res_id,
 					       const char *name);
 extern int remove_conflicting_framebuffers(struct apertures_struct *a,
-- 
cgit v1.2.3


From 50c5056356340c8b5be90440d2f32fec8c47a7c3 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:52 +0200
Subject: fbdev: directly call fbcon_suspended/resumed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the sh_mobile notifier removed we can just directly call the
fbcon code here.

v2: Remove now unused local variable.

v3: fixup !CONFIG_FRAMEBUFFER_CONSOLE, noticed by kbuild

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Peter Rosin <peda@axentia.se>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-22-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcon.c | 10 ++--------
 drivers/video/fbdev/core/fbmem.c |  7 ++-----
 include/linux/fb.h               |  8 --------
 include/linux/fbcon.h            |  4 ++++
 4 files changed, 8 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index d1e37afa6f80..9994111f2563 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2919,7 +2919,7 @@ static int fbcon_set_origin(struct vc_data *vc)
 	return 0;
 }
 
-static void fbcon_suspended(struct fb_info *info)
+void fbcon_suspended(struct fb_info *info)
 {
 	struct vc_data *vc = NULL;
 	struct fbcon_ops *ops = info->fbcon_par;
@@ -2932,7 +2932,7 @@ static void fbcon_suspended(struct fb_info *info)
 	fbcon_cursor(vc, CM_ERASE);
 }
 
-static void fbcon_resumed(struct fb_info *info)
+void fbcon_resumed(struct fb_info *info)
 {
 	struct vc_data *vc;
 	struct fbcon_ops *ops = info->fbcon_par;
@@ -3330,12 +3330,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	int idx, ret = 0;
 
 	switch(action) {
-	case FB_EVENT_SUSPEND:
-		fbcon_suspended(info);
-		break;
-	case FB_EVENT_RESUME:
-		fbcon_resumed(info);
-		break;
 	case FB_EVENT_MODE_CHANGE:
 		fbcon_modechanged(info);
 		break;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index bee45e9405b8..73269dedcd45 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1917,17 +1917,14 @@ EXPORT_SYMBOL(unregister_framebuffer);
  */
 void fb_set_suspend(struct fb_info *info, int state)
 {
-	struct fb_event event;
-
 	WARN_CONSOLE_UNLOCKED();
 
-	event.info = info;
 	if (state) {
-		fb_notifier_call_chain(FB_EVENT_SUSPEND, &event);
+		fbcon_suspended(info);
 		info->state = FBINFO_STATE_SUSPENDED;
 	} else {
 		info->state = FBINFO_STATE_RUNNING;
-		fb_notifier_call_chain(FB_EVENT_RESUME, &event);
+		fbcon_resumed(info);
 	}
 }
 EXPORT_SYMBOL(fb_set_suspend);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index b90cf7d56bd8..794b386415b7 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -126,14 +126,6 @@ struct fb_cursor_user {
 
 /*	The resolution of the passed in fb_info about to change */ 
 #define FB_EVENT_MODE_CHANGE		0x01
-/*	The display on this fb_info is being suspended, no access to the
- *	framebuffer is allowed any more after that call returns
- */
-#define FB_EVENT_SUSPEND		0x02
-/*	The display on this fb_info was resumed, you can restore the display
- *	if you own it
- */
-#define FB_EVENT_RESUME			0x03
 /*      An entry from the modelist was removed */
 #define FB_EVENT_MODE_DELETE            0x04
 
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 38d44fdb6d14..790c42ec7b5d 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -7,12 +7,16 @@ void __exit fb_console_exit(void);
 int fbcon_fb_registered(struct fb_info *info);
 void fbcon_fb_unregistered(struct fb_info *info);
 void fbcon_fb_unbind(struct fb_info *info);
+void fbcon_suspended(struct fb_info *info);
+void fbcon_resumed(struct fb_info *info);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
 static inline int fbcon_fb_registered(struct fb_info *info) { return 0; }
 static inline void fbcon_fb_unregistered(struct fb_info *info) {}
 static inline void fbcon_fb_unbind(struct fb_info *info) {}
+static inline void fbcon_suspended(struct fb_info *info) {}
+static inline void fbcon_resumed(struct fb_info *info) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 13ff178ccd6d3b8074c542a911300b79c4eec255 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:53 +0200
Subject: fbcon: Call fbcon_mode_deleted/new_modelist directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I'm not entirely clear on what new_modelist actually does, it seems
exclusively for a sysfs interface. Which in the end does amount to a
normal fb_set_par to check the mode, but then takes a different path
in both fbmem.c and fbcon.c.

I have no idea why these 2 paths are different, but then I also don't
really want to find out. So just do the simple conversion to a direct
function call.

v2: static inline for the dummy versions, I forgot.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Rosin <peda@axentia.se>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-23-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcon.c | 14 +++-----------
 drivers/video/fbdev/core/fbmem.c | 22 +++++++---------------
 include/linux/fb.h               |  5 -----
 include/linux/fbcon.h            |  6 ++++++
 4 files changed, 16 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 9994111f2563..24bd6cd270d5 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3019,8 +3019,8 @@ static void fbcon_set_all_vcs(struct fb_info *info)
 		fbcon_modechanged(info);
 }
 
-static int fbcon_mode_deleted(struct fb_info *info,
-			      struct fb_videomode *mode)
+int fbcon_mode_deleted(struct fb_info *info,
+		       struct fb_videomode *mode)
 {
 	struct fb_info *fb_info;
 	struct fbcon_display *p;
@@ -3262,7 +3262,7 @@ static void fbcon_fb_blanked(struct fb_info *info, int blank)
 	ops->blank_state = blank;
 }
 
-static void fbcon_new_modelist(struct fb_info *info)
+void fbcon_new_modelist(struct fb_info *info)
 {
 	int i;
 	struct vc_data *vc;
@@ -3324,7 +3324,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 {
 	struct fb_event *event = data;
 	struct fb_info *info = event->info;
-	struct fb_videomode *mode;
 	struct fb_con2fbmap *con2fb;
 	struct fb_blit_caps *caps;
 	int idx, ret = 0;
@@ -3336,10 +3335,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	case FB_EVENT_MODE_CHANGE_ALL:
 		fbcon_set_all_vcs(info);
 		break;
-	case FB_EVENT_MODE_DELETE:
-		mode = event->data;
-		ret = fbcon_mode_deleted(info, mode);
-		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		/* called with console lock held */
 		con2fb = event->data;
@@ -3353,9 +3348,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	case FB_EVENT_BLANK:
 		fbcon_fb_blanked(info, *(int *)event->data);
 		break;
-	case FB_EVENT_NEW_MODELIST:
-		fbcon_new_modelist(info);
-		break;
 	case FB_EVENT_GET_REQ:
 		caps = event->data;
 		fbcon_get_requirement(info, caps);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 73269dedcd45..cbdd141e7695 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -966,16 +966,11 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 		/* make sure we don't delete the videomode of current var */
 		ret = fb_mode_is_equal(&mode1, &mode2);
 
-		if (!ret) {
-		    struct fb_event event;
-
-		    event.info = info;
-		    event.data = &mode1;
-		    ret = fb_notifier_call_chain(FB_EVENT_MODE_DELETE, &event);
-		}
+		if (!ret)
+			fbcon_mode_deleted(info, &mode1);
 
 		if (!ret)
-		    fb_delete_videomode(&mode1, &info->modelist);
+			fb_delete_videomode(&mode1, &info->modelist);
 
 
 		ret = (ret) ? -EINVAL : 0;
@@ -1992,7 +1987,6 @@ subsys_initcall(fbmem_init);
 
 int fb_new_modelist(struct fb_info *info)
 {
-	struct fb_event event;
 	struct fb_var_screeninfo var = info->var;
 	struct list_head *pos, *n;
 	struct fb_modelist *modelist;
@@ -2012,14 +2006,12 @@ int fb_new_modelist(struct fb_info *info)
 		}
 	}
 
-	err = 1;
+	if (list_empty(&info->modelist))
+		return 1;
 
-	if (!list_empty(&info->modelist)) {
-		event.info = info;
-		err = fb_notifier_call_chain(FB_EVENT_NEW_MODELIST, &event);
-	}
+	fbcon_new_modelist(info);
 
-	return err;
+	return 0;
 }
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 794b386415b7..7a788ed8c7b5 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -126,8 +126,6 @@ struct fb_cursor_user {
 
 /*	The resolution of the passed in fb_info about to change */ 
 #define FB_EVENT_MODE_CHANGE		0x01
-/*      An entry from the modelist was removed */
-#define FB_EVENT_MODE_DELETE            0x04
 
 #ifdef CONFIG_GUMSTIX_AM200EPD
 /* only used by mach-pxa/am200epd.c */
@@ -142,9 +140,6 @@ struct fb_cursor_user {
 /*      A hardware display blank change occurred */
 #define FB_EVENT_BLANK                  0x09
 /*      Private modelist is to be replaced */
-#define FB_EVENT_NEW_MODELIST           0x0A
-/*	The resolution of the passed in fb_info about to change and
-        all vc's should be changed         */
 #define FB_EVENT_MODE_CHANGE_ALL	0x0B
 /*	A software display blank change occurred */
 #define FB_EVENT_CONBLANK               0x0C
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 790c42ec7b5d..c139834342f5 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -9,6 +9,9 @@ void fbcon_fb_unregistered(struct fb_info *info);
 void fbcon_fb_unbind(struct fb_info *info);
 void fbcon_suspended(struct fb_info *info);
 void fbcon_resumed(struct fb_info *info);
+int fbcon_mode_deleted(struct fb_info *info,
+		       struct fb_videomode *mode);
+void fbcon_new_modelist(struct fb_info *info);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -17,6 +20,9 @@ static inline void fbcon_fb_unregistered(struct fb_info *info) {}
 static inline void fbcon_fb_unbind(struct fb_info *info) {}
 static inline void fbcon_suspended(struct fb_info *info) {}
 static inline void fbcon_resumed(struct fb_info *info) {}
+static inline int fbcon_mode_deleted(struct fb_info *info,
+				     struct fb_videomode *mode) { return 0; }
+static inline void fbcon_new_modelist(struct fb_info *info) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 0526c2239ad8ceef98652fe8e059044c24c62ea7 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:54 +0200
Subject: fbdev: Call fbcon_get_requirement directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pretty simple case really.

v2: Forgot to remove a break;

v3: Add static inline to the dummy versions.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Peter Rosin <peda@axentia.se>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-24-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcon.c | 9 ++-------
 drivers/video/fbdev/core/fbmem.c | 5 +----
 include/linux/fb.h               | 2 --
 include/linux/fbcon.h            | 4 ++++
 4 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 24bd6cd270d5..ee0bed9571aa 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3283,8 +3283,8 @@ void fbcon_new_modelist(struct fb_info *info)
 	}
 }
 
-static void fbcon_get_requirement(struct fb_info *info,
-				  struct fb_blit_caps *caps)
+void fbcon_get_requirement(struct fb_info *info,
+			   struct fb_blit_caps *caps)
 {
 	struct vc_data *vc;
 	struct fbcon_display *p;
@@ -3325,7 +3325,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	struct fb_event *event = data;
 	struct fb_info *info = event->info;
 	struct fb_con2fbmap *con2fb;
-	struct fb_blit_caps *caps;
 	int idx, ret = 0;
 
 	switch(action) {
@@ -3348,10 +3347,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	case FB_EVENT_BLANK:
 		fbcon_fb_blanked(info, *(int *)event->data);
 		break;
-	case FB_EVENT_GET_REQ:
-		caps = event->data;
-		fbcon_get_requirement(info, caps);
-		break;
 	case FB_EVENT_REMAP_ALL_CONSOLE:
 		idx = info->node;
 		fbcon_remap_all(idx);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index cbdd141e7695..ddc0c16b8bbf 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -932,16 +932,13 @@ EXPORT_SYMBOL(fb_pan_display);
 static int fb_check_caps(struct fb_info *info, struct fb_var_screeninfo *var,
 			 u32 activate)
 {
-	struct fb_event event;
 	struct fb_blit_caps caps, fbcaps;
 	int err = 0;
 
 	memset(&caps, 0, sizeof(caps));
 	memset(&fbcaps, 0, sizeof(fbcaps));
 	caps.flags = (activate & FB_ACTIVATE_ALL) ? 1 : 0;
-	event.info = info;
-	event.data = &caps;
-	fb_notifier_call_chain(FB_EVENT_GET_REQ, &event);
+	fbcon_get_requirement(info, &caps);
 	info->fbops->fb_get_caps(info, &fbcaps, var);
 
 	if (((fbcaps.x ^ caps.x) & caps.x) ||
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 7a788ed8c7b5..0d86aa31bf8d 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -143,8 +143,6 @@ struct fb_cursor_user {
 #define FB_EVENT_MODE_CHANGE_ALL	0x0B
 /*	A software display blank change occurred */
 #define FB_EVENT_CONBLANK               0x0C
-/*      Get drawing requirements        */
-#define FB_EVENT_GET_REQ                0x0D
 /*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */
 #define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
 /*      A hardware display blank early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index c139834342f5..305e4f2eddac 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -12,6 +12,8 @@ void fbcon_resumed(struct fb_info *info);
 int fbcon_mode_deleted(struct fb_info *info,
 		       struct fb_videomode *mode);
 void fbcon_new_modelist(struct fb_info *info);
+void fbcon_get_requirement(struct fb_info *info,
+			   struct fb_blit_caps *caps);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -23,6 +25,8 @@ static inline void fbcon_resumed(struct fb_info *info) {}
 static inline int fbcon_mode_deleted(struct fb_info *info,
 				     struct fb_videomode *mode) { return 0; }
 static inline void fbcon_new_modelist(struct fb_info *info) {}
+static inline void fbcon_get_requirement(struct fb_info *info,
+					 struct fb_blit_caps *caps) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 7a625549ea8c14be70bc7cfaf30215401bba6da0 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:55 +0200
Subject: Revert "backlight/fbcon: Add FB_EVENT_CONBLANK"

This reverts commit 994efacdf9a087b52f71e620b58dfa526b0cf928.

The justification is that if hw blanking fails (i.e. fbops->fb_blank)
fails, then we still want to shut down the backlight. Which is exactly
_not_ what fb_blank() does and so rather inconsistent if we end up
with different behaviour between fbcon and direct fbdev usage. Given
that the entire notifier maze is getting in the way anyway I figured
it's simplest to revert this not well justified commit.

v2: Add static inline to the dummy version.

Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jingoo Han <jingoohan1@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-25-daniel.vetter@ffwll.ch
---
 drivers/video/backlight/backlight.c |  2 +-
 drivers/video/fbdev/core/fbcon.c    | 14 +-------------
 drivers/video/fbdev/core/fbmem.c    |  1 +
 include/linux/fb.h                  |  4 +---
 include/linux/fbcon.h               |  2 ++
 5 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 1ef8b6fd62ac..5dc07106a59e 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -47,7 +47,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 	int fb_blank = 0;
 
 	/* If we aren't interested in this event, skip it immediately ... */
-	if (event != FB_EVENT_BLANK && event != FB_EVENT_CONBLANK)
+	if (event != FB_EVENT_BLANK)
 		return 0;
 
 	bd = container_of(self, struct backlight_device, fb_notif);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index ee0bed9571aa..be179b47d1c6 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2350,8 +2350,6 @@ static int fbcon_switch(struct vc_data *vc)
 static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
 				int blank)
 {
-	struct fb_event event;
-
 	if (blank) {
 		unsigned short charmask = vc->vc_hi_font_mask ?
 			0x1ff : 0xff;
@@ -2362,13 +2360,6 @@ static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
 		fbcon_clear(vc, 0, 0, vc->vc_rows, vc->vc_cols);
 		vc->vc_video_erase_char = oldc;
 	}
-
-
-	lock_fb_info(info);
-	event.info = info;
-	event.data = &blank;
-	fb_notifier_call_chain(FB_EVENT_CONBLANK, &event);
-	unlock_fb_info(info);
 }
 
 static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
@@ -3240,7 +3231,7 @@ int fbcon_fb_registered(struct fb_info *info)
 	return ret;
 }
 
-static void fbcon_fb_blanked(struct fb_info *info, int blank)
+void fbcon_fb_blanked(struct fb_info *info, int blank)
 {
 	struct fbcon_ops *ops = info->fbcon_par;
 	struct vc_data *vc;
@@ -3344,9 +3335,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 		con2fb = event->data;
 		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
 		break;
-	case FB_EVENT_BLANK:
-		fbcon_fb_blanked(info, *(int *)event->data);
-		break;
 	case FB_EVENT_REMAP_ALL_CONSOLE:
 		idx = info->node;
 		fbcon_remap_all(idx);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index ddc0c16b8bbf..9366fbe99a58 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1068,6 +1068,7 @@ fb_blank(struct fb_info *info, int blank)
 	event.data = &blank;
 
 	early_ret = fb_notifier_call_chain(FB_EARLY_EVENT_BLANK, &event);
+	fbcon_fb_blanked(info, blank);
 
 	if (info->fbops->fb_blank)
  		ret = info->fbops->fb_blank(blank, info);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 0d86aa31bf8d..1e66fac3124f 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -137,12 +137,10 @@ struct fb_cursor_user {
 #define FB_EVENT_GET_CONSOLE_MAP        0x07
 /*      CONSOLE-SPECIFIC: set console to framebuffer mapping */
 #define FB_EVENT_SET_CONSOLE_MAP        0x08
-/*      A hardware display blank change occurred */
+/*      A display blank is requested       */
 #define FB_EVENT_BLANK                  0x09
 /*      Private modelist is to be replaced */
 #define FB_EVENT_MODE_CHANGE_ALL	0x0B
-/*	A software display blank change occurred */
-#define FB_EVENT_CONBLANK               0x0C
 /*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */
 #define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
 /*      A hardware display blank early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 305e4f2eddac..d67d7ec51ef9 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -14,6 +14,7 @@ int fbcon_mode_deleted(struct fb_info *info,
 void fbcon_new_modelist(struct fb_info *info);
 void fbcon_get_requirement(struct fb_info *info,
 			   struct fb_blit_caps *caps);
+void fbcon_fb_blanked(struct fb_info *info, int blank);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -27,6 +28,7 @@ static inline int fbcon_mode_deleted(struct fb_info *info,
 static inline void fbcon_new_modelist(struct fb_info *info) {}
 static inline void fbcon_get_requirement(struct fb_info *info,
 					 struct fb_blit_caps *caps) {}
+static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 9e1467002630065ed86c65ea28bfc9194fff6f0e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:02:59 +0200
Subject: fbcon: replace FB_EVENT_MODE_CHANGE/_ALL with direct calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a new wrapper function for this, feels like there's some
refactoring room here between the two modes.

v2: backlight notifier is also interested in the mode change event,
it calls lcd->set_mode, of which there are 3 implementations. Thanks
to Maarten for spotting this. So we keep that. We can ditch the differentiation
between mode change and all mode changes (because backlight notifier
doesn't care), and we can drop the FBINFO_MISC_USEREVENT stuff too,
because that's just to prevent recursion between fbmem.c and fbcon.c.

While at it flatten the control flow a bit.

v3: Need to add a static inline to the dummy function.

v4: Add missing #include <fbcon.h> to sh_mob (Sam).

Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jingoo Han <jingoohan1@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Peter Rosin <peda@axentia.se>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-29-daniel.vetter@ffwll.ch
---
 drivers/video/backlight/lcd.c          |  1 -
 drivers/video/fbdev/core/fbcon.c       | 15 +++++++++------
 drivers/video/fbdev/core/fbmem.c       | 21 ++++++++++-----------
 drivers/video/fbdev/sh_mobile_lcdcfb.c | 12 ++----------
 include/linux/fb.h                     |  2 --
 include/linux/fbcon.h                  |  2 ++
 6 files changed, 23 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 151b18776add..ecdda06989d0 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -34,7 +34,6 @@ static int fb_notifier_callback(struct notifier_block *self,
 	switch (event) {
 	case FB_EVENT_BLANK:
 	case FB_EVENT_MODE_CHANGE:
-	case FB_EVENT_MODE_CHANGE_ALL:
 	case FB_EARLY_EVENT_BLANK:
 	case FB_R_EARLY_EVENT_BLANK:
 		break;
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 4afbc7d8c68c..1837985e1ffb 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3009,6 +3009,15 @@ static void fbcon_set_all_vcs(struct fb_info *info)
 		fbcon_modechanged(info);
 }
 
+
+void fbcon_update_vcs(struct fb_info *info, bool all)
+{
+	if (all)
+		fbcon_set_all_vcs(info);
+	else
+		fbcon_modechanged(info);
+}
+
 int fbcon_mode_deleted(struct fb_info *info,
 		       struct fb_videomode *mode)
 {
@@ -3318,12 +3327,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 	int idx, ret = 0;
 
 	switch(action) {
-	case FB_EVENT_MODE_CHANGE:
-		fbcon_modechanged(info);
-		break;
-	case FB_EVENT_MODE_CHANGE_ALL:
-		fbcon_set_all_vcs(info);
-		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		/* called with console lock held */
 		con2fb = event->data;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 96805fe85332..dd1a708df1a7 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -957,6 +957,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 	u32 activate;
 	struct fb_var_screeninfo old_var;
 	struct fb_videomode mode;
+	struct fb_event event;
 
 	if (var->activate & FB_ACTIVATE_INV_MODE) {
 		struct fb_videomode mode1, mode2;
@@ -1039,19 +1040,17 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 	    !list_empty(&info->modelist))
 		ret = fb_add_videomode(&mode, &info->modelist);
 
-	if (!ret && (flags & FBINFO_MISC_USEREVENT)) {
-		struct fb_event event;
-		int evnt = (activate & FB_ACTIVATE_ALL) ?
-			FB_EVENT_MODE_CHANGE_ALL :
-			FB_EVENT_MODE_CHANGE;
+	if (ret)
+		return ret;
 
-		info->flags &= ~FBINFO_MISC_USEREVENT;
-		event.info = info;
-		event.data = &mode;
-		fb_notifier_call_chain(evnt, &event);
-	}
+	event.info = info;
+	event.data = &mode;
+	fb_notifier_call_chain(FB_EVENT_MODE_CHANGE, &event);
 
-	return ret;
+	if (flags & FBINFO_MISC_USEREVENT)
+		fbcon_update_vcs(info, activate & FB_ACTIVATE_ALL);
+
+	return 0;
 }
 EXPORT_SYMBOL(fb_set_var);
 
diff --git a/drivers/video/fbdev/sh_mobile_lcdcfb.c b/drivers/video/fbdev/sh_mobile_lcdcfb.c
index 015a02a29d37..b8454424910d 100644
--- a/drivers/video/fbdev/sh_mobile_lcdcfb.c
+++ b/drivers/video/fbdev/sh_mobile_lcdcfb.c
@@ -15,6 +15,7 @@
 #include <linux/ctype.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/fbcon.h>
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -1757,8 +1758,6 @@ static void sh_mobile_fb_reconfig(struct fb_info *info)
 	struct sh_mobile_lcdc_chan *ch = info->par;
 	struct fb_var_screeninfo var;
 	struct fb_videomode mode;
-	struct fb_event event;
-	int evnt = FB_EVENT_MODE_CHANGE_ALL;
 
 	if (ch->use_count > 1 || (ch->use_count == 1 && !info->fbcon_par))
 		/* More framebuffer users are active */
@@ -1780,14 +1779,7 @@ static void sh_mobile_fb_reconfig(struct fb_info *info)
 		/* Couldn't reconfigure, hopefully, can continue as before */
 		return;
 
-	/*
-	 * fb_set_var() calls the notifier change internally, only if
-	 * FBINFO_MISC_USEREVENT flag is set. Since we do not want to fake a
-	 * user event, we have to call the chain ourselves.
-	 */
-	event.info = info;
-	event.data = &ch->display.mode;
-	fb_notifier_call_chain(evnt, &event);
+	fbcon_update_vcs(info, true);
 }
 
 /*
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 1e66fac3124f..f9c212f9b661 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -139,8 +139,6 @@ struct fb_cursor_user {
 #define FB_EVENT_SET_CONSOLE_MAP        0x08
 /*      A display blank is requested       */
 #define FB_EVENT_BLANK                  0x09
-/*      Private modelist is to be replaced */
-#define FB_EVENT_MODE_CHANGE_ALL	0x0B
 /*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */
 #define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
 /*      A hardware display blank early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index d67d7ec51ef9..de31eeb22c97 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -15,6 +15,7 @@ void fbcon_new_modelist(struct fb_info *info);
 void fbcon_get_requirement(struct fb_info *info,
 			   struct fb_blit_caps *caps);
 void fbcon_fb_blanked(struct fb_info *info, int blank);
+void fbcon_update_vcs(struct fb_info *info, bool all);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -29,6 +30,7 @@ static inline void fbcon_new_modelist(struct fb_info *info) {}
 static inline void fbcon_get_requirement(struct fb_info *info,
 					 struct fb_blit_caps *caps) {}
 static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
+static inline void fbcon_update_vcs(struct fb_info *info, bool all) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 1cd51b5d200dec292577a4656803d8aeff54ad51 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:03:00 +0200
Subject: vgaswitcheroo: call fbcon_remap_all directly

While at it, clean up the interface a bit and push the console locking
into fbcon.c.

v2: Remove now outdated comment (Lukas).

v3: Forgot to add static inline to the dummy function.

Acked-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: linux-fbdev@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-30-daniel.vetter@ffwll.ch
---
 drivers/gpu/vga/vga_switcheroo.c | 11 +++--------
 drivers/video/fbdev/core/fbcon.c | 14 +++++---------
 include/linux/fb.h               |  2 --
 include/linux/fbcon.h            |  2 ++
 4 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c
index a132c37d7334..65d7541c413a 100644
--- a/drivers/gpu/vga/vga_switcheroo.c
+++ b/drivers/gpu/vga/vga_switcheroo.c
@@ -35,6 +35,7 @@
 #include <linux/debugfs.h>
 #include <linux/fb.h>
 #include <linux/fs.h>
+#include <linux/fbcon.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/pm_domain.h>
@@ -736,14 +737,8 @@ static int vga_switchto_stage2(struct vga_switcheroo_client *new_client)
 	if (!active->driver_power_control)
 		set_audio_state(active->id, VGA_SWITCHEROO_OFF);
 
-	if (new_client->fb_info) {
-		struct fb_event event;
-
-		console_lock();
-		event.info = new_client->fb_info;
-		fb_notifier_call_chain(FB_EVENT_REMAP_ALL_CONSOLE, &event);
-		console_unlock();
-	}
+	if (new_client->fb_info)
+		fbcon_remap_all(new_client->fb_info);
 
 	mutex_lock(&vgasr_priv.mux_hw_lock);
 	ret = vgasr_priv.handler->switchto(new_client->id);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 1837985e1ffb..44779a4371ee 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -3149,17 +3149,16 @@ void fbcon_fb_unregistered(struct fb_info *info)
 		do_unregister_con_driver(&fb_con);
 }
 
-/* called with console_lock held */
-static void fbcon_remap_all(int idx)
+void fbcon_remap_all(struct fb_info *info)
 {
-	int i;
-
-	WARN_CONSOLE_UNLOCKED();
+	int i, idx = info->node;
 
+	console_lock();
 	if (deferred_takeover) {
 		for (i = first_fb_vc; i <= last_fb_vc; i++)
 			con2fb_map_boot[i] = idx;
 		fbcon_map_override();
+		console_unlock();
 		return;
 	}
 
@@ -3172,6 +3171,7 @@ static void fbcon_remap_all(int idx)
 		       first_fb_vc + 1, last_fb_vc + 1);
 		info_idx = idx;
 	}
+	console_unlock();
 }
 
 #ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
@@ -3337,10 +3337,6 @@ static int fbcon_event_notify(struct notifier_block *self,
 		con2fb = event->data;
 		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
 		break;
-	case FB_EVENT_REMAP_ALL_CONSOLE:
-		idx = info->node;
-		fbcon_remap_all(idx);
-		break;
 	}
 	return ret;
 }
diff --git a/include/linux/fb.h b/include/linux/fb.h
index f9c212f9b661..25e4b885f5b3 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -139,8 +139,6 @@ struct fb_cursor_user {
 #define FB_EVENT_SET_CONSOLE_MAP        0x08
 /*      A display blank is requested       */
 #define FB_EVENT_BLANK                  0x09
-/*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */
-#define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
 /*      A hardware display blank early change occurred */
 #define FB_EARLY_EVENT_BLANK		0x10
 /*      A hardware display blank revert early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index de31eeb22c97..69f900d289b2 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -16,6 +16,7 @@ void fbcon_get_requirement(struct fb_info *info,
 			   struct fb_blit_caps *caps);
 void fbcon_fb_blanked(struct fb_info *info, int blank);
 void fbcon_update_vcs(struct fb_info *info, bool all);
+void fbcon_remap_all(struct fb_info *info);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -31,6 +32,7 @@ static inline void fbcon_get_requirement(struct fb_info *info,
 					 struct fb_blit_caps *caps) {}
 static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
 static inline void fbcon_update_vcs(struct fb_info *info, bool all) {}
+static inline void fbcon_remap_all(struct fb_info *info) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From fe2d70d6f6ff038c20705c34695bd34ac072af14 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 28 May 2019 11:03:01 +0200
Subject: fbcon: Call con2fb_map functions directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These are actually fbcon ioctls which just happen to be exposed
through /dev/fb*. They completely ignore which fb_info they're called
on, and I think the userspace tool even hardcodes to /dev/fb0.

Hence just forward the entire thing to fbcon.c wholesale.

Note that this patch drops the fb_lock/unlock on the set side. Since
the ioctl can operate on any fb (as passed in through
con2fb.framebuffer) this is bogus. Also note that fbcon.c in general
never calls fb_lock on anything, so this has been badly broken
already.

With this the last user of the fbcon notifier callback is gone, and we
can garbage collect that too.

v2: add missing uaccess.h include (alpha fails to compile otherwise),
reported by kbuild.

v3: Remember to also drop the #defines (Maarten)

v4: Add the static inline to dummy functions.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Yisheng Xie <ysxie@foxmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Peter Rosin <peda@axentia.se>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190528090304.9388-31-daniel.vetter@ffwll.ch
---
 drivers/video/fbdev/core/fbcon.c | 59 ++++++++++++++++++++++++----------------
 drivers/video/fbdev/core/fbmem.c | 34 ++---------------------
 include/linux/fb.h               |  4 ---
 include/linux/fbcon.h            |  4 +++
 4 files changed, 42 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 44779a4371ee..31d6a4e54436 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -76,6 +76,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/crc32.h> /* For counting font checksums */
+#include <linux/uaccess.h>
 #include <asm/fb.h>
 #include <asm/irq.h>
 
@@ -3318,29 +3319,47 @@ void fbcon_get_requirement(struct fb_info *info,
 	}
 }
 
-static int fbcon_event_notify(struct notifier_block *self,
-			      unsigned long action, void *data)
+int fbcon_set_con2fb_map_ioctl(void __user *argp)
 {
-	struct fb_event *event = data;
-	struct fb_info *info = event->info;
-	struct fb_con2fbmap *con2fb;
-	int idx, ret = 0;
+	struct fb_con2fbmap con2fb;
+	int ret;
 
-	switch(action) {
-	case FB_EVENT_SET_CONSOLE_MAP:
-		/* called with console lock held */
-		con2fb = event->data;
-		ret = set_con2fb_map(con2fb->console - 1,
-				     con2fb->framebuffer, 1);
-		break;
-	case FB_EVENT_GET_CONSOLE_MAP:
-		con2fb = event->data;
-		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
-		break;
+	if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
+		return -EFAULT;
+	if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
+		return -EINVAL;
+	if (con2fb.framebuffer >= FB_MAX)
+		return -EINVAL;
+	if (!registered_fb[con2fb.framebuffer])
+		request_module("fb%d", con2fb.framebuffer);
+	if (!registered_fb[con2fb.framebuffer]) {
+		return -EINVAL;
 	}
+
+	console_lock();
+	ret = set_con2fb_map(con2fb.console - 1,
+			     con2fb.framebuffer, 1);
+	console_unlock();
+
 	return ret;
 }
 
+int fbcon_get_con2fb_map_ioctl(void __user *argp)
+{
+	struct fb_con2fbmap con2fb;
+
+	if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
+		return -EFAULT;
+	if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
+		return -EINVAL;
+
+	console_lock();
+	con2fb.framebuffer = con2fb_map[con2fb.console - 1];
+	console_unlock();
+
+	return copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0;
+}
+
 /*
  *  The console `switch' structure for the frame buffer based console
  */
@@ -3372,10 +3391,6 @@ static const struct consw fb_con = {
 	.con_debug_leave	= fbcon_debug_leave,
 };
 
-static struct notifier_block fbcon_event_notifier = {
-	.notifier_call	= fbcon_event_notify,
-};
-
 static ssize_t store_rotate(struct device *device,
 			    struct device_attribute *attr, const char *buf,
 			    size_t count)
@@ -3648,7 +3663,6 @@ void __init fb_console_init(void)
 	int i;
 
 	console_lock();
-	fb_register_client(&fbcon_event_notifier);
 	fbcon_device = device_create(fb_class, NULL, MKDEV(0, 0), NULL,
 				     "fbcon");
 
@@ -3684,7 +3698,6 @@ static void __exit fbcon_deinit_device(void)
 void __exit fb_console_exit(void)
 {
 	console_lock();
-	fb_unregister_client(&fbcon_event_notifier);
 	fbcon_deinit_device();
 	device_destroy(fb_class, MKDEV(0, 0));
 	fbcon_exit();
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index dd1a708df1a7..64dd732021d8 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1092,10 +1092,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 	struct fb_ops *fb;
 	struct fb_var_screeninfo var;
 	struct fb_fix_screeninfo fix;
-	struct fb_con2fbmap con2fb;
 	struct fb_cmap cmap_from;
 	struct fb_cmap_user cmap;
-	struct fb_event event;
 	void __user *argp = (void __user *)arg;
 	long ret = 0;
 
@@ -1157,38 +1155,10 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		ret = -EINVAL;
 		break;
 	case FBIOGET_CON2FBMAP:
-		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
-			return -EFAULT;
-		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
-			return -EINVAL;
-		con2fb.framebuffer = -1;
-		event.data = &con2fb;
-		lock_fb_info(info);
-		event.info = info;
-		fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event);
-		unlock_fb_info(info);
-		ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0;
+		ret = fbcon_get_con2fb_map_ioctl(argp);
 		break;
 	case FBIOPUT_CON2FBMAP:
-		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
-			return -EFAULT;
-		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
-			return -EINVAL;
-		if (con2fb.framebuffer >= FB_MAX)
-			return -EINVAL;
-		if (!registered_fb[con2fb.framebuffer])
-			request_module("fb%d", con2fb.framebuffer);
-		if (!registered_fb[con2fb.framebuffer]) {
-			ret = -EINVAL;
-			break;
-		}
-		event.data = &con2fb;
-		console_lock();
-		lock_fb_info(info);
-		event.info = info;
-		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event);
-		unlock_fb_info(info);
-		console_unlock();
+		ret = fbcon_set_con2fb_map_ioctl(argp);
 		break;
 	case FBIOBLANK:
 		console_lock();
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 25e4b885f5b3..303771264644 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -133,10 +133,6 @@ struct fb_cursor_user {
 #define FB_EVENT_FB_UNREGISTERED        0x06
 #endif
 
-/*      CONSOLE-SPECIFIC: get console to framebuffer mapping */
-#define FB_EVENT_GET_CONSOLE_MAP        0x07
-/*      CONSOLE-SPECIFIC: set console to framebuffer mapping */
-#define FB_EVENT_SET_CONSOLE_MAP        0x08
 /*      A display blank is requested       */
 #define FB_EVENT_BLANK                  0x09
 /*      A hardware display blank early change occurred */
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 69f900d289b2..ff5596dd30f8 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -17,6 +17,8 @@ void fbcon_get_requirement(struct fb_info *info,
 void fbcon_fb_blanked(struct fb_info *info, int blank);
 void fbcon_update_vcs(struct fb_info *info, bool all);
 void fbcon_remap_all(struct fb_info *info);
+int fbcon_set_con2fb_map_ioctl(void __user *argp);
+int fbcon_get_con2fb_map_ioctl(void __user *argp);
 #else
 static inline void fb_console_init(void) {}
 static inline void fb_console_exit(void) {}
@@ -33,6 +35,8 @@ static inline void fbcon_get_requirement(struct fb_info *info,
 static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
 static inline void fbcon_update_vcs(struct fb_info *info, bool all) {}
 static inline void fbcon_remap_all(struct fb_info *info) {}
+static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; }
+static inline int fbcon_get_con2fb_map_ioctl(void __user *argp) { return 0; }
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From a842fe1425cb20f457abd3f8ef98b468f83ca98b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 12 Jun 2019 11:57:25 -0700
Subject: tcp: add optional per socket transmit delay

Adding delays to TCP flows is crucial for studying behavior
of TCP stacks, including congestion control modules.

Linux offers netem module, but it has unpractical constraints :
- Need root access to change qdisc
- Hard to setup on egress if combined with non trivial qdisc like FQ
- Single delay for all flows.

EDT (Earliest Departure Time) adoption in TCP stack allows us
to enable a per socket delay at a very small cost.

Networking tools can now establish thousands of flows, each of them
with a different delay, simulating real world conditions.

This requires FQ packet scheduler or a EDT-enabled NIC.

This patchs adds TCP_TX_DELAY socket option, to set a delay in
usec units.

  unsigned int tx_delay = 10000; /* 10 msec */

  setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay));

Note that FQ packet scheduler limits might need some tweaking :

man tc-fq

PARAMETERS
   limit
       Hard  limit  on  the  real  queue  size. When this limit is
       reached, new packets are dropped. If the value is  lowered,
       packets  are  dropped so that the new limit is met. Default
       is 10000 packets.

   flow_limit
       Hard limit on the maximum  number  of  packets  queued  per
       flow.  Default value is 100.

Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc,
so packets would be dropped if any of the previous limit is hit.

Use of a jump label makes this support runtime-free, for hosts
never using the option.

Also note that TSQ (TCP Small Queues) limits are slightly changed
with this patch : we need to account that skbs artificially delayed
wont stop us providind more skbs to feed the pipe (netem uses
skb_orphan_partial() for this purpose, but FQ can not use this trick)

Because of that, using big delays might very well trigger
old bugs in TSO auto defer logic and/or sndbuf limited detection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 ++
 include/net/tcp.h        | 19 +++++++++++++++++++
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 24 ++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      | 10 ++++++----
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    | 23 ++++++++++++++++++++---
 net/ipv6/tcp_ipv6.c      |  1 +
 8 files changed, 76 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 711361af9ce0..c23019a3b264 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -245,6 +245,7 @@ struct tcp_sock {
 		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
+	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
 	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 
@@ -436,6 +437,7 @@ struct tcp_timewait_sock {
 	u32			  tw_last_oow_ack_time;
 
 	int			  tw_ts_recent_stamp;
+	u32			  tw_tx_delay;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 204328b88412..49a178b8d5b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2232,4 +2232,23 @@ void clean_acked_data_disable(struct inet_connection_sock *icsk);
 void clean_acked_data_flush(void);
 #endif
 
+DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+static inline void tcp_add_tx_delay(struct sk_buff *skb,
+				    const struct tcp_sock *tp)
+{
+	if (static_branch_unlikely(&tcp_tx_delay_enabled))
+		skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
+}
+
+static inline void tcp_set_tx_time(struct sk_buff *skb,
+				   const struct sock *sk)
+{
+	if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
+		u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
+			tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
+
+		skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
+	}
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b521464ea962..b3564f85a762 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -127,6 +127,9 @@ enum {
 
 #define TCP_CM_INQ		TCP_INQ
 
+#define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
+
+
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
 #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bd0856ac680a..5542e3d778e6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk,
 	return 0;
 }
 
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+		static int __tcp_tx_delay_enabled = 0;
+
+		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+			static_branch_enable(&tcp_tx_delay_enabled);
+			pr_info("TCP_TX_DELAY enabled\n");
+		}
+	}
+}
+
 /*
  *	Socket option code for TCP.
  */
@@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->recvmsg_inq = val;
 		break;
+	case TCP_TX_DELAY:
+		if (val)
+			tcp_enable_tx_delay();
+		tp->tcp_tx_delay = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = tp->fastopen_no_cookie;
 		break;
 
+	case TCP_TX_DELAY:
+		val = tp->tcp_tx_delay;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp_raw() + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f059fbd81a84..1b7e9e1fbd3b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
-	if (sk)
+	if (sk) {
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+		tcp_set_tx_time(skb, sk);
+	}
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
-	if (sk)
-		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
-				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+			   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	tcp_set_tx_time(skb, sk);
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 11011e8386dc..8bcaf2586b68 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
 		tcptw->tw_last_oow_ack_time = 0;
-
+		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f429e856e263..d954ff9069e8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
 			       sizeof(struct inet6_skb_parm)));
 
+	tcp_add_tx_delay(skb, tp);
+
 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
 	if (unlikely(err > 0)) {
@@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 			      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
 	limit <<= factor;
 
+	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+	    tcp_sk(sk)->tcp_tx_delay) {
+		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+		 * approximate our needs assuming an ~100% skb->truesize overhead.
+		 * USEC_PER_SEC is approximated by 2^20.
+		 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+		 */
+		extra_bytes >>= (20 - 1);
+		limit += extra_bytes;
+	}
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
 		/* Always send skb if rtx queue is empty.
 		 * No need to wait for TX completion to call us back,
@@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	int tcp_header_size;
 	struct tcphdr *th;
 	int mss;
+	u64 now;
 
 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 	if (unlikely(!skb)) {
@@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
+	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
 		skb->skb_mstamp_ns = cookie_init_timestamp(req);
 	else
 #endif
 	{
-		skb->skb_mstamp_ns = tcp_clock_ns();
+		skb->skb_mstamp_ns = now;
 		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
 			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
 	}
@@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	rcu_read_unlock();
 #endif
 
-	/* Do not fool tcpdump (if any), clean our debris */
-	skb->tstamp = 0;
+	skb->skb_mstamp_ns = now;
+	tcp_add_tx_delay(skb, tp);
+
 	return skb;
 }
 EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad7039137a20..5606b2131b65 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		} else {
 			mark = sk->sk_mark;
 		}
+		tcp_set_tx_time(buff, sk);
 	}
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
-- 
cgit v1.2.3


From d664c43958e0d9e0b34e23b6f8a8f4cf8ec61a2e Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Wed, 12 Jun 2019 23:59:36 +0200
Subject: gpio: Fix build warnings on undefined struct pinctrl_dev

This fixes the warnings:

* include/linux/gpio.h:254:11: warning: 'struct pinctrl_dev' declared
  inside parameter list will not be visible outside of this definition
  or declaration
* include/linux/gpio/driver.h:602:11: warning: 'struct pinctrl_dev'
  declared inside parameter list will not be visible outside of this
  definition or declaration

Fixes: 78b99577b393 ("pinctrl: remove unused pin_is_valid()")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Enrico Weigelt <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio.h        | 1 +
 include/linux/gpio/driver.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 39745b8bdd65..40915b461f18 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -106,6 +106,7 @@ void devm_gpio_free(struct device *dev, unsigned int gpio);
 
 struct device;
 struct gpio_chip;
+struct pinctrl_dev;
 
 static inline bool gpio_is_valid(int number)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a1d273c96016..b58b27c11355 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -590,6 +590,8 @@ void gpiochip_remove_pin_ranges(struct gpio_chip *chip);
 
 #else
 
+struct pinctrl_dev;
+
 static inline int
 gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name,
 		       unsigned int gpio_offset, unsigned int pin_offset,
-- 
cgit v1.2.3


From 68608b5e5063dd12942f1118286c6f595d0c4a05 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 10 Jun 2019 12:18:56 +0300
Subject: firmware: ti_sci: Add resource management APIs for ringacc, psi-l and
 udma

Configuration of NAVSS resource, like rings, UDMAP channels, flows
and PSI-L thread management need to be done via TISCI.

Add the needed structures and functions for NAVSS resource configuration of
the following:
Rings from Ring Accelerator
PSI-L thread management
UDMAP tchan, rchan and rflow configuration.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 488 ++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 675 +++++++++++++++++++++++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h | 215 +++++++++++
 3 files changed, 1378 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 36ce11a67235..02fa196428d8 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2004,6 +2004,481 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle,
 			       ia_id, vint, global_event, vint_status_bit, 0);
 }
 
+/**
+ * ti_sci_cmd_ring_config() - configure RA ring
+ * @handle:		Pointer to TI SCI handle.
+ * @valid_params:	Bitfield defining validity of ring configuration
+ *			parameters
+ * @nav_id:		Device ID of Navigator Subsystem from which the ring is
+ *			allocated
+ * @index:		Ring index
+ * @addr_lo:		The ring base address lo 32 bits
+ * @addr_hi:		The ring base address hi 32 bits
+ * @count:		Number of ring elements
+ * @mode:		The mode of the ring
+ * @size:		The ring element size.
+ * @order_id:		Specifies the ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle,
+				  u32 valid_params, u16 nav_id, u16 index,
+				  u32 addr_lo, u32 addr_hi, u32 count,
+				  u8 mode, u8 size, u8 order_id)
+{
+	struct ti_sci_msg_rm_ring_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "RM_RA:Message config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
+	req->valid_params = valid_params;
+	req->nav_id = nav_id;
+	req->index = index;
+	req->addr_lo = addr_lo;
+	req->addr_hi = addr_hi;
+	req->count = count;
+	req->mode = mode;
+	req->size = size;
+	req->order_id = order_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_ring_get_config() - get RA ring configuration
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem from which the ring is
+ *		allocated
+ * @index:	Ring index
+ * @addr_lo:	Returns ring's base address lo 32 bits
+ * @addr_hi:	Returns ring's base address hi 32 bits
+ * @count:	Returns number of ring elements
+ * @mode:	Returns mode of the ring
+ * @size:	Returns ring element size
+ * @order_id:	Returns ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_get_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle,
+				      u32 nav_id, u32 index, u8 *mode,
+				      u32 *addr_lo, u32 *addr_hi,
+				      u32 *count, u8 *size, u8 *order_id)
+{
+	struct ti_sci_msg_rm_ring_get_cfg_resp *resp;
+	struct ti_sci_msg_rm_ring_get_cfg_req *req;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev,
+			"RM_RA:Message get config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->index = index;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox get config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		if (mode)
+			*mode = resp->mode;
+		if (addr_lo)
+			*addr_lo = resp->addr_lo;
+		if (addr_hi)
+			*addr_hi = resp->addr_hi;
+		if (count)
+			*count = resp->count;
+		if (size)
+			*size = resp->size;
+		if (order_id)
+			*order_id = resp->order_id;
+	};
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:get config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		pairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread: Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_pair(const struct ti_sci_handle *handle,
+				   u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_pair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_PAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_pair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_unpair() - Unpair PSI-L source from destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		unpairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread:	Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_unpair(const struct ti_sci_handle *handle,
+				     u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_unpair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_UNPAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_unpair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_tx_ch_cfg() - Configure a UDMAP TX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_tx_ch_cfg TX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg and @ti_sci_msg_rm_udmap_tx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_TX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message TX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->tx_pause_on_err = params->tx_pause_on_err;
+	req->tx_filt_einfo = params->tx_filt_einfo;
+	req->tx_filt_pswords = params->tx_filt_pswords;
+	req->tx_atype = params->tx_atype;
+	req->tx_chan_type = params->tx_chan_type;
+	req->tx_supr_tdpkt = params->tx_supr_tdpkt;
+	req->tx_fetch_size = params->tx_fetch_size;
+	req->tx_credit_count = params->tx_credit_count;
+	req->txcq_qnum = params->txcq_qnum;
+	req->tx_priority = params->tx_priority;
+	req->tx_qos = params->tx_qos;
+	req->tx_orderid = params->tx_orderid;
+	req->fdepth = params->fdepth;
+	req->tx_sched_priority = params->tx_sched_priority;
+	req->tx_burst_size = params->tx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send TX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "TX_CH_CFG: chn %u ret:%u\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_ch_cfg() - Configure a UDMAP RX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_rx_ch_cfg RX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg and @ti_sci_msg_rm_udmap_rx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_RX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message RX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->rx_fetch_size = params->rx_fetch_size;
+	req->rxcq_qnum = params->rxcq_qnum;
+	req->rx_priority = params->rx_priority;
+	req->rx_qos = params->rx_qos;
+	req->rx_orderid = params->rx_orderid;
+	req->rx_sched_priority = params->rx_sched_priority;
+	req->flowid_start = params->flowid_start;
+	req->flowid_cnt = params->flowid_cnt;
+	req->rx_pause_on_err = params->rx_pause_on_err;
+	req->rx_atype = params->rx_atype;
+	req->rx_chan_type = params->rx_chan_type;
+	req->rx_ignore_short = params->rx_ignore_short;
+	req->rx_ignore_long = params->rx_ignore_long;
+	req->rx_burst_size = params->rx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send RX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_CH_CFG: chn %u ret:%d\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_flow_cfg() - Configure UDMAP RX FLOW
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_flow_cfg RX FLOW config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_flow_cfg and @ti_sci_msg_rm_udmap_flow_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_flow_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_flow_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_flow_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_FLOW_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RX_FL_CFG: Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_flow_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->flow_index = params->flow_index;
+	req->rx_einfo_present = params->rx_einfo_present;
+	req->rx_psinfo_present = params->rx_psinfo_present;
+	req->rx_error_handling = params->rx_error_handling;
+	req->rx_desc_type = params->rx_desc_type;
+	req->rx_sop_offset = params->rx_sop_offset;
+	req->rx_dest_qnum = params->rx_dest_qnum;
+	req->rx_src_tag_hi = params->rx_src_tag_hi;
+	req->rx_src_tag_lo = params->rx_src_tag_lo;
+	req->rx_dest_tag_hi = params->rx_dest_tag_hi;
+	req->rx_dest_tag_lo = params->rx_dest_tag_lo;
+	req->rx_src_tag_hi_sel = params->rx_src_tag_hi_sel;
+	req->rx_src_tag_lo_sel = params->rx_src_tag_lo_sel;
+	req->rx_dest_tag_hi_sel = params->rx_dest_tag_hi_sel;
+	req->rx_dest_tag_lo_sel = params->rx_dest_tag_lo_sel;
+	req->rx_fdq0_sz0_qnum = params->rx_fdq0_sz0_qnum;
+	req->rx_fdq1_qnum = params->rx_fdq1_qnum;
+	req->rx_fdq2_qnum = params->rx_fdq2_qnum;
+	req->rx_fdq3_qnum = params->rx_fdq3_qnum;
+	req->rx_ps_location = params->rx_ps_location;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RX_FL_CFG: Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_FL_CFG: %u ret:%d\n", params->flow_index, ret);
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2016,6 +2491,9 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_clk_ops *cops = &ops->clk_ops;
 	struct ti_sci_rm_core_ops *rm_core_ops = &ops->rm_core_ops;
 	struct ti_sci_rm_irq_ops *iops = &ops->rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
+	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
+	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2055,6 +2533,16 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->set_event_map = ti_sci_cmd_set_event_map;
 	iops->free_irq = ti_sci_cmd_free_irq;
 	iops->free_event_map = ti_sci_cmd_free_event_map;
+
+	rops->config = ti_sci_cmd_ring_config;
+	rops->get_config = ti_sci_cmd_ring_get_config;
+
+	psilops->pair = ti_sci_cmd_rm_psil_pair;
+	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
+
+	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
+	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
+	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..2bb81ec7793c 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -42,6 +42,35 @@
 #define TI_SCI_MSG_SET_IRQ		0x1000
 #define TI_SCI_MSG_FREE_IRQ		0x1001
 
+/* NAVSS resource management */
+/* Ringacc requests */
+#define TI_SCI_MSG_RM_RING_ALLOCATE		0x1100
+#define TI_SCI_MSG_RM_RING_FREE			0x1101
+#define TI_SCI_MSG_RM_RING_RECONFIG		0x1102
+#define TI_SCI_MSG_RM_RING_RESET		0x1103
+#define TI_SCI_MSG_RM_RING_CFG			0x1110
+#define TI_SCI_MSG_RM_RING_GET_CFG		0x1111
+
+/* PSI-L requests */
+#define TI_SCI_MSG_RM_PSIL_PAIR			0x1280
+#define TI_SCI_MSG_RM_PSIL_UNPAIR		0x1281
+
+#define TI_SCI_MSG_RM_UDMAP_TX_ALLOC		0x1200
+#define TI_SCI_MSG_RM_UDMAP_TX_FREE		0x1201
+#define TI_SCI_MSG_RM_UDMAP_RX_ALLOC		0x1210
+#define TI_SCI_MSG_RM_UDMAP_RX_FREE		0x1211
+#define TI_SCI_MSG_RM_UDMAP_FLOW_CFG		0x1220
+#define TI_SCI_MSG_RM_UDMAP_OPT_FLOW_CFG	0x1221
+
+#define TISCI_MSG_RM_UDMAP_TX_CH_CFG		0x1205
+#define TISCI_MSG_RM_UDMAP_TX_CH_GET_CFG	0x1206
+#define TISCI_MSG_RM_UDMAP_RX_CH_CFG		0x1215
+#define TISCI_MSG_RM_UDMAP_RX_CH_GET_CFG	0x1216
+#define TISCI_MSG_RM_UDMAP_FLOW_CFG		0x1230
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_CFG	0x1231
+#define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -563,4 +592,650 @@ struct ti_sci_msg_req_manage_irq {
 	u8 secondary_host;
 } __packed;
 
+/**
+ * struct ti_sci_msg_rm_ring_cfg_req - Configure a Navigator Subsystem ring
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem ring.
+ * @hdr:	Generic Header
+ * @valid_params: Bitfield defining validity of ring configuration parameters.
+ *	The ring configuration fields are not valid, and will not be used for
+ *	ring configuration, if their corresponding valid bit is zero.
+ *	Valid bit usage:
+ *	0 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_lo
+ *	1 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_hi
+ *	2 - Valid bit for @tisci_msg_rm_ring_cfg_req count
+ *	3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode
+ *	4 - Valid bit for @tisci_msg_rm_ring_cfg_req size
+ *	5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index to be configured.
+ * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's
+ *	RING_BA_LO register
+ * @addr_hi: 16 MSBs of ring base address to be programmed into the ring's
+ *	RING_BA_HI register.
+ * @count: Number of ring elements. Must be even if mode is CREDENTIALS or QM
+ *	modes.
+ * @mode: Specifies the mode the ring is to be configured.
+ * @size: Specifies encoded ring element size. To calculate the encoded size use
+ *	the formula (log2(size_bytes) - 2), where size_bytes cannot be
+ *	greater than 256.
+ * @order_id: Specifies the ring's bus order ID.
+ */
+struct ti_sci_msg_rm_ring_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration
+ *
+ * Gets the configuration of the non-real-time register fields of a ring.  The
+ * host, or a supervisor of the host, who owns the ring must be the requesting
+ * host.  The values of the non-real-time registers are returned in
+ * @ti_sci_msg_rm_ring_get_cfg_resp.
+ *
+ * @hdr: Generic Header
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u16 nav_id;
+	u16 index;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_resp -  Ring get configuration response
+ *
+ * Response received by host processor after RM has handled
+ * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's
+ * non-real-time register values.
+ *
+ * @hdr: Generic Header
+ * @addr_lo: Ring 32 LSBs of base address
+ * @addr_hi: Ring 16 MSBs of base address.
+ * @count: Ring number of elements.
+ * @mode: Ring mode.
+ * @size: encoded Ring element size
+ * @order_id: ing order ID.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_resp {
+	struct ti_sci_msg_hdr hdr;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination
+ *				 thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to pair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register programmed with the destination thread if the pairing
+ * is successful.
+
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register programmed with the source thread if the pairing
+ * is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_PAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_pair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_unpair - Unpairs a PSI-L source thread from a
+ *				   destination thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to unpair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_UNPAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_unpair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_udmap_rx_flow_cfg -  UDMAP receive flow configuration
+ *					  message
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_einfo_present: UDMAP receive flow extended packet info present.
+ * @rx_psinfo_present: UDMAP receive flow PS words present.
+ * @rx_error_handling: UDMAP receive flow error handling configuration. Valid
+ *	values are TI_SCI_RM_UDMAP_RX_FLOW_ERR_DROP/RETRY.
+ * @rx_desc_type: UDMAP receive flow descriptor type. It can be one of
+ *	TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST/MONO.
+ * @rx_sop_offset: UDMAP receive flow start of packet offset.
+ * @rx_dest_qnum: UDMAP receive flow destination queue number.
+ * @rx_ps_location: UDMAP receive flow PS words location.
+ *	0 - end of packet descriptor
+ *	1 - Beginning of the data buffer
+ * @rx_src_tag_hi: UDMAP receive flow source tag high byte constant
+ * @rx_src_tag_lo: UDMAP receive flow source tag low byte constant
+ * @rx_dest_tag_hi: UDMAP receive flow destination tag high byte constant
+ * @rx_dest_tag_lo: UDMAP receive flow destination tag low byte constant
+ * @rx_src_tag_hi_sel: UDMAP receive flow source tag high byte selector
+ * @rx_src_tag_lo_sel: UDMAP receive flow source tag low byte selector
+ * @rx_dest_tag_hi_sel: UDMAP receive flow destination tag high byte selector
+ * @rx_dest_tag_lo_sel: UDMAP receive flow destination tag low byte selector
+ * @rx_size_thresh_en: UDMAP receive flow packet size based free buffer queue
+ *	enable. If enabled, the ti_sci_rm_udmap_rx_flow_opt_cfg also need to be
+ *	configured and sent.
+ * @rx_fdq0_sz0_qnum: UDMAP receive flow free descriptor queue 0.
+ * @rx_fdq1_qnum: UDMAP receive flow free descriptor queue 1.
+ * @rx_fdq2_qnum: UDMAP receive flow free descriptor queue 2.
+ * @rx_fdq3_qnum: UDMAP receive flow free descriptor queue 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct ti_sci_msg_udmap_rx_flow_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_ps_location;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u8 rx_size_thresh_en;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+} __packed;
+
+/**
+ * struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg - parameters for UDMAP receive
+ *						flow optional configuration
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_size_thresh0: UDMAP receive flow packet size threshold 0.
+ * @rx_size_thresh1: UDMAP receive flow packet size threshold 1.
+ * @rx_size_thresh2: UDMAP receive flow packet size threshold 2.
+ * @rx_fdq0_sz1_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 1.
+ * @rx_fdq0_sz2_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 2.
+ * @rx_fdq0_sz3_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u16 rx_size_thresh0;
+	u16 rx_size_thresh1;
+	u16 rx_size_thresh2;
+	u16 rx_fdq0_sz1_qnum;
+	u16 rx_fdq0_sz2_qnum;
+	u16 rx_fdq0_sz3_qnum;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * transmit channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of tx channel configuration
+ * parameters. The tx channel configuration fields are not valid, and will not
+ * be used for ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_pause_on_err
+ *    1 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_atype
+ *    2 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_chan_type
+ *    3 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_fetch_size
+ *    4 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::txcq_qnum
+ *    5 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_priority
+ *    6 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_qos
+ *    7 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_orderid
+ *    8 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_sched_priority
+ *    9 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_einfo
+ *   10 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_pswords
+ *   11 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_supr_tdpkt
+ *   12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count
+ *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
+ *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
+ *
+ * @index: UDMAP transmit channel index.
+ *
+ * @tx_pause_on_err: UDMAP transmit channel pause on error configuration to
+ * be programmed into the tx_pause_on_err field of the channel's TCHAN_TCFG
+ * register.
+ *
+ * @tx_filt_einfo: UDMAP transmit channel extended packet information passing
+ * configuration to be programmed into the tx_filt_einfo field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_filt_pswords: UDMAP transmit channel protocol specific word passing
+ * configuration to be programmed into the tx_filt_pswords field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_atype: UDMAP transmit channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the tx_atype field of
+ * the channel's TCHAN_TCFG register.
+ *
+ * @tx_chan_type: UDMAP transmit channel functional channel type and work
+ * passing mechanism configuration to be programmed into the tx_chan_type
+ * field of the channel's TCHAN_TCFG register.
+ *
+ * @tx_supr_tdpkt: UDMAP transmit channel teardown packet generation suppression
+ * configuration to be programmed into the tx_supr_tdpkt field of the channel's
+ * TCHAN_TCFG register.
+ *
+ * @tx_fetch_size: UDMAP transmit channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the tx_fetch_size field of the
+ * channel's TCHAN_TCFG register.  The user must make sure to set the maximum
+ * word count that can pass through the channel for any allowed descriptor type.
+ *
+ * @tx_credit_count: UDMAP transmit channel transfer request credit count
+ * configuration to be programmed into the count field of the TCHAN_TCREDIT
+ * register.  Specifies how many credits for complete TRs are available.
+ *
+ * @txcq_qnum: UDMAP transmit channel completion queue configuration to be
+ * programmed into the txcq_qnum field of the TCHAN_TCQ register. The specified
+ * completion queue must be assigned to the host, or a subordinate of the host,
+ * requesting configuration of the transmit channel.
+ *
+ * @tx_priority: UDMAP transmit channel transmit priority value to be programmed
+ * into the priority field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_qos: UDMAP transmit channel transmit qos value to be programmed into the
+ * qos field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_orderid: UDMAP transmit channel bus order id value to be programmed into
+ * the orderid field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @fdepth: UDMAP transmit channel FIFO depth configuration to be programmed
+ * into the fdepth field of the TCHAN_TFIFO_DEPTH register. Sets the number of
+ * Tx FIFO bytes which are allowed to be stored for the channel. Check the UDMAP
+ * section of the TRM for restrictions regarding this parameter.
+ *
+ * @tx_sched_priority: UDMAP transmit channel tx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * TCHAN_TST_SCHED register.
+ *
+ * @tx_burst_size: UDMAP transmit channel burst size configuration to be
+ * programmed into the tx_burst_size field of the TCHAN_TCFG register.
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * receive channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of rx channel configuration
+ * parameters.
+ * The rx channel configuration fields are not valid, and will not be used for
+ * ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_pause_on_err
+ *    1 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_atype
+ *    2 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_chan_type
+ *    3 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_fetch_size
+ *    4 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rxcq_qnum
+ *    5 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_priority
+ *    6 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_qos
+ *    7 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_orderid
+ *    8 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_sched_priority
+ *    9 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_start
+ *   10 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_cnt
+ *   11 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_short
+ *   12 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_long
+ *   14 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where rx channel is located
+ *
+ * @index: UDMAP receive channel index.
+ *
+ * @rx_fetch_size: UDMAP receive channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the rx_fetch_size field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rxcq_qnum: UDMAP receive channel completion queue configuration to be
+ * programmed into the rxcq_qnum field of the RCHAN_RCQ register.
+ * The specified completion queue must be assigned to the host, or a subordinate
+ * of the host, requesting configuration of the receive channel.
+ *
+ * @rx_priority: UDMAP receive channel receive priority value to be programmed
+ * into the priority field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_qos: UDMAP receive channel receive qos value to be programmed into the
+ * qos field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_orderid: UDMAP receive channel bus order id value to be programmed into
+ * the orderid field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_sched_priority: UDMAP receive channel rx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * RCHAN_RST_SCHED register.
+ *
+ * @flowid_start: UDMAP receive channel additional flows starting index
+ * configuration to program into the flow_start field of the RCHAN_RFLOW_RNG
+ * register. Specifies the starting index for flow IDs the receive channel is to
+ * make use of beyond the default flow. flowid_start and @ref flowid_cnt must be
+ * set as valid and configured together. The starting flow ID set by
+ * @ref flowid_cnt must be a flow index within the Navigator Subsystem's subset
+ * of flows beyond the default flows statically mapped to receive channels.
+ * The additional flows must be assigned to the host, or a subordinate of the
+ * host, requesting configuration of the receive channel.
+ *
+ * @flowid_cnt: UDMAP receive channel additional flows count configuration to
+ * program into the flowid_cnt field of the RCHAN_RFLOW_RNG register.
+ * This field specifies how many flow IDs are in the additional contiguous range
+ * of legal flow IDs for the channel.  @ref flowid_start and flowid_cnt must be
+ * set as valid and configured together. Disabling the valid_params field bit
+ * for flowid_cnt indicates no flow IDs other than the default are to be
+ * allocated and used by the receive channel. @ref flowid_start plus flowid_cnt
+ * cannot be greater than the number of receive flows in the receive channel's
+ * Navigator Subsystem.  The additional flows must be assigned to the host, or a
+ * subordinate of the host, requesting configuration of the receive channel.
+ *
+ * @rx_pause_on_err: UDMAP receive channel pause on error configuration to be
+ * programmed into the rx_pause_on_err field of the channel's RCHAN_RCFG
+ * register.
+ *
+ * @rx_atype: UDMAP receive channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the rx_atype field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_chan_type: UDMAP receive channel functional channel type and work passing
+ * mechanism configuration to be programmed into the rx_chan_type field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_ignore_short: UDMAP receive channel short packet treatment configuration
+ * to be programmed into the rx_ignore_short field of the RCHAN_RCFG register.
+ *
+ * @rx_ignore_long: UDMAP receive channel long packet treatment configuration to
+ * be programmed into the rx_ignore_long field of the RCHAN_RCFG register.
+ *
+ * @rx_burst_size: UDMAP receive channel burst size configuration to be
+ * programmed into the rx_burst_size field of the RCHAN_RCFG register.
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * Configuration does not include the flow registers which handle size-based
+ * free descriptor queue routing.
+ *
+ * The flow index must be assigned to the host defined in the TISCI header via
+ * the RM board configuration resource assignment range list.
+ *
+ * @hdr: Standard TISCI header
+ *
+ * @valid_params
+ * Bitfield defining validity of rx flow configuration parameters.  The
+ * rx flow configuration fields are not valid, and will not be used for flow
+ * configuration, if their corresponding valid bit is zero.  Valid bit usage:
+ *     0 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_einfo_present
+ *     1 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_psinfo_present
+ *     2 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_error_handling
+ *     3 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_desc_type
+ *     4 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_sop_offset
+ *     5 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_qnum
+ *     6 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi
+ *     7 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo
+ *     8 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi
+ *     9 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo
+ *    10 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi_sel
+ *    11 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo_sel
+ *    12 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi_sel
+ *    13 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo_sel
+ *    14 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq0_sz0_qnum
+ *    15 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq1_sz0_qnum
+ *    16 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq2_sz0_qnum
+ *    17 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq3_sz0_qnum
+ *    18 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_ps_location
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem from which the receive flow is
+ * allocated
+ *
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ *
+ * @rx_einfo_present:
+ * UDMAP receive flow extended packet info present configuration to be
+ * programmed into the rx_einfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_psinfo_present:
+ * UDMAP receive flow PS words present configuration to be programmed into the
+ * rx_psinfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_error_handling:
+ * UDMAP receive flow error handling configuration to be programmed into the
+ * rx_error_handling field of the flow's RFLOW_RFA register.
+ *
+ * @rx_desc_type:
+ * UDMAP receive flow descriptor type configuration to be programmed into the
+ * rx_desc_type field field of the flow's RFLOW_RFA register.
+ *
+ * @rx_sop_offset:
+ * UDMAP receive flow start of packet offset configuration to be programmed
+ * into the rx_sop_offset field of the RFLOW_RFA register.  See the UDMAP
+ * section of the TRM for more information on this setting.  Valid values for
+ * this field are 0-255 bytes.
+ *
+ * @rx_dest_qnum:
+ * UDMAP receive flow destination queue configuration to be programmed into the
+ * rx_dest_qnum field of the flow's RFLOW_RFA register.  The specified
+ * destination queue must be valid within the Navigator Subsystem and must be
+ * owned by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_src_tag_hi:
+ * UDMAP receive flow source tag high byte constant configuration to be
+ * programmed into the rx_src_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo:
+ * UDMAP receive flow source tag low byte constant configuration to be
+ * programmed into the rx_src_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi:
+ * UDMAP receive flow destination tag high byte constant configuration to be
+ * programmed into the rx_dest_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo:
+ * UDMAP receive flow destination tag low byte constant configuration to be
+ * programmed into the rx_dest_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_hi_sel:
+ * UDMAP receive flow source tag high byte selector configuration to be
+ * programmed into the rx_src_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo_sel:
+ * UDMAP receive flow source tag low byte selector configuration to be
+ * programmed into the rx_src_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi_sel:
+ * UDMAP receive flow destination tag high byte selector configuration to be
+ * programmed into the rx_dest_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo_sel:
+ * UDMAP receive flow destination tag low byte selector configuration to be
+ * programmed into the rx_dest_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_fdq0_sz0_qnum:
+ * UDMAP receive flow free descriptor queue 0 configuration to be programmed
+ * into the rx_fdq0_sz0_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting. The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq1_qnum:
+ * UDMAP receive flow free descriptor queue 1 configuration to be programmed
+ * into the rx_fdq1_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq2_qnum:
+ * UDMAP receive flow free descriptor queue 2 configuration to be programmed
+ * into the rx_fdq2_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq3_qnum:
+ * UDMAP receive flow free descriptor queue 3 configuration to be programmed
+ * into the rx_fdq3_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_ps_location:
+ * UDMAP receive flow PS words location configuration to be programmed into the
+ * rx_ps_location field of the flow's RFLOW_RFA register.
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..4fd9bff5806b 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -241,6 +241,218 @@ struct ti_sci_rm_irq_ops {
 			      u16 global_event, u8 vint_status_bit);
 };
 
+/* RA config.addr_lo parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID	BIT(0)
+/* RA config.addr_hi parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID	BIT(1)
+ /* RA config.count parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID	BIT(2)
+/* RA config.mode parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_MODE_VALID	BIT(3)
+/* RA config.size parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID	BIT(4)
+/* RA config.order_id parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID	BIT(5)
+
+#define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \
+	(TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID)
+
+/**
+ * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
+ * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
+ * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring
+ *		configuration
+ */
+struct ti_sci_rm_ringacc_ops {
+	int (*config)(const struct ti_sci_handle *handle,
+		      u32 valid_params, u16 nav_id, u16 index,
+		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
+		      u8 size, u8 order_id
+	);
+	int (*get_config)(const struct ti_sci_handle *handle,
+			  u32 nav_id, u32 index, u8 *mode,
+			  u32 *addr_lo, u32 *addr_hi, u32 *count,
+			  u8 *size, u8 *order_id);
+};
+
+/**
+ * struct ti_sci_rm_psil_ops - PSI-L thread operations
+ * @pair: pair PSI-L source thread to a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is updated.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is updated.
+ * @unpair: unpair PSI-L source thread from a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is cleared.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is cleared.
+ */
+struct ti_sci_rm_psil_ops {
+	int (*pair)(const struct ti_sci_handle *handle, u32 nav_id,
+		    u32 src_thread, u32 dst_thread);
+	int (*unpair)(const struct ti_sci_handle *handle, u32 nav_id,
+		      u32 src_thread, u32 dst_thread);
+};
+
+/* UDMAP channel types */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR		2
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR_SB		3	/* RX only */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBRR		10
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBVR		11
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR	12
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBVR	13
+
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST		0
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_MONO		2
+
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES	1
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES	2
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES	3
+
+/* UDMAP TX/RX channel valid_params common declarations */
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID		BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID                BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID            BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID           BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID              BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PRIORITY_VALID             BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_QOS_VALID                  BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ORDER_ID_VALID             BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_SCHED_PRIORITY_VALID       BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID		BIT(14)
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures a Navigator Subsystem UDMAP transmit channel registers.
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID      BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID        BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures a Navigator Subsystem UDMAP receive channel registers.
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID      BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID        BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_SHORT_VALID      BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_LONG_VALID       BIT(12)
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * See @tis_ci_msg_rm_udmap_flow_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID	BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID     BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID     BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DESC_TYPE_VALID          BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SOP_OFFSET_VALID         BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_QNUM_VALID          BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_VALID         BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_VALID         BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_VALID        BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_SEL_VALID     BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_SEL_VALID     BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_SEL_VALID    BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_SEL_VALID    BIT(13)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ0_SZ0_QNUM_VALID      BIT(14)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ1_QNUM_VALID          BIT(15)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ2_QNUM_VALID          BIT(16)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ3_QNUM_VALID          BIT(17)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PS_LOCATION_VALID        BIT(18)
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+};
+
+/**
+ * struct ti_sci_rm_udmap_ops - UDMA Management operations
+ * @tx_ch_cfg: configure SoC Navigator Subsystem UDMA transmit channel.
+ * @rx_ch_cfg: configure SoC Navigator Subsystem UDMA receive channel.
+ * @rx_flow_cfg1: configure SoC Navigator Subsystem UDMA receive flow.
+ */
+struct ti_sci_rm_udmap_ops {
+	int (*tx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params);
+	int (*rx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params);
+	int (*rx_flow_cfg)(const struct ti_sci_handle *handle,
+			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
@@ -254,6 +466,9 @@ struct ti_sci_ops {
 	struct ti_sci_clk_ops clk_ops;
 	struct ti_sci_rm_core_ops rm_core_ops;
 	struct ti_sci_rm_irq_ops rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops rm_ring_ops;
+	struct ti_sci_rm_psil_ops rm_psil_ops;
+	struct ti_sci_rm_udmap_ops rm_udmap_ops;
 };
 
 /**
-- 
cgit v1.2.3


From 1e407f337f4015c8ffc56e7cfd70e06b2e9fc9da Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 5 Jun 2019 17:33:34 -0500
Subject: firmware: ti_sci: Add support for processor control

Texas Instrument's System Control Interface (TI-SCI) Message Protocol
is used in Texas Instrument's System on Chip (SoC) such as those
in K3 family AM654 SoC to communicate between various compute
processors with a central system controller entity.

The system controller provides various services including the control
of other compute processors within the SoC. Extend the TI-SCI protocol
support to add various TI-SCI commands to invoke services associated
with power and reset control, and boot vector management of the
various compute processors from the Linux kernel.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 350 +++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 135 +++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h |  31 +++
 3 files changed, 516 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 02fa196428d8..b47e33e7411f 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2479,6 +2479,348 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_proc_request() - Command to request a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_request(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_request *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_REQUEST,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_request *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_release() - Command to release a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_release(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_release *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_RELEASE,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_release *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_handover() - Command to handover a physical processor
+ *				control to a host in the processor's access
+ *				control list.
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ * @host_id:	Host ID to get the control of the processor
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_handover(const struct ti_sci_handle *handle,
+				    u8 proc_id, u8 host_id)
+{
+	struct ti_sci_msg_req_proc_handover *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_HANDOVER,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_handover *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->host_id = host_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_config() - Command to set the processor boot
+ *				    configuration flags
+ * @handle:		Pointer to TI SCI handle
+ * @proc_id:		Processor ID this request is for
+ * @config_flags_set:	Configuration flags to be set
+ * @config_flags_clear:	Configuration flags to be cleared.
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_config(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 bootvector,
+				      u32 config_flags_set,
+				      u32 config_flags_clear)
+{
+	struct ti_sci_msg_req_set_config *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CONFIG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_config *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->bootvector_low = bootvector & TI_SCI_ADDR_LOW_MASK;
+	req->bootvector_high = (bootvector & TI_SCI_ADDR_HIGH_MASK) >>
+				TI_SCI_ADDR_HIGH_SHIFT;
+	req->config_flags_set = config_flags_set;
+	req->config_flags_clear = config_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_control() - Command to set the processor boot
+ *				     control flags
+ * @handle:			Pointer to TI SCI handle
+ * @proc_id:			Processor ID this request is for
+ * @control_flags_set:		Control flags to be set
+ * @control_flags_clear:	Control flags to be cleared
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_control(const struct ti_sci_handle *handle,
+				       u8 proc_id, u32 control_flags_set,
+				       u32 control_flags_clear)
+{
+	struct ti_sci_msg_req_set_ctrl *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CTRL,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_ctrl *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->control_flags_set = control_flags_set;
+	req->control_flags_clear = control_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_get_boot_status() - Command to get the processor boot status
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_get_status(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 *bv, u32 *cfg_flags,
+				      u32 *ctrl_flags, u32 *sts_flags)
+{
+	struct ti_sci_msg_resp_get_status *resp;
+	struct ti_sci_msg_req_get_status *req;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_GET_STATUS,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_get_status *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_resp_get_status *)xfer->tx_message.buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		*bv = (resp->bootvector_low & TI_SCI_ADDR_LOW_MASK) |
+		      (((u64)resp->bootvector_high << TI_SCI_ADDR_HIGH_SHIFT) &
+		       TI_SCI_ADDR_HIGH_MASK);
+		*cfg_flags = resp->config_flags;
+		*ctrl_flags = resp->control_flags;
+		*sts_flags = resp->status_flags;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2494,6 +2836,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
 	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
 	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
+	struct ti_sci_proc_ops *pops = &ops->proc_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2543,6 +2886,13 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
 	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
 	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
+
+	pops->request = ti_sci_cmd_proc_request;
+	pops->release = ti_sci_cmd_proc_release;
+	pops->handover = ti_sci_cmd_proc_handover;
+	pops->set_config = ti_sci_cmd_proc_set_config;
+	pops->set_control = ti_sci_cmd_proc_set_control;
+	pops->get_status = ti_sci_cmd_proc_get_status;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 2bb81ec7793c..662dcffef311 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -71,6 +71,14 @@
 #define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
 #define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
 
+/* Processor Control requests */
+#define TI_SCI_MSG_PROC_REQUEST		0xc000
+#define TI_SCI_MSG_PROC_RELEASE		0xc001
+#define TI_SCI_MSG_PROC_HANDOVER	0xc005
+#define TI_SCI_MSG_SET_CONFIG		0xc100
+#define TI_SCI_MSG_SET_CTRL		0xc101
+#define TI_SCI_MSG_GET_STATUS		0xc400
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -1238,4 +1246,131 @@ struct ti_sci_msg_rm_udmap_flow_cfg_req {
 	u8 rx_ps_location;
 } __packed;
 
+/**
+ * struct ti_sci_msg_req_proc_request - Request a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being requested
+ *
+ * Request type is TI_SCI_MSG_PROC_REQUEST, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_request {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_release - Release a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being released
+ *
+ * Request type is TI_SCI_MSG_PROC_RELEASE, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_release {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_handover - Handover a processor to a host
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being handed over
+ * @host_id:		Host ID the control needs to be transferred to
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_handover {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u8 host_id;
+} __packed;
+
+/* Boot Vector masks */
+#define TI_SCI_ADDR_LOW_MASK			GENMASK_ULL(31, 0)
+#define TI_SCI_ADDR_HIGH_MASK			GENMASK_ULL(63, 32)
+#define TI_SCI_ADDR_HIGH_SHIFT			32
+
+/**
+ * struct ti_sci_msg_req_set_config - Set Processor boot configuration
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags_set:	Optional Processor specific Config Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @config_flags_clear:	Optional Processor specific Config Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_config {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags_set;
+	u32 config_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_set_ctrl - Set Processor boot control flags
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @control_flags_set:	Optional Processor specific Control Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @control_flags_clear:Optional Processor specific Control Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_SET_CTRL, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_ctrl {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 control_flags_set;
+	u32 control_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_get_status - Processor boot status request
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is being requested
+ *
+ * Request type is TI_SCI_MSG_GET_STATUS, response is an appropriate
+ * message, or NACK in case of inability to satisfy request.
+ */
+struct ti_sci_msg_req_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_resp_get_status - Processor boot status response
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is returned
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags:	Optional Processor specific Config Flags set currently
+ * @control_flags:	Optional Processor specific Control Flags set currently
+ * @status_flags:	Optional Processor specific Status Flags set currently
+ *
+ * Response structure to a TI_SCI_MSG_GET_STATUS request.
+ */
+struct ti_sci_msg_resp_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags;
+	u32 control_flags;
+	u32 status_flags;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 4fd9bff5806b..7b3762f68df9 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -453,12 +453,42 @@ struct ti_sci_rm_udmap_ops {
 			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
 };
 
+/**
+ * struct ti_sci_proc_ops - Processor Control operations
+ * @request:	Request to control a physical processor. The requesting host
+ *		should be in the processor access list
+ * @release:	Relinquish a physical processor control
+ * @handover:	Handover a physical processor control to another host
+ *		in the permitted list
+ * @set_config:	Set base configuration of a processor
+ * @set_control: Setup limited control flags in specific cases
+ * @get_status: Get the state of physical processor
+ *
+ * NOTE: The following paramteres are generic in nature for all these ops,
+ * -handle:	Pointer to TI SCI handle as retrieved by *ti_sci_get_handle
+ * -pid:	Processor ID
+ * -hid:	Host ID
+ */
+struct ti_sci_proc_ops {
+	int (*request)(const struct ti_sci_handle *handle, u8 pid);
+	int (*release)(const struct ti_sci_handle *handle, u8 pid);
+	int (*handover)(const struct ti_sci_handle *handle, u8 pid, u8 hid);
+	int (*set_config)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 boot_vector, u32 cfg_set, u32 cfg_clr);
+	int (*set_control)(const struct ti_sci_handle *handle, u8 pid,
+			   u32 ctrl_set, u32 ctrl_clr);
+	int (*get_status)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 *boot_vector, u32 *cfg_flags, u32 *ctrl_flags,
+			  u32 *status_flags);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
  * @clk_ops:	Clock specific operations
  * @rm_core_ops:	Resource management core operations.
  * @rm_irq_ops:		IRQ management specific operations
+ * @proc_ops:	Processor Control specific operations
  */
 struct ti_sci_ops {
 	struct ti_sci_core_ops core_ops;
@@ -469,6 +499,7 @@ struct ti_sci_ops {
 	struct ti_sci_rm_ringacc_ops rm_ring_ops;
 	struct ti_sci_rm_psil_ops rm_psil_ops;
 	struct ti_sci_rm_udmap_ops rm_udmap_ops;
+	struct ti_sci_proc_ops proc_ops;
 };
 
 /**
-- 
cgit v1.2.3


From 18c8c0954d15105b02f7d2f556b9eafae426871f Mon Sep 17 00:00:00 2001
From: Wesley Sheng <wesley.sheng@microchip.com>
Date: Tue, 30 Apr 2019 18:04:29 +0800
Subject: NTB: correct ntb_dev_ops and ntb_dev comment typos

The comment for ntb_dev_ops and ntb_dev incorrectly referred to
ntb_ctx_ops and ntb_device.

Signed-off-by: Wesley Sheng <wesley.sheng@microchip.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 56a92e3ae3ae..604abc883741 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -205,7 +205,7 @@ static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops)
 }
 
 /**
- * struct ntb_ctx_ops - ntb device operations
+ * struct ntb_dev_ops - ntb device operations
  * @port_number:	See ntb_port_number().
  * @peer_port_count:	See ntb_peer_port_count().
  * @peer_port_number:	See ntb_peer_port_number().
@@ -404,7 +404,7 @@ struct ntb_client {
 #define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv)
 
 /**
- * struct ntb_device - ntb device
+ * struct ntb_dev - ntb device
  * @dev:		Linux device object.
  * @pdev:		PCI device entry of the ntb.
  * @topo:		Detected topology of the ntb.
-- 
cgit v1.2.3


From d7cc609fb679e11dc2b00cbe6c50cbd37ec4bfa2 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:51 -0600
Subject: PCI/MSI: Support allocating virtual MSI interrupts

For NTB devices, we want to be able to trigger MSI interrupts
through a memory window. In these cases we may want to use
more interrupts than the NTB PCI device has available in its MSI-X
table.

We allow for this by creating a new 'virtual' interrupt. These
interrupts are allocated as usual but are not programmed into the
MSI-X table (as there may not be space for them).

The MSI address and data will then handled through an NTB MSI library
introduced later in this series.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/pci/msi.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/msi.h |  8 ++++++++
 include/linux/pci.h |  9 +++++++++
 3 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..ace978deaf93 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -192,6 +192,9 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
+	if (desc->msi_attrib.is_virtual)
+		return NULL;
+
 	return desc->mask_base +
 		desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
@@ -206,14 +209,19 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
 	u32 mask_bits = desc->masked;
+	void __iomem *desc_addr;
 
 	if (pci_msi_ignore_mask)
 		return 0;
+	desc_addr = pci_msix_desc_addr(desc);
+	if (!desc_addr)
+		return 0;
 
 	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
 	if (flag)
 		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
-	writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	writel(mask_bits, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
 
 	return mask_bits;
 }
@@ -273,6 +281,11 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base) {
+			WARN_ON(1);
+			return;
+		}
+
 		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
@@ -303,6 +316,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base)
+			goto skip;
+
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
@@ -327,7 +343,13 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 					      msg->data);
 		}
 	}
+
+skip:
 	entry->msg = *msg;
+
+	if (entry->write_msi_msg)
+		entry->write_msi_msg(entry, entry->write_msi_msg_data);
+
 }
 
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
@@ -550,6 +572,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 
 	entry->msi_attrib.is_msix	= 0;
 	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
+	entry->msi_attrib.is_virtual    = 0;
 	entry->msi_attrib.entry_nr	= 0;
 	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
 	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
@@ -674,6 +697,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
+	int vec_count = pci_msix_vec_count(dev);
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
@@ -696,6 +720,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr = entries[i].entry;
 		else
 			entry->msi_attrib.entry_nr = i;
+
+		entry->msi_attrib.is_virtual =
+			entry->msi_attrib.entry_nr >= vec_count;
+
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
 
@@ -714,12 +742,19 @@ static void msix_program_entries(struct pci_dev *dev,
 {
 	struct msi_desc *entry;
 	int i = 0;
+	void __iomem *desc_addr;
 
 	for_each_pci_msi_entry(entry, dev) {
 		if (entries)
 			entries[i++].vector = entry->irq;
-		entry->masked = readl(pci_msix_desc_addr(entry) +
-				PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+		desc_addr = pci_msix_desc_addr(entry);
+		if (desc_addr)
+			entry->masked = readl(desc_addr +
+					      PCI_MSIX_ENTRY_VECTOR_CTRL);
+		else
+			entry->masked = 0;
+
 		msix_mask_irq(entry, 1);
 	}
 }
@@ -932,7 +967,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-			     int nvec, struct irq_affinity *affd)
+			     int nvec, struct irq_affinity *affd, int flags)
 {
 	int nr_entries;
 	int i, j;
@@ -943,7 +978,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	nr_entries = pci_msix_vec_count(dev);
 	if (nr_entries < 0)
 		return nr_entries;
-	if (nvec > nr_entries)
+	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
 		return nr_entries;
 
 	if (entries) {
@@ -1079,7 +1114,8 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
-				   int maxvec, struct irq_affinity *affd)
+				   int maxvec, struct irq_affinity *affd,
+				   int flags)
 {
 	int rc, nvec = maxvec;
 
@@ -1096,7 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 				return -ENOSPC;
 		}
 
-		rc = __pci_enable_msix(dev, entries, nvec, affd);
+		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
 		if (rc == 0)
 			return nvec;
 
@@ -1127,7 +1163,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 		int minvec, int maxvec)
 {
-	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL);
+	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL, 0);
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
@@ -1167,7 +1203,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 
 	if (flags & PCI_IRQ_MSIX) {
 		msix_vecs = __pci_enable_msix_range(dev, NULL, min_vecs,
-						    max_vecs, affd);
+						    max_vecs, affd, flags);
 		if (msix_vecs > 0)
 			return msix_vecs;
 	}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d48e919d55ae..8ad679e9d9c0 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -64,6 +64,10 @@ struct ti_sci_inta_msi_desc {
  * @msg:	The last set MSI message cached for reuse
  * @affinity:	Optional pointer to a cpu affinity mask for this descriptor
  *
+ * @write_msi_msg:	Callback that may be called when the MSI message
+ *			address or data changes
+ * @write_msi_msg_data:	Data parameter for the callback.
+ *
  * @masked:	[PCI MSI/X] Mask bits
  * @is_msix:	[PCI MSI/X] True if MSI-X
  * @multiple:	[PCI MSI/X] log2 num of messages allocated
@@ -90,6 +94,9 @@ struct msi_desc {
 	const void			*iommu_cookie;
 #endif
 
+	void (*write_msi_msg)(struct msi_desc *entry, void *data);
+	void *write_msi_msg_data;
+
 	union {
 		/* PCI MSI/X specific data */
 		struct {
@@ -100,6 +107,7 @@ struct msi_desc {
 				u8	multi_cap	: 3;
 				u8	maskbit		: 1;
 				u8	is_64		: 1;
+				u8	is_virtual	: 1;
 				u16	entry_nr;
 				unsigned default_irq;
 			} msi_attrib;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..19b5c27c6f63 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1362,6 +1362,15 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
+
+/*
+ * Virtual interrupts allow for more interrupts to be allocated
+ * than the device has interrupts for. These are not programmed
+ * into the device's MSI-X table and must be handled by some
+ * other driver means.
+ */
+#define PCI_IRQ_VIRTUAL		(1 << 4)
+
 #define PCI_IRQ_ALL_TYPES \
 	(PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
-- 
cgit v1.2.3


From 246a42c51bc5dd247629f86c87d5e1b7628343c4 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:53 -0600
Subject: NTB: Introduce helper functions to calculate logical port number

This patch introduces the "Logical Port Number" which is similar to the
"Port Number" in that it enumerates the ports in the system.

The original (or Physical) "Port Number" can be any number used by the
hardware to uniquely identify a port in the system. The "Logical Port
Number" enumerates all ports in the system from 0 to the number of
ports minus one.

For example a system with 5 ports might have the following port numbers
which would be enumerated thusly:

Port Number:           1  2  5  7  116
Logical Port Number:   0  1  2  3  4

The logical port number is useful when calculating which resources
to use for which peers. So we thus define two helper functions:
ntb_logical_port_number() and ntb_peer_logical_port_number() which
provide the "Logical Port Number" for the local port and any peer
respectively.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Cc: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 604abc883741..2fadd0352683 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -616,7 +616,6 @@ static inline int ntb_port_number(struct ntb_dev *ntb)
 
 	return ntb->ops->port_number(ntb);
 }
-
 /**
  * ntb_peer_port_count() - get the number of peer device ports
  * @ntb:	NTB device context.
@@ -653,6 +652,58 @@ static inline int ntb_peer_port_number(struct ntb_dev *ntb, int pidx)
 	return ntb->ops->peer_port_number(ntb, pidx);
 }
 
+/**
+ * ntb_logical_port_number() - get the logical port number of the local port
+ * @ntb:	NTB device context.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the logical port number or negative value indicating an error
+ */
+static inline int ntb_logical_port_number(struct ntb_dev *ntb)
+{
+	int lport = ntb_port_number(ntb);
+	int pidx;
+
+	if (lport < 0)
+		return lport;
+
+	for (pidx = 0; pidx < ntb_peer_port_count(ntb); pidx++)
+		if (lport <= ntb_peer_port_number(ntb, pidx))
+			return pidx;
+
+	return pidx;
+}
+
+/**
+ * ntb_peer_logical_port_number() - get the logical peer port by given index
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the peer's logical port number or negative value indicating an error
+ */
+static inline int ntb_peer_logical_port_number(struct ntb_dev *ntb, int pidx)
+{
+	if (ntb_peer_port_number(ntb, pidx) < ntb_port_number(ntb))
+		return pidx;
+	else
+		return pidx + 1;
+}
+
 /**
  * ntb_peer_port_idx() - get the peer device port index by given port number
  * @ntb:	NTB device context.
-- 
cgit v1.2.3


From 5f1b1f065c791de71017502ed3ba46779e231d9b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:54 -0600
Subject: NTB: Introduce functions to calculate multi-port resource index

When using multi-ports each port uses resources (dbs, msgs, mws, etc)
on every other port. Creating a mapping for these resources such that
each port has a corresponding resource on every other port is a bit
tricky.

Introduce the ntb_peer_resource_idx() function for this purpose.
It returns the peer resource number that will correspond with the
local peer index on the remote peer.

Also, introduce ntb_peer_highest_mw_idx() which will use
ntb_peer_resource_idx() but return the MW index starting with the
highest index and working down.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 2fadd0352683..bed421b9579b 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -1557,4 +1557,74 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx,
 	return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
 }
 
+/**
+ * ntb_peer_resource_idx() - get a resource index for a given peer idx
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * When constructing a graph of peers, each remote peer must use a different
+ * resource index (mw, doorbell, etc) to communicate with each other
+ * peer.
+ *
+ * In a two peer system, this function should always return 0 such that
+ * resource 0 points to the remote peer on both ports.
+ *
+ * In a 5 peer system, this function will return the following matrix
+ *
+ * pidx \ port    0    1    2    3    4
+ * 0              0    0    1    2    3
+ * 1              0    1    1    2    3
+ * 2              0    1    2    2    3
+ * 3              0    1    2    3    3
+ *
+ * For example, if this function is used to program peer's memory
+ * windows, port 0 will program MW 0 on all it's peers to point to itself.
+ * port 1 will program MW 0 in port 0 to point to itself and MW 1 on all
+ * other ports. etc.
+ *
+ * For the legacy two host case, ntb_port_number() and ntb_peer_port_number()
+ * both return zero and therefore this function will always return zero.
+ * So MW 0 on each host would be programmed to point to the other host.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx)
+{
+	int local_port, peer_port;
+
+	if (pidx >= ntb_peer_port_count(ntb))
+		return -EINVAL;
+
+	local_port = ntb_logical_port_number(ntb);
+	peer_port = ntb_peer_logical_port_number(ntb, pidx);
+
+	if (peer_port < local_port)
+		return local_port - 1;
+	else
+		return local_port;
+}
+
+/**
+ * ntb_peer_highest_mw_idx() - get a memory window index for a given peer idx
+ *	using the highest index memory windows first
+ *
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * Like ntb_peer_resource_idx(), except it returns indexes starting with
+ * last memory window index.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
+{
+	int ret;
+
+	ret = ntb_peer_resource_idx(ntb, pidx);
+	if (ret < 0)
+		return ret;
+
+	return ntb_mw_count(ntb, pidx) - ret - 1;
+}
+
 #endif
-- 
cgit v1.2.3


From 26b3a37b928457ba2cd98eaf6d7b0feca5a30fa6 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:56 -0600
Subject: NTB: Introduce MSI library

The NTB MSI library allows passing MSI interrupts across a memory
window. This offers similar functionality to doorbells or messages
except will often have much better latency and the client can
potentially use significantly more remote interrupts than typical hardware
provides for doorbells. (Which can be important in high-multiport
setups.)

The library utilizes one memory window per peer and uses the highest
index memory windows. Before any ntb_msi function may be used, the user
must call ntb_msi_init(). It may then setup and tear down the memory
windows when the link state changes using ntb_msi_setup_mws() and
ntb_msi_clear_mws().

The peer which receives the interrupt must call ntb_msim_request_irq()
to assign the interrupt handler (this function is functionally
similar to devm_request_irq()) and the returned descriptor must be
transferred to the peer which can use it to trigger the interrupt.
The triggering peer, once having received the descriptor, can
trigger the interrupt by calling ntb_msi_peer_trigger().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/Kconfig  |  11 ++
 drivers/ntb/Makefile |   3 +-
 drivers/ntb/msi.c    | 415 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ntb.h  |  73 +++++++++
 4 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ntb/msi.c

(limited to 'include/linux')

diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index 95944e52fa36..5760764052be 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -12,6 +12,17 @@ menuconfig NTB
 
 if NTB
 
+config NTB_MSI
+	bool "MSI Interrupt Support"
+	depends on PCI_MSI
+	help
+	 Support using MSI interrupt forwarding instead of (or in addition to)
+	 hardware doorbells. MSI interrupts typically offer lower latency
+	 than doorbells and more MSI interrupts can be made available to
+	 clients. However this requires an extra memory window and support
+	 in the hardware driver for creating the MSI interrupts.
+
+	 If unsure, say N.
 source "drivers/ntb/hw/Kconfig"
 
 source "drivers/ntb/test/Kconfig"
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 537226f8e78d..cc27ad2ef150 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_NTB) += ntb.o hw/ test/
 obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
 
-ntb-y := core.o
+ntb-y			:= core.o
+ntb-$(CONFIG_NTB_MSI)	+= msi.o
diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c
new file mode 100644
index 000000000000..9dddf133658f
--- /dev/null
+++ b/drivers/ntb/msi.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/ntb.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_DESCRIPTION("NTB MSI Interrupt Library");
+
+struct ntb_msi {
+	u64 base_addr;
+	u64 end_addr;
+
+	void (*desc_changed)(void *ctx);
+
+	u32 __iomem *peer_mws[];
+};
+
+/**
+ * ntb_msi_init() - Initialize the MSI context
+ * @ntb:	NTB device context
+ *
+ * This function must be called before any other ntb_msi function.
+ * It initializes the context for MSI operations and maps
+ * the peer memory windows.
+ *
+ * This function reserves the last N outbound memory windows (where N
+ * is the number of peers).
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_init(struct ntb_dev *ntb,
+		 void (*desc_changed)(void *ctx))
+{
+	phys_addr_t mw_phys_addr;
+	resource_size_t mw_size;
+	size_t struct_size;
+	int peer_widx;
+	int peers;
+	int ret;
+	int i;
+
+	peers = ntb_peer_port_count(ntb);
+	if (peers <= 0)
+		return -EINVAL;
+
+	struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers;
+
+	ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL);
+	if (!ntb->msi)
+		return -ENOMEM;
+
+	ntb->msi->desc_changed = desc_changed;
+
+	for (i = 0; i < peers; i++) {
+		peer_widx = ntb_peer_mw_count(ntb) - 1 - i;
+
+		ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr,
+					   &mw_size);
+		if (ret)
+			goto unroll;
+
+		ntb->msi->peer_mws[i] = devm_ioremap(&ntb->dev, mw_phys_addr,
+						     mw_size);
+		if (!ntb->msi->peer_mws[i]) {
+			ret = -EFAULT;
+			goto unroll;
+		}
+	}
+
+	return 0;
+
+unroll:
+	for (i = 0; i < peers; i++)
+		if (ntb->msi->peer_mws[i])
+			devm_iounmap(&ntb->dev, ntb->msi->peer_mws[i]);
+
+	devm_kfree(&ntb->dev, ntb->msi);
+	ntb->msi = NULL;
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_init);
+
+/**
+ * ntb_msi_setup_mws() - Initialize the MSI inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function sets up the required inbound memory windows. It should be
+ * called from a work function after a link up event.
+ *
+ * Over the entire network, this function will reserves the last N
+ * inbound memory windows for each peer (where N is the number of peers).
+ *
+ * ntb_msi_init() must be called before this function.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	struct msi_desc *desc;
+	u64 addr;
+	int peer, peer_widx;
+	resource_size_t addr_align, size_align, size_max;
+	resource_size_t mw_size = SZ_32K;
+	resource_size_t mw_min_size = mw_size;
+	int i;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	desc = first_msi_entry(&ntb->pdev->dev);
+	addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32);
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			return peer_widx;
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, &addr_align,
+				       NULL, NULL);
+		if (ret)
+			return ret;
+
+		addr &= ~(addr_align - 1);
+	}
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0) {
+			ret = peer_widx;
+			goto error_out;
+		}
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, NULL,
+				       &size_align, &size_max);
+		if (ret)
+			goto error_out;
+
+		mw_size = round_up(mw_size, size_align);
+		mw_size = max(mw_size, size_max);
+		if (mw_size < mw_min_size)
+			mw_min_size = mw_size;
+
+		ret = ntb_mw_set_trans(ntb, peer, peer_widx,
+				       addr, mw_size);
+		if (ret)
+			goto error_out;
+	}
+
+	ntb->msi->base_addr = addr;
+	ntb->msi->end_addr = addr + mw_min_size;
+
+	return 0;
+
+error_out:
+	for (i = 0; i < peer; i++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, i, peer_widx);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_setup_mws);
+
+/**
+ * ntb_msi_clear_mws() - Clear all inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function tears down the resources used by ntb_msi_setup_mws().
+ */
+void ntb_msi_clear_mws(struct ntb_dev *ntb)
+{
+	int peer;
+	int peer_widx;
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, peer, peer_widx);
+	}
+}
+EXPORT_SYMBOL(ntb_msi_clear_mws);
+
+struct ntb_msi_devres {
+	struct ntb_dev *ntb;
+	struct msi_desc *entry;
+	struct ntb_msi_desc *msi_desc;
+};
+
+static int ntb_msi_set_desc(struct ntb_dev *ntb, struct msi_desc *entry,
+			    struct ntb_msi_desc *msi_desc)
+{
+	u64 addr;
+
+	addr = entry->msg.address_lo +
+		((uint64_t)entry->msg.address_hi << 32);
+
+	if (addr < ntb->msi->base_addr || addr >= ntb->msi->end_addr) {
+		dev_warn_once(&ntb->dev,
+			      "IRQ %d: MSI Address not within the memory window (%llx, [%llx %llx])\n",
+			      entry->irq, addr, ntb->msi->base_addr,
+			      ntb->msi->end_addr);
+		return -EFAULT;
+	}
+
+	msi_desc->addr_offset = addr - ntb->msi->base_addr;
+	msi_desc->data = entry->msg.data;
+
+	return 0;
+}
+
+static void ntb_msi_write_msg(struct msi_desc *entry, void *data)
+{
+	struct ntb_msi_devres *dr = data;
+
+	WARN_ON(ntb_msi_set_desc(dr->ntb, entry, dr->msi_desc));
+
+	if (dr->ntb->msi->desc_changed)
+		dr->ntb->msi->desc_changed(dr->ntb->ctx);
+}
+
+static void ntbm_msi_callback_release(struct device *dev, void *res)
+{
+	struct ntb_msi_devres *dr = res;
+
+	dr->entry->write_msi_msg = NULL;
+	dr->entry->write_msi_msg_data = NULL;
+}
+
+static int ntbm_msi_setup_callback(struct ntb_dev *ntb, struct msi_desc *entry,
+				   struct ntb_msi_desc *msi_desc)
+{
+	struct ntb_msi_devres *dr;
+
+	dr = devres_alloc(ntbm_msi_callback_release,
+			  sizeof(struct ntb_msi_devres), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->ntb = ntb;
+	dr->entry = entry;
+	dr->msi_desc = msi_desc;
+
+	devres_add(&ntb->dev, dr);
+
+	dr->entry->write_msi_msg = ntb_msi_write_msg;
+	dr->entry->write_msi_msg_data = dr;
+
+	return 0;
+}
+
+/**
+ * ntbm_msi_request_threaded_irq() - allocate an MSI interrupt
+ * @ntb:	NTB device context
+ * @handler:	Function to be called when the IRQ occurs
+ * @thread_fn:  Function to be called in a threaded interrupt context. NULL
+ *              for clients which handle everything in @handler
+ * @devname:    An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id:     A cookie passed back to the handler function
+ *
+ * This function assigns an interrupt handler to an unused
+ * MSI interrupt and returns the descriptor used to trigger
+ * it. The descriptor can then be sent to a peer to trigger
+ * the interrupt.
+ *
+ * The interrupt resource is managed with devres so it will
+ * be automatically freed when the NTB device is torn down.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, ntbm_free_irq() must be used.
+ *
+ * Return: IRQ number assigned on success, otherwise a negative error number.
+ */
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc)
+{
+	struct msi_desc *entry;
+	struct irq_desc *desc;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	for_each_pci_msi_entry(entry, ntb->pdev) {
+		desc = irq_to_desc(entry->irq);
+		if (desc->action)
+			continue;
+
+		ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
+						thread_fn, 0, name, dev_id);
+		if (ret)
+			continue;
+
+		if (ntb_msi_set_desc(ntb, entry, msi_desc)) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			continue;
+		}
+
+		ret = ntbm_msi_setup_callback(ntb, entry, msi_desc);
+		if (ret) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			return ret;
+		}
+
+
+		return entry->irq;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(ntbm_msi_request_threaded_irq);
+
+static int ntbm_msi_callback_match(struct device *dev, void *res, void *data)
+{
+	struct ntb_dev *ntb = dev_ntb(dev);
+	struct ntb_msi_devres *dr = res;
+
+	return dr->ntb == ntb && dr->entry == data;
+}
+
+/**
+ * ntbm_msi_free_irq() - free an interrupt
+ * @ntb:	NTB device context
+ * @irq:	Interrupt line to free
+ * @dev_id:	Device identity to free
+ *
+ * This function should be used to manually free IRQs allocated with
+ * ntbm_request_[threaded_]irq().
+ */
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id)
+{
+	struct msi_desc *entry = irq_get_msi_desc(irq);
+
+	entry->write_msi_msg = NULL;
+	entry->write_msi_msg_data = NULL;
+
+	WARN_ON(devres_destroy(&ntb->dev, ntbm_msi_callback_release,
+			       ntbm_msi_callback_match, entry));
+
+	devm_free_irq(&ntb->dev, irq, dev_id);
+}
+EXPORT_SYMBOL(ntbm_msi_free_irq);
+
+/**
+ * ntb_msi_peer_trigger() - Trigger an interrupt handler on a peer
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ *
+ * This function triggers an interrupt on a peer. It requires
+ * the descriptor structure to have been passed from that peer
+ * by some other means.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc)
+{
+	int idx;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	idx = desc->addr_offset / sizeof(*ntb->msi->peer_mws[peer]);
+
+	iowrite32(desc->data, &ntb->msi->peer_mws[peer][idx]);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_trigger);
+
+/**
+ * ntb_msi_peer_addr() - Get the DMA address to trigger a peer's MSI interrupt
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ * @msi_addr:   Physical address to trigger the interrupt
+ *
+ * This function allows using DMA engines to trigger an interrupt
+ * (for example, trigger an interrupt to process the data after
+ * sending it). To trigger the interrupt, write @desc.data to the address
+ * returned in @msi_addr
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr)
+{
+	int peer_widx = ntb_peer_mw_count(ntb) - 1 - peer;
+	phys_addr_t mw_phys_addr;
+	int ret;
+
+	ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr, NULL);
+	if (ret)
+		return ret;
+
+	if (msi_addr)
+		*msi_addr = mw_phys_addr + desc->addr_offset;
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_addr);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index bed421b9579b..8c13538aeffe 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -58,9 +58,11 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/interrupt.h>
 
 struct ntb_client;
 struct ntb_dev;
+struct ntb_msi;
 struct pci_dev;
 
 /**
@@ -426,6 +428,10 @@ struct ntb_dev {
 	spinlock_t			ctx_lock;
 	/* block unregister until device is fully released */
 	struct completion		released;
+
+#ifdef CONFIG_NTB_MSI
+	struct ntb_msi *msi;
+#endif
 };
 #define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev)
 
@@ -1627,4 +1633,71 @@ static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
 	return ntb_mw_count(ntb, pidx) - ret - 1;
 }
 
+struct ntb_msi_desc {
+	u32 addr_offset;
+	u32 data;
+};
+
+#ifdef CONFIG_NTB_MSI
+
+int ntb_msi_init(struct ntb_dev *ntb, void (*desc_changed)(void *ctx));
+int ntb_msi_setup_mws(struct ntb_dev *ntb);
+void ntb_msi_clear_mws(struct ntb_dev *ntb);
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc);
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id);
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc);
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr);
+
+#else /* not CONFIG_NTB_MSI */
+
+static inline int ntb_msi_init(struct ntb_dev *ntb,
+			       void (*desc_changed)(void *ctx))
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntb_msi_clear_mws(struct ntb_dev *ntb) {}
+static inline int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb,
+						irq_handler_t handler,
+						irq_handler_t thread_fn,
+						const char *name, void *dev_id,
+						struct ntb_msi_desc *msi_desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq,
+				     void *dev_id) {}
+static inline int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+				       struct ntb_msi_desc *desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+				    struct ntb_msi_desc *desc,
+				    phys_addr_t *msi_addr)
+{
+	return -EOPNOTSUPP;
+
+}
+
+#endif /* CONFIG_NTB_MSI */
+
+static inline int ntbm_msi_request_irq(struct ntb_dev *ntb,
+				       irq_handler_t handler,
+				       const char *name, void *dev_id,
+				       struct ntb_msi_desc *msi_desc)
+{
+	return ntbm_msi_request_threaded_irq(ntb, handler, NULL, name,
+					     dev_id, msi_desc);
+}
+
 #endif
-- 
cgit v1.2.3


From fd742eaab827b47c5f2de6c1811a17075608da60 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 9 Jun 2019 13:48:12 +0200
Subject: regulator: max8952: Convert to use GPIO descriptors

This finalizes the descriptor conversion of the MAX8952 driver
by letting the VID0 and VID1 GPIOs be fetched from descriptors.

Both VID0 and VID1 must be supplied for the VID selection to work,
I add some code to preserve the semantics that if only one of
the two VID gpios is supplied, it will be initialized to low.
This might be a bit overzealous, but I want to preserve any
implicit semantics.

This is currently only used by device tree in-kernel but it is
still also possible to supply the same GPIOs using a machine
descriptor table if a board file is used.

Ideally this should be phased over to using gpio-regulator.c
that does the same thing, but it might require some refactoring
and needs testing on real hardware.

Cc: Tomasz Figa <tfiga@chromium.org>
Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/max8952.c       | 62 ++++++++++++++++++---------------------
 include/linux/regulator/max8952.h |  3 --
 2 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/max8952.c b/drivers/regulator/max8952.c
index 451237efb359..8f0e4dc810f0 100644
--- a/drivers/regulator/max8952.c
+++ b/drivers/regulator/max8952.c
@@ -26,11 +26,9 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/max8952.h>
-#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/slab.h>
 
@@ -50,7 +48,8 @@ enum {
 struct max8952_data {
 	struct i2c_client	*client;
 	struct max8952_platform_data *pdata;
-
+	struct gpio_desc *vid0_gpiod;
+	struct gpio_desc *vid1_gpiod;
 	bool vid0;
 	bool vid1;
 };
@@ -100,16 +99,15 @@ static int max8952_set_voltage_sel(struct regulator_dev *rdev,
 {
 	struct max8952_data *max8952 = rdev_get_drvdata(rdev);
 
-	if (!gpio_is_valid(max8952->pdata->gpio_vid0) ||
-			!gpio_is_valid(max8952->pdata->gpio_vid1)) {
+	if (!max8952->vid0_gpiod || !max8952->vid1_gpiod) {
 		/* DVS not supported */
 		return -EPERM;
 	}
 
 	max8952->vid0 = selector & 0x1;
 	max8952->vid1 = (selector >> 1) & 0x1;
-	gpio_set_value(max8952->pdata->gpio_vid0, max8952->vid0);
-	gpio_set_value(max8952->pdata->gpio_vid1, max8952->vid1);
+	gpiod_set_value(max8952->vid0_gpiod, max8952->vid0);
+	gpiod_set_value(max8952->vid1_gpiod, max8952->vid1);
 
 	return 0;
 }
@@ -147,9 +145,6 @@ static struct max8952_platform_data *max8952_parse_dt(struct device *dev)
 	if (!pd)
 		return NULL;
 
-	pd->gpio_vid0 = of_get_named_gpio(np, "max8952,vid-gpios", 0);
-	pd->gpio_vid1 = of_get_named_gpio(np, "max8952,vid-gpios", 1);
-
 	if (of_property_read_u32(np, "max8952,default-mode", &pd->default_mode))
 		dev_warn(dev, "Default mode not specified, assuming 0\n");
 
@@ -200,7 +195,7 @@ static int max8952_pmic_probe(struct i2c_client *client,
 	struct gpio_desc *gpiod;
 	enum gpiod_flags gflags;
 
-	int ret = 0, err = 0;
+	int ret = 0;
 
 	if (client->dev.of_node)
 		pdata = max8952_parse_dt(&client->dev);
@@ -253,32 +248,31 @@ static int max8952_pmic_probe(struct i2c_client *client,
 	max8952->vid0 = pdata->default_mode & 0x1;
 	max8952->vid1 = (pdata->default_mode >> 1) & 0x1;
 
-	if (gpio_is_valid(pdata->gpio_vid0) &&
-			gpio_is_valid(pdata->gpio_vid1)) {
-		unsigned long gpio_flags;
-
-		gpio_flags = max8952->vid0 ?
-			     GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
-		if (devm_gpio_request_one(&client->dev, pdata->gpio_vid0,
-					  gpio_flags, "MAX8952 VID0"))
-			err = 1;
-
-		gpio_flags = max8952->vid1 ?
-			     GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
-		if (devm_gpio_request_one(&client->dev, pdata->gpio_vid1,
-					  gpio_flags, "MAX8952 VID1"))
-			err = 2;
-	} else
-		err = 3;
-
-	if (err) {
+	/* Fetch vid0 and vid1 GPIOs if available */
+	gflags = max8952->vid0 ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW;
+	max8952->vid0_gpiod = devm_gpiod_get_index_optional(&client->dev,
+							    "max8952,vid",
+							    0, gflags);
+	if (IS_ERR(max8952->vid0_gpiod))
+		return PTR_ERR(max8952->vid0_gpiod);
+	gflags = max8952->vid1 ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW;
+	max8952->vid1_gpiod = devm_gpiod_get_index_optional(&client->dev,
+							    "max8952,vid",
+							    1, gflags);
+	if (IS_ERR(max8952->vid1_gpiod))
+		return PTR_ERR(max8952->vid1_gpiod);
+
+	/* If either VID GPIO is missing just disable this */
+	if (!max8952->vid0_gpiod || !max8952->vid1_gpiod) {
 		dev_warn(&client->dev, "VID0/1 gpio invalid: "
-				"DVS not available.\n");
+			 "DVS not available.\n");
 		max8952->vid0 = 0;
 		max8952->vid1 = 0;
-		/* Mark invalid */
-		pdata->gpio_vid0 = -1;
-		pdata->gpio_vid1 = -1;
+		/* Make sure if we have any descriptors they get set to low */
+		if (max8952->vid0_gpiod)
+			gpiod_set_value(max8952->vid0_gpiod, 0);
+		if (max8952->vid1_gpiod)
+			gpiod_set_value(max8952->vid1_gpiod, 0);
 
 		/* Disable Pulldown of EN only */
 		max8952_write_reg(max8952, MAX8952_REG_CONTROL, 0x60);
diff --git a/include/linux/regulator/max8952.h b/include/linux/regulator/max8952.h
index 686c42c041b5..33b6e2c09c05 100644
--- a/include/linux/regulator/max8952.h
+++ b/include/linux/regulator/max8952.h
@@ -118,9 +118,6 @@ enum {
 #define MAX8952_NUM_DVS_MODE	4
 
 struct max8952_platform_data {
-	int gpio_vid0;
-	int gpio_vid1;
-
 	u32 default_mode;
 	u32 dvs_mode[MAX8952_NUM_DVS_MODE]; /* MAX8952_DVS_MODEx_XXXXmV */
 
-- 
cgit v1.2.3


From 86eec50beaf3a45f6432d491072fa5c54284dbca Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:19 +0000
Subject: net/mlx5: Support querying max VFs from device

For ECPF with eswitch manager privilege, query the host max VF count
by querying the device using query_functions command.

With this enhancement:
1. flow steering entries are created only for valid vports based on
   the max VF count of the PF.
2. Driver only queries cap of valid vport.

Eswitch requires the max VFs when doing initialization, so do sr-iov
init before eswitch init.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c  | 18 +++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 22 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                     |  7 ++-----
 include/linux/mlx5/mlx5_ifc.h                   |  2 +-
 4 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8e96c42d3b84..720f65bfe6a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -844,32 +844,32 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_rl_cleanup;
 	}
 
-	err = mlx5_eswitch_init(dev);
+	err = mlx5_sriov_init(dev);
 	if (err) {
-		mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
+		mlx5_core_err(dev, "Failed to init sriov %d\n", err);
 		goto err_mpfs_cleanup;
 	}
 
-	err = mlx5_sriov_init(dev);
+	err = mlx5_eswitch_init(dev);
 	if (err) {
-		mlx5_core_err(dev, "Failed to init sriov %d\n", err);
-		goto err_eswitch_cleanup;
+		mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
+		goto err_sriov_cleanup;
 	}
 
 	err = mlx5_fpga_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to init fpga device %d\n", err);
-		goto err_sriov_cleanup;
+		goto err_eswitch_cleanup;
 	}
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
 
 	return 0;
 
-err_sriov_cleanup:
-	mlx5_sriov_cleanup(dev);
 err_eswitch_cleanup:
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
+err_sriov_cleanup:
+	mlx5_sriov_cleanup(dev);
 err_mpfs_cleanup:
 	mlx5_mpfs_cleanup(dev);
 err_rl_cleanup:
@@ -893,8 +893,8 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_fpga_cleanup(dev);
-	mlx5_sriov_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
+	mlx5_sriov_cleanup(dev);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
 	mlx5_vxlan_destroy(dev->vxlan);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index a249b3c3843d..2eecb831c499 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -208,6 +208,27 @@ void mlx5_sriov_detach(struct mlx5_core_dev *dev)
 	mlx5_device_disable_sriov(dev);
 }
 
+static u16 mlx5_get_max_vfs(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {};
+	u16 host_total_vfs;
+	int err;
+
+	if (mlx5_core_is_ecpf_esw_manager(dev)) {
+		err = mlx5_esw_query_functions(dev, out, sizeof(out));
+		host_total_vfs = MLX5_GET(query_esw_functions_out, out,
+					  host_params_context.host_total_vfs);
+
+		/* Old FW doesn't support getting total_vfs from esw func
+		 * but supports getting it from pci_sriov.
+		 */
+		if (!err && host_total_vfs)
+			return host_total_vfs;
+	}
+
+	return pci_sriov_get_totalvfs(dev->pdev);
+}
+
 int mlx5_sriov_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
@@ -218,6 +239,7 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev)
 		return 0;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
+	sriov->max_vfs = mlx5_get_max_vfs(dev);
 	sriov->num_vfs = pci_num_vf(pdev);
 	sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
 	if (!sriov->vfs_ctx)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5431f7d97cb..64155fe201ee 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -470,6 +470,7 @@ struct mlx5_core_sriov {
 	struct mlx5_vf_context	*vfs_ctx;
 	int			num_vfs;
 	int			enabled_vfs;
+	u16			max_vfs;
 };
 
 struct mlx5_fc_stats {
@@ -1103,13 +1104,9 @@ static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
 	return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
 }
 
-#define MLX5_HOST_PF_MAX_VFS	(127u)
 static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
 {
-	if (mlx5_core_is_ecpf_esw_manager(dev))
-		return MLX5_HOST_PF_MAX_VFS;
-	else
-		return pci_sriov_get_totalvfs(dev->pdev);
+	return dev->priv.sriov.max_vfs;
 }
 
 static inline int mlx5_get_gid_table_len(u16 param)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6513b985c5e9..e3c154b573a2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9711,7 +9711,7 @@ struct mlx5_ifc_host_params_context_bits {
 	u8         reserved_at_8[0x8];
 	u8         host_num_of_vfs[0x10];
 
-	u8         reserved_at_20[0x10];
+	u8         host_total_vfs[0x10];
 	u8         host_pci_bus[0x10];
 
 	u8         reserved_at_40[0x10];
-- 
cgit v1.2.3


From ca390799c2aa03632c294107fa7f647bcbdff428 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:23 +0000
Subject: net/mlx5: Change interrupt handler to call chain notifier

Multiple EQs may share the same IRQ in subsequent patches.

Instead of calling the IRQ handler directly, the EQ will register
to an atomic chain notfier.

The Linux built-in shared IRQ is not used because it forces the caller
to disable the IRQ and clear affinity before free_irq() can be called.

This patch is the first step in the separation of IRQ and EQ logic.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h             |   1 +
 drivers/infiniband/hw/mlx5/odp.c                 |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 138 +++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h |   9 +-
 include/linux/mlx5/eq.h                          |   3 +-
 5 files changed, 105 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 40eb8be482e4..a043af7ee366 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -920,6 +920,7 @@ struct mlx5_ib_lb_state {
 };
 
 struct mlx5_ib_pf_eq {
+	struct notifier_block irq_nb;
 	struct mlx5_ib_dev *dev;
 	struct mlx5_eq *core;
 	struct work_struct work;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 91507a2e9290..ac40a4fd5598 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1488,9 +1488,11 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
 	mlx5_eq_update_ci(eq->core, cc, 1);
 }
 
-static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr)
+static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
+			     void *data)
 {
-	struct mlx5_ib_pf_eq *eq = eq_ptr;
+	struct mlx5_ib_pf_eq *eq =
+		container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
 	unsigned long flags;
 
 	if (spin_trylock_irqsave(&eq->lock, flags)) {
@@ -1553,12 +1555,12 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		goto err_mempool;
 	}
 
+	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_PFAULT_IDX,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
-		.context = eq,
-		.handler = mlx5_ib_eq_pf_int
+		.nb = &eq->irq_nb,
 	};
 	eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
 	if (IS_ERR(eq->core)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 28defeaca80a..590c0fefaa25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -72,16 +72,16 @@ enum {
 static_assert(MLX5_EQ_POLLING_BUDGET <= MLX5_NUM_SPARE_EQE);
 
 struct mlx5_irq_info {
+	struct atomic_notifier_head nh;
 	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
-	void *context; /* dev_id provided to request_irq */
 };
 
 struct mlx5_eq_table {
 	struct list_head        comp_eqs_list;
-	struct mlx5_eq          pages_eq;
-	struct mlx5_eq	        cmd_eq;
-	struct mlx5_eq          async_eq;
+	struct mlx5_eq_async    pages_eq;
+	struct mlx5_eq_async    cmd_eq;
+	struct mlx5_eq_async    async_eq;
 
 	struct atomic_notifier_head nh[MLX5_EVENT_TYPE_MAX];
 
@@ -109,6 +109,31 @@ struct mlx5_eq_table {
 			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
 			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
 
+static struct mlx5_irq_info *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+
+	return &eq_table->irq_info[vecidx];
+}
+
+static int mlx5_irq_attach_nb(struct mlx5_irq_info *irq,
+			      struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&irq->nh, nb);
+}
+
+static int mlx5_irq_detach_nb(struct mlx5_irq_info *irq,
+			      struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&irq->nh, nb);
+}
+
+static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
+{
+	atomic_notifier_call_chain(nh, 0, NULL);
+	return IRQ_HANDLED;
+}
+
 static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
 {
 	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
@@ -134,10 +159,13 @@ static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 	return cq;
 }
 
-static irqreturn_t mlx5_eq_comp_int(int irq, void *eq_ptr)
+static int mlx5_eq_comp_int(struct notifier_block *nb,
+			    __always_unused unsigned long action,
+			    __always_unused void *data)
 {
-	struct mlx5_eq_comp *eq_comp = eq_ptr;
-	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_eq_comp *eq_comp =
+		container_of(nb, struct mlx5_eq_comp, irq_nb);
+	struct mlx5_eq *eq = &eq_comp->core;
 	struct mlx5_eqe *eqe;
 	int num_eqes = 0;
 	u32 cqn = -1;
@@ -175,7 +203,7 @@ out:
 	if (cqn != -1)
 		tasklet_schedule(&eq_comp->tasklet_ctx.task);
 
-	return IRQ_HANDLED;
+	return 0;
 }
 
 /* Some architectures don't latch interrupts when they are disabled, so using
@@ -189,16 +217,19 @@ u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq)
 
 	disable_irq(eq->core.irqn);
 	count_eqe = eq->core.cons_index;
-	mlx5_eq_comp_int(eq->core.irqn, eq);
+	mlx5_eq_comp_int(&eq->irq_nb, 0, NULL);
 	count_eqe = eq->core.cons_index - count_eqe;
 	enable_irq(eq->core.irqn);
 
 	return count_eqe;
 }
 
-static irqreturn_t mlx5_eq_async_int(int irq, void *eq_ptr)
+static int mlx5_eq_async_int(struct notifier_block *nb,
+			     unsigned long action, void *data)
 {
-	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_eq_async *eq_async =
+		container_of(nb, struct mlx5_eq_async, irq_nb);
+	struct mlx5_eq *eq = &eq_async->core;
 	struct mlx5_eq_table *eqt;
 	struct mlx5_core_dev *dev;
 	struct mlx5_eqe *eqe;
@@ -232,7 +263,7 @@ static irqreturn_t mlx5_eq_async_int(int irq, void *eq_ptr)
 out:
 	eq_update_ci(eq, 1);
 
-	return IRQ_HANDLED;
+	return 0;
 }
 
 static void init_eq_buf(struct mlx5_eq *eq)
@@ -254,6 +285,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_irq_info *irq_info;
 	u8 vecidx = param->index;
 	__be64 *pas;
 	void *eqc;
@@ -261,9 +293,6 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	u32 *in;
 	int err;
 
-	if (eq_table->irq_info[vecidx].context)
-		return -EEXIST;
-
 	/* Init CQ table */
 	memset(cq_table, 0, sizeof(*cq_table));
 	spin_lock_init(&cq_table->lock);
@@ -306,24 +335,31 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	if (err)
 		goto err_in;
 
-	snprintf(eq_table->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
-		 name, pci_name(dev->pdev));
-	eq_table->irq_info[vecidx].context = param->context;
+	irq_info = mlx5_irq_get(dev, vecidx);
+	ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
+	snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
+		 "%s@pci:%s", name, pci_name(dev->pdev));
 
 	eq->vecidx = vecidx;
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
 	eq->dev = dev;
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	err = request_irq(eq->irqn, param->handler, 0,
-			  eq_table->irq_info[vecidx].name, param->context);
+	eq->irq_nb = param->nb;
+
+	err = request_irq(eq->irqn, mlx5_irq_int_handler, 0, irq_info->name,
+			  &irq_info->nh);
 	if (err)
 		goto err_eq;
 
-	err = mlx5_debug_eq_add(dev, eq);
+	err = mlx5_irq_attach_nb(irq_info, param->nb);
 	if (err)
 		goto err_irq;
 
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_detach;
+
 	/* EQs are created in ARMED state
 	 */
 	eq_update_ci(eq, 1);
@@ -331,8 +367,11 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	kvfree(in);
 	return 0;
 
+err_detach:
+	mlx5_irq_detach_nb(irq_info, param->nb);
+
 err_irq:
-	free_irq(eq->irqn, eq);
+	free_irq(eq->irqn, &eq_table->irq_info[vecidx].nh);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -355,9 +394,11 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 
 	mlx5_debug_eq_remove(dev, eq);
 
-	free_irq(eq->irqn, irq_info->context);
-	irq_info->context = NULL;
-
+	err = mlx5_irq_detach_nb(irq_info, eq->irq_nb);
+	if (err)
+		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
+			       err);
+	free_irq(eq->irqn, &eq_table->irq_info[eq->vecidx].nh);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -479,7 +520,7 @@ static int cq_err_event_notifier(struct notifier_block *nb,
 	/* type == MLX5_EVENT_TYPE_CQ_ERROR */
 
 	eqt = mlx5_nb_cof(nb, struct mlx5_eq_table, cq_err_nb);
-	eq  = &eqt->async_eq;
+	eq  = &eqt->async_eq.core;
 	eqe = data;
 
 	cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
@@ -548,14 +589,14 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR);
 	mlx5_eq_notifier_register(dev, &table->cq_err_nb);
 
+	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_CMD_IDX,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
-		.context = &table->cmd_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->cmd_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq, &param);
+	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
@@ -563,27 +604,29 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	mlx5_cmd_use_events(dev);
 
+	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_ASYNC_IDX,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
-		.context = &table->async_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->async_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_async_eq", &table->async_eq, &param);
+	err = create_async_eq(dev, "mlx5_async_eq",
+			      &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
 	}
 
+	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_PAGEREQ_IDX,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
-		.context = &table->pages_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->pages_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_pages_eq", &table->pages_eq, &param);
+	err = create_async_eq(dev, "mlx5_pages_eq",
+			      &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
@@ -592,11 +635,11 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	return err;
 
 err2:
-	destroy_async_eq(dev, &table->async_eq);
+	destroy_async_eq(dev, &table->async_eq.core);
 
 err1:
 	mlx5_cmd_use_polling(dev);
-	destroy_async_eq(dev, &table->cmd_eq);
+	destroy_async_eq(dev, &table->cmd_eq.core);
 err0:
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
 	return err;
@@ -607,19 +650,19 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	int err;
 
-	err = destroy_async_eq(dev, &table->pages_eq);
+	err = destroy_async_eq(dev, &table->pages_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
 			      err);
 
-	err = destroy_async_eq(dev, &table->async_eq);
+	err = destroy_async_eq(dev, &table->async_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
 			      err);
 
 	mlx5_cmd_use_polling(dev);
 
-	err = destroy_async_eq(dev, &table->cmd_eq);
+	err = destroy_async_eq(dev, &table->cmd_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
 			      err);
@@ -629,17 +672,17 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev)
 {
-	return &dev->priv.eq_table->async_eq;
+	return &dev->priv.eq_table->async_eq.core;
 }
 
 void mlx5_eq_synchronize_async_irq(struct mlx5_core_dev *dev)
 {
-	synchronize_irq(dev->priv.eq_table->async_eq.irqn);
+	synchronize_irq(dev->priv.eq_table->async_eq.core.irqn);
 }
 
 void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev)
 {
-	synchronize_irq(dev->priv.eq_table->cmd_eq.irqn);
+	synchronize_irq(dev->priv.eq_table->cmd_eq.core.irqn);
 }
 
 /* Generic EQ API for mlx5_core consumers
@@ -837,12 +880,12 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
 #endif
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
+		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.index = vecidx,
 			.mask = 0,
 			.nent = nent,
-			.context = &eq->core,
-			.handler = mlx5_eq_comp_int
+			.nb = &eq->irq_nb,
 		};
 		err = create_map_eq(dev, &eq->core, name, &param);
 		if (err) {
@@ -940,10 +983,7 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
 	max_eqs = table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE;
 	for (i = max_eqs - 1; i >= 0; i--) {
-		if (!table->irq_info[i].context)
-			continue;
-		free_irq(pci_irq_vector(dev->pdev, i), table->irq_info[i].context);
-		table->irq_info[i].context = NULL;
+		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
 	}
 	mutex_unlock(&table->lock);
 	pci_free_irq_vectors(dev->pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index c0fb6d72b695..adbc228bd55d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -34,10 +34,17 @@ struct mlx5_eq {
 	u8                      eqn;
 	int                     nent;
 	struct mlx5_rsc_debug   *dbg;
+	struct notifier_block   *irq_nb; /* For destroy only */
+};
+
+struct mlx5_eq_async {
+	struct mlx5_eq          core;
+	struct notifier_block   irq_nb;
 };
 
 struct mlx5_eq_comp {
-	struct mlx5_eq          core; /* Must be first */
+	struct mlx5_eq          core;
+	struct notifier_block   irq_nb;
 	struct mlx5_eq_tasklet  tasklet_ctx;
 	struct list_head        list;
 };
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 00045cc4ea11..7909f1ff197c 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -26,8 +26,7 @@ struct mlx5_eq_param {
 	u8             index;
 	int            nent;
 	u64            mask;
-	void          *context;
-	irq_handler_t  handler;
+	struct notifier_block *nb;
 };
 
 struct mlx5_eq *
-- 
cgit v1.2.3


From 24163189da487b4caa751eef4e945c9333aae441 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:25 +0000
Subject: net/mlx5: Separate IRQ request/free from EQ life cycle

Instead of requesting IRQ with eq creation, IRQs will be requested
before EQ table creation.
Instead of freeing the IRQs after EQ destroy, free IRQs after eq
table destroy.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c             |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 121 ++++++++++++++++++---------
 include/linux/mlx5/eq.h                      |   3 +-
 3 files changed, 84 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index ac40a4fd5598..7ce7c5bfe685 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1562,7 +1562,7 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		.nent = MLX5_IB_NUM_PF_EQE,
 		.nb = &eq->irq_nb,
 	};
-	eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
+	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 590c0fefaa25..f187169cbe76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -134,6 +134,64 @@ static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
 	return IRQ_HANDLED;
 }
 
+static void irq_set_name(char *name, int vecidx)
+{
+	switch (vecidx) {
+	case MLX5_EQ_CMD_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_cmd_eq");
+		break;
+	case MLX5_EQ_ASYNC_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async_eq");
+		break;
+	case MLX5_EQ_PAGEREQ_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_pages_eq");
+		break;
+	case MLX5_EQ_PFAULT_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_ib_page_fault_eq");
+		break;
+	default:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
+			 vecidx - MLX5_EQ_VEC_COMP_BASE);
+		break;
+	}
+}
+
+static int request_irqs(struct mlx5_core_dev *dev, int nvec)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *eq_table;
+	char name[MLX5_MAX_IRQ_NAME];
+	int err;
+	int i;
+
+	eq_table = priv->eq_table;
+	for (i = 0; i < nvec; i++) {
+		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		int irqn = pci_irq_vector(dev->pdev, i);
+
+		irq_set_name(name, i);
+		ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
+		snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
+			 "%s@pci:%s", name, pci_name(dev->pdev));
+		err = request_irq(irqn, mlx5_irq_int_handler, 0, irq_info->name,
+				  &irq_info->nh);
+		if (err) {
+			mlx5_core_err(dev, "Failed to request irq\n");
+			goto err_request_irq;
+		}
+	}
+	return 0;
+
+err_request_irq:
+	for (; i >= 0; i--) {
+		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		int irqn = pci_irq_vector(dev->pdev, i);
+
+		free_irq(irqn, &irq_info->nh);
+	}
+	return  err;
+}
+
 static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
 {
 	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
@@ -278,14 +336,12 @@ static void init_eq_buf(struct mlx5_eq *eq)
 }
 
 static int
-create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
+create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	      struct mlx5_eq_param *param)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_irq_info *irq_info;
 	u8 vecidx = param->index;
 	__be64 *pas;
 	void *eqc;
@@ -335,11 +391,6 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	if (err)
 		goto err_in;
 
-	irq_info = mlx5_irq_get(dev, vecidx);
-	ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
-	snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
-		 "%s@pci:%s", name, pci_name(dev->pdev));
-
 	eq->vecidx = vecidx;
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
@@ -347,15 +398,10 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
 	eq->irq_nb = param->nb;
 
-	err = request_irq(eq->irqn, mlx5_irq_int_handler, 0, irq_info->name,
-			  &irq_info->nh);
+	err = mlx5_irq_attach_nb(mlx5_irq_get(dev, vecidx), param->nb);
 	if (err)
 		goto err_eq;
 
-	err = mlx5_irq_attach_nb(irq_info, param->nb);
-	if (err)
-		goto err_irq;
-
 	err = mlx5_debug_eq_add(dev, eq);
 	if (err)
 		goto err_detach;
@@ -368,10 +414,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	return 0;
 
 err_detach:
-	mlx5_irq_detach_nb(irq_info, param->nb);
-
-err_irq:
-	free_irq(eq->irqn, &eq_table->irq_info[vecidx].nh);
+	mlx5_irq_detach_nb(mlx5_irq_get(dev, vecidx), eq->irq_nb);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -386,19 +429,14 @@ err_buf:
 
 static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
-	struct mlx5_irq_info *irq_info;
 	int err;
 
-	irq_info = &eq_table->irq_info[eq->vecidx];
-
 	mlx5_debug_eq_remove(dev, eq);
 
-	err = mlx5_irq_detach_nb(irq_info, eq->irq_nb);
+	err = mlx5_irq_detach_nb(mlx5_irq_get(dev, eq->vecidx), eq->irq_nb);
 	if (err)
 		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
 			       err);
-	free_irq(eq->irqn, &eq_table->irq_info[eq->vecidx].nh);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -479,7 +517,7 @@ void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev)
 
 /* Async EQs */
 
-static int create_async_eq(struct mlx5_core_dev *dev, const char *name,
+static int create_async_eq(struct mlx5_core_dev *dev,
 			   struct mlx5_eq *eq, struct mlx5_eq_param *param)
 {
 	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
@@ -491,7 +529,7 @@ static int create_async_eq(struct mlx5_core_dev *dev, const char *name,
 		goto unlock;
 	}
 
-	err = create_map_eq(dev, eq, name, param);
+	err = create_map_eq(dev, eq, param);
 unlock:
 	mutex_unlock(&eq_table->lock);
 	return err;
@@ -596,7 +634,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = MLX5_NUM_CMD_EQE,
 		.nb = &table->cmd_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq.core, &param);
+	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
@@ -611,8 +649,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = MLX5_NUM_ASYNC_EQE,
 		.nb = &table->async_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_async_eq",
-			      &table->async_eq.core, &param);
+	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
@@ -625,8 +662,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.nb = &table->pages_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_pages_eq",
-			      &table->pages_eq.core, &param);
+	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
@@ -689,7 +725,7 @@ void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev)
  * Needed For RDMA ODP EQ for now
  */
 struct mlx5_eq *
-mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
+mlx5_eq_create_generic(struct mlx5_core_dev *dev,
 		       struct mlx5_eq_param *param)
 {
 	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
@@ -698,7 +734,7 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
 	if (!eq)
 		return ERR_PTR(-ENOMEM);
 
-	err = create_async_eq(dev, name, eq, param);
+	err = create_async_eq(dev, eq, param);
 	if (err) {
 		kvfree(eq);
 		eq = ERR_PTR(err);
@@ -845,7 +881,6 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 static int create_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
-	char name[MLX5_MAX_IRQ_NAME];
 	struct mlx5_eq_comp *eq;
 	int ncomp_vec;
 	int nent;
@@ -879,7 +914,6 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 #ifdef CONFIG_RFS_ACCEL
 		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
 #endif
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.index = vecidx,
@@ -887,7 +921,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			.nent = nent,
 			.nb = &eq->irq_nb,
 		};
-		err = create_map_eq(dev, &eq->core, name, &param);
+		err = create_map_eq(dev, &eq->core, &param);
 		if (err) {
 			kfree(eq);
 			goto clean;
@@ -1018,8 +1052,14 @@ static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 
 	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
 
+	err = request_irqs(dev, nvec);
+	if (err)
+		goto err_free_irqs;
+
 	return 0;
 
+err_free_irqs:
+	pci_free_irq_vectors(dev->pdev);
 err_free_irq_info:
 	kfree(table->irq_info);
 	return err;
@@ -1027,10 +1067,13 @@ err_free_irq_info:
 
 static void free_irq_vectors(struct mlx5_core_dev *dev)
 {
-	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int i;
 
+	for (i = 0; i < table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE; i++)
+		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
 	pci_free_irq_vectors(dev->pdev);
-	kfree(priv->eq_table->irq_info);
+	kfree(table->irq_info);
 }
 
 int mlx5_eq_table_create(struct mlx5_core_dev *dev)
@@ -1039,7 +1082,7 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 
 	err = alloc_irq_vectors(dev);
 	if (err) {
-		mlx5_core_err(dev, "alloc irq vectors failed\n");
+		mlx5_core_err(dev, "Failed to create IRQ vectors\n");
 		return err;
 	}
 
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 7909f1ff197c..73ab658af764 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -30,8 +30,7 @@ struct mlx5_eq_param {
 };
 
 struct mlx5_eq *
-mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
-		       struct mlx5_eq_param *param);
+mlx5_eq_create_generic(struct mlx5_core_dev *dev, struct mlx5_eq_param *param);
 int
 mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 
-- 
cgit v1.2.3


From 561aa15ad69e9d1e5a8bb277adb3209bf8091ecb Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:27 +0000
Subject: net/mlx5: Separate IRQ data from EQ table data

IRQ table should only exist for mlx5_core_dev for PF and VF only.
EQ table of mediated devices should hold a pointer to the IRQ table
of the parent PCI device.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 125 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  11 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   3 +
 include/linux/mlx5/driver.h                        |   3 +
 4 files changed, 98 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index f187169cbe76..cdfa35ec02fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -77,6 +77,14 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+struct mlx5_irq_table {
+	struct mlx5_irq_info *irq_info;
+	int nvec;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap *rmap;
+#endif
+};
+
 struct mlx5_eq_table {
 	struct list_head        comp_eqs_list;
 	struct mlx5_eq_async    pages_eq;
@@ -89,11 +97,8 @@ struct mlx5_eq_table {
 	struct mlx5_nb          cq_err_nb;
 
 	struct mutex            lock; /* sync async eqs creations */
-	int			num_comp_vectors;
-	struct mlx5_irq_info	*irq_info;
-#ifdef CONFIG_RFS_ACCEL
-	struct cpu_rmap         *rmap;
-#endif
+	int			num_comp_eqs;
+	struct mlx5_irq_table	*irq_table;
 };
 
 #define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
@@ -109,11 +114,33 @@ struct mlx5_eq_table {
 			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
 			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
 
+int mlx5_irq_table_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_irq_table *irq_table;
+
+	irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
+	if (!irq_table)
+		return -ENOMEM;
+
+	dev->priv.irq_table = irq_table;
+	return 0;
+}
+
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
+{
+	kvfree(dev->priv.irq_table);
+}
+
+static int mlx5_irq_get_num_comp(struct mlx5_irq_table *table)
+{
+	return table->nvec - MLX5_EQ_VEC_COMP_BASE;
+}
+
 static struct mlx5_irq_info *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+	struct mlx5_irq_table *irq_table = dev->priv.irq_table;
 
-	return &eq_table->irq_info[vecidx];
+	return &irq_table->irq_info[vecidx];
 }
 
 static int mlx5_irq_attach_nb(struct mlx5_irq_info *irq,
@@ -158,15 +185,12 @@ static void irq_set_name(char *name, int vecidx)
 
 static int request_irqs(struct mlx5_core_dev *dev, int nvec)
 {
-	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_eq_table *eq_table;
 	char name[MLX5_MAX_IRQ_NAME];
 	int err;
 	int i;
 
-	eq_table = priv->eq_table;
 	for (i = 0; i < nvec; i++) {
-		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		struct mlx5_irq_info *irq_info = mlx5_irq_get(dev, i);
 		int irqn = pci_irq_vector(dev->pdev, i);
 
 		irq_set_name(name, i);
@@ -184,7 +208,7 @@ static int request_irqs(struct mlx5_core_dev *dev, int nvec)
 
 err_request_irq:
 	for (; i >= 0; i--) {
-		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		struct mlx5_irq_info *irq_info = mlx5_irq_get(dev, i);
 		int irqn = pci_irq_vector(dev->pdev, i);
 
 		free_irq(irqn, &irq_info->nh);
@@ -501,6 +525,7 @@ int mlx5_eq_table_init(struct mlx5_core_dev *dev)
 	for (i = 0; i < MLX5_EVENT_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&eq_table->nh[i]);
 
+	eq_table->irq_table = dev->priv.irq_table;
 	return 0;
 
 kvfree_eq_table:
@@ -796,10 +821,13 @@ EXPORT_SYMBOL(mlx5_eq_update_ci);
 
 static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	struct mlx5_priv *priv  = &mdev->priv;
 	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
-	int irq = pci_irq_vector(mdev->pdev, vecidx);
-	struct mlx5_irq_info *irq_info = &priv->eq_table->irq_info[vecidx];
+	struct mlx5_priv *priv  = &mdev->priv;
+	struct mlx5_irq_info *irq_info;
+	int irq;
+
+	irq_info = mlx5_irq_get(mdev, vecidx);
+	irq = pci_irq_vector(mdev->pdev, vecidx);
 
 	if (!zalloc_cpumask_var(&irq_info->mask, GFP_KERNEL)) {
 		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
@@ -819,20 +847,22 @@ static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 static void clear_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
 	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, vecidx);
-	struct mlx5_irq_info *irq_info = &priv->eq_table->irq_info[vecidx];
+	struct mlx5_irq_info *irq_info;
+	int irq;
 
+	irq_info = mlx5_irq_get(mdev, vecidx);
+	irq = pci_irq_vector(mdev->pdev, vecidx);
 	irq_set_affinity_hint(irq, NULL);
 	free_cpumask_var(irq_info->mask);
 }
 
 static int set_comp_irq_affinity_hints(struct mlx5_core_dev *mdev)
 {
+	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
 	int err;
 	int i;
 
-	for (i = 0; i < mdev->priv.eq_table->num_comp_vectors; i++) {
+	for (i = 0; i < nvec; i++) {
 		err = set_comp_irq_affinity_hint(mdev, i);
 		if (err)
 			goto err_out;
@@ -849,9 +879,10 @@ err_out:
 
 static void clear_comp_irqs_affinity_hints(struct mlx5_core_dev *mdev)
 {
+	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
 	int i;
 
-	for (i = 0; i < mdev->priv.eq_table->num_comp_vectors; i++)
+	for (i = 0; i < nvec; i++)
 		clear_comp_irq_affinity_hint(mdev, i);
 }
 
@@ -863,9 +894,9 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 	clear_comp_irqs_affinity_hints(dev);
 
 #ifdef CONFIG_RFS_ACCEL
-	if (table->rmap) {
-		free_irq_cpu_rmap(table->rmap);
-		table->rmap = NULL;
+	if (table->irq_table->rmap) {
+		free_irq_cpu_rmap(table->irq_table->rmap);
+		table->irq_table->rmap = NULL;
 	}
 #endif
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
@@ -882,20 +913,20 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	struct mlx5_eq_comp *eq;
-	int ncomp_vec;
+	int ncomp_eqs;
 	int nent;
 	int err;
 	int i;
 
 	INIT_LIST_HEAD(&table->comp_eqs_list);
-	ncomp_vec = table->num_comp_vectors;
+	ncomp_eqs = table->num_comp_eqs;
 	nent = MLX5_COMP_EQ_SIZE;
 #ifdef CONFIG_RFS_ACCEL
-	table->rmap = alloc_irq_cpu_rmap(ncomp_vec);
-	if (!table->rmap)
+	table->irq_table->rmap = alloc_irq_cpu_rmap(ncomp_eqs);
+	if (!table->irq_table->rmap)
 		return -ENOMEM;
 #endif
-	for (i = 0; i < ncomp_vec; i++) {
+	for (i = 0; i < ncomp_eqs; i++) {
 		int vecidx = i + MLX5_EQ_VEC_COMP_BASE;
 		struct mlx5_eq_param param = {};
 
@@ -912,7 +943,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			     (unsigned long)&eq->tasklet_ctx);
 
 #ifdef CONFIG_RFS_ACCEL
-		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
+		irq_cpu_rmap_add(table->irq_table->rmap,
+				 pci_irq_vector(dev->pdev, vecidx));
 #endif
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
@@ -967,22 +999,23 @@ EXPORT_SYMBOL(mlx5_vector2eqn);
 
 unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev)
 {
-	return dev->priv.eq_table->num_comp_vectors;
+	return dev->priv.eq_table->num_comp_eqs;
 }
 EXPORT_SYMBOL(mlx5_comp_vectors_count);
 
 struct cpumask *
 mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
 {
-	/* TODO: consider irq_get_affinity_mask(irq) */
-	return dev->priv.eq_table->irq_info[vector + MLX5_EQ_VEC_COMP_BASE].mask;
+	int vecidx = vector + MLX5_EQ_VEC_COMP_BASE;
+
+	return dev->priv.eq_table->irq_table->irq_info[vecidx].mask;
 }
 EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask);
 
 #ifdef CONFIG_RFS_ACCEL
 struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev)
 {
-	return dev->priv.eq_table->rmap;
+	return dev->priv.eq_table->irq_table->rmap;
 }
 #endif
 
@@ -1008,16 +1041,17 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 	clear_comp_irqs_affinity_hints(dev);
 
 #ifdef CONFIG_RFS_ACCEL
-	if (table->rmap) {
-		free_irq_cpu_rmap(table->rmap);
-		table->rmap = NULL;
+	if (table->irq_table->rmap) {
+		free_irq_cpu_rmap(table->irq_table->rmap);
+		table->irq_table->rmap = NULL;
 	}
 #endif
 
 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
-	max_eqs = table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE;
+	max_eqs = table->num_comp_eqs + MLX5_EQ_VEC_COMP_BASE;
 	for (i = max_eqs - 1; i >= 0; i--) {
-		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
+		free_irq(pci_irq_vector(dev->pdev, i),
+			 &mlx5_irq_get(dev, i)->nh);
 	}
 	mutex_unlock(&table->lock);
 	pci_free_irq_vectors(dev->pdev);
@@ -1026,7 +1060,7 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_eq_table *table = priv->eq_table;
+	struct mlx5_irq_table *table = priv->irq_table;
 	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
@@ -1050,7 +1084,7 @@ static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 		goto err_free_irq_info;
 	}
 
-	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+	table->nvec = nvec;
 
 	err = request_irqs(dev, nvec);
 	if (err)
@@ -1067,17 +1101,19 @@ err_free_irq_info:
 
 static void free_irq_vectors(struct mlx5_core_dev *dev)
 {
-	struct mlx5_eq_table *table = dev->priv.eq_table;
+	struct mlx5_irq_table *table = dev->priv.irq_table;
 	int i;
 
-	for (i = 0; i < table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE; i++)
-		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
+	for (i = 0; i < table->nvec; i++)
+		free_irq(pci_irq_vector(dev->pdev, i),
+			 &mlx5_irq_get(dev, i)->nh);
 	pci_free_irq_vectors(dev->pdev);
 	kfree(table->irq_info);
 }
 
 int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 {
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
 	int err;
 
 	err = alloc_irq_vectors(dev);
@@ -1086,6 +1122,9 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 		return err;
 	}
 
+	eq_table->num_comp_eqs =
+		mlx5_irq_get_num_comp(eq_table->irq_table);
+
 	err = create_async_eqs(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to create async EQs\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 720f65bfe6a9..be79dceea3c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -804,10 +804,16 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_devcom;
 	}
 
+	err = mlx5_irq_table_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "failed to initialize irq table\n");
+		goto err_devcom;
+	}
+
 	err = mlx5_eq_table_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "failed to initialize eq\n");
-		goto err_devcom;
+		goto err_irq_cleanup;
 	}
 
 	err = mlx5_events_init(dev);
@@ -883,6 +889,8 @@ err_events_cleanup:
 	mlx5_events_cleanup(dev);
 err_eq_cleanup:
 	mlx5_eq_table_cleanup(dev);
+err_irq_cleanup:
+	mlx5_irq_table_cleanup(dev);
 err_devcom:
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 
@@ -905,6 +913,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_cq_debugfs_cleanup(dev);
 	mlx5_events_cleanup(dev);
 	mlx5_eq_table_cleanup(dev);
+	mlx5_irq_table_cleanup(dev);
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 22e69d4813e4..907515f3bfbb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -153,6 +153,9 @@ int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
 void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_remove(struct mlx5_core_dev *dev);
 
+int mlx5_irq_table_init(struct mlx5_core_dev *dev);
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
+
 int mlx5_events_init(struct mlx5_core_dev *dev);
 void mlx5_events_cleanup(struct mlx5_core_dev *dev);
 void mlx5_events_start(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 64155fe201ee..d8ab633406c2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -492,6 +492,7 @@ struct mlx5_eswitch;
 struct mlx5_lag;
 struct mlx5_devcom;
 struct mlx5_eq_table;
+struct mlx5_irq_table;
 
 struct mlx5_rate_limit {
 	u32			rate;
@@ -521,6 +522,8 @@ struct mlx5_core_roce {
 };
 
 struct mlx5_priv {
+	/* IRQ table valid only for real pci devices PF or VF */
+	struct mlx5_irq_table   *irq_table;
 	struct mlx5_eq_table	*eq_table;
 
 	/* pages stuff */
-- 
cgit v1.2.3


From 81bfa206032a67f0700459a64a5493c246629604 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:41 +0000
Subject: net/mlx5: Use a single IRQ for all async EQs

The patch modifies the IRQ allocation so that all async EQs are
assigned to the same IRQ resulting in more available IRQs for
completion EQs.

The changes are using the support for IRQ sharing and EQ polling budget
that was introduced in previous patches so when the shared interrupt is
triggered, the kernel will serially call the handler of each of the
sharing EQs with a certain budget of EQEs to poll in order to prevent
starvation.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c      | 19 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 38 +++++++++--------------
 include/linux/mlx5/eq.h                           | 14 ++-------
 4 files changed, 27 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 7ce7c5bfe685..693a0e225093 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1557,7 +1557,7 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_PFAULT_IDX,
+		.irq_index = 0,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
 		.nb = &eq->irq_nb,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0c72c122daef..0f5846a34928 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -250,7 +250,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
-	u8 vecidx = param->index;
+	u8 vecidx = param->irq_index;
 	__be64 *pas;
 	void *eqc;
 	int inlen;
@@ -435,8 +435,9 @@ static int create_async_eq(struct mlx5_core_dev *dev,
 	int err;
 
 	mutex_lock(&eq_table->lock);
-	if (param->index >= MLX5_EQ_MAX_ASYNC_EQS) {
-		err = -ENOSPC;
+	/* Async EQs must share irq index 0 */
+	if (param->irq_index != 0) {
+		err = -EINVAL;
 		goto unlock;
 	}
 
@@ -540,7 +541,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_CMD_IDX,
+		.irq_index = 0,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
 		.nb = &table->cmd_eq.irq_nb,
@@ -555,7 +556,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_ASYNC_IDX,
+		.irq_index = 0,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
 		.nb = &table->async_eq.irq_nb,
@@ -568,7 +569,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_PAGEREQ_IDX,
+		.irq_index = 0,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.nb = &table->pages_eq.irq_nb,
@@ -731,7 +732,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	ncomp_eqs = table->num_comp_eqs;
 	nent = MLX5_COMP_EQ_SIZE;
 	for (i = 0; i < ncomp_eqs; i++) {
-		int vecidx = i + MLX5_EQ_VEC_COMP_BASE;
+		int vecidx = i + MLX5_IRQ_VEC_COMP_BASE;
 		struct mlx5_eq_param param = {};
 
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
@@ -748,7 +749,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
-			.index = vecidx,
+			.irq_index = vecidx,
 			.mask = 0,
 			.nent = nent,
 			.nb = &eq->irq_nb,
@@ -800,7 +801,7 @@ EXPORT_SYMBOL(mlx5_comp_vectors_count);
 struct cpumask *
 mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
 {
-	int vecidx = vector + MLX5_EQ_VEC_COMP_BASE;
+	int vecidx = vector + MLX5_IRQ_VEC_COMP_BASE;
 
 	return mlx5_irq_get_affinity_mask(dev->priv.eq_table->irq_table,
 					  vecidx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index fec861f4fefe..373981a659c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -45,7 +45,7 @@ void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
 
 int mlx5_irq_get_num_comp(struct mlx5_irq_table *table)
 {
-	return table->nvec - MLX5_EQ_VEC_COMP_BASE;
+	return table->nvec - MLX5_IRQ_VEC_COMP_BASE;
 }
 
 static struct mlx5_irq *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
@@ -81,24 +81,14 @@ static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
 
 static void irq_set_name(char *name, int vecidx)
 {
-	switch (vecidx) {
-	case MLX5_EQ_CMD_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_cmd_eq");
-		break;
-	case MLX5_EQ_ASYNC_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async_eq");
-		break;
-	case MLX5_EQ_PAGEREQ_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_pages_eq");
-		break;
-	case MLX5_EQ_PFAULT_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_ib_page_fault_eq");
-		break;
-	default:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
-			 vecidx - MLX5_EQ_VEC_COMP_BASE);
-		break;
+	if (vecidx == 0) {
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async");
+		return;
 	}
+
+	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
+		 vecidx - MLX5_IRQ_VEC_COMP_BASE);
+	return;
 }
 
 static int request_irqs(struct mlx5_core_dev *dev, int nvec)
@@ -159,7 +149,7 @@ static int irq_set_rmap(struct mlx5_core_dev *mdev)
 		goto err_out;
 	}
 
-	vecidx = MLX5_EQ_VEC_COMP_BASE;
+	vecidx = MLX5_IRQ_VEC_COMP_BASE;
 	for (; vecidx < irq_table->nvec; vecidx++) {
 		err = irq_cpu_rmap_add(irq_table->rmap,
 				       pci_irq_vector(mdev->pdev, vecidx));
@@ -182,7 +172,7 @@ err_out:
 
 static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
 	struct mlx5_irq *irq;
 	int irqn;
 
@@ -205,7 +195,7 @@ static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 
 static void clear_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
 	struct mlx5_irq *irq;
 	int irqn;
 
@@ -279,16 +269,16 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 	int err;
 
 	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
-	       MLX5_EQ_VEC_COMP_BASE;
+	       MLX5_IRQ_VEC_COMP_BASE;
 	nvec = min_t(int, nvec, num_eqs);
-	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+	if (nvec <= MLX5_IRQ_VEC_COMP_BASE)
 		return -ENOMEM;
 
 	table->irq = kcalloc(nvec, sizeof(*table->irq), GFP_KERNEL);
 	if (!table->irq)
 		return -ENOMEM;
 
-	nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1,
+	nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_IRQ_VEC_COMP_BASE + 1,
 				     nvec, PCI_IRQ_MSIX);
 	if (nvec < 0) {
 		err = nvec;
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 73ab658af764..4a94e04eff0a 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -4,17 +4,7 @@
 #ifndef MLX5_CORE_EQ_H
 #define MLX5_CORE_EQ_H
 
-enum {
-	MLX5_EQ_PAGEREQ_IDX        = 0,
-	MLX5_EQ_CMD_IDX            = 1,
-	MLX5_EQ_ASYNC_IDX          = 2,
-	/* reserved to be used by mlx5_core ulps (mlx5e/mlx5_ib) */
-	MLX5_EQ_PFAULT_IDX         = 3,
-	MLX5_EQ_MAX_ASYNC_EQS,
-	/* completion eqs vector indices start here */
-	MLX5_EQ_VEC_COMP_BASE = MLX5_EQ_MAX_ASYNC_EQS,
-};
-
+#define MLX5_IRQ_VEC_COMP_BASE 1
 #define MLX5_NUM_CMD_EQE   (32)
 #define MLX5_NUM_ASYNC_EQE (0x1000)
 #define MLX5_NUM_SPARE_EQE (0x80)
@@ -23,7 +13,7 @@ struct mlx5_eq;
 struct mlx5_core_dev;
 
 struct mlx5_eq_param {
-	u8             index;
+	u8             irq_index;
 	int            nent;
 	u64            mask;
 	struct notifier_block *nb;
-- 
cgit v1.2.3


From 1f8a7bee27e63d7c5287719049941e285e54d370 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:42 +0000
Subject: net/mlx5: Add EQ enable/disable API

Previously, EQ joined the chain notifier on creation.
This forced the caller to be ready to handle events before creating
the EQ through eq_create_generic interface.

To help the caller control when the created EQ will be attached to the
IRQ, add enable/disable API.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c                 |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 105 ++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h |   1 -
 include/linux/mlx5/eq.h                          |   5 +-
 4 files changed, 88 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 693a0e225093..12ccee1eb047 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1560,15 +1560,21 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		.irq_index = 0,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
-		.nb = &eq->irq_nb,
 	};
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
 	}
+	err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
+	if (err) {
+		mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
+		goto err_eq;
+	}
 
 	return 0;
+err_eq:
+	mlx5_eq_destroy_generic(dev->mdev, eq->core);
 err_wq:
 	destroy_workqueue(eq->wq);
 err_mempool:
@@ -1581,6 +1587,7 @@ mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 {
 	int err;
 
+	mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
 	err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
 	cancel_work_sync(&eq->work);
 	destroy_workqueue(eq->wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0f5846a34928..58fff2f39b38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -304,27 +304,14 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
 	eq->dev = dev;
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	eq->irq_nb = param->nb;
-
-	err = mlx5_irq_attach_nb(dev->priv.eq_table->irq_table, vecidx,
-				 param->nb);
-	if (err)
-		goto err_eq;
 
 	err = mlx5_debug_eq_add(dev, eq);
 	if (err)
-		goto err_detach;
-
-	/* EQs are created in ARMED state
-	 */
-	eq_update_ci(eq, 1);
+		goto err_eq;
 
 	kvfree(in);
 	return 0;
 
-err_detach:
-	mlx5_irq_detach_nb(dev->priv.eq_table->irq_table, vecidx, eq->irq_nb);
-
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
 
@@ -336,17 +323,49 @@ err_buf:
 	return err;
 }
 
+/**
+ * mlx5_eq_enable - Enable EQ for receiving EQEs
+ * @dev - Device which owns the eq
+ * @eq - EQ to enable
+ * @nb - notifier call block
+ * mlx5_eq_enable - must be called after EQ is created in device.
+ */
+int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		   struct notifier_block *nb)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+	int err;
+
+	err = mlx5_irq_attach_nb(eq_table->irq_table, eq->vecidx, nb);
+	if (!err)
+		eq_update_ci(eq, 1);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_eq_enable);
+
+/**
+ * mlx5_eq_disable - Enable EQ for receiving EQEs
+ * @dev - Device which owns the eq
+ * @eq - EQ to disable
+ * @nb - notifier call block
+ * mlx5_eq_disable - must be called before EQ is destroyed.
+ */
+void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		     struct notifier_block *nb)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+
+	mlx5_irq_detach_nb(eq_table->irq_table, eq->vecidx, nb);
+}
+EXPORT_SYMBOL(mlx5_eq_disable);
+
 static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 {
 	int err;
 
 	mlx5_debug_eq_remove(dev, eq);
 
-	err = mlx5_irq_detach_nb(dev->priv.eq_table->irq_table,
-				 eq->vecidx, eq->irq_nb);
-	if (err)
-		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
-			       err);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -544,14 +563,17 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
-		.nb = &table->cmd_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
 	}
-
+	err = mlx5_eq_enable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable cmd EQ %d\n", err);
+		goto err1;
+	}
 	mlx5_cmd_use_events(dev);
 
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
@@ -559,12 +581,17 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
-		.nb = &table->async_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
-		goto err1;
+		goto err2;
+	}
+	err = mlx5_eq_enable(dev, &table->async_eq.core,
+			     &table->async_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable async EQ %d\n", err);
+		goto err3;
 	}
 
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
@@ -572,21 +599,31 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
-		.nb = &table->pages_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
-		goto err2;
+		goto err4;
+	}
+	err = mlx5_eq_enable(dev, &table->pages_eq.core,
+			     &table->pages_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable pages EQ %d\n", err);
+		goto err5;
 	}
 
 	return err;
 
-err2:
+err5:
+	destroy_async_eq(dev, &table->pages_eq.core);
+err4:
+	mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb);
+err3:
 	destroy_async_eq(dev, &table->async_eq.core);
-
-err1:
+err2:
 	mlx5_cmd_use_polling(dev);
+	mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
+err1:
 	destroy_async_eq(dev, &table->cmd_eq.core);
 err0:
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
@@ -598,11 +635,13 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	int err;
 
+	mlx5_eq_disable(dev, &table->pages_eq.core, &table->pages_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->pages_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
 			      err);
 
+	mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->async_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
@@ -610,6 +649,7 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 
 	mlx5_cmd_use_polling(dev);
 
+	mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->cmd_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
@@ -711,6 +751,7 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
 		list_del(&eq->list);
+		mlx5_eq_disable(dev, &eq->core, &eq->irq_nb);
 		if (destroy_unmap_eq(dev, &eq->core))
 			mlx5_core_warn(dev, "failed to destroy comp EQ 0x%x\n",
 				       eq->core.eqn);
@@ -752,13 +793,19 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			.irq_index = vecidx,
 			.mask = 0,
 			.nent = nent,
-			.nb = &eq->irq_nb,
 		};
 		err = create_map_eq(dev, &eq->core, &param);
 		if (err) {
 			kfree(eq);
 			goto clean;
 		}
+		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
+		if (err) {
+			destroy_unmap_eq(dev, &eq->core);
+			kfree(eq);
+			goto clean;
+		}
+
 		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->core.eqn);
 		/* add tail, to keep the list ordered, for mlx5_vector2eqn to work */
 		list_add_tail(&eq->list, &table->comp_eqs_list);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 3836c39b2900..24bd991a727e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -33,7 +33,6 @@ struct mlx5_eq {
 	u8                      eqn;
 	int                     nent;
 	struct mlx5_rsc_debug   *dbg;
-	struct notifier_block   *irq_nb; /* For destroy only */
 };
 
 struct mlx5_eq_async {
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 4a94e04eff0a..70e16dcfb4c4 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -16,13 +16,16 @@ struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
 	u64            mask;
-	struct notifier_block *nb;
 };
 
 struct mlx5_eq *
 mlx5_eq_create_generic(struct mlx5_core_dev *dev, struct mlx5_eq_param *param);
 int
 mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		   struct notifier_block *nb);
+void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		     struct notifier_block *nb);
 
 struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc);
 void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm);
-- 
cgit v1.2.3


From b25bbc2f24dcab9cd186ef4003c39bf51ad0454c Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 28 Jun 2018 15:05:58 +0300
Subject: net/mlx5: Add Vendor Specific Capability access gateway

The Vendor Specific Capability (VSC) is used to activate a gateway
interfacing with the device. The gateway is used to read or write
device configurations, which are organized in different domains (spaces).
A configuration access may result in multiple actions, reads, writes.

Example usages are accessing the Crspace domain to read the crspace or
locking a device semaphore using the Semaphore domain.

The configuration access use pci_cfg_access to prevent parallel access to
the VSC space by the driver and userspace calls.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c  | 286 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h  |  24 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   3 +
 include/linux/mlx5/driver.h                        |   1 +
 5 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9006fda6bd11..8e07354faea1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,8 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
-		lib/devcom.o diag/fs_tracepoint.o diag/fw_tracer.o devlink.o
+		lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
+		diag/fw_tracer.o devlink.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
new file mode 100644
index 000000000000..a27b0119b3d6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#include <linux/pci.h>
+#include "mlx5_core.h"
+#include "pci_vsc.h"
+
+#define MLX5_EXTRACT_C(source, offset, size)	\
+	((((u32)(source)) >> (offset)) & MLX5_ONES32(size))
+#define MLX5_EXTRACT(src, start, len)		\
+	(((len) == 32) ? (src) : MLX5_EXTRACT_C(src, start, len))
+#define MLX5_ONES32(size)			\
+	((size) ? (0xffffffff >> (32 - (size))) : 0)
+#define MLX5_MASK32(offset, size)		\
+	(MLX5_ONES32(size) << (offset))
+#define MLX5_MERGE_C(rsrc1, rsrc2, start, len)  \
+	((((rsrc2) << (start)) & (MLX5_MASK32((start), (len)))) | \
+	((rsrc1) & (~MLX5_MASK32((start), (len)))))
+#define MLX5_MERGE(rsrc1, rsrc2, start, len)	\
+	(((len) == 32) ? (rsrc2) : MLX5_MERGE_C(rsrc1, rsrc2, start, len))
+#define vsc_read(dev, offset, val) \
+	pci_read_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val))
+#define vsc_write(dev, offset, val) \
+	pci_write_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val))
+#define VSC_MAX_RETRIES 2048
+
+enum mlx5_vsc_state {
+	MLX5_VSC_UNLOCK,
+	MLX5_VSC_LOCK,
+};
+
+enum {
+	VSC_CTRL_OFFSET = 0x4,
+	VSC_COUNTER_OFFSET = 0x8,
+	VSC_SEMAPHORE_OFFSET = 0xc,
+	VSC_ADDR_OFFSET = 0x10,
+	VSC_DATA_OFFSET = 0x14,
+
+	VSC_FLAG_BIT_OFFS = 31,
+	VSC_FLAG_BIT_LEN = 1,
+
+	VSC_SYND_BIT_OFFS = 30,
+	VSC_SYND_BIT_LEN = 1,
+
+	VSC_ADDR_BIT_OFFS = 0,
+	VSC_ADDR_BIT_LEN = 30,
+
+	VSC_SPACE_BIT_OFFS = 0,
+	VSC_SPACE_BIT_LEN = 16,
+
+	VSC_SIZE_VLD_BIT_OFFS = 28,
+	VSC_SIZE_VLD_BIT_LEN = 1,
+
+	VSC_STATUS_BIT_OFFS = 29,
+	VSC_STATUS_BIT_LEN = 3,
+};
+
+void mlx5_pci_vsc_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	dev->vsc_addr = pci_find_capability(dev->pdev,
+					    PCI_CAP_ID_VNDR);
+	if (!dev->vsc_addr)
+		mlx5_core_warn(dev, "Failed to get valid vendor specific ID\n");
+}
+
+int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev)
+{
+	u32 counter = 0;
+	int retries = 0;
+	u32 lock_val;
+	int ret;
+
+	pci_cfg_access_lock(dev->pdev);
+	do {
+		if (retries > VSC_MAX_RETRIES) {
+			ret = -EBUSY;
+			goto pci_unlock;
+		}
+
+		/* Check if semaphore is already locked */
+		ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val);
+		if (ret)
+			goto pci_unlock;
+
+		if (lock_val) {
+			retries++;
+			usleep_range(1000, 2000);
+			continue;
+		}
+
+		/* Read and write counter value, if written value is
+		 * the same, semaphore was acquired successfully.
+		 */
+		ret = vsc_read(dev, VSC_COUNTER_OFFSET, &counter);
+		if (ret)
+			goto pci_unlock;
+
+		ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, counter);
+		if (ret)
+			goto pci_unlock;
+
+		ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val);
+		if (ret)
+			goto pci_unlock;
+
+		retries++;
+	} while (counter != lock_val);
+
+	return 0;
+
+pci_unlock:
+	pci_cfg_access_unlock(dev->pdev);
+	return ret;
+}
+
+int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev)
+{
+	int ret;
+
+	ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, MLX5_VSC_UNLOCK);
+	pci_cfg_access_unlock(dev->pdev);
+	return ret;
+}
+
+int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space,
+			  u32 *ret_space_size)
+{
+	int ret;
+	u32 val = 0;
+
+	if (!mlx5_vsc_accessible(dev))
+		return -EINVAL;
+
+	if (ret_space_size)
+		*ret_space_size = 0;
+
+	/* Get a unique val */
+	ret = vsc_read(dev, VSC_CTRL_OFFSET, &val);
+	if (ret)
+		goto out;
+
+	/* Try to modify the lock */
+	val = MLX5_MERGE(val, space, VSC_SPACE_BIT_OFFS, VSC_SPACE_BIT_LEN);
+	ret = vsc_write(dev, VSC_CTRL_OFFSET, val);
+	if (ret)
+		goto out;
+
+	/* Verify lock was modified */
+	ret = vsc_read(dev, VSC_CTRL_OFFSET, &val);
+	if (ret)
+		goto out;
+
+	if (MLX5_EXTRACT(val, VSC_STATUS_BIT_OFFS, VSC_STATUS_BIT_LEN) == 0)
+		return -EINVAL;
+
+	/* Get space max address if indicated by size valid bit */
+	if (ret_space_size &&
+	    MLX5_EXTRACT(val, VSC_SIZE_VLD_BIT_OFFS, VSC_SIZE_VLD_BIT_LEN)) {
+		ret = vsc_read(dev, VSC_ADDR_OFFSET, &val);
+		if (ret) {
+			mlx5_core_warn(dev, "Failed to get max space size\n");
+			goto out;
+		}
+		*ret_space_size = MLX5_EXTRACT(val, VSC_ADDR_BIT_OFFS,
+					       VSC_ADDR_BIT_LEN);
+	}
+	return 0;
+
+out:
+	return ret;
+}
+
+static int mlx5_vsc_wait_on_flag(struct mlx5_core_dev *dev, u8 expected_val)
+{
+	int retries = 0;
+	u32 flag;
+	int ret;
+
+	do {
+		if (retries > VSC_MAX_RETRIES)
+			return -EBUSY;
+
+		ret = vsc_read(dev, VSC_ADDR_OFFSET, &flag);
+		if (ret)
+			return ret;
+		flag = MLX5_EXTRACT(flag, VSC_FLAG_BIT_OFFS, VSC_FLAG_BIT_LEN);
+		retries++;
+
+		if ((retries & 0xf) == 0)
+			usleep_range(1000, 2000);
+
+	} while (flag != expected_val);
+
+	return 0;
+}
+
+static int mlx5_vsc_gw_write(struct mlx5_core_dev *dev, unsigned int address,
+			     u32 data)
+{
+	int ret;
+
+	if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS,
+			 VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN))
+		return -EINVAL;
+
+	/* Set flag to 0x1 */
+	address = MLX5_MERGE(address, 1, VSC_FLAG_BIT_OFFS, 1);
+	ret = vsc_write(dev, VSC_DATA_OFFSET, data);
+	if (ret)
+		goto out;
+
+	ret = vsc_write(dev, VSC_ADDR_OFFSET, address);
+	if (ret)
+		goto out;
+
+	/* Wait for the flag to be cleared */
+	ret = mlx5_vsc_wait_on_flag(dev, 0);
+
+out:
+	return ret;
+}
+
+static int mlx5_vsc_gw_read(struct mlx5_core_dev *dev, unsigned int address,
+			    u32 *data)
+{
+	int ret;
+
+	if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS,
+			 VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN))
+		return -EINVAL;
+
+	ret = vsc_write(dev, VSC_ADDR_OFFSET, address);
+	if (ret)
+		goto out;
+
+	ret = mlx5_vsc_wait_on_flag(dev, 1);
+	if (ret)
+		goto out;
+
+	ret = vsc_read(dev, VSC_DATA_OFFSET, data);
+out:
+	return ret;
+}
+
+static int mlx5_vsc_gw_read_fast(struct mlx5_core_dev *dev,
+				 unsigned int read_addr,
+				 unsigned int *next_read_addr,
+				 u32 *data)
+{
+	int ret;
+
+	ret = mlx5_vsc_gw_read(dev, read_addr, data);
+	if (ret)
+		goto out;
+
+	ret = vsc_read(dev, VSC_ADDR_OFFSET, next_read_addr);
+	if (ret)
+		goto out;
+
+	*next_read_addr = MLX5_EXTRACT(*next_read_addr, VSC_ADDR_BIT_OFFS,
+				       VSC_ADDR_BIT_LEN);
+
+	if (*next_read_addr <= read_addr)
+		ret = -EINVAL;
+out:
+	return ret;
+}
+
+int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data,
+				int length)
+{
+	unsigned int next_read_addr = 0;
+	unsigned int read_addr = 0;
+
+	while (read_addr < length) {
+		if (mlx5_vsc_gw_read_fast(dev, read_addr, &next_read_addr,
+					  &data[(read_addr >> 2)]))
+			return read_addr;
+
+		read_addr = next_read_addr;
+	}
+	return length;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h
new file mode 100644
index 000000000000..28ea6bfa439f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#ifndef __MLX5_PCI_VSC_H__
+#define __MLX5_PCI_VSC_H__
+
+enum {
+	MLX5_VSC_SPACE_SCAN_CRSPACE = 0x7,
+};
+
+void mlx5_pci_vsc_init(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space,
+			  u32 *ret_space_size);
+int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data,
+				int length);
+
+static inline bool mlx5_vsc_accessible(struct mlx5_core_dev *dev)
+{
+	return !!dev->vsc_addr;
+}
+
+#endif /* __MLX5_PCI_VSC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5ea141893b99..3adc09a1a312 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -66,6 +66,7 @@
 #include "lib/vxlan.h"
 #include "lib/geneve.h"
 #include "lib/devcom.h"
+#include "lib/pci_vsc.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
 
@@ -763,6 +764,8 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
 		goto err_clr_master;
 	}
 
+	mlx5_pci_vsc_init(dev);
+
 	return 0;
 
 err_clr_master:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3a810bf043fe..f732445bcbdb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -693,6 +693,7 @@ struct mlx5_core_dev {
 	struct mlx5_clock        clock;
 	struct mlx5_ib_clock_info  *clock_info;
 	struct mlx5_fw_tracer   *tracer;
+	u32                      vsc_addr;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 8b9d8baae1de7400f19058020ee8f0f27d436687 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Tue, 17 Jul 2018 11:18:26 +0300
Subject: net/mlx5: Add Crdump support

Crdump allows the driver to retrieve a dump of the FW PCI crspace.
This is useful in case of catastrophic issues which may require FW
reset. The crspace dump can be used for later debug.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/diag/crdump.c  | 106 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   5 +
 include/linux/mlx5/driver.h                        |   1 +
 5 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8e07354faea1..5fe2bf916c06 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o devlink.o
+		diag/fw_tracer.o diag/crdump.o devlink.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
new file mode 100644
index 000000000000..dfb34172c69b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "lib/pci_vsc.h"
+#include "lib/mlx5.h"
+
+#define BAD_ACCESS			0xBADACCE5
+#define MLX5_PROTECTED_CR_SCAN_CRSPACE	0x7
+
+static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev)
+{
+	return !!dev->priv.health.crdump_size;
+}
+
+static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+	u32 crdump_size = dev->priv.health.crdump_size;
+	int i, ret;
+
+	for (i = 0; i < (crdump_size / 4); i++)
+		cr_data[i] = BAD_ACCESS;
+
+	ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size);
+	if (ret <= 0) {
+		if (ret == 0)
+			return -EIO;
+		return ret;
+	}
+
+	if (crdump_size != ret) {
+		mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
+			       ret, crdump_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+	int ret;
+
+	if (!mlx5_crdump_enabled(dev))
+		return -ENODEV;
+
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret) {
+		mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
+			       ret);
+		return ret;
+	}
+
+	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
+	if (ret)
+		goto unlock;
+
+	ret = mlx5_crdump_fill(dev, cr_data);
+
+unlock:
+	mlx5_vsc_gw_unlock(dev);
+	return ret;
+}
+
+int mlx5_crdump_enable(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	u32 space_size;
+	int ret;
+
+	if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
+	    mlx5_crdump_enabled(dev))
+		return 0;
+
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret)
+		return ret;
+
+	/* Check if space is supported and get space size */
+	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
+				    &space_size);
+	if (ret) {
+		/* Unlock and mask error since space is not supported */
+		mlx5_vsc_gw_unlock(dev);
+		return 0;
+	}
+
+	if (!space_size) {
+		mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
+		mlx5_vsc_gw_unlock(dev);
+		return -EINVAL;
+	}
+
+	ret = mlx5_vsc_gw_unlock(dev);
+	if (ret)
+		return ret;
+
+	priv->health.crdump_size = space_size;
+	return 0;
+}
+
+void mlx5_crdump_disable(struct mlx5_core_dev *dev)
+{
+	dev->priv.health.crdump_size = 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 397a2847867a..d918e44491f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -41,6 +41,9 @@ int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
 void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
+int mlx5_crdump_enable(struct mlx5_core_dev *dev);
+void mlx5_crdump_disable(struct mlx5_core_dev *dev);
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
 
 /* TODO move to lib/events.h */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 3adc09a1a312..c70e97071b87 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto clean_load;
 
+	err = mlx5_crdump_enable(dev);
+	if (err)
+		dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
+
 	pci_save_state(pdev);
 	return 0;
 
@@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev)
 	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
 	struct devlink *devlink = priv_to_devlink(dev);
 
+	mlx5_crdump_disable(dev);
 	mlx5_devlink_unregister(devlink);
 	mlx5_unregister_device(dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f732445bcbdb..4ae533b3da07 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,6 +435,7 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
-- 
cgit v1.2.3


From 63cbc552eebf08818af2025aef4589a48ef849c0 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Mon, 12 Nov 2018 15:23:02 +0200
Subject: net/mlx5: Handle SW reset of FW in error flow

New mlx5 adapters allow the driver to reset the FW in the event of an
error, this action called "SW Reset". When an SW reset is issued on any
PF all PFs enter reset state which is a recoverable condition. The
existing recovery flow was designed to allow the recovery of a VF after
a PF driver reload. This patch adds the sw reset to the NIC states
as a preparation for sw reset handling.

When a software reset is issued the following occurs:
1. The NIC interface mode is set to 7 while the reset is in progress.
2. Once the reset completes the NIC interface mode is set to 1.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 105 +++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   2 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +-
 include/linux/mlx5/driver.h                        |   2 +-
 5 files changed, 48 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 4382ef85488c..840ec945ccba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -64,7 +64,7 @@ static int mlx5e_test_health_info(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_health *health = &priv->mdev->priv.health;
 
-	return health->sick ? 1 : 0;
+	return health->fatal_error ? 1 : 0;
 }
 
 static int mlx5e_test_link_state(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index a2656f4008d9..737e6d550775 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -62,12 +62,18 @@ enum {
 
 enum {
 	MLX5_DROP_NEW_HEALTH_WORK,
-	MLX5_DROP_NEW_RECOVERY_WORK,
+};
+
+enum  {
+	MLX5_SENSOR_NO_ERR		= 0,
+	MLX5_SENSOR_PCI_COMM_ERR	= 1,
+	MLX5_SENSOR_NIC_DISABLED	= 2,
+	MLX5_SENSOR_NIC_SW_RESET	= 3,
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
 {
-	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
 }
 
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
@@ -80,18 +86,25 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
-static int in_fatal(struct mlx5_core_dev *dev)
+static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct health_buffer __iomem *h = health->health;
 
-	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
-		return 1;
+	/* Offline PCI reads return 0xffffffff */
+	return (ioread32be(&h->fw_ver) == 0xffffffff);
+}
 
-	if (ioread32be(&h->fw_ver) == 0xffffffff)
-		return 1;
+static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
+{
+	if (sensor_pci_not_working(dev))
+		return MLX5_SENSOR_PCI_COMM_ERR;
+	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		return MLX5_SENSOR_NIC_DISABLED;
+	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET)
+		return MLX5_SENSOR_NIC_SW_RESET;
 
-	return 0;
+	return MLX5_SENSOR_NO_ERR;
 }
 
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
@@ -101,7 +114,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 		goto unlock;
 
 	mlx5_core_err(dev, "start\n");
-	if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_flush(dev);
 	}
@@ -137,38 +151,14 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 	mlx5_disable_device(dev);
 }
 
-static void health_recover(struct work_struct *work)
-{
-	struct mlx5_core_health *health;
-	struct delayed_work *dwork;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
-	u8 nic_state;
-
-	dwork = container_of(work, struct delayed_work, work);
-	health = container_of(dwork, struct mlx5_core_health, recover_work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	nic_state = mlx5_get_nic_state(dev);
-	if (nic_state == MLX5_NIC_IFC_INVALID) {
-		mlx5_core_err(dev, "health recovery flow aborted since the nic state is invalid\n");
-		return;
-	}
-
-	mlx5_core_err(dev, "starting health recovery flow\n");
-	mlx5_recover_device(dev);
-}
-
 /* How much time to wait until health resetting the driver (in msecs) */
-#define MLX5_RECOVERY_DELAY_MSECS 60000
+#define MLX5_RECOVERY_WAIT_MSECS 60000
 static void health_care(struct work_struct *work)
 {
-	unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
 	struct mlx5_core_health *health;
 	struct mlx5_core_dev *dev;
 	struct mlx5_priv *priv;
-	unsigned long flags;
+	unsigned long end;
 
 	health = container_of(work, struct mlx5_core_health, work);
 	priv = container_of(health, struct mlx5_priv, health);
@@ -176,13 +166,18 @@ static void health_care(struct work_struct *work)
 	mlx5_core_warn(dev, "handling bad device here\n");
 	mlx5_handle_bad_state(dev);
 
-	spin_lock_irqsave(&health->wq_lock, flags);
-	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
-		schedule_delayed_work(&health->recover_work, recover_delay);
-	else
-		mlx5_core_err(dev,
-			      "new health works are not permitted at this stage\n");
-	spin_unlock_irqrestore(&health->wq_lock, flags);
+	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
+	while (sensor_pci_not_working(dev)) {
+		if (time_after(jiffies, end)) {
+			mlx5_core_err(dev,
+				      "health recovery flow aborted, PCI reads still not working\n");
+			return;
+		}
+		msleep(100);
+	}
+
+	mlx5_core_err(dev, "starting health recovery flow\n");
+	mlx5_recover_device(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -274,6 +269,7 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	u32 fatal_error;
 	u32 count;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -291,8 +287,11 @@ static void poll_health(struct timer_list *t)
 		print_health_info(dev);
 	}
 
-	if (in_fatal(dev) && !health->sick) {
-		health->sick = true;
+	fatal_error = check_fatal_sensors(dev);
+
+	if (fatal_error && !health->fatal_error) {
+		mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
+		dev->priv.health.fatal_error = fatal_error;
 		print_health_info(dev);
 		mlx5_trigger_health_work(dev);
 	}
@@ -306,9 +305,8 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	timer_setup(&health->timer, poll_health, 0);
-	health->sick = 0;
+	health->fatal_error = MLX5_SENSOR_NO_ERR;
 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	health->health = &dev->iseg->health;
 	health->health_counter = &dev->iseg->health_counter;
 
@@ -324,7 +322,6 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 	if (disable_health) {
 		spin_lock_irqsave(&health->wq_lock, flags);
 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 		spin_unlock_irqrestore(&health->wq_lock, flags);
 	}
 
@@ -338,23 +335,10 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&health->wq_lock, flags);
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
-	cancel_delayed_work_sync(&health->recover_work);
 	cancel_work_sync(&health->work);
 }
 
-void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
-{
-	struct mlx5_core_health *health = &dev->priv.health;
-	unsigned long flags;
-
-	spin_lock_irqsave(&health->wq_lock, flags);
-	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
-	spin_unlock_irqrestore(&health->wq_lock, flags);
-	cancel_delayed_work_sync(&dev->priv.health.recover_work);
-}
-
 void mlx5_health_flush(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
@@ -387,7 +371,6 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
-	INIT_DELAYED_WORK(&health->recover_work, health_recover);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c70e97071b87..fd0e2949c4f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1191,7 +1191,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup)
 	int err = 0;
 
 	if (cleanup)
-		mlx5_drain_health_recovery(dev);
+		mlx5_drain_health_wq(dev);
 
 	mutex_lock(&dev->intf_state_mutex);
 	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d4dd8c1ae55c..97f8cf67ced0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -214,7 +214,7 @@ enum {
 	MLX5_NIC_IFC_FULL		= 0,
 	MLX5_NIC_IFC_DISABLED		= 1,
 	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
-	MLX5_NIC_IFC_INVALID		= 3
+	MLX5_NIC_IFC_SW_RESET		= 7
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4ae533b3da07..cc7fd8e62844 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,6 +435,7 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	u32				fatal_error;
 	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
@@ -906,7 +907,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
-void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
 int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_frag_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 3e5b72ac2f298423902169db7893fef43365e0a6 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Mon, 12 Nov 2018 16:40:17 +0200
Subject: net/mlx5: Issue SW reset on FW assert

If a FW assert is considered fatal, indicated by a new bit in the health
buffer, reset the FW. After the reset go through the normal recovery
flow. Only one PF needs to issue the reset, so an attempt is made to
prevent the 2nd function from also issuing the reset.
It's not an error if that happens, it just slows recovery.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/diag/crdump.c  |  13 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 157 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   1 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 include/linux/mlx5/device.h                        |  10 +-
 include/linux/mlx5/driver.h                        |   1 +
 6 files changed, 176 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
index dfb34172c69b..28d02749d3c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
@@ -51,14 +51,23 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
 			       ret);
 		return ret;
 	}
+	/* Verify no other PF is running cr-dump or sw reset */
+	ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET,
+				     MLX5_VSC_LOCK);
+	if (ret) {
+		mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n");
+		goto unlock_gw;
+	}
 
 	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
 	if (ret)
-		goto unlock;
+		goto unlock_sem;
 
 	ret = mlx5_crdump_fill(dev, cr_data);
 
-unlock:
+unlock_sem:
+	mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_UNLOCK);
+unlock_gw:
 	mlx5_vsc_gw_unlock(dev);
 	return ret;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 737e6d550775..caf54bd7d538 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -40,6 +40,7 @@
 #include "mlx5_core.h"
 #include "lib/eq.h"
 #include "lib/mlx5.h"
+#include "lib/pci_vsc.h"
 
 enum {
 	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
@@ -67,8 +68,10 @@ enum {
 enum  {
 	MLX5_SENSOR_NO_ERR		= 0,
 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
-	MLX5_SENSOR_NIC_DISABLED	= 2,
-	MLX5_SENSOR_NIC_SW_RESET	= 3,
+	MLX5_SENSOR_PCI_ERR		= 2,
+	MLX5_SENSOR_NIC_DISABLED	= 3,
+	MLX5_SENSOR_NIC_SW_RESET	= 4,
+	MLX5_SENSOR_FW_SYND_RFR		= 5,
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
@@ -95,32 +98,162 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
 	return (ioread32be(&h->fw_ver) == 0xffffffff);
 }
 
+static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
+	u8 synd = ioread8(&h->synd);
+
+	if (rfr && synd)
+		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
+	return rfr && synd;
+}
+
 static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
 {
 	if (sensor_pci_not_working(dev))
 		return MLX5_SENSOR_PCI_COMM_ERR;
+	if (pci_channel_offline(dev->pdev))
+		return MLX5_SENSOR_PCI_ERR;
 	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
 		return MLX5_SENSOR_NIC_DISABLED;
 	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET)
 		return MLX5_SENSOR_NIC_SW_RESET;
+	if (sensor_fw_synd_rfr(dev))
+		return MLX5_SENSOR_FW_SYND_RFR;
 
 	return MLX5_SENSOR_NO_ERR;
 }
 
+static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock)
+{
+	enum mlx5_vsc_state state;
+	int ret;
+
+	if (!mlx5_core_is_pf(dev))
+		return -EBUSY;
+
+	/* Try to lock GW access, this stage doesn't return
+	 * EBUSY because locked GW does not mean that other PF
+	 * already started the reset.
+	 */
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret == -EBUSY)
+		return -EINVAL;
+	if (ret)
+		return ret;
+
+	state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK;
+	/* At this stage, if the return status == EBUSY, then we know
+	 * for sure that another PF started the reset, so don't allow
+	 * another reset.
+	 */
+	ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state);
+	if (ret)
+		mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n");
+
+	/* Unlock GW access */
+	mlx5_vsc_gw_unlock(dev);
+
+	return ret;
+}
+
+static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
+{
+	bool supported = (ioread32be(&dev->iseg->initializing) >>
+			  MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
+	u32 fatal_error;
+
+	if (!supported)
+		return false;
+
+	/* The reset only needs to be issued by one PF. The health buffer is
+	 * shared between all functions, and will be cleared during a reset.
+	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
+	 * PCI related a reset won't help.
+	 */
+	fatal_error = check_fatal_sensors(dev);
+	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
+	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
+	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
+		mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.");
+		return false;
+	}
+
+	mlx5_core_warn(dev, "Issuing FW Reset\n");
+	/* Write the NIC interface field to initiate the reset, the command
+	 * interface address also resides here, don't overwrite it.
+	 */
+	mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET);
+
+	return true;
+}
+
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 {
 	mutex_lock(&dev->intf_state_mutex);
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto unlock;
+	if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+		goto unlock;
+	}
 
-	mlx5_core_err(dev, "start\n");
-	if (pci_channel_offline(dev->pdev) ||
-	    dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) {
+	if (check_fatal_sensors(dev) || force) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_flush(dev);
 	}
 
 	mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
+unlock:
+	mutex_unlock(&dev->intf_state_mutex);
+}
+
+#define MLX5_CRDUMP_WAIT_MS	60000
+#define MLX5_FW_RESET_WAIT_MS	1000
+void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
+{
+	unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS;
+	int lock = -EBUSY;
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto unlock;
+
+	mlx5_core_err(dev, "start\n");
+
+	if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
+		/* Get cr-dump and reset FW semaphore */
+		lock = lock_sem_sw_reset(dev, true);
+
+		if (lock == -EBUSY) {
+			delay_ms = MLX5_CRDUMP_WAIT_MS;
+			goto recover_from_sw_reset;
+		}
+		/* Execute SW reset */
+		reset_fw_if_needed(dev);
+	}
+
+recover_from_sw_reset:
+	/* Recover from SW reset */
+	end = jiffies + msecs_to_jiffies(delay_ms);
+	do {
+		if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+			break;
+
+		cond_resched();
+	} while (!time_after(jiffies, end));
+
+	if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) {
+		dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
+			mlx5_get_nic_state(dev), delay_ms);
+	}
+
+	/* Release FW semaphore if you are the lock owner */
+	if (!lock)
+		lock_sem_sw_reset(dev, false);
+
 	mlx5_core_err(dev, "end\n");
 
 unlock:
@@ -143,6 +276,20 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 	case MLX5_NIC_IFC_NO_DRAM_NIC:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
 		break;
+
+	case MLX5_NIC_IFC_SW_RESET:
+		/* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases:
+		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
+		 *    and this is a VF), this is not recoverable by SW reset.
+		 *    Logging of this is handled elsewhere.
+		 * 2. FW reset has been issued by another function, driver can
+		 *    be reloaded to recover after the mode switches to
+		 *    MLX5_NIC_IFC_DISABLED.
+		 */
+		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
+			mlx5_core_warn(dev, "NIC SW reset in progress\n");
+		break;
+
 	default:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
 			       nic_interface);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index fd0e2949c4f2..ec5287c51825 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1361,6 +1361,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 	mlx5_core_info(dev, "%s was called\n", __func__);
 
 	mlx5_enter_error_state(dev, false);
+	mlx5_error_sw_reset(dev);
 	mlx5_unload_one(dev, false);
 	/* In case of kernel call drain the health wq */
 	if (state) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8593c8183d87..29bb61a10289 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -113,6 +113,7 @@ enum {
 
 enum mlx5_semaphore_space_address {
 	MLX5_SEMAPHORE_SPACE_DOMAIN     = 0xA,
+	MLX5_SEMAPHORE_SW_RESET         = 0x20,
 };
 
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
@@ -122,6 +123,7 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
+void mlx5_error_sw_reset(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
 void mlx5_recover_device(struct mlx5_core_dev *dev);
 int mlx5_sriov_init(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5e760067ac41..35ed38c2ae6c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -510,6 +510,10 @@ struct mlx5_cmd_layout {
 	u8		status_own;
 };
 
+enum mlx5_fatal_assert_bit_offsets {
+	MLX5_RFR_OFFSET = 31,
+};
+
 struct health_buffer {
 	__be32		assert_var[5];
 	__be32		rsvd0[3];
@@ -518,12 +522,16 @@ struct health_buffer {
 	__be32		rsvd1[2];
 	__be32		fw_ver;
 	__be32		hw_id;
-	__be32		rsvd2;
+	__be32		rfr;
 	u8		irisc_index;
 	u8		synd;
 	__be16		ext_synd;
 };
 
+enum mlx5_initializing_bit_offsets {
+	MLX5_FW_RESET_SUPPORTED_OFFSET = 30,
+};
+
 enum mlx5_cmd_addr_l_sz_offset {
 	MLX5_NIC_IFC_OFFSET = 8,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cc7fd8e62844..89205b6cc7ef 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -583,6 +583,7 @@ struct mlx5_priv {
 };
 
 enum mlx5_device_state {
+	MLX5_DEVICE_STATE_UNINITIALIZED,
 	MLX5_DEVICE_STATE_UP,
 	MLX5_DEVICE_STATE_INTERNAL_ERROR,
 };
-- 
cgit v1.2.3


From 1e34f3efd413a6318c3edd6e8e7e091f1214b2e6 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:53 +0200
Subject: net/mlx5: Create FW devlink_health_reporter

Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
implements devlink_health_reporter diagnose callback.

The fw reporter diagnose command can be triggered any time by the user
to check current fw status.
In healthy status, it will return clear syndrome. Otherwise it will
return the syndrome and description of the error type.

Command example and output on healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
Syndrome: 0

Command example and output on non healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
Syndrome: 8 Description: unrecoverable hardware error

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 48 ++++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |  2 +
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index caf54bd7d538..973cc005ae60 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -388,6 +388,51 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
 }
 
+static int
+mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
+			  struct devlink_fmsg *fmsg)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	u8 synd;
+	int err;
+
+	synd = ioread8(&h->synd);
+	err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
+	if (err || !synd)
+		return err;
+	return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd));
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
+		.name = "fw",
+		.diagnose = mlx5_fw_reporter_diagnose,
+};
+
+static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	health->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(health->fw_reporter))
+		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
+			       PTR_ERR(health->fw_reporter));
+}
+
+static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	devlink_health_reporter_destroy(health->fw_reporter);
+}
+
 static unsigned long get_next_poll_jiffies(void)
 {
 	unsigned long next;
@@ -498,6 +543,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	destroy_workqueue(health->wq);
+	mlx5_fw_reporter_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -519,5 +565,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
 
+	mlx5_fw_reporter_create(dev);
+
 	return 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 89205b6cc7ef..8d5d065d1aa6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -53,6 +53,7 @@
 #include <linux/mlx5/eq.h>
 #include <linux/timecounter.h>
 #include <linux/ptp_clock_kernel.h>
+#include <net/devlink.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -443,6 +444,7 @@ struct mlx5_core_health {
 	unsigned long			flags;
 	struct work_struct		work;
 	struct delayed_work		recover_work;
+	struct devlink_health_reporter *fw_reporter;
 };
 
 struct mlx5_qp_table {
-- 
cgit v1.2.3


From d1bf0e2cc4a6e66c2bff48176b8b2930098468ef Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:56 +0200
Subject: net/mlx5: Report devlink health on FW issues

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrome.
The FW issues detected in mlx5 during poll_health which is called in
timer atomic context and so health work queue is used to schedule the
reports.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 33 ++++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 1c20d3f1d238..5e876f1de114 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
 	return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
 }
 
+static void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+
+	health = container_of(work, struct mlx5_core_health, report_work);
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	if (fw_reporter_ctx.err_synd) {
+		devlink_health_report(health->fw_reporter,
+				      "FW syndrom reported", &fw_reporter_ctx);
+		return;
+	}
+	if (fw_reporter_ctx.miss_counter)
+		devlink_health_report(health->fw_reporter,
+				      "FW miss counter reported",
+				      &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.name = "fw",
 		.diagnose = mlx5_fw_reporter_diagnose,
@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
 	u32 fatal_error;
+	u8 prev_synd;
 	u32 count;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
 	if (health->miss_counter == MAX_MISSES) {
 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
 		print_health_info(dev);
+		queue_work(health->wq, &health->report_work);
 	}
 
+	prev_synd = health->synd;
+	health->synd = ioread8(&h->synd);
+	if (health->synd && health->synd != prev_synd)
+		queue_work(health->wq, &health->report_work);
+
 	fatal_error = check_fatal_sensors(dev);
 
 	if (fatal_error && !health->fatal_error) {
@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	spin_lock_irqsave(&health->wq_lock, flags);
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_work_sync(&health->report_work);
 	cancel_work_sync(&health->work);
 }
 
@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
 	mlx5_fw_reporter_create(dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8d5d065d1aa6..1931a4080d78 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,7 +435,7 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
-	bool				sick;
+	u8				synd;
 	u32				fatal_error;
 	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
@@ -443,6 +443,7 @@ struct mlx5_core_health {
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
 };
-- 
cgit v1.2.3


From 96c82cdfe77b5e769624af71ec0554434037b82f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:57 +0200
Subject: net/mlx5: Add fw fatal devlink_health_reporter

Create mlx5_devlink_health_reporter for fw fatal reporter.
The fw fatal reporter is added in addition to the fw reporter and
implements the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 81 ++++++++++++++++++------
 include/linux/mlx5/driver.h                      |  1 +
 2 files changed, 62 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 5e876f1de114..82a658834675 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 
 /* How much time to wait until health resetting the driver (in msecs) */
 #define MLX5_RECOVERY_WAIT_MSECS 60000
-static void health_care(struct work_struct *work)
+static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_health *health;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
 	unsigned long end;
 
-	health = container_of(work, struct mlx5_core_health, work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
 	mlx5_core_warn(dev, "handling bad device here\n");
 	mlx5_handle_bad_state(dev);
-
 	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
 	while (sensor_pci_not_working(dev)) {
 		if (time_after(jiffies, end)) {
 			mlx5_core_err(dev,
 				      "health recovery flow aborted, PCI reads still not working\n");
-			return;
+			return -EIO;
 		}
 		msleep(100);
 	}
 
 	mlx5_core_err(dev, "starting health recovery flow\n");
 	mlx5_recover_device(dev);
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
+	    check_fatal_sensors(dev)) {
+		mlx5_core_err(dev, "health recovery failed\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static void health_recover_work(struct work_struct *work)
+{
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	mlx5_health_try_recover(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -544,7 +556,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.dump = mlx5_fw_reporter_dump,
 };
 
-static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+
+	return mlx5_health_try_recover(dev);
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "fw_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct devlink *devlink = priv_to_devlink(dev);
@@ -555,16 +582,26 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
 	if (IS_ERR(health->fw_reporter))
 		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
 			       PTR_ERR(health->fw_reporter));
+
+	health->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink,
+					       &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	if (IS_ERR(health->fw_fatal_reporter))
+		mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
+			       PTR_ERR(health->fw_fatal_reporter));
 }
 
-static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 
-	if (IS_ERR_OR_NULL(health->fw_reporter))
-		return;
+	if (!IS_ERR_OR_NULL(health->fw_reporter))
+		devlink_health_reporter_destroy(health->fw_reporter);
 
-	devlink_health_reporter_destroy(health->fw_reporter);
+	if (!IS_ERR_OR_NULL(health->fw_fatal_reporter))
+		devlink_health_reporter_destroy(health->fw_fatal_reporter);
 }
 
 static unsigned long get_next_poll_jiffies(void)
@@ -686,7 +723,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	destroy_workqueue(health->wq);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -694,22 +731,26 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health;
 	char *name;
 
+	mlx5_fw_reporters_create(dev);
+
 	health = &dev->priv.health;
 	name = kmalloc(64, GFP_KERNEL);
 	if (!name)
-		return -ENOMEM;
+		goto out_err;
 
 	strcpy(name, "mlx5_health");
 	strcat(name, dev_name(dev->device));
 	health->wq = create_singlethread_workqueue(name);
 	kfree(name);
 	if (!health->wq)
-		return -ENOMEM;
+		goto out_err;
 	spin_lock_init(&health->wq_lock);
-	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->work, health_recover_work);
 	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
-	mlx5_fw_reporter_create(dev);
-
 	return 0;
+
+out_err:
+	mlx5_fw_reporters_destroy(dev);
+	return -ENOMEM;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1931a4080d78..caac96bf9c0d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -446,6 +446,7 @@ struct mlx5_core_health {
 	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 };
 
 struct mlx5_qp_table {
-- 
cgit v1.2.3


From b3bd076f7501afea2871bb4738ab53498fd32cd5 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Sun, 27 Jan 2019 18:38:39 +0200
Subject: net/mlx5: Report devlink health on FW fatal issues

Report devlink health on FW fatal issues via fw_fatal_reporter. The
driver recover flow for FW fatal error is now being handled by the
devlink health.

Having the recovery controlled by devlink health, the user has the
ability to cancel the auto-recovery for debug session and run it
manually.

Call mlx5_enter_error_state() before calling devlink_health_report() to
ensure entering device error state even if auto-recovery is off.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 42 +++++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/main.c   | 10 +++---
 include/linux/mlx5/driver.h                      |  2 +-
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 4ef62c6c6424..2fe6923f7ce0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
 	return 0;
 }
 
-static void health_recover_work(struct work_struct *work)
-{
-	struct mlx5_core_health *health;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
-
-	health = container_of(work, struct mlx5_core_health, work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	mlx5_health_try_recover(dev);
-}
-
 static const char *hsynd_str(u8 synd)
 {
 	switch (synd) {
@@ -614,6 +601,29 @@ free_data:
 	return err;
 }
 
+static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, fatal_report_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	mlx5_enter_error_state(dev, false);
+	if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
+		if (mlx5_health_try_recover(dev))
+			mlx5_core_err(dev, "health recovery failed\n");
+		return;
+	}
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	devlink_health_report(health->fw_fatal_reporter,
+			      "FW fatal error reported", &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
 		.name = "fw_fatal",
 		.recover = mlx5_fw_fatal_reporter_recover,
@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&health->wq_lock, flags);
 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
-		queue_work(health->wq, &health->work);
+		queue_work(health->wq, &health->fatal_report_work);
 	else
 		mlx5_core_err(dev, "new health works are not permitted at this stage\n");
 	spin_unlock_irqrestore(&health->wq_lock, flags);
@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
 	cancel_work_sync(&health->report_work);
-	cancel_work_sync(&health->work);
+	cancel_work_sync(&health->fatal_report_work);
 }
 
 void mlx5_health_flush(struct mlx5_core_dev *dev)
@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	if (!health->wq)
 		goto out_err;
 	spin_lock_init(&health->wq_lock);
-	INIT_WORK(&health->work, health_recover_work);
+	INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
 	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ec5287c51825..998eec938d3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 	mlx5_enter_error_state(dev, false);
 	mlx5_error_sw_reset(dev);
 	mlx5_unload_one(dev, false);
-	/* In case of kernel call drain the health wq */
-	if (state) {
-		mlx5_drain_health_wq(dev);
-		mlx5_pci_disable_device(dev);
-	}
+	mlx5_drain_health_wq(dev);
+	mlx5_pci_disable_device(dev);
 
 	return state == pci_channel_io_perm_failure ?
 		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
 
 void mlx5_disable_device(struct mlx5_core_dev *dev)
 {
-	mlx5_pci_err_detected(dev->pdev, 0);
+	mlx5_error_sw_reset(dev);
+	mlx5_unload_one(dev, false);
 }
 
 void mlx5_recover_device(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index caac96bf9c0d..25847beabd3f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -442,7 +442,7 @@ struct mlx5_core_health {
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
-	struct work_struct		work;
+	struct work_struct		fatal_report_work;
 	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
-- 
cgit v1.2.3


From 33ee09cd59ce154b64f9df942dfa5456db90d5f9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 13 Jun 2019 19:59:51 +0300
Subject: device property: Add helpers to count items in an array

The usual pattern to allocate the necessary space for an array of properties is
to count them first by calling:

  count = device_property_read_uXX_array(dev, propname, NULL, 0);
  if (count < 0)
	return count;

Introduce helpers device_property_count_uXX() to count items by supplying hard
coded last two parameters to device_property_readXX_array().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 088d4db7e949..dbacf17fff2e 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -148,6 +148,26 @@ static inline int device_property_read_u64(struct device *dev,
 	return device_property_read_u64_array(dev, propname, val, 1);
 }
 
+static inline int device_property_count_u8(struct device *dev, const char *propname)
+{
+	return device_property_read_u8_array(dev, propname, NULL, 0);
+}
+
+static inline int device_property_count_u16(struct device *dev, const char *propname)
+{
+	return device_property_read_u16_array(dev, propname, NULL, 0);
+}
+
+static inline int device_property_count_u32(struct device *dev, const char *propname)
+{
+	return device_property_read_u32_array(dev, propname, NULL, 0);
+}
+
+static inline int device_property_count_u64(struct device *dev, const char *propname)
+{
+	return device_property_read_u64_array(dev, propname, NULL, 0);
+}
+
 static inline bool fwnode_property_read_bool(const struct fwnode_handle *fwnode,
 					     const char *propname)
 {
@@ -178,6 +198,30 @@ static inline int fwnode_property_read_u64(const struct fwnode_handle *fwnode,
 	return fwnode_property_read_u64_array(fwnode, propname, val, 1);
 }
 
+static inline int fwnode_property_count_u8(const struct fwnode_handle *fwnode,
+					   const char *propname)
+{
+	return fwnode_property_read_u8_array(fwnode, propname, NULL, 0);
+}
+
+static inline int fwnode_property_count_u16(const struct fwnode_handle *fwnode,
+					    const char *propname)
+{
+	return fwnode_property_read_u16_array(fwnode, propname, NULL, 0);
+}
+
+static inline int fwnode_property_count_u32(const struct fwnode_handle *fwnode,
+					    const char *propname)
+{
+	return fwnode_property_read_u32_array(fwnode, propname, NULL, 0);
+}
+
+static inline int fwnode_property_count_u64(const struct fwnode_handle *fwnode,
+					    const char *propname)
+{
+	return fwnode_property_read_u64_array(fwnode, propname, NULL, 0);
+}
+
 /**
  * struct property_entry - "Built-in" device property representation.
  * @name: Name of the property.
-- 
cgit v1.2.3


From de76cda215d56256ffcda7ffa538b70f9fb301a7 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 18:24:43 +0200
Subject: PCI: Decode PCIe 32 GT/s link speed

PCIe r5.0, sec 7.5.3.18, defines a new 32.0 GT/s bit in the Supported Link
Speeds Vector of Link Capabilities 2.  Decode this new speed.  This does
not affect the speed of the link, which should be negotiated automatically
by the hardware; it only adds decoding when showing the speed to the user.

Previously, reading the speed of a link operating at this speed showed
"Unknown speed" instead of "32.0 GT/s".

Link: https://lore.kernel.org/lkml/92365e3caf0fc559f9ab14bcd053bfc92d4f661c.1559664969.git.gustavo.pimentel@synopsys.com
Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-sysfs.c       | 3 +++
 drivers/pci/pci.c             | 4 +++-
 drivers/pci/probe.c           | 2 +-
 drivers/pci/slot.c            | 1 +
 include/linux/pci.h           | 1 +
 include/uapi/linux/pci_regs.h | 4 ++++
 6 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6d27475e39b2..d52d30448e41 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -182,6 +182,9 @@ static ssize_t current_link_speed_show(struct device *dev,
 		return -EINVAL;
 
 	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	case PCI_EXP_LNKSTA_CLS_32_0GB:
+		speed = "32 GT/s";
+		break;
 	case PCI_EXP_LNKSTA_CLS_16_0GB:
 		speed = "16 GT/s";
 		break;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..4729a7c7a9d9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5621,7 +5621,9 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
 	 */
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
 	if (lnkcap2) { /* PCIe r3.0-compliant */
-		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_32_0GB)
+			return PCIE_SPEED_32_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
 			return PCIE_SPEED_16_0GT;
 		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
 			return PCIE_SPEED_8_0GT;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0e8e2c186f50..c5f27c8cd140 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -668,7 +668,7 @@ const unsigned char pcie_link_speed[] = {
 	PCIE_SPEED_5_0GT,		/* 2 */
 	PCIE_SPEED_8_0GT,		/* 3 */
 	PCIE_SPEED_16_0GT,		/* 4 */
-	PCI_SPEED_UNKNOWN,		/* 5 */
+	PCIE_SPEED_32_0GT,		/* 5 */
 	PCI_SPEED_UNKNOWN,		/* 6 */
 	PCI_SPEED_UNKNOWN,		/* 7 */
 	PCI_SPEED_UNKNOWN,		/* 8 */
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index f4d92b1afe7b..ae4aa0e1f2f4 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -75,6 +75,7 @@ static const char *pci_bus_speed_strings[] = {
 	"5.0 GT/s PCIe",	/* 0x15 */
 	"8.0 GT/s PCIe",	/* 0x16 */
 	"16.0 GT/s PCIe",	/* 0x17 */
+	"32.0 GT/s PCIe",	/* 0x18 */
 };
 
 static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..2173e6b75579 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -258,6 +258,7 @@ enum pci_bus_speed {
 	PCIE_SPEED_5_0GT		= 0x15,
 	PCIE_SPEED_8_0GT		= 0x16,
 	PCIE_SPEED_16_0GT		= 0x17,
+	PCIE_SPEED_32_0GT		= 0x18,
 	PCI_SPEED_UNKNOWN		= 0xff,
 };
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 27164769d184..f28e562d7ca8 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -528,6 +528,7 @@
 #define  PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
+#define  PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
@@ -556,6 +557,7 @@
 #define  PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
+#define  PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */
 #define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_X1	0x0010	/* Current Link Width x1 */
 #define  PCI_EXP_LNKSTA_NLW_X2	0x0020	/* Current Link Width x2 */
@@ -661,6 +663,7 @@
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCAP2_SLS_32_0GB	0x00000020 /* Supported Speed 32GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define  PCI_EXP_LNKCTL2_TLS		0x000f
@@ -668,6 +671,7 @@
 #define  PCI_EXP_LNKCTL2_TLS_5_0GT	0x0002 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCTL2_TLS_32_0GT	0x0005 /* Supported Speed 32GT/s */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
 #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
-- 
cgit v1.2.3


From b3119cde1d70d6df1574b9f26d8e087e8e5116b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 22 May 2019 10:07:45 -0700
Subject: rcu: Fix irritating whitespace error in rcu_assign_pointer()

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 915460ec0872..534c05d529b5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -369,7 +369,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_assign_pointer(p, v)					      \
 ({									      \
 	uintptr_t _r_a_p__v = (uintptr_t)(v);				      \
-	rcu_check_sparse(p, __rcu);				      \
+	rcu_check_sparse(p, __rcu);					      \
 									      \
 	if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)	      \
 		WRITE_ONCE((p), (typeof(p))(_r_a_p__v));		      \
-- 
cgit v1.2.3


From 6da9f775175e516fc7229ceaa9b54f8f56aa7924 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 21 May 2019 16:48:43 -0400
Subject: rcu: Force inlining of rcu_read_lock()

When debugging options are turned on, the rcu_read_lock() function
might not be inlined. This results in lockdep's print_lock() function
printing "rcu_read_lock+0x0/0x70" instead of rcu_read_lock()'s caller.
For example:

[   10.579995] =============================
[   10.584033] WARNING: suspicious RCU usage
[   10.588074] 4.18.0.memcg_v2+ #1 Not tainted
[   10.593162] -----------------------------
[   10.597203] include/linux/rcupdate.h:281 Illegal context switch in
RCU read-side critical section!
[   10.606220]
[   10.606220] other info that might help us debug this:
[   10.606220]
[   10.614280]
[   10.614280] rcu_scheduler_active = 2, debug_locks = 1
[   10.620853] 3 locks held by systemd/1:
[   10.624632]  #0: (____ptrval____) (&type->i_mutex_dir_key#5){.+.+}, at: lookup_slow+0x42/0x70
[   10.633232]  #1: (____ptrval____) (rcu_read_lock){....}, at: rcu_read_lock+0x0/0x70
[   10.640954]  #2: (____ptrval____) (rcu_read_lock){....}, at: rcu_read_lock+0x0/0x70

These "rcu_read_lock+0x0/0x70" strings are not providing any useful
information.  This commit therefore forces inlining of the rcu_read_lock()
function so that rcu_read_lock()'s caller is instead shown.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 534c05d529b5..a8ed624da555 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -588,7 +588,7 @@ static inline void rcu_preempt_sleep_check(void) { }
  * read-side critical sections may be preempted and they may also block, but
  * only when acquiring spinlocks that are subject to priority inheritance.
  */
-static inline void rcu_read_lock(void)
+static __always_inline void rcu_read_lock(void)
 {
 	__rcu_read_lock();
 	__acquire(RCU);
-- 
cgit v1.2.3


From 9129b017b54dab09eb69b7269026243156e5188e Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 27 May 2019 10:49:57 +0200
Subject: rcu: Don't return a value from rcu_assign_pointer()

Quoting Paul [1]:

  "Given that a quick (and perhaps error-prone) search of the uses
   of rcu_assign_pointer() in v5.1 didn't find a single use of the
   return value, let's please instead change the documentation and
   implementation to eliminate the return value."

[1] https://lkml.kernel.org/r/20190523135013.GL28207@linux.ibm.com

Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: rcu@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 Documentation/RCU/whatisRCU.txt           | 8 ++++----
 include/linux/rcupdate.h                  | 5 ++---
 tools/include/linux/rcu.h                 | 4 ++--
 tools/testing/radix-tree/linux/rcupdate.h | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 981651a8b65d..7e1a8721637a 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -212,7 +212,7 @@ synchronize_rcu()
 
 rcu_assign_pointer()
 
-	typeof(p) rcu_assign_pointer(p, typeof(p) v);
+	void rcu_assign_pointer(p, typeof(p) v);
 
 	Yes, rcu_assign_pointer() -is- implemented as a macro, though it
 	would be cool to be able to declare a function in this manner.
@@ -220,9 +220,9 @@ rcu_assign_pointer()
 
 	The updater uses this function to assign a new value to an
 	RCU-protected pointer, in order to safely communicate the change
-	in value from the updater to the reader.  This function returns
-	the new value, and also executes any memory-barrier instructions
-	required for a given CPU architecture.
+	in value from the updater to the reader.  This macro does not
+	evaluate to an rvalue, but it does execute any memory-barrier
+	instructions required for a given CPU architecture.
 
 	Perhaps just as important, it serves to document (1) which
 	pointers are protected by RCU and (2) the point at which a
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a8ed624da555..0c9b92799abc 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -367,7 +367,7 @@ static inline void rcu_preempt_sleep_check(void) { }
  * other macros that it invokes.
  */
 #define rcu_assign_pointer(p, v)					      \
-({									      \
+do {									      \
 	uintptr_t _r_a_p__v = (uintptr_t)(v);				      \
 	rcu_check_sparse(p, __rcu);					      \
 									      \
@@ -375,8 +375,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 		WRITE_ONCE((p), (typeof(p))(_r_a_p__v));		      \
 	else								      \
 		smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
-	_r_a_p__v;							      \
-})
+} while (0)
 
 /**
  * rcu_swap_protected() - swap an RCU and a regular pointer
diff --git a/tools/include/linux/rcu.h b/tools/include/linux/rcu.h
index 7d02527e5bce..9554d3fa54f3 100644
--- a/tools/include/linux/rcu.h
+++ b/tools/include/linux/rcu.h
@@ -19,7 +19,7 @@ static inline bool rcu_is_watching(void)
 	return false;
 }
 
-#define rcu_assign_pointer(p, v) ((p) = (v))
-#define RCU_INIT_POINTER(p, v) p=(v)
+#define rcu_assign_pointer(p, v)	do { (p) = (v); } while (0)
+#define RCU_INIT_POINTER(p, v)	do { (p) = (v); } while (0)
 
 #endif
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
index fd280b070fdb..fed468fb0c78 100644
--- a/tools/testing/radix-tree/linux/rcupdate.h
+++ b/tools/testing/radix-tree/linux/rcupdate.h
@@ -7,6 +7,6 @@
 #define rcu_dereference_raw(p) rcu_dereference(p)
 #define rcu_dereference_protected(p, cond) rcu_dereference(p)
 #define rcu_dereference_check(p, cond) rcu_dereference(p)
-#define RCU_INIT_POINTER(p, v)	(p) = (v)
+#define RCU_INIT_POINTER(p, v)	do { (p) = (v); } while (0)
 
 #endif
-- 
cgit v1.2.3


From 4368dada5b37e74a13b892ca5cef8a7d558e9a5f Mon Sep 17 00:00:00 2001
From: Shalom Toledo <shalomt@mellanox.com>
Date: Tue, 11 Jun 2019 18:45:09 +0300
Subject: ptp: ptp_clock: Publish scaled_ppm_to_ppb

Publish scaled_ppm_to_ppb to allow drivers to use it.

Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 3 ++-
 include/linux/ptp_clock_kernel.h | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index e189fa1be21e..e60eab7f8a61 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -63,7 +63,7 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 	spin_unlock_irqrestore(&queue->lock, flags);
 }
 
-static s32 scaled_ppm_to_ppb(long ppm)
+s32 scaled_ppm_to_ppb(long ppm)
 {
 	/*
 	 * The 'freq' field in the 'struct timex' is in parts per
@@ -82,6 +82,7 @@ static s32 scaled_ppm_to_ppb(long ppm)
 	ppb >>= 13;
 	return (s32) ppb;
 }
+EXPORT_SYMBOL(scaled_ppm_to_ppb);
 
 /* posix clock implementation */
 
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 28eb9c792522..93cc4f1d444a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -212,6 +212,14 @@ extern void ptp_clock_event(struct ptp_clock *ptp,
 
 extern int ptp_clock_index(struct ptp_clock *ptp);
 
+/**
+ * scaled_ppm_to_ppb() - convert scaled ppm to ppb
+ *
+ * @ppm:    Parts per million, but with a 16 bit binary fractional field
+ */
+
+extern s32 scaled_ppm_to_ppb(long ppm);
+
 /**
  * ptp_find_pin() - obtain the pin index of a given auxiliary function
  *
-- 
cgit v1.2.3


From 4892d3a6a009f7eba2e806b9183e5d8790769f41 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 14 Jun 2019 10:12:26 +0200
Subject: gpio: Drop the parent_irq from gpio_irq_chip

We already have an array named "parents" so instead
of letting one point to the other, simply allocate a
dynamic array to hold the parents, just one if desired
and drop the number of members in gpio_irq_chip by
1. Rename gpiochip to gc in the process.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 30 +++++++++++++++++++-----------
 include/linux/gpio/driver.h |  7 -------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 4561cb39bdb4..71cd685ed6c4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1644,39 +1644,47 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_irq_valid);
 
 /**
  * gpiochip_set_cascaded_irqchip() - connects a cascaded irqchip to a gpiochip
- * @gpiochip: the gpiochip to set the irqchip chain to
+ * @gc: the gpiochip to set the irqchip chain to
  * @parent_irq: the irq number corresponding to the parent IRQ for this
  * chained irqchip
  * @parent_handler: the parent interrupt handler for the accumulated IRQ
  * coming out of the gpiochip. If the interrupt is nested rather than
  * cascaded, pass NULL in this handler argument
  */
-static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
+static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gc,
 					  unsigned int parent_irq,
 					  irq_flow_handler_t parent_handler)
 {
-	if (!gpiochip->irq.domain) {
-		chip_err(gpiochip, "called %s before setting up irqchip\n",
+	struct gpio_irq_chip *girq = &gc->irq;
+	struct device *dev = &gc->gpiodev->dev;
+
+	if (!girq->domain) {
+		chip_err(gc, "called %s before setting up irqchip\n",
 			 __func__);
 		return;
 	}
 
 	if (parent_handler) {
-		if (gpiochip->can_sleep) {
-			chip_err(gpiochip,
+		if (gc->can_sleep) {
+			chip_err(gc,
 				 "you cannot have chained interrupts on a chip that may sleep\n");
 			return;
 		}
+		girq->parents = devm_kcalloc(dev, 1,
+					     sizeof(*girq->parents),
+					     GFP_KERNEL);
+		if (!girq->parents) {
+			chip_err(gc, "out of memory allocating parent IRQ\n");
+			return;
+		}
+		girq->parents[0] = parent_irq;
+		girq->num_parents = 1;
 		/*
 		 * The parent irqchip is already using the chip_data for this
 		 * irqchip, so our callbacks simply use the handler_data.
 		 */
 		irq_set_chained_handler_and_data(parent_irq, parent_handler,
-						 gpiochip);
-
-		gpiochip->irq.parent_irq = parent_irq;
-		gpiochip->irq.parents = &gpiochip->irq.parent_irq;
-		gpiochip->irq.num_parents = 1;
+						 gc);
 	}
 }
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 937c40fb61f7..02698c0f34ea 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -102,13 +102,6 @@ struct gpio_irq_chip {
 	 */
 	unsigned int num_parents;
 
-	/**
-	 * @parent_irq:
-	 *
-	 * For use by gpiochip_set_cascaded_irqchip()
-	 */
-	unsigned int parent_irq;
-
 	/**
 	 * @parents:
 	 *
-- 
cgit v1.2.3


From 1ec0cd8286f35988134e05367ab5e66213b84e7c Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Fri, 24 May 2019 12:44:18 +0200
Subject: PM: hibernate: powerpc: Expose pfn_is_nosave() prototype

The declaration for pfn_is_nosave is only available in
kernel/power/power.h. Since this function can be override in arch,
expose it globally. Having a prototype will make sure to avoid warning
(sometime treated as error with W=1) such as:

  arch/powerpc/kernel/suspend.c:18:5: error: no previous prototype for 'pfn_is_nosave' [-Werror=missing-prototypes]

This moves the declaration into a globally visible header file and add
missing include to avoid a warning on powerpc.

Also remove the duplicated prototypes since not required anymore.

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/powerpc/kernel/suspend.c | 1 +
 arch/s390/kernel/entry.h      | 1 -
 include/linux/suspend.h       | 1 +
 kernel/power/power.h          | 2 --
 4 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index c612d50c9d18..b84992c10854 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/suspend.h>
 #include <asm/page.h>
 #include <asm/sections.h>
 
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 20420c2b8a14..b2956d49b6ad 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -63,7 +63,6 @@ void __init startup_init(void);
 void die(struct pt_regs *regs, const char *str);
 int setup_profiling_timer(unsigned int multiplier);
 void __init time_init(void);
-int pfn_is_nosave(unsigned long);
 void s390_early_resume(void);
 unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8594001e8be8..05645f726815 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -426,6 +426,7 @@ extern bool system_entering_hibernation(void);
 extern bool hibernation_available(void);
 asmlinkage int swsusp_save(void);
 extern struct pbe *restore_pblist;
+int pfn_is_nosave(unsigned long pfn);
 #else /* CONFIG_HIBERNATION */
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
 static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9e58bdc8a562..44bee462ff57 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {}
 static inline void hibernate_image_size_init(void) {}
 #endif /* !CONFIG_HIBERNATION */
 
-extern int pfn_is_nosave(unsigned long);
-
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
-- 
cgit v1.2.3


From bb2bb903042517b8fb17b2bc21e00512f2dcac01 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 13 Jun 2019 15:34:07 -0700
Subject: dma-buf: add DMA_BUF_SET_NAME ioctls

This patch adds complimentary DMA_BUF_SET_NAME  ioctls, which lets
userspace processes attach a free-form name to each buffer.

This information can be extremely helpful for tracking and accounting
shared buffers.  For example, on Android, we know what each buffer will
be used for at allocation time: GL, multimedia, camera, etc.  The
userspace allocator can use DMA_BUF_SET_NAME to associate that
information with the buffer, so we can later give developers a
breakdown of how much memory they're allocating for graphics, camera,
etc.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-3-fengc@google.com
---
 drivers/dma-buf/dma-buf.c    | 65 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dma-buf.h      |  5 +++-
 include/uapi/linux/dma-buf.h |  3 ++
 3 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3612ccededd6..ab96410d1dcd 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -48,8 +48,24 @@ struct dma_buf_list {
 
 static struct dma_buf_list db_list;
 
+static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct dma_buf *dmabuf;
+	char name[DMA_BUF_NAME_LEN];
+	size_t ret = 0;
+
+	dmabuf = dentry->d_fsdata;
+	mutex_lock(&dmabuf->lock);
+	if (dmabuf->name)
+		ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
+	mutex_unlock(&dmabuf->lock);
+
+	return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
+			     dentry->d_name.name, ret > 0 ? name : "");
+}
+
 static const struct dentry_operations dma_buf_dentry_ops = {
-	.d_dname = simple_dname,
+	.d_dname = dmabuffs_dname,
 };
 
 static struct vfsmount *dma_buf_mnt;
@@ -301,6 +317,43 @@ out:
 	return events;
 }
 
+/**
+ * dma_buf_set_name - Set a name to a specific dma_buf to track the usage.
+ * The name of the dma-buf buffer can only be set when the dma-buf is not
+ * attached to any devices. It could theoritically support changing the
+ * name of the dma-buf if the same piece of memory is used for multiple
+ * purpose between different devices.
+ *
+ * @dmabuf [in]     dmabuf buffer that will be renamed.
+ * @buf:   [in]     A piece of userspace memory that contains the name of
+ *                  the dma-buf.
+ *
+ * Returns 0 on success. If the dma-buf buffer is already attached to
+ * devices, return -EBUSY.
+ *
+ */
+static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
+{
+	char *name = strndup_user(buf, DMA_BUF_NAME_LEN);
+	long ret = 0;
+
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	mutex_lock(&dmabuf->lock);
+	if (!list_empty(&dmabuf->attachments)) {
+		ret = -EBUSY;
+		kfree(name);
+		goto out_unlock;
+	}
+	kfree(dmabuf->name);
+	dmabuf->name = name;
+
+out_unlock:
+	mutex_unlock(&dmabuf->lock);
+	return ret;
+}
+
 static long dma_buf_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -339,6 +392,10 @@ static long dma_buf_ioctl(struct file *file,
 			ret = dma_buf_begin_cpu_access(dmabuf, direction);
 
 		return ret;
+
+	case DMA_BUF_SET_NAME:
+		return dma_buf_set_name(dmabuf, (const char __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -380,6 +437,7 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
 		goto err_alloc_file;
 	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = dmabuf;
+	file->f_path.dentry->d_fsdata = dmabuf;
 
 	return file;
 
@@ -1112,12 +1170,13 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 			continue;
 		}
 
-		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
+		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n",
 				buf_obj->size,
 				buf_obj->file->f_flags, buf_obj->file->f_mode,
 				file_count(buf_obj->file),
 				buf_obj->exp_name,
-				file_inode(buf_obj->file)->i_ino);
+				file_inode(buf_obj->file)->i_ino,
+				buf_obj->name ?: "");
 
 		robj = buf_obj->resv;
 		while (true) {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8a327566d7f4..01ad5b942a6f 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -280,10 +280,12 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
- * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap
+ * @lock: used internally to serialize list manipulation, attach/detach and
+ *        vmap/unmap, and accesses to name
  * @vmapping_counter: used internally to refcnt the vmaps
  * @vmap_ptr: the current vmap ptr if vmapping_counter > 0
  * @exp_name: name of the exporter; useful for debugging.
+ * @name: userspace-provided name; useful for accounting and debugging.
  * @owner: pointer to exporter module; used for refcounting when exporter is a
  *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
@@ -311,6 +313,7 @@ struct dma_buf {
 	unsigned vmapping_counter;
 	void *vmap_ptr;
 	const char *exp_name;
+	const char *name;
 	struct module *owner;
 	struct list_head list_node;
 	void *priv;
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index d75df5210a4a..dbc7092e04b5 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -35,7 +35,10 @@ struct dma_buf_sync {
 #define DMA_BUF_SYNC_VALID_FLAGS_MASK \
 	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
 
+#define DMA_BUF_NAME_LEN	32
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+#define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
 
 #endif
-- 
cgit v1.2.3


From d1609c312d42f3bdfe7df9d4dd9d5b2c7ace90f4 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@nxp.com>
Date: Sun, 28 Apr 2019 10:35:31 +0800
Subject: usb: chipidea: imx: add imx7ulp support

In this commit, we add CI_HDRC_PMQOS to avoid system entering idle,
at imx7ulp, if the system enters idle, the DMA will stop, so the USB
transfer can't work at this case.

Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/ci_hdrc_imx.c | 28 +++++++++++++++++++++++++++-
 drivers/usb/chipidea/usbmisc_imx.c |  4 ++++
 include/linux/usb/chipidea.h       |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
index ceec8d5985d4..a76708501236 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.c
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c
@@ -13,6 +13,7 @@
 #include <linux/usb/of.h>
 #include <linux/clk.h>
 #include <linux/pinctrl/consumer.h>
+#include <linux/pm_qos.h>
 
 #include "ci.h"
 #include "ci_hdrc_imx.h"
@@ -63,6 +64,11 @@ static const struct ci_hdrc_imx_platform_flag imx7d_usb_data = {
 	.flags = CI_HDRC_SUPPORTS_RUNTIME_PM,
 };
 
+static const struct ci_hdrc_imx_platform_flag imx7ulp_usb_data = {
+	.flags = CI_HDRC_SUPPORTS_RUNTIME_PM |
+		CI_HDRC_PMQOS,
+};
+
 static const struct of_device_id ci_hdrc_imx_dt_ids[] = {
 	{ .compatible = "fsl,imx23-usb", .data = &imx23_usb_data},
 	{ .compatible = "fsl,imx28-usb", .data = &imx28_usb_data},
@@ -72,6 +78,7 @@ static const struct of_device_id ci_hdrc_imx_dt_ids[] = {
 	{ .compatible = "fsl,imx6sx-usb", .data = &imx6sx_usb_data},
 	{ .compatible = "fsl,imx6ul-usb", .data = &imx6ul_usb_data},
 	{ .compatible = "fsl,imx7d-usb", .data = &imx7d_usb_data},
+	{ .compatible = "fsl,imx7ulp-usb", .data = &imx7ulp_usb_data},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, ci_hdrc_imx_dt_ids);
@@ -93,6 +100,8 @@ struct ci_hdrc_imx_data {
 	struct clk *clk_ahb;
 	struct clk *clk_per;
 	/* --------------------------------- */
+	struct pm_qos_request pm_qos_req;
+	const struct ci_hdrc_imx_platform_flag *plat_data;
 };
 
 /* Common functions shared by usbmisc drivers */
@@ -309,6 +318,8 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev)
 	if (!data)
 		return -ENOMEM;
 
+	data->plat_data = imx_platform_flag;
+	pdata.flags |= imx_platform_flag->flags;
 	platform_set_drvdata(pdev, data);
 	data->usbmisc_data = usbmisc_get_init_data(dev);
 	if (IS_ERR(data->usbmisc_data))
@@ -369,6 +380,11 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev)
 			}
 		}
 	}
+
+	if (pdata.flags & CI_HDRC_PMQOS)
+		pm_qos_add_request(&data->pm_qos_req,
+			PM_QOS_CPU_DMA_LATENCY, 0);
+
 	ret = imx_get_clks(dev);
 	if (ret)
 		goto disable_hsic_regulator;
@@ -396,7 +412,6 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev)
 		usb_phy_init(pdata.usb_phy);
 	}
 
-	pdata.flags |= imx_platform_flag->flags;
 	if (pdata.flags & CI_HDRC_SUPPORTS_RUNTIME_PM)
 		data->supports_runtime_pm = true;
 
@@ -439,6 +454,8 @@ err_clk:
 disable_hsic_regulator:
 	if (data->hsic_pad_regulator)
 		ret = regulator_disable(data->hsic_pad_regulator);
+	if (pdata.flags & CI_HDRC_PMQOS)
+		pm_qos_remove_request(&data->pm_qos_req);
 	return ret;
 }
 
@@ -455,6 +472,8 @@ static int ci_hdrc_imx_remove(struct platform_device *pdev)
 	if (data->override_phy_control)
 		usb_phy_shutdown(data->phy);
 	imx_disable_unprepare_clks(&pdev->dev);
+	if (data->plat_data->flags & CI_HDRC_PMQOS)
+		pm_qos_remove_request(&data->pm_qos_req);
 	if (data->hsic_pad_regulator)
 		regulator_disable(data->hsic_pad_regulator);
 
@@ -480,6 +499,9 @@ static int __maybe_unused imx_controller_suspend(struct device *dev)
 	}
 
 	imx_disable_unprepare_clks(dev);
+	if (data->plat_data->flags & CI_HDRC_PMQOS)
+		pm_qos_remove_request(&data->pm_qos_req);
+
 	data->in_lpm = true;
 
 	return 0;
@@ -497,6 +519,10 @@ static int __maybe_unused imx_controller_resume(struct device *dev)
 		return 0;
 	}
 
+	if (data->plat_data->flags & CI_HDRC_PMQOS)
+		pm_qos_add_request(&data->pm_qos_req,
+			PM_QOS_CPU_DMA_LATENCY, 0);
+
 	ret = imx_prepare_enable_clks(dev);
 	if (ret)
 		return ret;
diff --git a/drivers/usb/chipidea/usbmisc_imx.c b/drivers/usb/chipidea/usbmisc_imx.c
index d8b67e150b12..b7a5727d0c8a 100644
--- a/drivers/usb/chipidea/usbmisc_imx.c
+++ b/drivers/usb/chipidea/usbmisc_imx.c
@@ -763,6 +763,10 @@ static const struct of_device_id usbmisc_imx_dt_ids[] = {
 		.compatible = "fsl,imx7d-usbmisc",
 		.data = &imx7d_usbmisc_ops,
 	},
+	{
+		.compatible = "fsl,imx7ulp-usbmisc",
+		.data = &imx7d_usbmisc_ops,
+	},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, usbmisc_imx_dt_ids);
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 911e05af671e..edd89b7c8f18 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -61,6 +61,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_OVERRIDE_PHY_CONTROL	BIT(12) /* Glue layer manages phy */
 #define CI_HDRC_REQUIRES_ALIGNED_DMA	BIT(13)
 #define CI_HDRC_IMX_IS_HSIC		BIT(14)
+#define CI_HDRC_PMQOS			BIT(15)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 26f7044e95042daabcf1c71796a0e804a83c979f Mon Sep 17 00:00:00 2001
From: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Date: Thu, 9 May 2019 09:49:06 +0000
Subject: nl80211: add support for SAE authentication offload

Let drivers advertise support for station-mode SAE authentication
offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD flag.

Signed-off-by: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Signed-off-by: Chi-Hsien Lin <chi-hsien.lin@cypress.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  1 +
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h | 19 +++++++++++++++++++
 net/wireless/nl80211.c       | 14 ++++++++++++++
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 61f0a316c6ac..5dfd949ade25 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2612,6 +2612,7 @@ enum ieee80211_key_len {
 #define FILS_ERP_MAX_RRK_LEN		64
 
 #define PMK_MAX_LEN			64
+#define SAE_PASSWORD_MAX_LEN		128
 
 /* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */
 enum ieee80211_pub_actioncode {
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c19687833493..4b45056dbb25 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -742,6 +742,9 @@ struct survey_info {
  *	CFG80211_MAX_WEP_KEYS WEP keys
  * @wep_tx_key: key index (0..3) of the default TX static WEP key
  * @psk: PSK (for devices supporting 4-way-handshake offload)
+ * @sae_pwd: password for SAE authentication (for devices supporting SAE
+ *	offload)
+ * @sae_pwd_len: length of SAE password (for devices supporting SAE offload)
  */
 struct cfg80211_crypto_settings {
 	u32 wpa_versions;
@@ -757,6 +760,8 @@ struct cfg80211_crypto_settings {
 	struct key_params *wep_keys;
 	int wep_tx_key;
 	const u8 *psk;
+	const u8 *sae_pwd;
+	u8 sae_pwd_len;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e9bf3d69d847..8b1e43fecd25 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -234,6 +234,15 @@
  * use in a FILS shared key connection with PMKSA caching.
  */
 
+/**
+ * DOC: SAE authentication offload
+ *
+ * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they
+ * support offloading SAE authentication for WPA3-Personal networks. In
+ * %NL80211_CMD_CONNECT the password for SAE should be specified using
+ * %NL80211_ATTR_SAE_PASSWORD.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -2341,6 +2350,10 @@ enum nl80211_commands {
  *	should be picking up the lowest tx power, either tx power per-interface
  *	or per-station.
  *
+ * @NL80211_ATTR_SAE_PASSWORD: attribute for passing SAE password material. It
+ *	is used with %NL80211_CMD_CONNECT to provide password for offloading
+ *	SAE authentication for WPA3-Personal networks.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2794,6 +2807,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_STA_TX_POWER_SETTING,
 	NL80211_ATTR_STA_TX_POWER,
 
+	NL80211_ATTR_SAE_PASSWORD,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5423,6 +5438,9 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power
  *	to a station.
  *
+ * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in
+ *	station mode (SAE password is passed as part of the connect command).
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5467,6 +5485,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
 	NL80211_EXT_FEATURE_EXT_KEY_ID,
 	NL80211_EXT_FEATURE_STA_TX_PWR,
+	NL80211_EXT_FEATURE_SAE_OFFLOAD,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8332a5731c57..80e514872719 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -571,6 +571,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PEER_MEASUREMENTS] =
 		NLA_POLICY_NESTED(nl80211_pmsr_attr_policy),
 	[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
+	[NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
+					.len = SAE_PASSWORD_MAX_LEN },
 };
 
 /* policy for the key attributes */
@@ -4434,6 +4436,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 		return true;
 	case NL80211_CMD_CONNECT:
 		if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
 		    auth_type == NL80211_AUTHTYPE_SAE)
 			return false;
 
@@ -8973,6 +8977,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 		settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
 	}
 
+	if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SAE_OFFLOAD))
+			return -EINVAL;
+		settings->sae_pwd =
+			nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+		settings->sae_pwd_len =
+			nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From cd6f34110285742ec5570f07aa2229e29f4d2092 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 29 May 2019 15:25:33 +0300
Subject: ieee80211: Add a missing extended capability flag definition

Add the "OBSS Narrow Bandwidth RU In OFDMA Tolerance Support" flag
definition to the definitions of the flags covered by the Extended
Capability IE.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5dfd949ade25..2dbefeffc43c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2713,6 +2713,13 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT	BIT(5)
 #define WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT	BIT(6)
 
+/*
+ * When set, indicates that the AP is able to tolerate 26-tone RU UL
+ * OFDMA transmissions using HE TB PPDU from OBSS (not falsely classify the
+ * 26-tone RU UL OFDMA transmissions as radar pulses).
+ */
+#define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7)
+
 /* Defines support for enhanced multi-bssid advertisement*/
 #define WLAN_EXT_CAPA11_EMA_SUPPORT	BIT(1)
 
-- 
cgit v1.2.3


From 42df744c4166af6959eda2df1ee5cde744d4a1c3 Mon Sep 17 00:00:00 2001
From: Janne Karhunen <janne.karhunen@gmail.com>
Date: Fri, 14 Jun 2019 15:20:14 +0300
Subject: LSM: switch to blocking policy update notifiers

Atomic policy updaters are not very useful as they cannot
usually perform the policy updates on their own. Since it
seems that there is no strict need for the atomicity,
switch to the blocking variant. While doing so, rename
the functions accordingly.

Signed-off-by: Janne Karhunen <janne.karhunen@gmail.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 drivers/infiniband/core/device.c |  6 +++---
 include/linux/security.h         | 12 ++++++------
 security/security.c              | 23 +++++++++++++----------
 security/selinux/hooks.c         |  2 +-
 security/selinux/selinuxfs.c     |  2 +-
 5 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 78dc07c6ac4b..61c0c93a2e73 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2499,7 +2499,7 @@ static int __init ib_core_init(void)
 		goto err_mad;
 	}
 
-	ret = register_lsm_notifier(&ibdev_lsm_nb);
+	ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
 	if (ret) {
 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
 		goto err_sa;
@@ -2518,7 +2518,7 @@ static int __init ib_core_init(void)
 	return 0;
 
 err_compat:
-	unregister_lsm_notifier(&ibdev_lsm_nb);
+	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
 err_sa:
 	ib_sa_cleanup();
 err_mad:
@@ -2544,7 +2544,7 @@ static void __exit ib_core_cleanup(void)
 	nldev_exit();
 	rdma_nl_unregister(RDMA_NL_LS);
 	unregister_pernet_device(&rdma_dev_net_ops);
-	unregister_lsm_notifier(&ibdev_lsm_nb);
+	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
 	ib_sa_cleanup();
 	ib_mad_cleanup();
 	addr_cleanup();
diff --git a/include/linux/security.h b/include/linux/security.h
index 659071c2e57c..5f7441abbf42 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -189,9 +189,9 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id)
 
 #ifdef CONFIG_SECURITY
 
-int call_lsm_notifier(enum lsm_event event, void *data);
-int register_lsm_notifier(struct notifier_block *nb);
-int unregister_lsm_notifier(struct notifier_block *nb);
+int call_blocking_lsm_notifier(enum lsm_event event, void *data);
+int register_blocking_lsm_notifier(struct notifier_block *nb);
+int unregister_blocking_lsm_notifier(struct notifier_block *nb);
 
 /* prototypes */
 extern int security_init(void);
@@ -394,17 +394,17 @@ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 #else /* CONFIG_SECURITY */
 
-static inline int call_lsm_notifier(enum lsm_event event, void *data)
+static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
 {
 	return 0;
 }
 
-static inline int register_lsm_notifier(struct notifier_block *nb)
+static inline int register_blocking_lsm_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
-static inline  int unregister_lsm_notifier(struct notifier_block *nb)
+static inline  int unregister_blocking_lsm_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 613a5c00e602..47e5849d7557 100644
--- a/security/security.c
+++ b/security/security.c
@@ -39,7 +39,7 @@
 #define LSM_COUNT (__end_lsm_info - __start_lsm_info)
 
 struct security_hook_heads security_hook_heads __lsm_ro_after_init;
-static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain);
+static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);
 
 static struct kmem_cache *lsm_file_cache;
 static struct kmem_cache *lsm_inode_cache;
@@ -430,23 +430,26 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 		panic("%s - Cannot get early memory.\n", __func__);
 }
 
-int call_lsm_notifier(enum lsm_event event, void *data)
+int call_blocking_lsm_notifier(enum lsm_event event, void *data)
 {
-	return atomic_notifier_call_chain(&lsm_notifier_chain, event, data);
+	return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
+					    event, data);
 }
-EXPORT_SYMBOL(call_lsm_notifier);
+EXPORT_SYMBOL(call_blocking_lsm_notifier);
 
-int register_lsm_notifier(struct notifier_block *nb)
+int register_blocking_lsm_notifier(struct notifier_block *nb)
 {
-	return atomic_notifier_chain_register(&lsm_notifier_chain, nb);
+	return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
+						nb);
 }
-EXPORT_SYMBOL(register_lsm_notifier);
+EXPORT_SYMBOL(register_blocking_lsm_notifier);
 
-int unregister_lsm_notifier(struct notifier_block *nb)
+int unregister_blocking_lsm_notifier(struct notifier_block *nb)
 {
-	return atomic_notifier_chain_unregister(&lsm_notifier_chain, nb);
+	return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
+						  nb);
 }
-EXPORT_SYMBOL(unregister_lsm_notifier);
+EXPORT_SYMBOL(unregister_blocking_lsm_notifier);
 
 /**
  * lsm_cred_alloc - allocate a composite cred blob
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c61787b15f27..c1e37018c8eb 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -197,7 +197,7 @@ static int selinux_lsm_notifier_avc_callback(u32 event)
 {
 	if (event == AVC_CALLBACK_RESET) {
 		sel_ib_pkey_flush();
-		call_lsm_notifier(LSM_POLICY_CHANGE, NULL);
+		call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
 	}
 
 	return 0;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 145ee62f205a..1e2e3e4b5fdb 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -180,7 +180,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
 		selnl_notify_setenforce(new_value);
 		selinux_status_update_setenforce(state, new_value);
 		if (!new_value)
-			call_lsm_notifier(LSM_POLICY_CHANGE, NULL);
+			call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
 	}
 	length = count;
 out:
-- 
cgit v1.2.3


From 10ffebbed5503b1830c7920ef528075785351be6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 12 Jun 2019 14:52:44 -0300
Subject: docs: fault-injection: convert docs to ReST and rename to *.rst

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Federico Vaga <federico.vaga@vaga.pv.it>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/fault-injection/fault-injection.rst  | 446 +++++++++++++++++++++
 Documentation/fault-injection/fault-injection.txt  | 435 --------------------
 Documentation/fault-injection/index.rst            |  20 +
 .../fault-injection/notifier-error-inject.rst      |  98 +++++
 .../fault-injection/notifier-error-inject.txt      |  94 -----
 .../fault-injection/nvme-fault-injection.rst       | 120 ++++++
 .../fault-injection/nvme-fault-injection.txt       | 116 ------
 Documentation/fault-injection/provoke-crashes.rst  |  48 +++
 Documentation/fault-injection/provoke-crashes.txt  |  38 --
 Documentation/process/4.Coding.rst                 |   2 +-
 .../translations/it_IT/process/4.Coding.rst        |   2 +-
 .../translations/zh_CN/process/4.Coding.rst        |   2 +-
 drivers/misc/lkdtm/core.c                          |   2 +-
 include/linux/fault-inject.h                       |   2 +-
 lib/Kconfig.debug                                  |   2 +-
 tools/testing/fault-injection/failcmd.sh           |   2 +-
 16 files changed, 739 insertions(+), 690 deletions(-)
 create mode 100644 Documentation/fault-injection/fault-injection.rst
 delete mode 100644 Documentation/fault-injection/fault-injection.txt
 create mode 100644 Documentation/fault-injection/index.rst
 create mode 100644 Documentation/fault-injection/notifier-error-inject.rst
 delete mode 100644 Documentation/fault-injection/notifier-error-inject.txt
 create mode 100644 Documentation/fault-injection/nvme-fault-injection.rst
 delete mode 100644 Documentation/fault-injection/nvme-fault-injection.txt
 create mode 100644 Documentation/fault-injection/provoke-crashes.rst
 delete mode 100644 Documentation/fault-injection/provoke-crashes.txt

(limited to 'include/linux')

diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
new file mode 100644
index 000000000000..f51bb21d20e4
--- /dev/null
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -0,0 +1,446 @@
+===========================================
+Fault injection capabilities infrastructure
+===========================================
+
+See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug.
+
+
+Available fault injection capabilities
+--------------------------------------
+
+- failslab
+
+  injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...)
+
+- fail_page_alloc
+
+  injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
+
+- fail_futex
+
+  injects futex deadlock and uaddr fault errors.
+
+- fail_make_request
+
+  injects disk IO errors on devices permitted by setting
+  /sys/block/<device>/make-it-fail or
+  /sys/block/<device>/<partition>/make-it-fail. (generic_make_request())
+
+- fail_mmc_request
+
+  injects MMC data errors on devices permitted by setting
+  debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
+
+- fail_function
+
+  injects error return on specific functions, which are marked by
+  ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
+  under /sys/kernel/debug/fail_function. No boot option supported.
+
+- NVMe fault injection
+
+  inject NVMe status code and retry flag on devices permitted by setting
+  debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
+  status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
+  retry flag can be set via the debugfs.
+
+
+Configure fault-injection capabilities behavior
+-----------------------------------------------
+
+debugfs entries
+^^^^^^^^^^^^^^^
+
+fault-inject-debugfs kernel module provides some debugfs entries for runtime
+configuration of fault-injection capabilities.
+
+- /sys/kernel/debug/fail*/probability:
+
+	likelihood of failure injection, in percent.
+
+	Format: <percent>
+
+	Note that one-failure-per-hundred is a very high error rate
+	for some testcases.  Consider setting probability=100 and configure
+	/sys/kernel/debug/fail*/interval for such testcases.
+
+- /sys/kernel/debug/fail*/interval:
+
+	specifies the interval between failures, for calls to
+	should_fail() that pass all the other tests.
+
+	Note that if you enable this, by setting interval>1, you will
+	probably want to set probability=100.
+
+- /sys/kernel/debug/fail*/times:
+
+	specifies how many times failures may happen at most.
+	A value of -1 means "no limit".
+
+- /sys/kernel/debug/fail*/space:
+
+	specifies an initial resource "budget", decremented by "size"
+	on each call to should_fail(,size).  Failure injection is
+	suppressed until "space" reaches zero.
+
+- /sys/kernel/debug/fail*/verbose
+
+	Format: { 0 | 1 | 2 }
+
+	specifies the verbosity of the messages when failure is
+	injected.  '0' means no messages; '1' will print only a single
+	log line per failure; '2' will print a call trace too -- useful
+	to debug the problems revealed by fault injection.
+
+- /sys/kernel/debug/fail*/task-filter:
+
+	Format: { 'Y' | 'N' }
+
+	A value of 'N' disables filtering by process (default).
+	Any positive value limits failures to only processes indicated by
+	/proc/<pid>/make-it-fail==1.
+
+- /sys/kernel/debug/fail*/require-start,
+  /sys/kernel/debug/fail*/require-end,
+  /sys/kernel/debug/fail*/reject-start,
+  /sys/kernel/debug/fail*/reject-end:
+
+	specifies the range of virtual addresses tested during
+	stacktrace walking.  Failure is injected only if some caller
+	in the walked stacktrace lies within the required range, and
+	none lies within the rejected range.
+	Default required range is [0,ULONG_MAX) (whole of virtual address space).
+	Default rejected range is [0,0).
+
+- /sys/kernel/debug/fail*/stacktrace-depth:
+
+	specifies the maximum stacktrace depth walked during search
+	for a caller within [require-start,require-end) OR
+	[reject-start,reject-end).
+
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' won't inject failures into
+	highmem/user allocations.
+
+- /sys/kernel/debug/failslab/ignore-gfp-wait:
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' will inject failures
+	only into non-sleep allocations (GFP_ATOMIC allocations).
+
+- /sys/kernel/debug/fail_page_alloc/min-order:
+
+	specifies the minimum page allocation order to be injected
+	failures.
+
+- /sys/kernel/debug/fail_futex/ignore-private:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' will disable failure injections
+	when dealing with private (address space) futexes.
+
+- /sys/kernel/debug/fail_function/inject:
+
+	Format: { 'function-name' | '!function-name' | '' }
+
+	specifies the target function of error injection by name.
+	If the function name leads '!' prefix, given function is
+	removed from injection list. If nothing specified ('')
+	injection list is cleared.
+
+- /sys/kernel/debug/fail_function/injectable:
+
+	(read only) shows error injectable functions and what type of
+	error values can be specified. The error type will be one of
+	below;
+	- NULL:	retval must be 0.
+	- ERRNO: retval must be -1 to -MAX_ERRNO (-4096).
+	- ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096).
+
+- /sys/kernel/debug/fail_function/<functiuon-name>/retval:
+
+	specifies the "error" return value to inject to the given
+	function for given function. This will be created when
+	user specifies new injection entry.
+
+Boot option
+^^^^^^^^^^^
+
+In order to inject faults while debugfs is not available (early boot time),
+use the boot option::
+
+	failslab=
+	fail_page_alloc=
+	fail_make_request=
+	fail_futex=
+	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
+
+proc entries
+^^^^^^^^^^^^
+
+- /proc/<pid>/fail-nth,
+  /proc/self/task/<tid>/fail-nth:
+
+	Write to this file of integer N makes N-th call in the task fail.
+	Read from this file returns a integer value. A value of '0' indicates
+	that the fault setup with a previous write to this file was injected.
+	A positive integer N indicates that the fault wasn't yet injected.
+	Note that this file enables all types of faults (slab, futex, etc).
+	This setting takes precedence over all other generic debugfs settings
+	like probability, interval, times, etc. But per-capability settings
+	(e.g. fail_futex/ignore-private) take precedence over it.
+
+	This feature is intended for systematic testing of faults in a single
+	system call. See an example below.
+
+How to add new fault injection capability
+-----------------------------------------
+
+- #include <linux/fault-inject.h>
+
+- define the fault attributes
+
+  DECLARE_FAULT_ATTR(name);
+
+  Please see the definition of struct fault_attr in fault-inject.h
+  for details.
+
+- provide a way to configure fault attributes
+
+- boot option
+
+  If you need to enable the fault injection capability from boot time, you can
+  provide boot option to configure it. There is a helper function for it:
+
+	setup_fault_attr(attr, str);
+
+- debugfs entries
+
+  failslab, fail_page_alloc, and fail_make_request use this way.
+  Helper functions:
+
+	fault_create_debugfs_attr(name, parent, attr);
+
+- module parameters
+
+  If the scope of the fault injection capability is limited to a
+  single kernel module, it is better to provide module parameters to
+  configure the fault attributes.
+
+- add a hook to insert failures
+
+  Upon should_fail() returning true, client code should inject a failure:
+
+	should_fail(attr, size);
+
+Application Examples
+--------------------
+
+- Inject slab allocation failures into module init/exit code::
+
+    #!/bin/bash
+
+    FAILTYPE=failslab
+    echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+
+    faulty_system()
+    {
+	bash -c "echo 1 > /proc/self/make-it-fail && exec $*"
+    }
+
+    if [ $# -eq 0 ]
+    then
+	echo "Usage: $0 modulename [ modulename ... ]"
+	exit 1
+    fi
+
+    for m in $*
+    do
+	echo inserting $m...
+	faulty_system modprobe $m
+
+	echo removing $m...
+	faulty_system modprobe -r $m
+    done
+
+------------------------------------------------------------------------------
+
+- Inject page allocation failures only for a specific module::
+
+    #!/bin/bash
+
+    FAILTYPE=fail_page_alloc
+    module=$1
+
+    if [ -z $module ]
+    then
+	echo "Usage: $0 <modulename>"
+	exit 1
+    fi
+
+    modprobe $module
+
+    if [ ! -d /sys/module/$module/sections ]
+    then
+	echo Module $module is not loaded
+	exit 1
+    fi
+
+    cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
+    cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
+
+    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
+    echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
+
+    trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
+
+    echo "Injecting errors into the module $module... (interrupt to stop)"
+    sleep 1000000
+
+------------------------------------------------------------------------------
+
+- Inject open_ctree error while btrfs mount::
+
+    #!/bin/bash
+
+    rm -f testfile.img
+    dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+    DEVICE=$(losetup --show -f testfile.img)
+    mkfs.btrfs -f $DEVICE
+    mkdir -p tmpmnt
+
+    FAILTYPE=fail_function
+    FAILFUNC=open_ctree
+    echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject
+    echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
+    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
+
+    mount -t btrfs $DEVICE tmpmnt
+    if [ $? -ne 0 ]
+    then
+	echo "SUCCESS!"
+    else
+	echo "FAILED!"
+	umount tmpmnt
+    fi
+
+    echo > /sys/kernel/debug/$FAILTYPE/inject
+
+    rmdir tmpmnt
+    losetup -d $DEVICE
+    rm testfile.img
+
+
+Tool to run command with failslab or fail_page_alloc
+----------------------------------------------------
+In order to make it easier to accomplish the tasks mentioned above, we can use
+tools/testing/fault-injection/failcmd.sh.  Please run a command
+"./tools/testing/fault-injection/failcmd.sh --help" for more information and
+see the following examples.
+
+Examples:
+
+Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab
+allocation failure::
+
+	# ./tools/testing/fault-injection/failcmd.sh \
+		-- make -C tools/testing/selftests/ run_tests
+
+Same as above except to specify 100 times failures at most instead of one time
+at most by default::
+
+	# ./tools/testing/fault-injection/failcmd.sh --times=100 \
+		-- make -C tools/testing/selftests/ run_tests
+
+Same as above except to inject page allocation failure instead of slab
+allocation failure::
+
+	# env FAILCMD_TYPE=fail_page_alloc \
+		./tools/testing/fault-injection/failcmd.sh --times=100 \
+		-- make -C tools/testing/selftests/ run_tests
+
+Systematic faults using fail-nth
+---------------------------------
+
+The following code systematically faults 0-th, 1-st, 2-nd and so on
+capabilities in the socketpair() system call::
+
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <sys/socket.h>
+  #include <sys/syscall.h>
+  #include <fcntl.h>
+  #include <unistd.h>
+  #include <string.h>
+  #include <stdlib.h>
+  #include <stdio.h>
+  #include <errno.h>
+
+  int main()
+  {
+	int i, err, res, fail_nth, fds[2];
+	char buf[128];
+
+	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
+	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
+	fail_nth = open(buf, O_RDWR);
+	for (i = 1;; i++) {
+		sprintf(buf, "%d", i);
+		write(fail_nth, buf, strlen(buf));
+		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
+		err = errno;
+		pread(fail_nth, buf, sizeof(buf), 0);
+		if (res == 0) {
+			close(fds[0]);
+			close(fds[1]);
+		}
+		printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y',
+			res, err);
+		if (atoi(buf))
+			break;
+	}
+	return 0;
+  }
+
+An example output::
+
+	1-th fault Y: res=-1/23
+	2-th fault Y: res=-1/23
+	3-th fault Y: res=-1/12
+	4-th fault Y: res=-1/12
+	5-th fault Y: res=-1/23
+	6-th fault Y: res=-1/23
+	7-th fault Y: res=-1/23
+	8-th fault Y: res=-1/12
+	9-th fault Y: res=-1/12
+	10-th fault Y: res=-1/12
+	11-th fault Y: res=-1/12
+	12-th fault Y: res=-1/12
+	13-th fault Y: res=-1/12
+	14-th fault Y: res=-1/12
+	15-th fault Y: res=-1/12
+	16-th fault N: res=0/12
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
deleted file mode 100644
index a17517a083c3..000000000000
--- a/Documentation/fault-injection/fault-injection.txt
+++ /dev/null
@@ -1,435 +0,0 @@
-Fault injection capabilities infrastructure
-===========================================
-
-See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug.
-
-
-Available fault injection capabilities
---------------------------------------
-
-o failslab
-
-  injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...)
-
-o fail_page_alloc
-
-  injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
-
-o fail_futex
-
-  injects futex deadlock and uaddr fault errors.
-
-o fail_make_request
-
-  injects disk IO errors on devices permitted by setting
-  /sys/block/<device>/make-it-fail or
-  /sys/block/<device>/<partition>/make-it-fail. (generic_make_request())
-
-o fail_mmc_request
-
-  injects MMC data errors on devices permitted by setting
-  debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
-
-o fail_function
-
-  injects error return on specific functions, which are marked by
-  ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
-  under /sys/kernel/debug/fail_function. No boot option supported.
-
-o NVMe fault injection
-
-  inject NVMe status code and retry flag on devices permitted by setting
-  debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
-  status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
-  retry flag can be set via the debugfs.
-
-
-Configure fault-injection capabilities behavior
------------------------------------------------
-
-o debugfs entries
-
-fault-inject-debugfs kernel module provides some debugfs entries for runtime
-configuration of fault-injection capabilities.
-
-- /sys/kernel/debug/fail*/probability:
-
-	likelihood of failure injection, in percent.
-	Format: <percent>
-
-	Note that one-failure-per-hundred is a very high error rate
-	for some testcases.  Consider setting probability=100 and configure
-	/sys/kernel/debug/fail*/interval for such testcases.
-
-- /sys/kernel/debug/fail*/interval:
-
-	specifies the interval between failures, for calls to
-	should_fail() that pass all the other tests.
-
-	Note that if you enable this, by setting interval>1, you will
-	probably want to set probability=100.
-
-- /sys/kernel/debug/fail*/times:
-
-	specifies how many times failures may happen at most.
-	A value of -1 means "no limit".
-
-- /sys/kernel/debug/fail*/space:
-
-	specifies an initial resource "budget", decremented by "size"
-	on each call to should_fail(,size).  Failure injection is
-	suppressed until "space" reaches zero.
-
-- /sys/kernel/debug/fail*/verbose
-
-	Format: { 0 | 1 | 2 }
-	specifies the verbosity of the messages when failure is
-	injected.  '0' means no messages; '1' will print only a single
-	log line per failure; '2' will print a call trace too -- useful
-	to debug the problems revealed by fault injection.
-
-- /sys/kernel/debug/fail*/task-filter:
-
-	Format: { 'Y' | 'N' }
-	A value of 'N' disables filtering by process (default).
-	Any positive value limits failures to only processes indicated by
-	/proc/<pid>/make-it-fail==1.
-
-- /sys/kernel/debug/fail*/require-start:
-- /sys/kernel/debug/fail*/require-end:
-- /sys/kernel/debug/fail*/reject-start:
-- /sys/kernel/debug/fail*/reject-end:
-
-	specifies the range of virtual addresses tested during
-	stacktrace walking.  Failure is injected only if some caller
-	in the walked stacktrace lies within the required range, and
-	none lies within the rejected range.
-	Default required range is [0,ULONG_MAX) (whole of virtual address space).
-	Default rejected range is [0,0).
-
-- /sys/kernel/debug/fail*/stacktrace-depth:
-
-	specifies the maximum stacktrace depth walked during search
-	for a caller within [require-start,require-end) OR
-	[reject-start,reject-end).
-
-- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' won't inject failures into
-	highmem/user allocations.
-
-- /sys/kernel/debug/failslab/ignore-gfp-wait:
-- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' will inject failures
-	only into non-sleep allocations (GFP_ATOMIC allocations).
-
-- /sys/kernel/debug/fail_page_alloc/min-order:
-
-	specifies the minimum page allocation order to be injected
-	failures.
-
-- /sys/kernel/debug/fail_futex/ignore-private:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' will disable failure injections
-	when dealing with private (address space) futexes.
-
-- /sys/kernel/debug/fail_function/inject:
-
-	Format: { 'function-name' | '!function-name' | '' }
-	specifies the target function of error injection by name.
-	If the function name leads '!' prefix, given function is
-	removed from injection list. If nothing specified ('')
-	injection list is cleared.
-
-- /sys/kernel/debug/fail_function/injectable:
-
-	(read only) shows error injectable functions and what type of
-	error values can be specified. The error type will be one of
-	below;
-	- NULL:	retval must be 0.
-	- ERRNO: retval must be -1 to -MAX_ERRNO (-4096).
-	- ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096).
-
-- /sys/kernel/debug/fail_function/<functiuon-name>/retval:
-
-	specifies the "error" return value to inject to the given
-	function for given function. This will be created when
-	user specifies new injection entry.
-
-o Boot option
-
-In order to inject faults while debugfs is not available (early boot time),
-use the boot option:
-
-	failslab=
-	fail_page_alloc=
-	fail_make_request=
-	fail_futex=
-	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
-
-o proc entries
-
-- /proc/<pid>/fail-nth:
-- /proc/self/task/<tid>/fail-nth:
-
-	Write to this file of integer N makes N-th call in the task fail.
-	Read from this file returns a integer value. A value of '0' indicates
-	that the fault setup with a previous write to this file was injected.
-	A positive integer N indicates that the fault wasn't yet injected.
-	Note that this file enables all types of faults (slab, futex, etc).
-	This setting takes precedence over all other generic debugfs settings
-	like probability, interval, times, etc. But per-capability settings
-	(e.g. fail_futex/ignore-private) take precedence over it.
-
-	This feature is intended for systematic testing of faults in a single
-	system call. See an example below.
-
-How to add new fault injection capability
------------------------------------------
-
-o #include <linux/fault-inject.h>
-
-o define the fault attributes
-
-  DECLARE_FAULT_ATTR(name);
-
-  Please see the definition of struct fault_attr in fault-inject.h
-  for details.
-
-o provide a way to configure fault attributes
-
-- boot option
-
-  If you need to enable the fault injection capability from boot time, you can
-  provide boot option to configure it. There is a helper function for it:
-
-	setup_fault_attr(attr, str);
-
-- debugfs entries
-
-  failslab, fail_page_alloc, and fail_make_request use this way.
-  Helper functions:
-
-	fault_create_debugfs_attr(name, parent, attr);
-
-- module parameters
-
-  If the scope of the fault injection capability is limited to a
-  single kernel module, it is better to provide module parameters to
-  configure the fault attributes.
-
-o add a hook to insert failures
-
-  Upon should_fail() returning true, client code should inject a failure.
-
-	should_fail(attr, size);
-
-Application Examples
---------------------
-
-o Inject slab allocation failures into module init/exit code
-
-#!/bin/bash
-
-FAILTYPE=failslab
-echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 10 > /sys/kernel/debug/$FAILTYPE/probability
-echo 100 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
-
-faulty_system()
-{
-	bash -c "echo 1 > /proc/self/make-it-fail && exec $*"
-}
-
-if [ $# -eq 0 ]
-then
-	echo "Usage: $0 modulename [ modulename ... ]"
-	exit 1
-fi
-
-for m in $*
-do
-	echo inserting $m...
-	faulty_system modprobe $m
-
-	echo removing $m...
-	faulty_system modprobe -r $m
-done
-
-------------------------------------------------------------------------------
-
-o Inject page allocation failures only for a specific module
-
-#!/bin/bash
-
-FAILTYPE=fail_page_alloc
-module=$1
-
-if [ -z $module ]
-then
-	echo "Usage: $0 <modulename>"
-	exit 1
-fi
-
-modprobe $module
-
-if [ ! -d /sys/module/$module/sections ]
-then
-	echo Module $module is not loaded
-	exit 1
-fi
-
-cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
-cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
-
-echo N > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 10 > /sys/kernel/debug/$FAILTYPE/probability
-echo 100 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
-echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
-
-trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
-
-echo "Injecting errors into the module $module... (interrupt to stop)"
-sleep 1000000
-
-------------------------------------------------------------------------------
-
-o Inject open_ctree error while btrfs mount
-
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir -p tmpmnt
-
-FAILTYPE=fail_function
-FAILFUNC=open_ctree
-echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject
-echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
-echo N > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 100 > /sys/kernel/debug/$FAILTYPE/probability
-echo 0 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
-
-mount -t btrfs $DEVICE tmpmnt
-if [ $? -ne 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-	umount tmpmnt
-fi
-
-echo > /sys/kernel/debug/$FAILTYPE/inject
-
-rmdir tmpmnt
-losetup -d $DEVICE
-rm testfile.img
-
-
-Tool to run command with failslab or fail_page_alloc
-----------------------------------------------------
-In order to make it easier to accomplish the tasks mentioned above, we can use
-tools/testing/fault-injection/failcmd.sh.  Please run a command
-"./tools/testing/fault-injection/failcmd.sh --help" for more information and
-see the following examples.
-
-Examples:
-
-Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab
-allocation failure.
-
-	# ./tools/testing/fault-injection/failcmd.sh \
-		-- make -C tools/testing/selftests/ run_tests
-
-Same as above except to specify 100 times failures at most instead of one time
-at most by default.
-
-	# ./tools/testing/fault-injection/failcmd.sh --times=100 \
-		-- make -C tools/testing/selftests/ run_tests
-
-Same as above except to inject page allocation failure instead of slab
-allocation failure.
-
-	# env FAILCMD_TYPE=fail_page_alloc \
-		./tools/testing/fault-injection/failcmd.sh --times=100 \
-                -- make -C tools/testing/selftests/ run_tests
-
-Systematic faults using fail-nth
----------------------------------
-
-The following code systematically faults 0-th, 1-st, 2-nd and so on
-capabilities in the socketpair() system call.
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
-
-int main()
-{
-	int i, err, res, fail_nth, fds[2];
-	char buf[128];
-
-	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
-	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
-	fail_nth = open(buf, O_RDWR);
-	for (i = 1;; i++) {
-		sprintf(buf, "%d", i);
-		write(fail_nth, buf, strlen(buf));
-		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
-		err = errno;
-		pread(fail_nth, buf, sizeof(buf), 0);
-		if (res == 0) {
-			close(fds[0]);
-			close(fds[1]);
-		}
-		printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y',
-			res, err);
-		if (atoi(buf))
-			break;
-	}
-	return 0;
-}
-
-An example output:
-
-1-th fault Y: res=-1/23
-2-th fault Y: res=-1/23
-3-th fault Y: res=-1/12
-4-th fault Y: res=-1/12
-5-th fault Y: res=-1/23
-6-th fault Y: res=-1/23
-7-th fault Y: res=-1/23
-8-th fault Y: res=-1/12
-9-th fault Y: res=-1/12
-10-th fault Y: res=-1/12
-11-th fault Y: res=-1/12
-12-th fault Y: res=-1/12
-13-th fault Y: res=-1/12
-14-th fault Y: res=-1/12
-15-th fault Y: res=-1/12
-16-th fault N: res=0/12
diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst
new file mode 100644
index 000000000000..92b5639ed07a
--- /dev/null
+++ b/Documentation/fault-injection/index.rst
@@ -0,0 +1,20 @@
+:orphan:
+
+===============
+fault-injection
+===============
+
+.. toctree::
+    :maxdepth: 1
+
+    fault-injection
+    notifier-error-inject
+    nvme-fault-injection
+    provoke-crashes
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/fault-injection/notifier-error-inject.rst b/Documentation/fault-injection/notifier-error-inject.rst
new file mode 100644
index 000000000000..1668b6e48d3a
--- /dev/null
+++ b/Documentation/fault-injection/notifier-error-inject.rst
@@ -0,0 +1,98 @@
+Notifier error injection
+========================
+
+Notifier error injection provides the ability to inject artificial errors to
+specified notifier chain callbacks. It is useful to test the error handling of
+notifier call chain failures which is rarely executed.  There are kernel
+modules that can be used to test the following notifiers.
+
+ * PM notifier
+ * Memory hotplug notifier
+ * powerpc pSeries reconfig notifier
+ * Netdevice notifier
+
+PM notifier error injection module
+----------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error
+
+Possible PM notifier events to be failed are:
+
+ * PM_HIBERNATION_PREPARE
+ * PM_SUSPEND_PREPARE
+ * PM_RESTORE_PREPARE
+
+Example: Inject PM suspend error (-12 = -ENOMEM)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/pm/
+	# echo -12 > actions/PM_SUSPEND_PREPARE/error
+	# echo mem > /sys/power/state
+	bash: echo: write error: Cannot allocate memory
+
+Memory hotplug notifier error injection module
+----------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error
+
+Possible memory notifier events to be failed are:
+
+ * MEM_GOING_ONLINE
+ * MEM_GOING_OFFLINE
+
+Example: Inject memory hotplug offline error (-12 == -ENOMEM)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/memory
+	# echo -12 > actions/MEM_GOING_OFFLINE/error
+	# echo offline > /sys/devices/system/memory/memoryXXX/state
+	bash: echo: write error: Cannot allocate memory
+
+powerpc pSeries reconfig notifier error injection module
+--------------------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error
+
+Possible pSeries reconfig notifier events to be failed are:
+
+ * PSERIES_RECONFIG_ADD
+ * PSERIES_RECONFIG_REMOVE
+ * PSERIES_DRCONF_MEM_ADD
+ * PSERIES_DRCONF_MEM_REMOVE
+
+Netdevice notifier error injection module
+----------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error
+
+Netdevice notifier events which can be failed are:
+
+ * NETDEV_REGISTER
+ * NETDEV_CHANGEMTU
+ * NETDEV_CHANGENAME
+ * NETDEV_PRE_UP
+ * NETDEV_PRE_TYPE_CHANGE
+ * NETDEV_POST_INIT
+ * NETDEV_PRECHANGEMTU
+ * NETDEV_PRECHANGEUPPER
+ * NETDEV_CHANGEUPPER
+
+Example: Inject netdevice mtu change error (-22 == -EINVAL)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/netdev
+	# echo -22 > actions/NETDEV_CHANGEMTU/error
+	# ip link set eth0 mtu 1024
+	RTNETLINK answers: Invalid argument
+
+For more usage examples
+-----------------------
+There are tools/testing/selftests using the notifier error injection features
+for CPU and memory notifiers.
+
+ * tools/testing/selftests/cpu-hotplug/on-off-test.sh
+ * tools/testing/selftests/memory-hotplug/on-off-test.sh
+
+These scripts first do simple online and offline tests and then do fault
+injection tests if notifier error injection module is available.
diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt
deleted file mode 100644
index e861d761de24..000000000000
--- a/Documentation/fault-injection/notifier-error-inject.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-Notifier error injection
-========================
-
-Notifier error injection provides the ability to inject artificial errors to
-specified notifier chain callbacks. It is useful to test the error handling of
-notifier call chain failures which is rarely executed.  There are kernel
-modules that can be used to test the following notifiers.
-
- * PM notifier
- * Memory hotplug notifier
- * powerpc pSeries reconfig notifier
- * Netdevice notifier
-
-PM notifier error injection module
-----------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error
-
-Possible PM notifier events to be failed are:
-
- * PM_HIBERNATION_PREPARE
- * PM_SUSPEND_PREPARE
- * PM_RESTORE_PREPARE
-
-Example: Inject PM suspend error (-12 = -ENOMEM)
-
-	# cd /sys/kernel/debug/notifier-error-inject/pm/
-	# echo -12 > actions/PM_SUSPEND_PREPARE/error
-	# echo mem > /sys/power/state
-	bash: echo: write error: Cannot allocate memory
-
-Memory hotplug notifier error injection module
-----------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error
-
-Possible memory notifier events to be failed are:
-
- * MEM_GOING_ONLINE
- * MEM_GOING_OFFLINE
-
-Example: Inject memory hotplug offline error (-12 == -ENOMEM)
-
-	# cd /sys/kernel/debug/notifier-error-inject/memory
-	# echo -12 > actions/MEM_GOING_OFFLINE/error
-	# echo offline > /sys/devices/system/memory/memoryXXX/state
-	bash: echo: write error: Cannot allocate memory
-
-powerpc pSeries reconfig notifier error injection module
---------------------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error
-
-Possible pSeries reconfig notifier events to be failed are:
-
- * PSERIES_RECONFIG_ADD
- * PSERIES_RECONFIG_REMOVE
- * PSERIES_DRCONF_MEM_ADD
- * PSERIES_DRCONF_MEM_REMOVE
-
-Netdevice notifier error injection module
-----------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error
-
-Netdevice notifier events which can be failed are:
-
- * NETDEV_REGISTER
- * NETDEV_CHANGEMTU
- * NETDEV_CHANGENAME
- * NETDEV_PRE_UP
- * NETDEV_PRE_TYPE_CHANGE
- * NETDEV_POST_INIT
- * NETDEV_PRECHANGEMTU
- * NETDEV_PRECHANGEUPPER
- * NETDEV_CHANGEUPPER
-
-Example: Inject netdevice mtu change error (-22 == -EINVAL)
-
-	# cd /sys/kernel/debug/notifier-error-inject/netdev
-	# echo -22 > actions/NETDEV_CHANGEMTU/error
-	# ip link set eth0 mtu 1024
-	RTNETLINK answers: Invalid argument
-
-For more usage examples
------------------------
-There are tools/testing/selftests using the notifier error injection features
-for CPU and memory notifiers.
-
- * tools/testing/selftests/cpu-hotplug/on-off-test.sh
- * tools/testing/selftests/memory-hotplug/on-off-test.sh
-
-These scripts first do simple online and offline tests and then do fault
-injection tests if notifier error injection module is available.
diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst
new file mode 100644
index 000000000000..bbb1bf3e8650
--- /dev/null
+++ b/Documentation/fault-injection/nvme-fault-injection.rst
@@ -0,0 +1,120 @@
+NVMe Fault Injection
+====================
+Linux's fault injection framework provides a systematic way to support
+error injection via debugfs in the /sys/kernel/debug directory. When
+enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
+injected into the nvme_end_request. Users can change the default status
+code and no retry flag via the debugfs. The list of Generic Command
+Status can be found in include/linux/nvme.h
+
+Following examples show how to inject an error into the nvme.
+
+First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config,
+recompile the kernel. After booting up the kernel, do the
+following.
+
+Example 1: Inject default status code with no retry
+---------------------------------------------------
+
+::
+
+  mount /dev/nvme0n1 /mnt
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+  echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+  cp a.file /mnt
+
+Expected Result::
+
+  cp: cannot stat ‘/mnt/a.file’: Input/output error
+
+Message from dmesg::
+
+  FAULT_INJECTION: forcing a failure.
+  name fault_inject, interval 1, probability 100, space 0, times 1
+  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2
+  Hardware name: innotek GmbH VirtualBox/VirtualBox,
+  BIOS VirtualBox 12/01/2006
+  Call Trace:
+    <IRQ>
+    dump_stack+0x5c/0x7d
+    should_fail+0x148/0x170
+    nvme_should_fail+0x2f/0x50 [nvme_core]
+    nvme_process_cq+0xe7/0x1d0 [nvme]
+    nvme_irq+0x1e/0x40 [nvme]
+    __handle_irq_event_percpu+0x3a/0x190
+    handle_irq_event_percpu+0x30/0x70
+    handle_irq_event+0x36/0x60
+    handle_fasteoi_irq+0x78/0x120
+    handle_irq+0xa7/0x130
+    ? tick_irq_enter+0xa8/0xc0
+    do_IRQ+0x43/0xc0
+    common_interrupt+0xa2/0xa2
+    </IRQ>
+  RIP: 0010:native_safe_halt+0x2/0x10
+  RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd
+  RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+  RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480
+  R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000
+    ? __sched_text_end+0x4/0x4
+    default_idle+0x18/0xf0
+    do_idle+0x150/0x1d0
+    cpu_startup_entry+0x6f/0x80
+    start_kernel+0x4c4/0x4e4
+    ? set_init_arg+0x55/0x55
+    secondary_startup_64+0xa5/0xb0
+    print_req_error: I/O error, dev nvme0n1, sector 9240
+  EXT4-fs error (device nvme0n1): ext4_find_entry:1436:
+  inode #2: comm cp: reading directory lblock 0
+
+Example 2: Inject default status code with retry
+------------------------------------------------
+
+::
+
+  mount /dev/nvme0n1 /mnt
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+  echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status
+  echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry
+
+  cp a.file /mnt
+
+Expected Result::
+
+  command success without error
+
+Message from dmesg::
+
+  FAULT_INJECTION: forcing a failure.
+  name fault_inject, interval 1, probability 100, space 0, times 1
+  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4
+  Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+  Call Trace:
+    <IRQ>
+    dump_stack+0x5c/0x7d
+    should_fail+0x148/0x170
+    nvme_should_fail+0x30/0x60 [nvme_core]
+    nvme_loop_queue_response+0x84/0x110 [nvme_loop]
+    nvmet_req_complete+0x11/0x40 [nvmet]
+    nvmet_bio_done+0x28/0x40 [nvmet]
+    blk_update_request+0xb0/0x310
+    blk_mq_end_request+0x18/0x60
+    flush_smp_call_function_queue+0x3d/0xf0
+    smp_call_function_single_interrupt+0x2c/0xc0
+    call_function_single_interrupt+0xa2/0xb0
+    </IRQ>
+  RIP: 0010:native_safe_halt+0x2/0x10
+  RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04
+  RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+  RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680
+  R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000
+    ? __sched_text_end+0x4/0x4
+    default_idle+0x18/0xf0
+    do_idle+0x150/0x1d0
+    cpu_startup_entry+0x6f/0x80
+    start_secondary+0x187/0x1e0
+    secondary_startup_64+0xa5/0xb0
diff --git a/Documentation/fault-injection/nvme-fault-injection.txt b/Documentation/fault-injection/nvme-fault-injection.txt
deleted file mode 100644
index 8fbf3bf60b62..000000000000
--- a/Documentation/fault-injection/nvme-fault-injection.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-NVMe Fault Injection
-====================
-Linux's fault injection framework provides a systematic way to support
-error injection via debugfs in the /sys/kernel/debug directory. When
-enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
-injected into the nvme_end_request. Users can change the default status
-code and no retry flag via the debugfs. The list of Generic Command
-Status can be found in include/linux/nvme.h
-
-Following examples show how to inject an error into the nvme.
-
-First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config,
-recompile the kernel. After booting up the kernel, do the
-following.
-
-Example 1: Inject default status code with no retry
----------------------------------------------------
-
-mount /dev/nvme0n1 /mnt
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
-echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
-cp a.file /mnt
-
-Expected Result:
-
-cp: cannot stat ‘/mnt/a.file’: Input/output error
-
-Message from dmesg:
-
-FAULT_INJECTION: forcing a failure.
-name fault_inject, interval 1, probability 100, space 0, times 1
-CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2
-Hardware name: innotek GmbH VirtualBox/VirtualBox,
-BIOS VirtualBox 12/01/2006
-Call Trace:
-  <IRQ>
-  dump_stack+0x5c/0x7d
-  should_fail+0x148/0x170
-  nvme_should_fail+0x2f/0x50 [nvme_core]
-  nvme_process_cq+0xe7/0x1d0 [nvme]
-  nvme_irq+0x1e/0x40 [nvme]
-  __handle_irq_event_percpu+0x3a/0x190
-  handle_irq_event_percpu+0x30/0x70
-  handle_irq_event+0x36/0x60
-  handle_fasteoi_irq+0x78/0x120
-  handle_irq+0xa7/0x130
-  ? tick_irq_enter+0xa8/0xc0
-  do_IRQ+0x43/0xc0
-  common_interrupt+0xa2/0xa2
-  </IRQ>
-RIP: 0010:native_safe_halt+0x2/0x10
-RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd
-RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000
-RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
-RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000
-R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480
-R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000
-  ? __sched_text_end+0x4/0x4
-  default_idle+0x18/0xf0
-  do_idle+0x150/0x1d0
-  cpu_startup_entry+0x6f/0x80
-  start_kernel+0x4c4/0x4e4
-  ? set_init_arg+0x55/0x55
-  secondary_startup_64+0xa5/0xb0
-  print_req_error: I/O error, dev nvme0n1, sector 9240
-EXT4-fs error (device nvme0n1): ext4_find_entry:1436:
-inode #2: comm cp: reading directory lblock 0
-
-Example 2: Inject default status code with retry
-------------------------------------------------
-
-mount /dev/nvme0n1 /mnt
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
-echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status
-echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry
-
-cp a.file /mnt
-
-Expected Result:
-
-command success without error
-
-Message from dmesg:
-
-FAULT_INJECTION: forcing a failure.
-name fault_inject, interval 1, probability 100, space 0, times 1
-CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4
-Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
-Call Trace:
-  <IRQ>
-  dump_stack+0x5c/0x7d
-  should_fail+0x148/0x170
-  nvme_should_fail+0x30/0x60 [nvme_core]
-  nvme_loop_queue_response+0x84/0x110 [nvme_loop]
-  nvmet_req_complete+0x11/0x40 [nvmet]
-  nvmet_bio_done+0x28/0x40 [nvmet]
-  blk_update_request+0xb0/0x310
-  blk_mq_end_request+0x18/0x60
-  flush_smp_call_function_queue+0x3d/0xf0
-  smp_call_function_single_interrupt+0x2c/0xc0
-  call_function_single_interrupt+0xa2/0xb0
-  </IRQ>
-RIP: 0010:native_safe_halt+0x2/0x10
-RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04
-RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000
-RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
-RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000
-R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680
-R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000
-  ? __sched_text_end+0x4/0x4
-  default_idle+0x18/0xf0
-  do_idle+0x150/0x1d0
-  cpu_startup_entry+0x6f/0x80
-  start_secondary+0x187/0x1e0
-  secondary_startup_64+0xa5/0xb0
diff --git a/Documentation/fault-injection/provoke-crashes.rst b/Documentation/fault-injection/provoke-crashes.rst
new file mode 100644
index 000000000000..9279a3e12278
--- /dev/null
+++ b/Documentation/fault-injection/provoke-crashes.rst
@@ -0,0 +1,48 @@
+===============
+Provoke crashes
+===============
+
+The lkdtm module provides an interface to crash or injure the kernel at
+predefined crashpoints to evaluate the reliability of crash dumps obtained
+using different dumping solutions. The module uses KPROBEs to instrument
+crashing points, but can also crash the kernel directly without KRPOBE
+support.
+
+
+You can provide the way either through module arguments when inserting
+the module, or through a debugfs interface.
+
+Usage::
+
+	insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
+			[cpoint_count={>0}]
+
+recur_count
+	Recursion level for the stack overflow test. Default is 10.
+
+cpoint_name
+	Crash point where the kernel is to be crashed. It can be
+	one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
+	FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
+	IDE_CORE_CP, DIRECT
+
+cpoint_type
+	Indicates the action to be taken on hitting the crash point.
+	It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
+	CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
+	WRITE_AFTER_FREE,
+
+cpoint_count
+	Indicates the number of times the crash point is to be hit
+	to trigger an action. The default is 10.
+
+You can also induce failures by mounting debugfs and writing the type to
+<mountpoint>/provoke-crash/<crashpoint>. E.g.::
+
+  mount -t debugfs debugfs /mnt
+  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
+
+
+A special file is `DIRECT` which will induce the crash directly without
+KPROBE instrumentation. This mode is the only one available when the module
+is built on a kernel without KPROBEs support.
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt
deleted file mode 100644
index 7a9d3d81525b..000000000000
--- a/Documentation/fault-injection/provoke-crashes.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-The lkdtm module provides an interface to crash or injure the kernel at
-predefined crashpoints to evaluate the reliability of crash dumps obtained
-using different dumping solutions. The module uses KPROBEs to instrument
-crashing points, but can also crash the kernel directly without KRPOBE
-support.
-
-
-You can provide the way either through module arguments when inserting
-the module, or through a debugfs interface.
-
-Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
-				[cpoint_count={>0}]
-
-  recur_count : Recursion level for the stack overflow test. Default is 10.
-
-  cpoint_name : Crash point where the kernel is to be crashed. It can be
-	 one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
-	 FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
-	 IDE_CORE_CP, DIRECT
-
-  cpoint_type : Indicates the action to be taken on hitting the crash point.
-     It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
-     CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
-     WRITE_AFTER_FREE,
-
-  cpoint_count : Indicates the number of times the crash point is to be hit
-    to trigger an action. The default is 10.
-
-You can also induce failures by mounting debugfs and writing the type to
-<mountpoint>/provoke-crash/<crashpoint>. E.g.,
-
-  mount -t debugfs debugfs /mnt
-  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
-
-
-A special file is `DIRECT' which will induce the crash directly without
-KPROBE instrumentation. This mode is the only one available when the module
-is built on a kernel without KPROBEs support.
diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst
index 4b7a5ab3cec1..13dd893c9f88 100644
--- a/Documentation/process/4.Coding.rst
+++ b/Documentation/process/4.Coding.rst
@@ -298,7 +298,7 @@ enabled, a configurable percentage of memory allocations will be made to
 fail; these failures can be restricted to a specific range of code.
 Running with fault injection enabled allows the programmer to see how the
 code responds when things go badly.  See
-Documentation/fault-injection/fault-injection.txt for more information on
+Documentation/fault-injection/fault-injection.rst for more information on
 how to use this facility.
 
 Other kinds of errors can be found with the "sparse" static analysis tool.
diff --git a/Documentation/translations/it_IT/process/4.Coding.rst b/Documentation/translations/it_IT/process/4.Coding.rst
index c05b89e616dd..a5e36aa60448 100644
--- a/Documentation/translations/it_IT/process/4.Coding.rst
+++ b/Documentation/translations/it_IT/process/4.Coding.rst
@@ -314,7 +314,7 @@ di allocazione di memoria sarà destinata al fallimento; questi fallimenti
 possono essere ridotti ad uno specifico pezzo di codice.  Procedere con
 l'inserimento dei fallimenti attivo permette al programmatore di verificare
 come il codice risponde quando le cose vanno male.  Consultate:
-Documentation/fault-injection/fault-injection.txt per avere maggiori
+Documentation/fault-injection/fault-injection.rst per avere maggiori
 informazioni su come utilizzare questo strumento.
 
 Altre tipologie di errori possono essere riscontrati con lo strumento di
diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst
index 8bb777941394..b82b1dde3122 100644
--- a/Documentation/translations/zh_CN/process/4.Coding.rst
+++ b/Documentation/translations/zh_CN/process/4.Coding.rst
@@ -205,7 +205,7 @@ Linus对这个问题给出了最佳答案:
 启用故障注入后，内存分配的可配置百分比将失败；这些失败可以限制在特定的代码
 范围内。在启用了故障注入的情况下运行，程序员可以看到当情况恶化时代码如何响
 应。有关如何使用此工具的详细信息，请参阅
-Documentation/fault-injection/fault-injection.txt。
+Documentation/fault-injection/fault-injection.rst。
 
 使用“sparse”静态分析工具可以发现其他类型的错误。对于sparse，可以警告程序员
 用户空间和内核空间地址之间的混淆、big endian和small endian数量的混合、在需
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index 8a1428d4f138..bba49abb6750 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -15,7 +15,7 @@
  *
  * Debugfs support added by Simon Kagstrom <simon.kagstrom@netinsight.net>
  *
- * See Documentation/fault-injection/provoke-crashes.txt for instructions
+ * See Documentation/fault-injection/provoke-crashes.rst for instructions
  */
 #include "lkdtm.h"
 #include <linux/fs.h>
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 7e6c77740413..e525f6957c49 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -11,7 +11,7 @@
 
 /*
  * For explanation of the elements of this struct, see
- * Documentation/fault-injection/fault-injection.txt
+ * Documentation/fault-injection/fault-injection.rst
  */
 struct fault_attr {
 	unsigned long probability;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cbdfae379896..4d42a9a6006d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1701,7 +1701,7 @@ config LKDTM
 	called lkdtm.
 
 	Documentation on how to use the module can be found in
-	Documentation/fault-injection/provoke-crashes.txt
+	Documentation/fault-injection/provoke-crashes.rst
 
 config TEST_LIST_SORT
 	tristate "Linked list sorting test"
diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh
index 29a6c63c5a15..78dac34264be 100644
--- a/tools/testing/fault-injection/failcmd.sh
+++ b/tools/testing/fault-injection/failcmd.sh
@@ -42,7 +42,7 @@ OPTIONS
 	--interval=value, --space=value, --verbose=value, --task-filter=value,
 	--stacktrace-depth=value, --require-start=value, --require-end=value,
 	--reject-start=value, --reject-end=value, --ignore-gfp-wait=value
-		See Documentation/fault-injection/fault-injection.txt for more
+		See Documentation/fault-injection/fault-injection.rst for more
 		information
 
 	failslab options:
-- 
cgit v1.2.3


From 99c8b231ae6c6ca4ca2fd1c0b3701071f589661f Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 12 Jun 2019 14:52:41 -0300
Subject: docs: cgroup-v1: convert docs to ReST and rename to *.rst

Convert the cgroup-v1 files to ReST format, in order to
allow a later addition to the admin-guide.

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/hw-vuln/l1tf.rst         |    2 +-
 Documentation/admin-guide/kernel-parameters.txt    |    4 +-
 .../admin-guide/mm/numa_memory_policy.rst          |    2 +-
 Documentation/block/bfq-iosched.txt                |    2 +-
 Documentation/cgroup-v1/blkio-controller.rst       |  391 ++++++++
 Documentation/cgroup-v1/blkio-controller.txt       |  375 --------
 Documentation/cgroup-v1/cgroups.rst                |  695 ++++++++++++++
 Documentation/cgroup-v1/cgroups.txt                |  677 -------------
 Documentation/cgroup-v1/cpuacct.rst                |   50 +
 Documentation/cgroup-v1/cpuacct.txt                |   49 -
 Documentation/cgroup-v1/cpusets.rst                |  866 +++++++++++++++++
 Documentation/cgroup-v1/cpusets.txt                |  839 ----------------
 Documentation/cgroup-v1/devices.rst                |  132 +++
 Documentation/cgroup-v1/devices.txt                |  116 ---
 Documentation/cgroup-v1/freezer-subsystem.rst      |  127 +++
 Documentation/cgroup-v1/freezer-subsystem.txt      |  123 ---
 Documentation/cgroup-v1/hugetlb.rst                |   50 +
 Documentation/cgroup-v1/hugetlb.txt                |   49 -
 Documentation/cgroup-v1/index.rst                  |   30 +
 Documentation/cgroup-v1/memcg_test.rst             |  355 +++++++
 Documentation/cgroup-v1/memcg_test.txt             |  280 ------
 Documentation/cgroup-v1/memory.rst                 | 1003 ++++++++++++++++++++
 Documentation/cgroup-v1/memory.txt                 |  892 -----------------
 Documentation/cgroup-v1/net_cls.rst                |   44 +
 Documentation/cgroup-v1/net_cls.txt                |   39 -
 Documentation/cgroup-v1/net_prio.rst               |   57 ++
 Documentation/cgroup-v1/net_prio.txt               |   55 --
 Documentation/cgroup-v1/pids.rst                   |   92 ++
 Documentation/cgroup-v1/pids.txt                   |   88 --
 Documentation/cgroup-v1/rdma.rst                   |  117 +++
 Documentation/cgroup-v1/rdma.txt                   |  109 ---
 Documentation/filesystems/tmpfs.txt                |    2 +-
 Documentation/scheduler/sched-deadline.txt         |    2 +-
 Documentation/scheduler/sched-design-CFS.txt       |    2 +-
 Documentation/scheduler/sched-rt-group.txt         |    2 +-
 Documentation/vm/numa.rst                          |    4 +-
 Documentation/vm/page_migration.rst                |    2 +-
 Documentation/vm/unevictable-lru.rst               |    2 +-
 Documentation/x86/x86_64/fake-numa-for-cpusets.rst |    4 +-
 MAINTAINERS                                        |    2 +-
 block/Kconfig                                      |    2 +-
 include/linux/cgroup-defs.h                        |    2 +-
 include/uapi/linux/bpf.h                           |    2 +-
 init/Kconfig                                       |    2 +-
 kernel/cgroup/cpuset.c                             |    2 +-
 security/device_cgroup.c                           |    2 +-
 tools/include/uapi/linux/bpf.h                     |    2 +-
 47 files changed, 4032 insertions(+), 3714 deletions(-)
 create mode 100644 Documentation/cgroup-v1/blkio-controller.rst
 delete mode 100644 Documentation/cgroup-v1/blkio-controller.txt
 create mode 100644 Documentation/cgroup-v1/cgroups.rst
 delete mode 100644 Documentation/cgroup-v1/cgroups.txt
 create mode 100644 Documentation/cgroup-v1/cpuacct.rst
 delete mode 100644 Documentation/cgroup-v1/cpuacct.txt
 create mode 100644 Documentation/cgroup-v1/cpusets.rst
 delete mode 100644 Documentation/cgroup-v1/cpusets.txt
 create mode 100644 Documentation/cgroup-v1/devices.rst
 delete mode 100644 Documentation/cgroup-v1/devices.txt
 create mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst
 delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.txt
 create mode 100644 Documentation/cgroup-v1/hugetlb.rst
 delete mode 100644 Documentation/cgroup-v1/hugetlb.txt
 create mode 100644 Documentation/cgroup-v1/index.rst
 create mode 100644 Documentation/cgroup-v1/memcg_test.rst
 delete mode 100644 Documentation/cgroup-v1/memcg_test.txt
 create mode 100644 Documentation/cgroup-v1/memory.rst
 delete mode 100644 Documentation/cgroup-v1/memory.txt
 create mode 100644 Documentation/cgroup-v1/net_cls.rst
 delete mode 100644 Documentation/cgroup-v1/net_cls.txt
 create mode 100644 Documentation/cgroup-v1/net_prio.rst
 delete mode 100644 Documentation/cgroup-v1/net_prio.txt
 create mode 100644 Documentation/cgroup-v1/pids.rst
 delete mode 100644 Documentation/cgroup-v1/pids.txt
 create mode 100644 Documentation/cgroup-v1/rdma.rst
 delete mode 100644 Documentation/cgroup-v1/rdma.txt

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 31653a9f0e1b..656aee262e23 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
 
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst
 
 .. _interrupt_isolation:
 
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..da0e84ecee32 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4078,7 +4078,7 @@
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cgroup-v1/cpusets.txt.
+			See Documentation/cgroup-v1/cpusets.rst.
 
 	reserve=	[KNL,BUGS] Force kernel to ignore I/O ports or memory
 			Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4588,7 +4588,7 @@
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
-			it if 0 is given (See Documentation/cgroup-v1/memory.txt)
+			it if 0 is given (See Documentation/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
 			Format: { <int> | force | noforce }
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index d78c5b315f72..546f174e5d6a 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
 Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.txt``)
+(``Documentation/cgroup-v1/cpusets.rst``)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 1a0f2ac02eb6..b2265cf6c9c3 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -539,7 +539,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
 CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all
 the stat files documented in
-Documentation/cgroup-v1/blkio-controller.txt. If, instead,
+Documentation/cgroup-v1/blkio-controller.rst. If, instead,
 CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
 blkio.bfq.io_service_bytes
 blkio.bfq.io_service_bytes_recursive
diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..2c1b907afc14
--- /dev/null
+++ b/Documentation/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,391 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+Currently two IO control policies are implemented. First one is proportional
+weight time based division of disk policy. It is implemented in CFQ. Hence
+this policy takes effect only on leaf nodes when CFQ is being used. The second
+one is throttling policy which can be used to specify upper IO rate limits
+on devices. This policy is implemented in generic block layer and can be
+used on leaf nodes as well as higher level logical devices like device mapper.
+
+HOWTO
+=====
+Proportional Weight division of bandwidth
+-----------------------------------------
+You can do a very simple testing of running two dd threads in two different
+cgroups. Here is what you can do.
+
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable group scheduling in CFQ:
+
+
+	CONFIG_CFQ_GROUP_IOSCHED=y
+
+- Compile and boot into kernel and mount IO controller (blkio); see
+  cgroups.txt, Why are cgroups needed?.
+
+  ::
+
+	mount -t tmpfs cgroup_root /sys/fs/cgroup
+	mkdir /sys/fs/cgroup/blkio
+	mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Create two cgroups::
+
+	mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
+
+- Set weights of group test1 and test2::
+
+	echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
+	echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
+
+- Create two same size files (say 512MB each) on same disk (file1, file2) and
+  launch two dd threads in different cgroup to read those files::
+
+	sync
+	echo 3 > /proc/sys/vm/drop_caches
+
+	dd if=/mnt/sdb/zerofile1 of=/dev/null &
+	echo $! > /sys/fs/cgroup/blkio/test1/tasks
+	cat /sys/fs/cgroup/blkio/test1/tasks
+
+	dd if=/mnt/sdb/zerofile2 of=/dev/null &
+	echo $! > /sys/fs/cgroup/blkio/test2/tasks
+	cat /sys/fs/cgroup/blkio/test2/tasks
+
+- At macro level, first dd should finish first. To get more precise data, keep
+  on looking at (with the help of script), at blkio.disk_time and
+  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
+  much disk time (in milliseconds), each group got and how many sectors each
+  group dispatched to the disk. We provide fairness in terms of disk time, so
+  ideally io.disk_time of cgroups should be in proportion to the weight.
+
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>"::
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Both CFQ and throttling implement hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+CFQ by default and throttling with "sane_behavior" will handle the
+hierarchy correctly.  For details on CFQ hierarchy support, refer to
+Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+	- Block IO controller.
+
+CONFIG_DEBUG_BLK_CGROUP
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+	- Enables group scheduling in CFQ. Currently only 1 level of group
+	  creation is allowed.
+
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
+	  Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format::
+
+	    # echo dev_maj:dev_minor weight > blkio.weight_device
+
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+	    # echo 8:16 300 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+	    # echo 8:0 500 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:0     500
+	    8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup::
+
+	    # echo 8:0 0 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the existing ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per device. Following is
+	  the format::
+
+	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
+
+CFQ sysfs tunable
+=================
+/sys/block/<disk>/queue/iosched/slice_idle
+------------------------------------------
+On a faster hardware CFQ can be slow, especially with sequential workload.
+This happens because CFQ idles on a single queue and single queue might not
+drive deeper request queue depths to keep the storage busy. In such scenarios
+one can try setting slice_idle=0 and that would switch CFQ to IOPS
+(IO operations per second) mode on NCQ supporting hardware.
+
+That means CFQ will not idle between cfq queues of a cfq group and hence be
+able to driver higher queue depth and achieve better throughput. That also
+means that cfq provides fairness among groups in terms of IOPS and not in
+terms of disk time.
+
+/sys/block/<disk>/queue/iosched/group_idle
+------------------------------------------
+If one disables idling on individual cfq queues and cfq service trees by
+setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
+on the group in an attempt to provide fairness among groups.
+
+By default group_idle is same as slice_idle and does not do anything if
+slice_idle is enabled.
+
+One can experience an overall throughput drop if you have created multiple
+groups and put applications in that group which are not driving enough
+IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
+on individual groups and throughput should improve.
diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt
deleted file mode 100644
index 673dc34d3f78..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.txt
+++ /dev/null
@@ -1,375 +0,0 @@
-				Block IO Controller
-				===================
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-Currently two IO control policies are implemented. First one is proportional
-weight time based division of disk policy. It is implemented in CFQ. Hence
-this policy takes effect only on leaf nodes when CFQ is being used. The second
-one is throttling policy which can be used to specify upper IO rate limits
-on devices. This policy is implemented in generic block layer and can be
-used on leaf nodes as well as higher level logical devices like device mapper.
-
-HOWTO
-=====
-Proportional Weight division of bandwidth
------------------------------------------
-You can do a very simple testing of running two dd threads in two different
-cgroups. Here is what you can do.
-
-- Enable Block IO controller
-	CONFIG_BLK_CGROUP=y
-
-- Enable group scheduling in CFQ
-	CONFIG_CFQ_GROUP_IOSCHED=y
-
-- Compile and boot into kernel and mount IO controller (blkio); see
-  cgroups.txt, Why are cgroups needed?.
-
-	mount -t tmpfs cgroup_root /sys/fs/cgroup
-	mkdir /sys/fs/cgroup/blkio
-	mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Create two cgroups
-	mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
-
-- Set weights of group test1 and test2
-	echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
-	echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
-
-- Create two same size files (say 512MB each) on same disk (file1, file2) and
-  launch two dd threads in different cgroup to read those files.
-
-	sync
-	echo 3 > /proc/sys/vm/drop_caches
-
-	dd if=/mnt/sdb/zerofile1 of=/dev/null &
-	echo $! > /sys/fs/cgroup/blkio/test1/tasks
-	cat /sys/fs/cgroup/blkio/test1/tasks
-
-	dd if=/mnt/sdb/zerofile2 of=/dev/null &
-	echo $! > /sys/fs/cgroup/blkio/test2/tasks
-	cat /sys/fs/cgroup/blkio/test2/tasks
-
-- At macro level, first dd should finish first. To get more precise data, keep
-  on looking at (with the help of script), at blkio.disk_time and
-  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
-  much disk time (in milliseconds), each group got and how many sectors each
-  group dispatched to the disk. We provide fairness in terms of disk time, so
-  ideally io.disk_time of cgroups should be in proportion to the weight.
-
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller
-	CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer
-	CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>".
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not.
-
-        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Both CFQ and throttling implement hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows.
-
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
-
-CFQ by default and throttling with "sane_behavior" will handle the
-hierarchy correctly.  For details on CFQ hierarchy support, refer to
-Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following.
-
-				pivot
-			     /  /   \  \
-			root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-	- Block IO controller.
-
-CONFIG_DEBUG_BLK_CGROUP
-	- Debug help. Right now some additional stats file show up in cgroup
-	  if this option is enabled.
-
-CONFIG_CFQ_GROUP_IOSCHED
-	- Enables group scheduling in CFQ. Currently only 1 level of group
-	  creation is allowed.
-
-CONFIG_BLK_DEV_THROTTLING
-	- Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-	- Specifies per cgroup weight. This is default weight of the group
-	  on all the devices until and unless overridden by per device rule.
-	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-	- One can specify per cgroup per device rules using this interface.
-	  These rules override the default value of group weight as specified
-	  by blkio.weight.
-
-	  Following is the format.
-
-	  # echo dev_maj:dev_minor weight > blkio.weight_device
-	  Configure weight=300 on /dev/sdb (8:16) in this cgroup
-	  # echo 8:16 300 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:16    300
-
-	  Configure weight=500 on /dev/sda (8:0) in this cgroup
-	  # echo 8:0 500 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:0     500
-	  8:16    300
-
-	  Remove specific weight for /dev/sda in this cgroup
-	  # echo 8:0 0 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:16    300
-
-- blkio.leaf_weight[_device]
-	- Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-	- disk time allocated to cgroup per device in milliseconds. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the disk time allocated to group in
-	  milliseconds.
-
-- blkio.sectors
-	- number of sectors transferred to/from disk by the group. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the number of sectors transferred by the
-	  group to/from the device.
-
-- blkio.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-- blkio.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.io_service_time
-	- Total amount of time between request dispatch and request completion
-	  for the IOs done by this cgroup. This is in nanoseconds to make it
-	  meaningful for flash devices too. For devices with queue depth of 1,
-	  this time represents the actual service time. When queue_depth > 1,
-	  that is no longer true as requests may be served out of order. This
-	  may cause the service time for a given IO to include the service time
-	  of multiple IOs when served out of order which may result in total
-	  io_service_time > actual time elapsed. This time is further divided by
-	  the type of operation - read or write, sync or async. First two fields
-	  specify the major and minor number of the device, third field
-	  specifies the operation type and the fourth field specifies the
-	  io_service_time in ns.
-
-- blkio.io_wait_time
-	- Total amount of time the IOs for this cgroup spent waiting in the
-	  scheduler queues for service. This can be greater than the total time
-	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
-	  measure of total time the cgroup spent waiting but rather a measure of
-	  the wait_time for its individual IOs. For devices with queue_depth > 1
-	  this metric does not include the time spent waiting for service once
-	  the IO is dispatched to the device but till it actually gets serviced
-	  (there might be a time lag here due to re-ordering of requests by the
-	  device). This is in nanoseconds to make it meaningful for flash
-	  devices too. This time is further divided by the type of operation -
-	  read or write, sync or async. First two fields specify the major and
-	  minor number of the device, third field specifies the operation type
-	  and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-	- Total number of bios/requests merged into requests belonging to this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.io_queued
-	- Total number of requests queued up at any given instant for this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  The average queue size for this cgroup over the entire time of this
-	  cgroup's existence. Queue size samples are taken each time one of the
-	  queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time the cgroup had to wait since it became busy
-	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-	  its queues. This is different from the io_wait_time which is the
-	  cumulative total of the amount of time spent by each IO in that cgroup
-	  waiting in the scheduler queue. This is in nanoseconds. If this is
-	  read when the cgroup is in a waiting (for timeslice) state, the stat
-	  will only report the group_wait_time accumulated till the last time it
-	  got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time a cgroup spends without any pending
-	  requests when not being served, i.e., it does not include any time
-	  spent idling for one of the queues of the cgroup. This is in
-	  nanoseconds. If this is read when the cgroup is in an empty state,
-	  the stat will only report the empty_time accumulated till the last
-	  time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the existing ones
-	  from other queues/cgroups. This is in nanoseconds. If this is read
-	  when the cgroup is in an idling state, the stat will only report the
-	  idle_time accumulated till the last idle period and will not include
-	  the current delta.
-
-- blkio.dequeue
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
-	  gives the statistics about how many a times a group was dequeued
-	  from service tree of the device. First two fields specify the major
-	  and minor number of the device and third field specifies the number
-	  of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-	- Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-	- Writing an int to this file will result in resetting all the stats
-	  for that cgroup.
-
-CFQ sysfs tunable
-=================
-/sys/block/<disk>/queue/iosched/slice_idle
-------------------------------------------
-On a faster hardware CFQ can be slow, especially with sequential workload.
-This happens because CFQ idles on a single queue and single queue might not
-drive deeper request queue depths to keep the storage busy. In such scenarios
-one can try setting slice_idle=0 and that would switch CFQ to IOPS
-(IO operations per second) mode on NCQ supporting hardware.
-
-That means CFQ will not idle between cfq queues of a cfq group and hence be
-able to driver higher queue depth and achieve better throughput. That also
-means that cfq provides fairness among groups in terms of IOPS and not in
-terms of disk time.
-
-/sys/block/<disk>/queue/iosched/group_idle
-------------------------------------------
-If one disables idling on individual cfq queues and cfq service trees by
-setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
-on the group in an attempt to provide fairness among groups.
-
-By default group_idle is same as slice_idle and does not do anything if
-slice_idle is enabled.
-
-One can experience an overall throughput drop if you have created multiple
-groups and put applications in that group which are not driving enough
-IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
-on individual groups and throughput should improve.
diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..46bbe7e022d4
--- /dev/null
+++ b/Documentation/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+	1. Control Groups
+	1.1 What are cgroups ?
+	1.2 Why are cgroups needed ?
+	1.3 How are cgroups implemented ?
+	1.4 What does notify_on_release do ?
+	1.5 What does clone_children do ?
+	1.6 How do I use cgroups ?
+	2. Usage Examples and Syntax
+	2.1 Basic Usage
+	2.2 Attaching processes
+	2.3 Mounting hierarchies by name
+	3. Kernel API
+	3.1 Overview
+	3.2 Synchronization
+	3.3 Subsystem API
+	4. Extended attributes usage
+	5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+  # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+  # mount -t tmpfs cgroup_root /sys/fs/cgroup
+  # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+    xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+  # cd /sys/fs/cgroup/rg1
+  # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+  # cd my_cgroup
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.procs notify_on_release tasks
+  (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+  # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	  ...
+  # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+  # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+
+	- Trusted (XATTR_TRUSTED)
+	- Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+  Q: what's up with this '/bin/echo' ?
+  A: bash's builtin 'echo' command does not check calls to write() against
+     errors. If you use it in the cgroup file system, you won't be
+     able to tell whether a command succeeded or failed.
+
+  Q: When I attach processes, only the first of the line gets really attached !
+  A: We can only return one error code per call to write(). So you should also
+     put only ONE PID.
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
deleted file mode 100644
index 059f7063eea6..000000000000
--- a/Documentation/cgroup-v1/cgroups.txt
+++ /dev/null
@@ -1,677 +0,0 @@
-				CGROUPS
-				-------
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.txt
-
-Original copyright statements from cpusets.txt:
-Portions Copyright (C) 2004 BULL SA.
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-
-CONTENTS:
-=========
-
-1. Control Groups
-  1.1 What are cgroups ?
-  1.2 Why are cgroups needed ?
-  1.3 How are cgroups implemented ?
-  1.4 What does notify_on_release do ?
-  1.5 What does clone_children do ?
-  1.6 How do I use cgroups ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Attaching processes
-  2.3 Mounting hierarchies by name
-3. Kernel API
-  3.1 Overview
-  3.2 Synchronization
-  3.3 Subsystem API
-4. Extended attributes usage
-5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines:
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of:
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like:
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup:
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type:
-# mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?' you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group.
-
-# mount -t tmpfs cgroup_root /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type:
-# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent:
-# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-  xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent:
-# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1:
-# cd /sys/fs/cgroup/rg1
-# mkdir my_cgroup
-
-Now you want to do something with this cgroup.
-# cd my_cgroup
-
-In this directory you can find several files:
-# ls
-cgroup.procs notify_on_release tasks
-(plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup:
-# /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir:
-# rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-	...
-# /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0:
-
-# echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-int css_online(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-void css_offline(struct cgroup *cgrp);
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-void css_free(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-void css_reset(struct cgroup_subsys_state *css)
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-void fork(struct task_struct *task)
-
-Called when a task is forked into a cgroup.
-
-void exit(struct task_struct *task)
-
-Called during task exit.
-
-void free(struct task_struct *task)
-
-Called when the task_struct is freed.
-
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-	- Trusted (XATTR_TRUSTED)
-	- Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cgroup file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE PID.
-
diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt
deleted file mode 100644
index 9d73cc0cadb9..000000000000
--- a/Documentation/cgroup-v1/cpuacct.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-CPU Accounting Controller
--------------------------
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..b6a42cdea72b
--- /dev/null
+++ b/Documentation/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+   1. Cpusets
+     1.1 What are cpusets ?
+     1.2 Why are cpusets needed ?
+     1.3 How are cpusets implemented ?
+     1.4 What are exclusive cpusets ?
+     1.5 What is memory_pressure ?
+     1.6 What is memory spread ?
+     1.7 What is sched_load_balance ?
+     1.8 What is sched_relax_domain_level ?
+     1.9 How do I use cpusets ?
+   2. Usage Examples and Syntax
+     2.1 Basic Usage
+     2.2 Adding/removing cpus
+     2.3 Setting flags
+     2.4 Attaching processes
+   3. Questions
+   4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+    Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+  -1   no request. use system default or follow request of others.
+   0   no search.
+   1   search siblings (hyperthreads in a core).
+   2   search cores in a package.
+   3   search cpus in a node [= system wide on non-NUMA system]
+   4   search nodes in a chunk of node [on NUMA system]
+   5   search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+   then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+  # cd /sys/fs/cgroup/cpuset
+  # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+  # cd my_cpuset
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.clone_children  cpuset.memory_pressure
+  cgroup.event_control   cpuset.memory_spread_page
+  cgroup.procs           cpuset.memory_spread_slab
+  cpuset.cpu_exclusive   cpuset.mems
+  cpuset.cpus            cpuset.sched_load_balance
+  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+  cpuset.mem_hardwall    notify_on_release
+  cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+  # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+  # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+  # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+  mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+  # /bin/echo "" > cpuset.cpus		-> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	...
+  # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+   what's up with this '/bin/echo' ?
+
+A:
+   bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q:
+   When I attach processes, only the first of the line gets really attached !
+
+A:
+   We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
deleted file mode 100644
index 8402dd6de8df..000000000000
--- a/Documentation/cgroup-v1/cpusets.txt
+++ /dev/null
@@ -1,839 +0,0 @@
-				CPUSETS
-				-------
-
-Copyright (C) 2004 BULL SA.
-Written by Simon.Derr@bull.net
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-Modified by Paul Menage <menage@google.com>
-Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-CONTENTS:
-=========
-
-1. Cpusets
-  1.1 What are cpusets ?
-  1.2 Why are cpusets needed ?
-  1.3 How are cpusets implemented ?
-  1.4 What are exclusive cpusets ?
-  1.5 What is memory_pressure ?
-  1.6 What is memory spread ?
-  1.7 What is sched_load_balance ?
-  1.8 What is sched_relax_domain_level ?
-  1.9 How do I use cpusets ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Adding/removing cpus
-  2.3 Setting flags
-  2.4 Attaching processes
-3. Questions
-4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.txt.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example:
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==> Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-  -1  : no request. use system default or follow request of others.
-   0  : no search.
-   1  : search siblings (hyperthreads in a core).
-   2  : search cores in a package.
-   3  : search cpus in a node [= system wide on non-NUMA system]
-   4  : search nodes in a chunk of node [on NUMA system]
-   5  : search system wide [on NUMA system]
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset:
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
-# cd /sys/fs/cgroup/cpuset
-# mkdir my_cpuset
-
-Now you want to do something with this cpuset.
-# cd my_cpuset
-
-In this directory you can find several files:
-# ls
-cgroup.clone_children  cpuset.memory_pressure
-cgroup.event_control   cpuset.memory_spread_page
-cgroup.procs           cpuset.memory_spread_slab
-cpuset.cpu_exclusive   cpuset.mems
-cpuset.cpus            cpuset.sched_load_balance
-cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-cpuset.mem_hardwall    notify_on_release
-cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags:
-# /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus:
-# /bin/echo 0-7 > cpuset.cpus
-
-Add some mems:
-# /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset:
-# /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir:
-# rmdir my_sub_cs
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command
-
-mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to
-
-mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories:
-
-# /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset:
-
-# /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs:
-
-# /bin/echo "" > cpuset.cpus		-> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple:
-
-# /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
-# /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-	...
-# /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance::
+
+	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+	# echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding ``c *:3 rwm``::
+
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B::
+
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.txt
deleted file mode 100644
index 3c1095ca02ea..000000000000
--- a/Documentation/cgroup-v1/devices.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-Device Whitelist Controller
-
-1. Description:
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance
-
-	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing
-
-	echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing
-
-	echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example:
-      A
-     / \
-        B
-
-    group        behavior	exceptions
-    A            allow		"b 8:* rwm", "c 116:1 rw"
-    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A:
-	# echo "c 116:* r" > A/devices.deny
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed:
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated:
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding "c *:3 rwm":
-	# echo "c *:3 rwm" >A/devices.allow
-
-the result:
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B:
-	# echo "c 2:3 rwm" >B/devices.allow
-	# echo "c 50:3 r" >B/devices.allow
-or even
-	# echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16690
+
+	<at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container::
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container::
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt
deleted file mode 100644
index e831cb2b8394..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells:
-
-	$ echo $$
-	16644
-	$ bash
-	$ echo $$
-	16690
-
-	From a second, unrelated bash shell:
-	$ kill -SIGSTOP 16690
-	$ kill -SIGCONT 16690
-
-	<at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage :
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem :
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container :
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container :
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+  hugetlb.1GB.limit_in_bytes
+  hugetlb.1GB.max_usage_in_bytes
+  hugetlb.1GB.usage_in_bytes
+  hugetlb.1GB.failcnt
+  hugetlb.64KB.limit_in_bytes
+  hugetlb.64KB.max_usage_in_bytes
+  hugetlb.64KB.usage_in_bytes
+  hugetlb.64KB.failcnt
+  hugetlb.32MB.limit_in_bytes
+  hugetlb.32MB.max_usage_in_bytes
+  hugetlb.32MB.usage_in_bytes
+  hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt
deleted file mode 100644
index 1260e5369b9b..000000000000
--- a/Documentation/cgroup-v1/hugetlb.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-HugeTLB Controller
--------------------
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting three hugepage sizes (64k, 32M and 1G), the control
-files include:
-
-hugetlb.1GB.limit_in_bytes
-hugetlb.1GB.max_usage_in_bytes
-hugetlb.1GB.usage_in_bytes
-hugetlb.1GB.failcnt
-hugetlb.64KB.limit_in_bytes
-hugetlb.64KB.max_usage_in_bytes
-hugetlb.64KB.usage_in_bytes
-hugetlb.64KB.failcnt
-hugetlb.32MB.limit_in_bytes
-hugetlb.32MB.max_usage_in_bytes
-hugetlb.32MB.usage_in_bytes
-hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst
new file mode 100644
index 000000000000..fe76d42edc11
--- /dev/null
+++ b/Documentation/cgroup-v1/index.rst
@@ -0,0 +1,30 @@
+:orphan:
+
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+    :maxdepth: 1
+
+    cgroups
+
+    blkio-controller
+    cpuacct
+    cpusets
+    devices
+    freezer-subsystem
+    hugetlb
+    memcg_test
+    memory
+    net_cls
+    net_prio
+    pids
+    rdma
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..91bd18c6a514
--- /dev/null
+++ b/Documentation/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+
+	Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+
+	Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+	mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
+
+	mem_cgroup_uncharge_swap()
+	  Called when swp_entry's refcnt goes down to 0. A charge against swap
+	  disappears.
+
+3. charge-commit-cancel
+=======================
+
+	Memcg pages are charged in two steps:
+
+		- mem_cgroup_try_charge()
+		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+	At try_charge(), there are no flags to say "this page is charged".
+	at this point, usage += PAGE_SIZE.
+
+	At commit(), the page is associated with the memcg.
+
+	At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+	Anonymous page is newly allocated at
+		  - page fault into MAP_ANONYMOUS mapping.
+		  - Copy-On-Write.
+
+	4.1 Swap-in.
+	At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+	(a) If the SwapCache is newly allocated and read, it has no charges.
+	(b) If the SwapCache has been mapped by processes, it has been
+	    charged already.
+
+	4.2 Swap-out.
+	At swap-out, typical state transition is below.
+
+	(a) add to swap cache. (marked as SwapCache)
+	    swp_entry's refcnt += 1.
+	(b) fully unmapped.
+	    swp_entry's refcnt += # of ptes.
+	(c) write back to swap.
+	(d) delete from swap cache. (remove from SwapCache)
+	    swp_entry's refcnt -= 1.
+
+
+	Finally, at task exit,
+	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+	Page Cache is charged at
+	- add_to_page_cache_locked().
+
+	The logic is very clear. (About migration, see below)
+
+	Note:
+	  __remove_from_page_cache() is called by remove_from_page_cache()
+	  and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
+
+	But brief explanation of the behavior of memcg around shmem will be
+	helpful to understand the logic.
+
+	Shmem's page (just leaf page, not direct/indirect block) can be on
+
+		- radix-tree of shmem's inode.
+		- SwapCache.
+		- Both on radix-tree and SwapCache. This happens at swap-in
+		  and swap-out,
+
+	It's charged when...
+
+	- A new page is added to shmem's radix-tree.
+	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+	mem_cgroup_migrate()
+
+8. LRU
+======
+        Each memcg has its own private LRU. Now, its handling is under global
+	VM's control (means that it's handled under global pgdat->lru_lock).
+	Almost all routines around memcg's LRU is called by global LRU's
+	list management functions under pgdat->lru_lock.
+
+	A special function is mem_cgroup_isolate_pages(). This scans
+	memcg's private LRU and call __isolate_lru_page() to extract a page
+	from LRU.
+
+	(By __isolate_lru_page(), the page is removed from both of global and
+	private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+	When you do test to do racy case, it's good test to set memcg's limit
+	to be very small rather than GB. Many races found in the test under
+	xKB or xxMB limits.
+
+	(Memory behavior under GB and Memory behavior under MB shows very
+	different situation.)
+
+9.2 Shmem
+---------
+
+	Historically, memcg's shmem handling was poor and we saw some amount
+	of troubles here. This is because shmem is page-cache but can be
+	SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+	For NUMA, migration is an another special case. To do easy test, cpuset
+	is useful. Following is a sample script to do migration::
+
+		mount -t cgroup -o cpuset none /opt/cpuset
+
+		mkdir /opt/cpuset/01
+		echo 1 > /opt/cpuset/01/cpuset.cpus
+		echo 0 > /opt/cpuset/01/cpuset.mems
+		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+		mkdir /opt/cpuset/02
+		echo 1 > /opt/cpuset/02/cpuset.cpus
+		echo 1 > /opt/cpuset/02/cpuset.mems
+		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+	In above set, when you moves a task from 01 to 02, page migration to
+	node 0 to node 1 will occur. Following is a script to migrate all
+	under cpuset.::
+
+		--
+		move_task()
+		{
+		for pid in $1
+		do
+			/bin/echo $pid >$2/tasks 2>/dev/null
+			echo -n $pid
+			echo -n " "
+		done
+		echo END
+		}
+
+		G1_TASK=`cat ${G1}/tasks`
+		G2_TASK=`cat ${G2}/tasks`
+		move_task "${G1_TASK}" ${G2} &
+		--
+
+9.4 Memory hotplug
+------------------
+
+	memory hotplug test is one of good test.
+
+	to offline memory, do following::
+
+		# echo offline > /sys/devices/system/memory/memoryXXX/state
+
+	(XXX is the place of memory)
+
+	This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+	When using hierarchy, mkdir/rmdir test should be done.
+	Use tests like the following::
+
+		echo 1 >/opt/cgroup/01/memory/use_hierarchy
+		mkdir /opt/cgroup/01/child_a
+		mkdir /opt/cgroup/01/child_b
+
+		set limit to 01.
+		add limit to 01/child_b
+		run jobs under child_a and child_b
+
+	create/delete following groups at random while jobs are running::
+
+		/opt/cgroup/01/child_a/child_aa
+		/opt/cgroup/01/child_b/child_bb
+		/opt/cgroup/01/child_c
+
+	running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+	Mounting with other subsystems is a good test because there is a
+	race and lock dependency with other cgroup subsystems.
+
+	example::
+
+		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+	and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+	Besides management of swap is one of complicated parts of memcg,
+	call path of swap-in at swapoff is not same as usual swap-in path..
+	It's worth to be tested explicitly.
+
+	For example, test like following is good:
+
+	(Shell-A)::
+
+		# mount -t cgroup none /cgroup -o memory
+		# mkdir /cgroup/test
+		# echo 40M > /cgroup/test/memory.limit_in_bytes
+		# echo 0 > /cgroup/test/tasks
+
+	Run malloc(100M) program under this. You'll see 60M of swaps.
+
+	(Shell-B)::
+
+		# move all tasks in /cgroup/test to /cgroup
+		# /sbin/swapoff -a
+		# rmdir /cgroup/test
+		# kill malloc task.
+
+	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+
+	Case A) when you can swapoff::
+
+		#swapoff -a
+		#echo 50M > /memory.limit_in_bytes
+
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation::
+
+		#echo 50M > memory.limit_in_bytes
+		#echo 50M > memory.memsw.limit_in_bytes
+
+	run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)::
+
+		#mkdir /cgroup/A
+		#echo $$ >/cgroup/A/tasks
+
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)::
+
+		#mkdir /cgroup/B
+		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+		#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading ``*.usage_in_bytes`` or
+	memory.stat of both A and B.
+
+	See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should
+	be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+	Memory controller implements memory thresholds using cgroups notification
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+	(Shell-A) Create cgroup and run event listener::
+
+		# mkdir /cgroup/A
+		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory::
+
+		# echo $$ >/cgroup/A/tasks
+		# a="$(dd if=/dev/zero bs=1M count=10)"
+		# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
deleted file mode 100644
index 621e29ffb358..000000000000
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ /dev/null
@@ -1,280 +0,0 @@
-Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2010/2
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.txt)
-
-0. How to record usage ?
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-	Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-	Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-	mem_cgroup_try_charge()
-
-2. Uncharge
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-	mem_cgroup_uncharge()
-	  Called when a page's refcount goes down to 0.
-
-	mem_cgroup_uncharge_swap()
-	  Called when swp_entry's refcnt goes down to 0. A charge against swap
-	  disappears.
-
-3. charge-commit-cancel
-	Memcg pages are charged in two steps:
-		mem_cgroup_try_charge()
-		mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-	At try_charge(), there are no flags to say "this page is charged".
-	at this point, usage += PAGE_SIZE.
-
-	At commit(), the page is associated with the memcg.
-
-	At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-	Anonymous page is newly allocated at
-		  - page fault into MAP_ANONYMOUS mapping.
-		  - Copy-On-Write.
-
-	4.1 Swap-in.
-	At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-	(a) If the SwapCache is newly allocated and read, it has no charges.
-	(b) If the SwapCache has been mapped by processes, it has been
-	    charged already.
-
-	4.2 Swap-out.
-	At swap-out, typical state transition is below.
-
-	(a) add to swap cache. (marked as SwapCache)
-	    swp_entry's refcnt += 1.
-	(b) fully unmapped.
-	    swp_entry's refcnt += # of ptes.
-	(c) write back to swap.
-	(d) delete from swap cache. (remove from SwapCache)
-	    swp_entry's refcnt -= 1.
-
-
-	Finally, at task exit,
-	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-   	Page Cache is charged at
-	- add_to_page_cache_locked().
-
-	The logic is very clear. (About migration, see below)
-	Note: __remove_from_page_cache() is called by remove_from_page_cache()
-	and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-	The best way to understand shmem's page state transition is to read
-	mm/shmem.c.
-	But brief explanation of the behavior of memcg around shmem will be
-	helpful to understand the logic.
-
-	Shmem's page (just leaf page, not direct/indirect block) can be on
-		- radix-tree of shmem's inode.
-		- SwapCache.
-		- Both on radix-tree and SwapCache. This happens at swap-in
-		  and swap-out,
-
-	It's charged when...
-	- A new page is added to shmem's radix-tree.
-	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-
-	mem_cgroup_migrate()
-
-8. LRU
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-	(By __isolate_lru_page(), the page is removed from both of global and
-	 private LRU.)
-
-
-9. Typical Tests.
-
- Tests for racy cases.
-
- 9.1 Small limit to memcg.
-	When you do test to do racy case, it's good test to set memcg's limit
-	to be very small rather than GB. Many races found in the test under
-	xKB or xxMB limits.
-	(Memory behavior under GB and Memory behavior under MB shows very
-	 different situation.)
-
- 9.2 Shmem
-	Historically, memcg's shmem handling was poor and we saw some amount
-	of troubles here. This is because shmem is page-cache but can be
-	SwapCache. Test with shmem/tmpfs is always good test.
-
- 9.3 Migration
-	For NUMA, migration is an another special case. To do easy test, cpuset
-	is useful. Following is a sample script to do migration.
-
-	mount -t cgroup -o cpuset none /opt/cpuset
-
-	mkdir /opt/cpuset/01
-	echo 1 > /opt/cpuset/01/cpuset.cpus
-	echo 0 > /opt/cpuset/01/cpuset.mems
-	echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-	mkdir /opt/cpuset/02
-	echo 1 > /opt/cpuset/02/cpuset.cpus
-	echo 1 > /opt/cpuset/02/cpuset.mems
-	echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-	In above set, when you moves a task from 01 to 02, page migration to
-	node 0 to node 1 will occur. Following is a script to migrate all
-	under cpuset.
-	--
-	move_task()
-	{
-	for pid in $1
-        do
-                /bin/echo $pid >$2/tasks 2>/dev/null
-		echo -n $pid
-		echo -n " "
-        done
-	echo END
-	}
-
-	G1_TASK=`cat ${G1}/tasks`
-	G2_TASK=`cat ${G2}/tasks`
-	move_task "${G1_TASK}" ${G2} &
-	--
- 9.4 Memory hotplug.
-	memory hotplug test is one of good test.
-	to offline memory, do following.
-	# echo offline > /sys/devices/system/memory/memoryXXX/state
-	(XXX is the place of memory)
-	This is an easy way to test page migration, too.
-
- 9.5 mkdir/rmdir
-	When using hierarchy, mkdir/rmdir test should be done.
-	Use tests like the following.
-
-	echo 1 >/opt/cgroup/01/memory/use_hierarchy
-	mkdir /opt/cgroup/01/child_a
-	mkdir /opt/cgroup/01/child_b
-
-	set limit to 01.
-	add limit to 01/child_b
-	run jobs under child_a and child_b
-
-	create/delete following groups at random while jobs are running.
-	/opt/cgroup/01/child_a/child_aa
-	/opt/cgroup/01/child_b/child_bb
-	/opt/cgroup/01/child_c
-
-	running new jobs in new group is also good.
-
- 9.6 Mount with other subsystems.
-	Mounting with other subsystems is a good test because there is a
-	race and lock dependency with other cgroup subsystems.
-
-	example)
-	# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-	and do task move, mkdir, rmdir etc...under this.
-
- 9.7 swapoff.
-	Besides management of swap is one of complicated parts of memcg,
-	call path of swap-in at swapoff is not same as usual swap-in path..
-	It's worth to be tested explicitly.
-
-	For example, test like following is good.
-	(Shell-A)
-	# mount -t cgroup none /cgroup -o memory
-	# mkdir /cgroup/test
-	# echo 40M > /cgroup/test/memory.limit_in_bytes
-	# echo 0 > /cgroup/test/tasks
-	Run malloc(100M) program under this. You'll see 60M of swaps.
-	(Shell-B)
-	# move all tasks in /cgroup/test to /cgroup
-	# /sbin/swapoff -a
-	# rmdir /cgroup/test
-	# kill malloc task.
-
-	Of course, tmpfs v.s. swapoff test should be tested, too.
-
- 9.8 OOM-Killer
-	Out-of-memory caused by memcg's limit will kill tasks under
-	the memcg. When hierarchy is used, a task under hierarchy
-	will be killed by the kernel.
-	In this case, panic_on_oom shouldn't be invoked and tasks
-	in other groups shouldn't be killed.
-
-	It's not difficult to cause OOM under memcg as following.
-	Case A) when you can swapoff
-	#swapoff -a
-	#echo 50M > /memory.limit_in_bytes
-	run 51M of malloc
-
-	Case B) when you use mem+swap limitation.
-	#echo 50M > memory.limit_in_bytes
-	#echo 50M > memory.memsw.limit_in_bytes
-	run 51M of malloc
-
- 9.9 Move charges at task migration
-	Charges associated with a task can be moved along with task migration.
-
-	(Shell-A)
-	#mkdir /cgroup/A
-	#echo $$ >/cgroup/A/tasks
-	run some programs which uses some amount of memory in /cgroup/A.
-
-	(Shell-B)
-	#mkdir /cgroup/B
-	#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-	#echo "pid of the program running in group A" >/cgroup/B/tasks
-
-	You can see charges have been moved by reading *.usage_in_bytes or
-	memory.stat of both A and B.
-	See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be
-	written to move_charge_at_immigrate.
-
- 9.10 Memory thresholds
-	Memory controller implements memory thresholds using cgroups notification
-	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-	(Shell-A) Create cgroup and run event listener
-	# mkdir /cgroup/A
-	# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-	(Shell-B) Add task to cgroup and try to allocate and free memory
-	# echo $$ >/cgroup/A/tasks
-	# a="$(dd if=/dev/zero bs=1M count=10)"
-	# a=
-
-	You will see message from cgroup_event_listener every time you cross
-	the thresholds.
-
-	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+      This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE:
+      The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks				     attach a task(thread) and show list of
+				     threads
+ cgroup.procs			     show list of processes
+ cgroup.event_control		     an interface for event_fd()
+ memory.usage_in_bytes		     show current usage for memory
+				     (See 5.5 for details)
+ memory.memsw.usage_in_bytes	     show current usage for memory+Swap
+				     (See 5.5 for details)
+ memory.limit_in_bytes		     set/show limit of memory usage
+ memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
+ memory.failcnt			     show the number of memory usage hits limits
+ memory.memsw.failcnt		     show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes	     show max memory usage recorded
+ memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes	     set/show soft limit of memory usage
+ memory.stat			     show various statistics
+ memory.use_hierarchy		     set/show hierarchical account enabled
+ memory.force_empty		     trigger forced page reclaim
+ memory.pressure_level		     set memory pressure notifications
+ memory.swappiness		     set/show swappiness parameter of vmscan
+				     (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate     set/show controls of moving charges
+ memory.oom_control		     set/show oom controls.
+ memory.numa_stat		     show the number of memory usage per numa
+				     node
+
+ memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes          show current kernel memory allocation
+ memory.kmem.failcnt                 show the number of kernel memory usage
+				     hits limits
+ memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
+				     hits limits
+ memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+		+--------------------+
+		|  mem_cgroup        |
+		|  (page_counter)    |
+		+--------------------+
+		 /            ^      \
+		/             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+  Reclaim does not work for the root cgroup, since we cannot set any
+  limits on the root cgroup.
+
+Note2:
+  When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   the i_pages lock.
+
+   Other lock order is following:
+
+   PG_locked.
+     mm->page_table_lock
+         pgdat->lru_lock
+	   lock_page_cgroup.
+
+  In many cases, just lock_page_cgroup() is called.
+
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+  every process consumes some stack pages. By accounting into
+  kernel memory, we prevent new processes from being created when the kernel
+  memory usage is too high.
+
+slab pages:
+  pages allocated by the SLAB or SLUB allocator are tracked. A copy
+  of each kmem_cache is created every time the cache is touched by the first time
+  from inside the memcg. The creation is done lazily, so some objects can still be
+  skipped while the cache is being created. All objects in a slab page should
+  belong to the same memcg. This only fails to hold when a task is migrated to a
+  different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+  some sockets protocols have memory pressure
+  thresholds. The Memory Controller allows them to be controlled individually
+  per cgroup, instead of globally.
+
+tcp memory pressure:
+  sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+WARNING:
+    In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+	# mount -t tmpfs none /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/memory
+	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+	# mkdir /sys/fs/cgroup/memory/0
+	# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+  Gibibytes.)
+
+NOTE:
+  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+  We cannot set limits on the root cgroup any more.
+
+::
+
+  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+  4194304
+
+We can check the usage::
+
+  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+  1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+  # echo 1 > memory.limit_in_bytes
+  # cat memory.limit_in_bytes
+  4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this::
+
+    # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Though rmdir() offlines memcg, but the memcg may still stay there due to
+  charged file caches. Some out-of-use page caches may keep charged until
+  memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache		# of bytes of page cache memory.
+rss		# of bytes of anonymous and swap cache memory (includes
+		transparent hugepages).
+rss_huge	# of bytes of anonymous transparent hugepages.
+mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
+pgpgin		# of charging events to the memory cgroup. The charging
+		event happens each time a page is accounted as either mapped
+		anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout		# of uncharging events to the memory cgroup. The uncharging
+		event happens each time a page is unaccounted from the cgroup.
+swap		# of bytes of swap usage
+dirty		# of bytes that are waiting to get written back to the disk.
+writeback	# of bytes of file/anon cache that are queued for syncing to
+		disk.
+inactive_anon	# of bytes of anonymous and swap cache memory on inactive
+		LRU list.
+active_anon	# of bytes of anonymous and swap cache memory on active
+		LRU list.
+inactive_file	# of bytes of file-backed memory on inactive LRU list.
+active_file	# of bytes of file-backed memory on active LRU list.
+unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+			  under which the memory cgroup is
+hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
+			  hierarchy under which memory cgroup is.
+
+total_<counter>		  # hierarchical version of <counter>, which in
+			  addition to the cgroup's own value includes the
+			  sum of all hierarchical children's values of
+			  <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+	recent_rotated means recent frequency of LRU rotation.
+	recent_scanned means recent # of scans to LRU.
+	showing for better debug please see the code for meanings.
+
+Note:
+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
+	This should not be confused with the true 'resident set size' or the
+	amount of physical memory used by the cgroup.
+
+	'rss + mapped_file" will give you resident set size of cgroup.
+
+	(Note: file and shmem may be shared among other cgroups. In that case,
+	mapped_file is accounted only when the memory cgroup is owner of page
+	cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+	# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+	       root
+	     /  |   \
+            /	|    \
+	   a	b     c
+		      | \
+		      |  \
+		      d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+	# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+	# echo 0 > memory.use_hierarchy
+
+NOTE1:
+       Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2:
+       When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+	# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+	# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+       Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2:
+       It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+	# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+      Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note:
+      Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note:
+      If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note:
+      It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+	# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ?                                    |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
+|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
+|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+|   | will be moved even if the task hasn't done page fault, i.e. they might   |
+|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+|   | And mapcount of the page is ignored (the page can be moved even if       |
+|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
+|   | enable move of swap charges.                                             |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+	#echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+	* enlarge limit or reduce usage.
+
+To reduce usage,
+
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+	- oom_kill_disable 0 or 1
+	  (if 1, oom-killer is disabled)
+	- under_oom	   0 or 1
+	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+   same as omitting the optional mode parameter, preserved by backwards
+   compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+   behavior, except that propagation continues regardless of whether there are
+   event listeners at each level, with the "hierarchy" mode. In the above
+   example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+   memory pressure is experienced in the memcg for which the notification is
+   registered. In the above example, group C will receive notification if
+   registered for "local" notification and the group experiences memory
+   pressure. However, group B will never receive notification, regardless if
+   there is an event listener for group C or not, if group B is registered for
+   local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir foo
+	# cd foo
+	# cgroup_event_listener memory.pressure_level low,hierarchy &
+	# echo 8000000 > memory.limit_in_bytes
+	# echo 8000000 > memory.memsw.limit_in_bytes
+	# echo $$ > tasks
+	# dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
deleted file mode 100644
index a33cedf85427..000000000000
--- a/Documentation/cgroup-v1/memory.txt
+++ /dev/null
@@ -1,892 +0,0 @@
-Memory Resource Controller
-
-NOTE: This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE: The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors)
-In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
- tasks				 # attach a task(thread) and show list of threads
- cgroup.procs			 # show list of processes
- cgroup.event_control		 # an interface for event_fd()
- memory.usage_in_bytes		 # show current usage for memory
-				 (See 5.5 for details)
- memory.memsw.usage_in_bytes	 # show current usage for memory+Swap
-				 (See 5.5 for details)
- memory.limit_in_bytes		 # set/show limit of memory usage
- memory.memsw.limit_in_bytes	 # set/show limit of memory+Swap usage
- memory.failcnt			 # show the number of memory usage hits limits
- memory.memsw.failcnt		 # show the number of memory+Swap hits limits
- memory.max_usage_in_bytes	 # show max memory usage recorded
- memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
- memory.soft_limit_in_bytes	 # set/show soft limit of memory usage
- memory.stat			 # show various statistics
- memory.use_hierarchy		 # set/show hierarchical account enabled
- memory.force_empty		 # trigger forced page reclaim
- memory.pressure_level		 # set memory pressure notifications
- memory.swappiness		 # set/show swappiness parameter of vmscan
-				 (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate # set/show controls of moving charges
- memory.oom_control		 # set/show oom controls.
- memory.numa_stat		 # show the number of memory usage per numa node
-
- memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes      # show current kernel memory allocation
- memory.kmem.failcnt             # show the number of kernel memory usage hits limits
- memory.kmem.max_usage_in_bytes  # show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes  # show current tcp buf memory allocation
- memory.kmem.tcp.failcnt            # show the number of tcp buf memory usage hits limits
- memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
-
-1. History
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
-
-		+--------------------+
-		|  mem_cgroup        |
-		|  (page_counter)    |
-		+--------------------+
-		 /            ^      \
-		/             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-* why 'memory+swap' rather than swap.
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-* What happens when a cgroup hits memory.memsw.limit_in_bytes
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE: Reclaim does not work for the root cgroup, since we cannot set any
-limits on the root cgroup.
-
-Note2: When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
-
-   Other lock order is following:
-   PG_locked.
-   mm->page_table_lock
-       pgdat->lru_lock
-	  lock_page_cgroup.
-  In many cases, just lock_page_cgroup() is called.
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
-
-* stack pages: every process consumes some stack pages. By accounting into
-kernel memory, we prevent new processes from being created when the kernel
-memory usage is too high.
-
-* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
-of each kmem_cache is created every time the cache is touched by the first time
-from inside the memcg. The creation is done lazily, so some objects can still be
-skipped while the cache is being created. All objects in a slab page should
-belong to the same memcg. This only fails to hold when a task is migrated to a
-different memcg during the page allocation by the cache.
-
-* sockets memory pressure: some sockets protocols have memory pressure
-thresholds. The Memory Controller allows them to be controlled individually
-per cgroup, instead of globally.
-
-* tcp memory pressure: sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-    U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-    U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-    WARNING: In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-    U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-
-3.0. Configuration
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
-# mount -t tmpfs none /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/memory
-# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it
-# mkdir /sys/fs/cgroup/memory/0
-# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit:
-# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
-
-NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
-NOTE: We cannot set limits on the root cgroup any more.
-
-# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-4194304
-
-We can check the usage:
-# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel.
-
-# echo 1 > memory.limit_in_bytes
-# cat memory.limit_in_bytes
-4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces.
-
-5.1 force_empty
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this
-
-  # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Though rmdir() offlines memcg, but the memcg may still stay there due to
-  charged file caches. Some out-of-use page caches may keep charged until
-  memory pressure happens. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
-
-memory.stat file includes following statistics
-
-# per-memory cgroup local status
-cache		- # of bytes of page cache memory.
-rss		- # of bytes of anonymous and swap cache memory (includes
-		transparent hugepages).
-rss_huge	- # of bytes of anonymous transparent hugepages.
-mapped_file	- # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin		- # of charging events to the memory cgroup. The charging
-		event happens each time a page is accounted as either mapped
-		anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout		- # of uncharging events to the memory cgroup. The uncharging
-		event happens each time a page is unaccounted from the cgroup.
-swap		- # of bytes of swap usage
-dirty		- # of bytes that are waiting to get written back to the disk.
-writeback	- # of bytes of file/anon cache that are queued for syncing to
-		disk.
-inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
-		LRU list.
-active_anon	- # of bytes of anonymous and swap cache memory on active
-		LRU list.
-inactive_file	- # of bytes of file-backed memory on inactive LRU list.
-active_file	- # of bytes of file-backed memory on active LRU list.
-unevictable	- # of bytes of memory that cannot be reclaimed (mlocked etc).
-
-# status considering hierarchy (see memory.use_hierarchy settings)
-
-hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
-			under which the memory cgroup is
-hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
-			hierarchy under which memory cgroup is.
-
-total_<counter>		- # hierarchical version of <counter>, which in
-			addition to the cgroup's own value includes the
-			sum of all hierarchical children's values of
-			<counter>, i.e. total_cache
-
-# The following additional stats are dependent on CONFIG_DEBUG_VM.
-
-recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon	- VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file	- VM internal parameter. (see mm/vmscan.c)
-
-Memo:
-	recent_rotated means recent frequency of LRU rotation.
-	recent_scanned means recent # of scans to LRU.
-	showing for better debug please see the code for meanings.
-
-Note:
-	Only anonymous and swap cache memory is listed as part of 'rss' stat.
-	This should not be confused with the true 'resident set size' or the
-	amount of physical memory used by the cgroup.
-	'rss + mapped_file" will give you resident set size of cgroup.
-	(Note: file and shmem may be shared among other cgroups. In that case,
-	 mapped_file is accounted only when the memory cgroup is owner of page
-	 cache.)
-
-5.3 swappiness
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file.
-# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is:
-
-total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy
-
-	       root
-	     /  |   \
-            /	|    \
-	   a	b     c
-		      | \
-		      |  \
-		      d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
-
-# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by
-
-# echo 0 > memory.use_hierarchy
-
-NOTE1: Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2: When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)
-
-# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use
-
-# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1: Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2: It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it:
-
-# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note: Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note: Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note: If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note: It can take several seconds if you move charges much.
-
-And if you want disable it again:
-
-# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-  bit | what type of charges would be moved ?
- -----+------------------------------------------------------------------------
-   0  | A charge of an anonymous page (or swap of it) used by the target task.
-      | You must enable Swap Extension (see 2.4) to enable move of swap charges.
- -----+------------------------------------------------------------------------
-   1  | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
-      | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
-      | anonymous pages, file pages (and swaps) in the range mmapped by the task
-      | will be moved even if the task hasn't done page fault, i.e. they might
-      | not be the task's "RSS", but other task's "RSS" that maps the same file.
-      | And mapcount of the page is ignored (the page can be moved even if
-      | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
-      | enable move of swap charges.
-
-8.3 TODO
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-	#echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-	* enlarge limit or reduce usage.
-To reduce usage,
-	* kill some tasks.
-	* move some tasks to other group with account migration.
-	* remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-	oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
-	under_oom	 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
-				 be stopped.)
-
-11. Memory Pressure
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
-   same as omitting the optional mode parameter, preserved by backwards
-   compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
-   behavior, except that propagation continues regardless of whether there are
-   event listeners at each level, with the "hierarchy" mode. In the above
-   example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
-   memory pressure is experienced in the memcg for which the notification is
-   registered. In the above example, group C will receive notification if
-   registered for "local" notification and the group experiences memory
-   pressure. However, group B will never receive notification, regardless if
-   there is an event listener for group C or not, if group B is registered for
-   local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure:
-
-   # cd /sys/fs/cgroup/memory/
-   # mkdir foo
-   # cd foo
-   # cgroup_event_listener memory.pressure_level low,hierarchy &
-   # echo 8000000 > memory.limit_in_bytes
-   # echo 8000000 > memory.memsw.limit_in_bytes
-   # echo $$ > tasks
-   # dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+	mkdir /sys/fs/cgroup/net_cls
+	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+	mkdir /sys/fs/cgroup/net_cls/0
+	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+	1048577
+
+- configuring tc::
+
+	tc qdisc add dev eth0 root handle 10: htb
+	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt
deleted file mode 100644
index ec182346dea2..000000000000
--- a/Documentation/cgroup-v1/net_cls.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Network classifier cgroup
--------------------------
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example:
-mkdir /sys/fs/cgroup/net_cls
-mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-mkdir /sys/fs/cgroup/net_cls/0
-echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-	- setting a 10:1 handle.
-
-cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-1048577
-
-configuring tc:
-tc qdisc add dev eth0 root handle 10: htb
-
-tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
- - creating traffic class 10:1
-
-tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example:
-iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+  This file is read-only, and is simply informative.  It contains a unique
+  integer value that the kernel uses as an internal representation of this
+  cgroup.
+
+net_prio.ifpriomap
+  This file contains a map of the priorities assigned to traffic originating
+  from processes in this group and egressing the system on various interfaces.
+  It contains a list of tuples in the form <ifname priority>.  Contents of this
+  file can be modified by echoing a string into the file using the same tuple
+  format. For example::
+
+	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt
deleted file mode 100644
index a82cbd28ea8a..000000000000
--- a/Documentation/cgroup-v1/net_prio.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Network priority cgroup
--------------------------
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-This file is read-only, and is simply informative.  It contains a unique integer
-value that the kernel uses as an internal representation of this cgroup.
-
-net_prio.ifpriomap
-This file contains a map of the priorities assigned to traffic originating from
-processes in this group and egressing the system on various interfaces. It
-contains a list of tuples in the form <ifname priority>.  Contents of this file
-can be modified by echoing a string into the file using the same tuple format.
-for example:
-
-echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+  - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+	# mkdir -p /sys/fs/cgroup/pids
+	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+	# mkdir -p /sys/fs/cgroup/pids/parent/child
+	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.max
+	max
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	#
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt
deleted file mode 100644
index e105d708ccde..000000000000
--- a/Documentation/cgroup-v1/pids.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-						   Process Number Controller
-						   =========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
-  - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller:
-# mkdir -p /sys/fs/cgroup/pids
-# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it:
-# mkdir -p /sys/fs/cgroup/pids/parent/child
-# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail:
-
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's):
-
-# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.max
-max
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current):
-
-# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-#
diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+   1. Overview
+     1-1. What is RDMA controller?
+     1-2. Why RDMA controller needed?
+     1-3. How is RDMA controller implemented?
+   2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+  ==========    =============================
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+  ==========    =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.max
+	#Output:
+	mlx4_0 hca_handle=2 hca_object=2000
+	ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.current
+	#Output:
+	mlx4_0 hca_handle=1 hca_object=20
+	ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
deleted file mode 100644
index 9bdb7fd03f83..000000000000
--- a/Documentation/cgroup-v1/rdma.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-				RDMA Controller
-				----------------
-
-Contents
---------
-
-1. Overview
-  1-1. What is RDMA controller?
-  1-2. Why RDMA controller needed?
-  1-3. How is RDMA controller implemented?
-2. Usage Examples
-
-1. Overview
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-
-2. Usage Examples
------------------
-
-(a) Configure resource limit:
-echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit:
-cat /sys/fs/cgroup/rdma/2/rdma.max
-#Output:
-mlx4_0 hca_handle=2 hca_object=2000
-ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage:
-cat /sys/fs/cgroup/rdma/2/rdma.current
-#Output:
-mlx4_0 hca_handle=1 hca_object=20
-ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit:
-echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d06e9a59a9f4..cad797a8a39e 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
 use at file creation time.  When a task allocates a file in the file
 system, the mount option memory policy will be applied with a NodeList,
 if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed
+[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed
 below.  If the resulting NodeLists is the empty set, the effective memory
 policy for the file will revert to "default" policy.
 
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index b14e03ff3528..a7514343b660 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -652,7 +652,7 @@ CONTENTS
 
  -deadline tasks cannot have an affinity mask smaller that the entire
  root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.txt).
+ through the cpuset facility (Documentation/cgroup-v1/cpusets.rst).
 
 5.1 SCHED_DEADLINE and cpusets HOWTO
 ------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index edd861c94c1b..d1328890ef28 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -215,7 +215,7 @@ SCHED_BATCH) tasks.
 
    These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
-   Documentation/cgroup-v1/cgroups.txt for more information about this filesystem.
+   Documentation/cgroup-v1/cgroups.rst for more information about this filesystem.
 
 When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index d8fce3e78457..c09f7a3fee66 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
 to control the CPU time reserved for each control group.
 
 For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.txt as well.
+Documentation/cgroup-v1/cgroups.rst as well.
 
 Group settings are checked against the following limits in order to keep the
 configuration schedulable:
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 5cae13e9a08b..0d830edae8fe 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes.  Each emulated node will manage a fraction of the underlying cells'
 physical memory.  NUMA emluation is useful for testing NUMA kernel and
 application features on non-NUMA platforms, and as a sort of memory resource
 management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.txt]
+[see Documentation/cgroup-v1/cpusets.rst]
 
 For each node with memory, Linux constructs an independent memory management
 subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
 
 System administrators can restrict the CPUs and nodes' memories that a non-
 privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.txt]
+using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.rst]
 
 On architectures that do not hide memoryless nodes, Linux will include only
 zones [nodes] with memory in the zonelists.  This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index f68d61335abb..35bba27d5fff 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
 move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.txt).
+Documentation/cgroup-v1/cpusets.rst).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index b8e29f977f2d..c6d94118fbcc 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
 --------------------------------
 
 The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.txt] by extending the
+memory controller; see Documentation/cgroup-v1/memory.rst] by extending the
 lru_list enum.
 
 The memory controller data structure automatically gets a per-zone unevictable
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 74fbb78b3c67..a6926cd40f70 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
 For more information on the features of cpusets, see
-Documentation/cgroup-v1/cpusets.txt.
+Documentation/cgroup-v1/cpusets.rst.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cgroup-v1/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
+Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets::
 
 	[root@xroads /]# mkdir exampleset
diff --git a/MAINTAINERS b/MAINTAINERS
index 429c6c624861..b8663911779a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4094,7 +4094,7 @@ W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
-F:	Documentation/cgroup-v1/cpusets.txt
+F:	Documentation/cgroup-v1/cpusets.rst
 F:	include/linux/cpuset.h
 F:	kernel/cgroup/cpuset.c
 
diff --git a/block/Kconfig b/block/Kconfig
index 1b220101a9cb..78374cb03114 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -88,7 +88,7 @@ config BLK_DEV_THROTTLING
 	one needs to mount and use blkio cgroup controller for creating
 	cgroups and specifying per device IO rate policies.
 
-	See Documentation/cgroup-v1/blkio-controller.txt for more information.
+	See Documentation/cgroup-v1/blkio-controller.rst for more information.
 
 config BLK_DEV_THROTTLING_LOW
 	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1615b9c17e02..a3699d4d27e0 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -619,7 +619,7 @@ struct cftype {
 
 /*
  * Control Group subsystem type.
- * See Documentation/cgroup-v1/cgroups.txt for details
+ * See Documentation/cgroup-v1/cgroups.rst for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf66f01a..79e080ea71d8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -783,7 +783,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.txt*.
+ * 		*Documentation/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
diff --git a/init/Kconfig b/init/Kconfig
index 36894c9fb420..5d4bf0f676e9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -798,7 +798,7 @@ config BLK_CGROUP
 	CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
 	CONFIG_BLK_DEV_THROTTLING=y.
 
-	See Documentation/cgroup-v1/blkio-controller.txt for more information.
+	See Documentation/cgroup-v1/blkio-controller.rst for more information.
 
 config DEBUG_BLK_CGROUP
 	bool "IO controller debugging"
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..fc6668f9db15 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index dc28914fa72e..c07196502577 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
  * This is one of the three key functions for hierarchy implementation.
  * This function is responsible for re-evaluating all the cgroup's active
  * exceptions due to a parent's exception change.
- * Refer to Documentation/cgroup-v1/devices.txt for more details.
+ * Refer to Documentation/cgroup-v1/devices.rst for more details.
  */
 static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63e0cf66f01a..79e080ea71d8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -783,7 +783,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.txt*.
+ * 		*Documentation/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
-- 
cgit v1.2.3


From 458f69ef36656dc74679667380422dd8063eabfb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 12 Jun 2019 14:53:00 -0300
Subject: docs: timers: convert docs to ReST and rename to *.rst

The conversion here is really trivial: just a bunch of title
markups and very few puntual changes is enough to make it to
be parsed by Sphinx and generate a nice html.

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/timers/NO_HZ.txt        | 318 ---------------------------------
 Documentation/timers/highres.rst      | 250 ++++++++++++++++++++++++++
 Documentation/timers/highres.txt      | 249 --------------------------
 Documentation/timers/hpet.rst         |  30 ++++
 Documentation/timers/hpet.txt         |  28 ---
 Documentation/timers/hrtimers.rst     | 178 +++++++++++++++++++
 Documentation/timers/hrtimers.txt     | 178 -------------------
 Documentation/timers/index.rst        |  22 +++
 Documentation/timers/no_hz.rst        | 326 ++++++++++++++++++++++++++++++++++
 Documentation/timers/timekeeping.rst  | 180 +++++++++++++++++++
 Documentation/timers/timekeeping.txt  | 179 -------------------
 Documentation/timers/timers-howto.rst | 112 ++++++++++++
 Documentation/timers/timers-howto.txt | 105 -----------
 MAINTAINERS                           |   2 +-
 drivers/media/usb/dvb-usb-v2/anysee.c |   2 +-
 drivers/regulator/core.c              |   2 +-
 include/linux/iopoll.h                |   4 +-
 include/linux/regmap.h                |   4 +-
 scripts/checkpatch.pl                 |   8 +-
 sound/soc/sof/ops.h                   |   2 +-
 20 files changed, 1110 insertions(+), 1069 deletions(-)
 delete mode 100644 Documentation/timers/NO_HZ.txt
 create mode 100644 Documentation/timers/highres.rst
 delete mode 100644 Documentation/timers/highres.txt
 create mode 100644 Documentation/timers/hpet.rst
 delete mode 100644 Documentation/timers/hpet.txt
 create mode 100644 Documentation/timers/hrtimers.rst
 delete mode 100644 Documentation/timers/hrtimers.txt
 create mode 100644 Documentation/timers/index.rst
 create mode 100644 Documentation/timers/no_hz.rst
 create mode 100644 Documentation/timers/timekeeping.rst
 delete mode 100644 Documentation/timers/timekeeping.txt
 create mode 100644 Documentation/timers/timers-howto.rst
 delete mode 100644 Documentation/timers/timers-howto.txt

(limited to 'include/linux')

diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
deleted file mode 100644
index 9591092da5e0..000000000000
--- a/Documentation/timers/NO_HZ.txt
+++ /dev/null
@@ -1,318 +0,0 @@
-		NO_HZ: Reducing Scheduling-Clock Ticks
-
-
-This document describes Kconfig options and boot parameters that can
-reduce the number of scheduling-clock interrupts, thereby improving energy
-efficiency and reducing OS jitter.  Reducing OS jitter is important for
-some types of computationally intensive high-performance computing (HPC)
-applications and for real-time applications.
-
-There are three main ways of managing scheduling-clock interrupts
-(also known as "scheduling-clock ticks" or simply "ticks"):
-
-1.	Never omit scheduling-clock ticks (CONFIG_HZ_PERIODIC=y or
-	CONFIG_NO_HZ=n for older kernels).  You normally will -not-
-	want to choose this option.
-
-2.	Omit scheduling-clock ticks on idle CPUs (CONFIG_NO_HZ_IDLE=y or
-	CONFIG_NO_HZ=y for older kernels).  This is the most common
-	approach, and should be the default.
-
-3.	Omit scheduling-clock ticks on CPUs that are either idle or that
-	have only one runnable task (CONFIG_NO_HZ_FULL=y).  Unless you
-	are running realtime applications or certain types of HPC
-	workloads, you will normally -not- want this option.
-
-These three cases are described in the following three sections, followed
-by a third section on RCU-specific considerations, a fourth section
-discussing testing, and a fifth and final section listing known issues.
-
-
-NEVER OMIT SCHEDULING-CLOCK TICKS
-
-Very old versions of Linux from the 1990s and the very early 2000s
-are incapable of omitting scheduling-clock ticks.  It turns out that
-there are some situations where this old-school approach is still the
-right approach, for example, in heavy workloads with lots of tasks
-that use short bursts of CPU, where there are very frequent idle
-periods, but where these idle periods are also quite short (tens or
-hundreds of microseconds).  For these types of workloads, scheduling
-clock interrupts will normally be delivered any way because there
-will frequently be multiple runnable tasks per CPU.  In these cases,
-attempting to turn off the scheduling clock interrupt will have no effect
-other than increasing the overhead of switching to and from idle and
-transitioning between user and kernel execution.
-
-This mode of operation can be selected using CONFIG_HZ_PERIODIC=y (or
-CONFIG_NO_HZ=n for older kernels).
-
-However, if you are instead running a light workload with long idle
-periods, failing to omit scheduling-clock interrupts will result in
-excessive power consumption.  This is especially bad on battery-powered
-devices, where it results in extremely short battery lifetimes.  If you
-are running light workloads, you should therefore read the following
-section.
-
-In addition, if you are running either a real-time workload or an HPC
-workload with short iterations, the scheduling-clock interrupts can
-degrade your applications performance.  If this describes your workload,
-you should read the following two sections.
-
-
-OMIT SCHEDULING-CLOCK TICKS FOR IDLE CPUs
-
-If a CPU is idle, there is little point in sending it a scheduling-clock
-interrupt.  After all, the primary purpose of a scheduling-clock interrupt
-is to force a busy CPU to shift its attention among multiple duties,
-and an idle CPU has no duties to shift its attention among.
-
-The CONFIG_NO_HZ_IDLE=y Kconfig option causes the kernel to avoid sending
-scheduling-clock interrupts to idle CPUs, which is critically important
-both to battery-powered devices and to highly virtualized mainframes.
-A battery-powered device running a CONFIG_HZ_PERIODIC=y kernel would
-drain its battery very quickly, easily 2-3 times as fast as would the
-same device running a CONFIG_NO_HZ_IDLE=y kernel.  A mainframe running
-1,500 OS instances might find that half of its CPU time was consumed by
-unnecessary scheduling-clock interrupts.  In these situations, there
-is strong motivation to avoid sending scheduling-clock interrupts to
-idle CPUs.  That said, dyntick-idle mode is not free:
-
-1.	It increases the number of instructions executed on the path
-	to and from the idle loop.
-
-2.	On many architectures, dyntick-idle mode also increases the
-	number of expensive clock-reprogramming operations.
-
-Therefore, systems with aggressive real-time response constraints often
-run CONFIG_HZ_PERIODIC=y kernels (or CONFIG_NO_HZ=n for older kernels)
-in order to avoid degrading from-idle transition latencies.
-
-An idle CPU that is not receiving scheduling-clock interrupts is said to
-be "dyntick-idle", "in dyntick-idle mode", "in nohz mode", or "running
-tickless".  The remainder of this document will use "dyntick-idle mode".
-
-There is also a boot parameter "nohz=" that can be used to disable
-dyntick-idle mode in CONFIG_NO_HZ_IDLE=y kernels by specifying "nohz=off".
-By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling
-dyntick-idle mode.
-
-
-OMIT SCHEDULING-CLOCK TICKS FOR CPUs WITH ONLY ONE RUNNABLE TASK
-
-If a CPU has only one runnable task, there is little point in sending it
-a scheduling-clock interrupt because there is no other task to switch to.
-Note that omitting scheduling-clock ticks for CPUs with only one runnable
-task implies also omitting them for idle CPUs.
-
-The CONFIG_NO_HZ_FULL=y Kconfig option causes the kernel to avoid
-sending scheduling-clock interrupts to CPUs with a single runnable task,
-and such CPUs are said to be "adaptive-ticks CPUs".  This is important
-for applications with aggressive real-time response constraints because
-it allows them to improve their worst-case response times by the maximum
-duration of a scheduling-clock interrupt.  It is also important for
-computationally intensive short-iteration workloads:  If any CPU is
-delayed during a given iteration, all the other CPUs will be forced to
-wait idle while the delayed CPU finishes.  Thus, the delay is multiplied
-by one less than the number of CPUs.  In these situations, there is
-again strong motivation to avoid sending scheduling-clock interrupts.
-
-By default, no CPU will be an adaptive-ticks CPU.  The "nohz_full="
-boot parameter specifies the adaptive-ticks CPUs.  For example,
-"nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks
-CPUs.  Note that you are prohibited from marking all of the CPUs as
-adaptive-tick CPUs:  At least one non-adaptive-tick CPU must remain
-online to handle timekeeping tasks in order to ensure that system
-calls like gettimeofday() returns accurate values on adaptive-tick CPUs.
-(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no running
-user processes to observe slight drifts in clock rate.)  Therefore, the
-boot CPU is prohibited from entering adaptive-ticks mode.  Specifying a
-"nohz_full=" mask that includes the boot CPU will result in a boot-time
-error message, and the boot CPU will be removed from the mask.  Note that
-this means that your system must have at least two CPUs in order for
-CONFIG_NO_HZ_FULL=y to do anything for you.
-
-Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded.
-This is covered in the "RCU IMPLICATIONS" section below.
-
-Normally, a CPU remains in adaptive-ticks mode as long as possible.
-In particular, transitioning to kernel mode does not automatically change
-the mode.  Instead, the CPU will exit adaptive-ticks mode only if needed,
-for example, if that CPU enqueues an RCU callback.
-
-Just as with dyntick-idle mode, the benefits of adaptive-tick mode do
-not come for free:
-
-1.	CONFIG_NO_HZ_FULL selects CONFIG_NO_HZ_COMMON, so you cannot run
-	adaptive ticks without also running dyntick idle.  This dependency
-	extends down into the implementation, so that all of the costs
-	of CONFIG_NO_HZ_IDLE are also incurred by CONFIG_NO_HZ_FULL.
-
-2.	The user/kernel transitions are slightly more expensive due
-	to the need to inform kernel subsystems (such as RCU) about
-	the change in mode.
-
-3.	POSIX CPU timers prevent CPUs from entering adaptive-tick mode.
-	Real-time applications needing to take actions based on CPU time
-	consumption need to use other means of doing so.
-
-4.	If there are more perf events pending than the hardware can
-	accommodate, they are normally round-robined so as to collect
-	all of them over time.  Adaptive-tick mode may prevent this
-	round-robining from happening.  This will likely be fixed by
-	preventing CPUs with large numbers of perf events pending from
-	entering adaptive-tick mode.
-
-5.	Scheduler statistics for adaptive-tick CPUs may be computed
-	slightly differently than those for non-adaptive-tick CPUs.
-	This might in turn perturb load-balancing of real-time tasks.
-
-6.	The LB_BIAS scheduler feature is disabled by adaptive ticks.
-
-Although improvements are expected over time, adaptive ticks is quite
-useful for many types of real-time and compute-intensive applications.
-However, the drawbacks listed above mean that adaptive ticks should not
-(yet) be enabled by default.
-
-
-RCU IMPLICATIONS
-
-There are situations in which idle CPUs cannot be permitted to
-enter either dyntick-idle mode or adaptive-tick mode, the most
-common being when that CPU has RCU callbacks pending.
-
-The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs
-to enter dyntick-idle mode or adaptive-tick mode anyway.  In this case,
-a timer will awaken these CPUs every four jiffies in order to ensure
-that the RCU callbacks are processed in a timely fashion.
-
-Another approach is to offload RCU callback processing to "rcuo" kthreads
-using the CONFIG_RCU_NOCB_CPU=y Kconfig option.  The specific CPUs to
-offload may be selected using The "rcu_nocbs=" kernel boot parameter,
-which takes a comma-separated list of CPUs and CPU ranges, for example,
-"1,3-5" selects CPUs 1, 3, 4, and 5.
-
-The offloaded CPUs will never queue RCU callbacks, and therefore RCU
-never prevents offloaded CPUs from entering either dyntick-idle mode
-or adaptive-tick mode.  That said, note that it is up to userspace to
-pin the "rcuo" kthreads to specific CPUs if desired.  Otherwise, the
-scheduler will decide where to run them, which might or might not be
-where you want them to run.
-
-
-TESTING
-
-So you enable all the OS-jitter features described in this document,
-but do not see any change in your workload's behavior.  Is this because
-your workload isn't affected that much by OS jitter, or is it because
-something else is in the way?  This section helps answer this question
-by providing a simple OS-jitter test suite, which is available on branch
-master of the following git archive:
-
-git://git.kernel.org/pub/scm/linux/kernel/git/frederic/dynticks-testing.git
-
-Clone this archive and follow the instructions in the README file.
-This test procedure will produce a trace that will allow you to evaluate
-whether or not you have succeeded in removing OS jitter from your system.
-If this trace shows that you have removed OS jitter as much as is
-possible, then you can conclude that your workload is not all that
-sensitive to OS jitter.
-
-Note: this test requires that your system have at least two CPUs.
-We do not currently have a good way to remove OS jitter from single-CPU
-systems.
-
-
-KNOWN ISSUES
-
-o	Dyntick-idle slows transitions to and from idle slightly.
-	In practice, this has not been a problem except for the most
-	aggressive real-time workloads, which have the option of disabling
-	dyntick-idle mode, an option that most of them take.  However,
-	some workloads will no doubt want to use adaptive ticks to
-	eliminate scheduling-clock interrupt latencies.  Here are some
-	options for these workloads:
-
-	a.	Use PMQOS from userspace to inform the kernel of your
-		latency requirements (preferred).
-
-	b.	On x86 systems, use the "idle=mwait" boot parameter.
-
-	c.	On x86 systems, use the "intel_idle.max_cstate=" to limit
-	`	the maximum C-state depth.
-
-	d.	On x86 systems, use the "idle=poll" boot parameter.
-		However, please note that use of this parameter can cause
-		your CPU to overheat, which may cause thermal throttling
-		to degrade your latencies -- and that this degradation can
-		be even worse than that of dyntick-idle.  Furthermore,
-		this parameter effectively disables Turbo Mode on Intel
-		CPUs, which can significantly reduce maximum performance.
-
-o	Adaptive-ticks slows user/kernel transitions slightly.
-	This is not expected to be a problem for computationally intensive
-	workloads, which have few such transitions.  Careful benchmarking
-	will be required to determine whether or not other workloads
-	are significantly affected by this effect.
-
-o	Adaptive-ticks does not do anything unless there is only one
-	runnable task for a given CPU, even though there are a number
-	of other situations where the scheduling-clock tick is not
-	needed.  To give but one example, consider a CPU that has one
-	runnable high-priority SCHED_FIFO task and an arbitrary number
-	of low-priority SCHED_OTHER tasks.  In this case, the CPU is
-	required to run the SCHED_FIFO task until it either blocks or
-	some other higher-priority task awakens on (or is assigned to)
-	this CPU, so there is no point in sending a scheduling-clock
-	interrupt to this CPU.	However, the current implementation
-	nevertheless sends scheduling-clock interrupts to CPUs having a
-	single runnable SCHED_FIFO task and multiple runnable SCHED_OTHER
-	tasks, even though these interrupts are unnecessary.
-
-	And even when there are multiple runnable tasks on a given CPU,
-	there is little point in interrupting that CPU until the current
-	running task's timeslice expires, which is almost always way
-	longer than the time of the next scheduling-clock interrupt.
-
-	Better handling of these sorts of situations is future work.
-
-o	A reboot is required to reconfigure both adaptive idle and RCU
-	callback offloading.  Runtime reconfiguration could be provided
-	if needed, however, due to the complexity of reconfiguring RCU at
-	runtime, there would need to be an earthshakingly good reason.
-	Especially given that you have the straightforward option of
-	simply offloading RCU callbacks from all CPUs and pinning them
-	where you want them whenever you want them pinned.
-
-o	Additional configuration is required to deal with other sources
-	of OS jitter, including interrupts and system-utility tasks
-	and processes.  This configuration normally involves binding
-	interrupts and tasks to particular CPUs.
-
-o	Some sources of OS jitter can currently be eliminated only by
-	constraining the workload.  For example, the only way to eliminate
-	OS jitter due to global TLB shootdowns is to avoid the unmapping
-	operations (such as kernel module unload operations) that
-	result in these shootdowns.  For another example, page faults
-	and TLB misses can be reduced (and in some cases eliminated) by
-	using huge pages and by constraining the amount of memory used
-	by the application.  Pre-faulting the working set can also be
-	helpful, especially when combined with the mlock() and mlockall()
-	system calls.
-
-o	Unless all CPUs are idle, at least one CPU must keep the
-	scheduling-clock interrupt going in order to support accurate
-	timekeeping.
-
-o	If there might potentially be some adaptive-ticks CPUs, there
-	will be at least one CPU keeping the scheduling-clock interrupt
-	going, even if all CPUs are otherwise idle.
-
-	Better handling of this situation is ongoing work.
-
-o	Some process-handling operations still require the occasional
-	scheduling-clock tick.	These operations include calculating CPU
-	load, maintaining sched average, computing CFS entity vruntime,
-	computing avenrun, and carrying out load balancing.  They are
-	currently accommodated by scheduling-clock tick every second
-	or so.	On-going work will eliminate the need even for these
-	infrequent scheduling-clock ticks.
diff --git a/Documentation/timers/highres.rst b/Documentation/timers/highres.rst
new file mode 100644
index 000000000000..bde5eb7e5c9e
--- /dev/null
+++ b/Documentation/timers/highres.rst
@@ -0,0 +1,250 @@
+=====================================================
+High resolution timers and dynamic ticks design notes
+=====================================================
+
+Further information can be found in the paper of the OLS 2006 talk "hrtimers
+and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can
+be found on the OLS website:
+https://www.kernel.org/doc/ols/2006/ols2006v1-pages-333-346.pdf
+
+The slides to this talk are available from:
+http://www.cs.columbia.edu/~nahum/w6998/papers/ols2006-hrtimers-slides.pdf
+
+The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the
+changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the
+design of the Linux time(r) system before hrtimers and other building blocks
+got merged into mainline.
+
+Note: the paper and the slides are talking about "clock event source", while we
+switched to the name "clock event devices" in meantime.
+
+The design contains the following basic building blocks:
+
+- hrtimer base infrastructure
+- timeofday and clock source management
+- clock event management
+- high resolution timer functionality
+- dynamic ticks
+
+
+hrtimer base infrastructure
+---------------------------
+
+The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of
+the base implementation are covered in Documentation/timers/hrtimers.rst. See
+also figure #2 (OLS slides p. 15)
+
+The main differences to the timer wheel, which holds the armed timer_list type
+timers are:
+
+       - time ordered enqueueing into a rb-tree
+       - independent of ticks (the processing is based on nanoseconds)
+
+
+timeofday and clock source management
+-------------------------------------
+
+John Stultz's Generic Time Of Day (GTOD) framework moves a large portion of
+code out of the architecture-specific areas into a generic management
+framework, as illustrated in figure #3 (OLS slides p. 18). The architecture
+specific portion is reduced to the low level hardware details of the clock
+sources, which are registered in the framework and selected on a quality based
+decision. The low level code provides hardware setup and readout routines and
+initializes data structures, which are used by the generic time keeping code to
+convert the clock ticks to nanosecond based time values. All other time keeping
+related functionality is moved into the generic code. The GTOD base patch got
+merged into the 2.6.18 kernel.
+
+Further information about the Generic Time Of Day framework is available in the
+OLS 2005 Proceedings Volume 1:
+
+	http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf
+
+The paper "We Are Not Getting Any Younger: A New Approach to Time and
+Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan.
+
+Figure #3 (OLS slides p.18) illustrates the transformation.
+
+
+clock event management
+----------------------
+
+While clock sources provide read access to the monotonically increasing time
+value, clock event devices are used to schedule the next event
+interrupt(s). The next event is currently defined to be periodic, with its
+period defined at compile time. The setup and selection of the event device
+for various event driven functionalities is hardwired into the architecture
+dependent code. This results in duplicated code across all architectures and
+makes it extremely difficult to change the configuration of the system to use
+event interrupt devices other than those already built into the
+architecture. Another implication of the current design is that it is necessary
+to touch all the architecture-specific implementations in order to provide new
+functionality like high resolution timers or dynamic ticks.
+
+The clock events subsystem tries to address this problem by providing a generic
+solution to manage clock event devices and their usage for the various clock
+event driven kernel functionalities. The goal of the clock event subsystem is
+to minimize the clock event related architecture dependent code to the pure
+hardware related handling and to allow easy addition and utilization of new
+clock event devices. It also minimizes the duplicated code across the
+architectures as it provides generic functionality down to the interrupt
+service handler, which is almost inherently hardware dependent.
+
+Clock event devices are registered either by the architecture dependent boot
+code or at module insertion time. Each clock event device fills a data
+structure with clock-specific property parameters and callback functions. The
+clock event management decides, by using the specified property parameters, the
+set of system functions a clock event device will be used to support. This
+includes the distinction of per-CPU and per-system global event devices.
+
+System-level global event devices are used for the Linux periodic tick. Per-CPU
+event devices are used to provide local CPU functionality such as process
+accounting, profiling, and high resolution timers.
+
+The management layer assigns one or more of the following functions to a clock
+event device:
+
+      - system global periodic tick (jiffies update)
+      - cpu local update_process_times
+      - cpu local profiling
+      - cpu local next event interrupt (non periodic mode)
+
+The clock event device delegates the selection of those timer interrupt related
+functions completely to the management layer. The clock management layer stores
+a function pointer in the device description structure, which has to be called
+from the hardware level handler. This removes a lot of duplicated code from the
+architecture specific timer interrupt handlers and hands the control over the
+clock event devices and the assignment of timer interrupt related functionality
+to the core code.
+
+The clock event layer API is rather small. Aside from the clock event device
+registration interface it provides functions to schedule the next event
+interrupt, clock event device notification service and support for suspend and
+resume.
+
+The framework adds about 700 lines of code which results in a 2KB increase of
+the kernel binary size. The conversion of i386 removes about 100 lines of
+code. The binary size decrease is in the range of 400 byte. We believe that the
+increase of flexibility and the avoidance of duplicated code across
+architectures justifies the slight increase of the binary size.
+
+The conversion of an architecture has no functional impact, but allows to
+utilize the high resolution and dynamic tick functionalities without any change
+to the clock event device and timer interrupt code. After the conversion the
+enabling of high resolution timers and dynamic ticks is simply provided by
+adding the kernel/time/Kconfig file to the architecture specific Kconfig and
+adding the dynamic tick specific calls to the idle routine (a total of 3 lines
+added to the idle function and the Kconfig file)
+
+Figure #4 (OLS slides p.20) illustrates the transformation.
+
+
+high resolution timer functionality
+-----------------------------------
+
+During system boot it is not possible to use the high resolution timer
+functionality, while making it possible would be difficult and would serve no
+useful function. The initialization of the clock event device framework, the
+clock source framework (GTOD) and hrtimers itself has to be done and
+appropriate clock sources and clock event devices have to be registered before
+the high resolution functionality can work. Up to the point where hrtimers are
+initialized, the system works in the usual low resolution periodic mode. The
+clock source and the clock event device layers provide notification functions
+which inform hrtimers about availability of new hardware. hrtimers validates
+the usability of the registered clock sources and clock event devices before
+switching to high resolution mode. This ensures also that a kernel which is
+configured for high resolution timers can run on a system which lacks the
+necessary hardware support.
+
+The high resolution timer code does not support SMP machines which have only
+global clock event devices. The support of such hardware would involve IPI
+calls when an interrupt happens. The overhead would be much larger than the
+benefit. This is the reason why we currently disable high resolution and
+dynamic ticks on i386 SMP systems which stop the local APIC in C3 power
+state. A workaround is available as an idea, but the problem has not been
+tackled yet.
+
+The time ordered insertion of timers provides all the infrastructure to decide
+whether the event device has to be reprogrammed when a timer is added. The
+decision is made per timer base and synchronized across per-cpu timer bases in
+a support function. The design allows the system to utilize separate per-CPU
+clock event devices for the per-CPU timer bases, but currently only one
+reprogrammable clock event device per-CPU is utilized.
+
+When the timer interrupt happens, the next event interrupt handler is called
+from the clock event distribution code and moves expired timers from the
+red-black tree to a separate double linked list and invokes the softirq
+handler. An additional mode field in the hrtimer structure allows the system to
+execute callback functions directly from the next event interrupt handler. This
+is restricted to code which can safely be executed in the hard interrupt
+context. This applies, for example, to the common case of a wakeup function as
+used by nanosleep. The advantage of executing the handler in the interrupt
+context is the avoidance of up to two context switches - from the interrupted
+context to the softirq and to the task which is woken up by the expired
+timer.
+
+Once a system has switched to high resolution mode, the periodic tick is
+switched off. This disables the per system global periodic clock event device -
+e.g. the PIT on i386 SMP systems.
+
+The periodic tick functionality is provided by an per-cpu hrtimer. The callback
+function is executed in the next event interrupt context and updates jiffies
+and calls update_process_times and profiling. The implementation of the hrtimer
+based periodic tick is designed to be extended with dynamic tick functionality.
+This allows to use a single clock event device to schedule high resolution
+timer and periodic events (jiffies tick, profiling, process accounting) on UP
+systems. This has been proved to work with the PIT on i386 and the Incrementer
+on PPC.
+
+The softirq for running the hrtimer queues and executing the callbacks has been
+separated from the tick bound timer softirq to allow accurate delivery of high
+resolution timer signals which are used by itimer and POSIX interval
+timers. The execution of this softirq can still be delayed by other softirqs,
+but the overall latencies have been significantly improved by this separation.
+
+Figure #5 (OLS slides p.22) illustrates the transformation.
+
+
+dynamic ticks
+-------------
+
+Dynamic ticks are the logical consequence of the hrtimer based periodic tick
+replacement (sched_tick). The functionality of the sched_tick hrtimer is
+extended by three functions:
+
+- hrtimer_stop_sched_tick
+- hrtimer_restart_sched_tick
+- hrtimer_update_jiffies
+
+hrtimer_stop_sched_tick() is called when a CPU goes into idle state. The code
+evaluates the next scheduled timer event (from both hrtimers and the timer
+wheel) and in case that the next event is further away than the next tick it
+reprograms the sched_tick to this future event, to allow longer idle sleeps
+without worthless interruption by the periodic tick. The function is also
+called when an interrupt happens during the idle period, which does not cause a
+reschedule. The call is necessary as the interrupt handler might have armed a
+new timer whose expiry time is before the time which was identified as the
+nearest event in the previous call to hrtimer_stop_sched_tick.
+
+hrtimer_restart_sched_tick() is called when the CPU leaves the idle state before
+it calls schedule(). hrtimer_restart_sched_tick() resumes the periodic tick,
+which is kept active until the next call to hrtimer_stop_sched_tick().
+
+hrtimer_update_jiffies() is called from irq_enter() when an interrupt happens
+in the idle period to make sure that jiffies are up to date and the interrupt
+handler has not to deal with an eventually stale jiffy value.
+
+The dynamic tick feature provides statistical values which are exported to
+userspace via /proc/stat and can be made available for enhanced power
+management control.
+
+The implementation leaves room for further development like full tickless
+systems, where the time slice is controlled by the scheduler, variable
+frequency profiling, and a complete removal of jiffies in the future.
+
+
+Aside the current initial submission of i386 support, the patchset has been
+extended to x86_64 and ARM already. Initial (work in progress) support is also
+available for MIPS and PowerPC.
+
+	  Thomas, Ingo
diff --git a/Documentation/timers/highres.txt b/Documentation/timers/highres.txt
deleted file mode 100644
index 8f9741592123..000000000000
--- a/Documentation/timers/highres.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-High resolution timers and dynamic ticks design notes
------------------------------------------------------
-
-Further information can be found in the paper of the OLS 2006 talk "hrtimers
-and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can
-be found on the OLS website:
-https://www.kernel.org/doc/ols/2006/ols2006v1-pages-333-346.pdf
-
-The slides to this talk are available from:
-http://www.cs.columbia.edu/~nahum/w6998/papers/ols2006-hrtimers-slides.pdf
-
-The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the
-changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the
-design of the Linux time(r) system before hrtimers and other building blocks
-got merged into mainline.
-
-Note: the paper and the slides are talking about "clock event source", while we
-switched to the name "clock event devices" in meantime.
-
-The design contains the following basic building blocks:
-
-- hrtimer base infrastructure
-- timeofday and clock source management
-- clock event management
-- high resolution timer functionality
-- dynamic ticks
-
-
-hrtimer base infrastructure
----------------------------
-
-The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of
-the base implementation are covered in Documentation/timers/hrtimers.txt. See
-also figure #2 (OLS slides p. 15)
-
-The main differences to the timer wheel, which holds the armed timer_list type
-timers are:
-       - time ordered enqueueing into a rb-tree
-       - independent of ticks (the processing is based on nanoseconds)
-
-
-timeofday and clock source management
--------------------------------------
-
-John Stultz's Generic Time Of Day (GTOD) framework moves a large portion of
-code out of the architecture-specific areas into a generic management
-framework, as illustrated in figure #3 (OLS slides p. 18). The architecture
-specific portion is reduced to the low level hardware details of the clock
-sources, which are registered in the framework and selected on a quality based
-decision. The low level code provides hardware setup and readout routines and
-initializes data structures, which are used by the generic time keeping code to
-convert the clock ticks to nanosecond based time values. All other time keeping
-related functionality is moved into the generic code. The GTOD base patch got
-merged into the 2.6.18 kernel.
-
-Further information about the Generic Time Of Day framework is available in the
-OLS 2005 Proceedings Volume 1:
-http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf
-
-The paper "We Are Not Getting Any Younger: A New Approach to Time and
-Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan.
-
-Figure #3 (OLS slides p.18) illustrates the transformation.
-
-
-clock event management
-----------------------
-
-While clock sources provide read access to the monotonically increasing time
-value, clock event devices are used to schedule the next event
-interrupt(s). The next event is currently defined to be periodic, with its
-period defined at compile time. The setup and selection of the event device
-for various event driven functionalities is hardwired into the architecture
-dependent code. This results in duplicated code across all architectures and
-makes it extremely difficult to change the configuration of the system to use
-event interrupt devices other than those already built into the
-architecture. Another implication of the current design is that it is necessary
-to touch all the architecture-specific implementations in order to provide new
-functionality like high resolution timers or dynamic ticks.
-
-The clock events subsystem tries to address this problem by providing a generic
-solution to manage clock event devices and their usage for the various clock
-event driven kernel functionalities. The goal of the clock event subsystem is
-to minimize the clock event related architecture dependent code to the pure
-hardware related handling and to allow easy addition and utilization of new
-clock event devices. It also minimizes the duplicated code across the
-architectures as it provides generic functionality down to the interrupt
-service handler, which is almost inherently hardware dependent.
-
-Clock event devices are registered either by the architecture dependent boot
-code or at module insertion time. Each clock event device fills a data
-structure with clock-specific property parameters and callback functions. The
-clock event management decides, by using the specified property parameters, the
-set of system functions a clock event device will be used to support. This
-includes the distinction of per-CPU and per-system global event devices.
-
-System-level global event devices are used for the Linux periodic tick. Per-CPU
-event devices are used to provide local CPU functionality such as process
-accounting, profiling, and high resolution timers.
-
-The management layer assigns one or more of the following functions to a clock
-event device:
-      - system global periodic tick (jiffies update)
-      - cpu local update_process_times
-      - cpu local profiling
-      - cpu local next event interrupt (non periodic mode)
-
-The clock event device delegates the selection of those timer interrupt related
-functions completely to the management layer. The clock management layer stores
-a function pointer in the device description structure, which has to be called
-from the hardware level handler. This removes a lot of duplicated code from the
-architecture specific timer interrupt handlers and hands the control over the
-clock event devices and the assignment of timer interrupt related functionality
-to the core code.
-
-The clock event layer API is rather small. Aside from the clock event device
-registration interface it provides functions to schedule the next event
-interrupt, clock event device notification service and support for suspend and
-resume.
-
-The framework adds about 700 lines of code which results in a 2KB increase of
-the kernel binary size. The conversion of i386 removes about 100 lines of
-code. The binary size decrease is in the range of 400 byte. We believe that the
-increase of flexibility and the avoidance of duplicated code across
-architectures justifies the slight increase of the binary size.
-
-The conversion of an architecture has no functional impact, but allows to
-utilize the high resolution and dynamic tick functionalities without any change
-to the clock event device and timer interrupt code. After the conversion the
-enabling of high resolution timers and dynamic ticks is simply provided by
-adding the kernel/time/Kconfig file to the architecture specific Kconfig and
-adding the dynamic tick specific calls to the idle routine (a total of 3 lines
-added to the idle function and the Kconfig file)
-
-Figure #4 (OLS slides p.20) illustrates the transformation.
-
-
-high resolution timer functionality
------------------------------------
-
-During system boot it is not possible to use the high resolution timer
-functionality, while making it possible would be difficult and would serve no
-useful function. The initialization of the clock event device framework, the
-clock source framework (GTOD) and hrtimers itself has to be done and
-appropriate clock sources and clock event devices have to be registered before
-the high resolution functionality can work. Up to the point where hrtimers are
-initialized, the system works in the usual low resolution periodic mode. The
-clock source and the clock event device layers provide notification functions
-which inform hrtimers about availability of new hardware. hrtimers validates
-the usability of the registered clock sources and clock event devices before
-switching to high resolution mode. This ensures also that a kernel which is
-configured for high resolution timers can run on a system which lacks the
-necessary hardware support.
-
-The high resolution timer code does not support SMP machines which have only
-global clock event devices. The support of such hardware would involve IPI
-calls when an interrupt happens. The overhead would be much larger than the
-benefit. This is the reason why we currently disable high resolution and
-dynamic ticks on i386 SMP systems which stop the local APIC in C3 power
-state. A workaround is available as an idea, but the problem has not been
-tackled yet.
-
-The time ordered insertion of timers provides all the infrastructure to decide
-whether the event device has to be reprogrammed when a timer is added. The
-decision is made per timer base and synchronized across per-cpu timer bases in
-a support function. The design allows the system to utilize separate per-CPU
-clock event devices for the per-CPU timer bases, but currently only one
-reprogrammable clock event device per-CPU is utilized.
-
-When the timer interrupt happens, the next event interrupt handler is called
-from the clock event distribution code and moves expired timers from the
-red-black tree to a separate double linked list and invokes the softirq
-handler. An additional mode field in the hrtimer structure allows the system to
-execute callback functions directly from the next event interrupt handler. This
-is restricted to code which can safely be executed in the hard interrupt
-context. This applies, for example, to the common case of a wakeup function as
-used by nanosleep. The advantage of executing the handler in the interrupt
-context is the avoidance of up to two context switches - from the interrupted
-context to the softirq and to the task which is woken up by the expired
-timer.
-
-Once a system has switched to high resolution mode, the periodic tick is
-switched off. This disables the per system global periodic clock event device -
-e.g. the PIT on i386 SMP systems.
-
-The periodic tick functionality is provided by an per-cpu hrtimer. The callback
-function is executed in the next event interrupt context and updates jiffies
-and calls update_process_times and profiling. The implementation of the hrtimer
-based periodic tick is designed to be extended with dynamic tick functionality.
-This allows to use a single clock event device to schedule high resolution
-timer and periodic events (jiffies tick, profiling, process accounting) on UP
-systems. This has been proved to work with the PIT on i386 and the Incrementer
-on PPC.
-
-The softirq for running the hrtimer queues and executing the callbacks has been
-separated from the tick bound timer softirq to allow accurate delivery of high
-resolution timer signals which are used by itimer and POSIX interval
-timers. The execution of this softirq can still be delayed by other softirqs,
-but the overall latencies have been significantly improved by this separation.
-
-Figure #5 (OLS slides p.22) illustrates the transformation.
-
-
-dynamic ticks
--------------
-
-Dynamic ticks are the logical consequence of the hrtimer based periodic tick
-replacement (sched_tick). The functionality of the sched_tick hrtimer is
-extended by three functions:
-
-- hrtimer_stop_sched_tick
-- hrtimer_restart_sched_tick
-- hrtimer_update_jiffies
-
-hrtimer_stop_sched_tick() is called when a CPU goes into idle state. The code
-evaluates the next scheduled timer event (from both hrtimers and the timer
-wheel) and in case that the next event is further away than the next tick it
-reprograms the sched_tick to this future event, to allow longer idle sleeps
-without worthless interruption by the periodic tick. The function is also
-called when an interrupt happens during the idle period, which does not cause a
-reschedule. The call is necessary as the interrupt handler might have armed a
-new timer whose expiry time is before the time which was identified as the
-nearest event in the previous call to hrtimer_stop_sched_tick.
-
-hrtimer_restart_sched_tick() is called when the CPU leaves the idle state before
-it calls schedule(). hrtimer_restart_sched_tick() resumes the periodic tick,
-which is kept active until the next call to hrtimer_stop_sched_tick().
-
-hrtimer_update_jiffies() is called from irq_enter() when an interrupt happens
-in the idle period to make sure that jiffies are up to date and the interrupt
-handler has not to deal with an eventually stale jiffy value.
-
-The dynamic tick feature provides statistical values which are exported to
-userspace via /proc/stat and can be made available for enhanced power
-management control.
-
-The implementation leaves room for further development like full tickless
-systems, where the time slice is controlled by the scheduler, variable
-frequency profiling, and a complete removal of jiffies in the future.
-
-
-Aside the current initial submission of i386 support, the patchset has been
-extended to x86_64 and ARM already. Initial (work in progress) support is also
-available for MIPS and PowerPC.
-
-	  Thomas, Ingo
-
-
-
diff --git a/Documentation/timers/hpet.rst b/Documentation/timers/hpet.rst
new file mode 100644
index 000000000000..c9d05d3caaca
--- /dev/null
+++ b/Documentation/timers/hpet.rst
@@ -0,0 +1,30 @@
+===========================================
+High Precision Event Timer Driver for Linux
+===========================================
+
+The High Precision Event Timer (HPET) hardware follows a specification
+by Intel and Microsoft, revision 1.
+
+Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
+and up to 32 comparators.  Normally three or more comparators are provided,
+each of which can generate oneshot interrupts and at least one of which has
+additional hardware to support periodic interrupts.  The comparators are
+also called "timers", which can be misleading since usually timers are
+independent of each other ... these share a counter, complicating resets.
+
+HPET devices can support two interrupt routing modes.  In one mode, the
+comparators are additional interrupt sources with no particular system
+role.  Many x86 BIOS writers don't route HPET interrupts at all, which
+prevents use of that mode.  They support the other "legacy replacement"
+mode where the first two comparators block interrupts from 8254 timers
+and from the RTC.
+
+The driver supports detection of HPET driver allocation and initialization
+of the HPET before the driver module_init routine is called.  This enables
+platform code which uses timer 0 or 1 as the main timer to intercept HPET
+initialization.  An example of this initialization can be found in
+arch/x86/kernel/hpet.c.
+
+The driver provides a userspace API which resembles the API found in the
+RTC driver framework.  An example user space program is provided in
+file:samples/timers/hpet_example.c
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt
deleted file mode 100644
index 895345ec513b..000000000000
--- a/Documentation/timers/hpet.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-		High Precision Event Timer Driver for Linux
-
-The High Precision Event Timer (HPET) hardware follows a specification
-by Intel and Microsoft, revision 1.
-
-Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
-and up to 32 comparators.  Normally three or more comparators are provided,
-each of which can generate oneshot interrupts and at least one of which has
-additional hardware to support periodic interrupts.  The comparators are
-also called "timers", which can be misleading since usually timers are
-independent of each other ... these share a counter, complicating resets.
-
-HPET devices can support two interrupt routing modes.  In one mode, the
-comparators are additional interrupt sources with no particular system
-role.  Many x86 BIOS writers don't route HPET interrupts at all, which
-prevents use of that mode.  They support the other "legacy replacement"
-mode where the first two comparators block interrupts from 8254 timers
-and from the RTC.
-
-The driver supports detection of HPET driver allocation and initialization
-of the HPET before the driver module_init routine is called.  This enables
-platform code which uses timer 0 or 1 as the main timer to intercept HPET
-initialization.  An example of this initialization can be found in
-arch/x86/kernel/hpet.c.
-
-The driver provides a userspace API which resembles the API found in the
-RTC driver framework.  An example user space program is provided in
-file:samples/timers/hpet_example.c
diff --git a/Documentation/timers/hrtimers.rst b/Documentation/timers/hrtimers.rst
new file mode 100644
index 000000000000..c1c20a693e8f
--- /dev/null
+++ b/Documentation/timers/hrtimers.rst
@@ -0,0 +1,178 @@
+======================================================
+hrtimers - subsystem for high-resolution kernel timers
+======================================================
+
+This patch introduces a new subsystem for high-resolution kernel timers.
+
+One might ask the question: we already have a timer subsystem
+(kernel/timers.c), why do we need two timer subsystems? After a lot of
+back and forth trying to integrate high-resolution and high-precision
+features into the existing timer framework, and after testing various
+such high-resolution timer implementations in practice, we came to the
+conclusion that the timer wheel code is fundamentally not suitable for
+such an approach. We initially didn't believe this ('there must be a way
+to solve this'), and spent a considerable effort trying to integrate
+things into the timer wheel, but we failed. In hindsight, there are
+several reasons why such integration is hard/impossible:
+
+- the forced handling of low-resolution and high-resolution timers in
+  the same way leads to a lot of compromises, macro magic and #ifdef
+  mess. The timers.c code is very "tightly coded" around jiffies and
+  32-bitness assumptions, and has been honed and micro-optimized for a
+  relatively narrow use case (jiffies in a relatively narrow HZ range)
+  for many years - and thus even small extensions to it easily break
+  the wheel concept, leading to even worse compromises. The timer wheel
+  code is very good and tight code, there's zero problems with it in its
+  current usage - but it is simply not suitable to be extended for
+  high-res timers.
+
+- the unpredictable [O(N)] overhead of cascading leads to delays which
+  necessitate a more complex handling of high resolution timers, which
+  in turn decreases robustness. Such a design still leads to rather large
+  timing inaccuracies. Cascading is a fundamental property of the timer
+  wheel concept, it cannot be 'designed out' without inevitably
+  degrading other portions of the timers.c code in an unacceptable way.
+
+- the implementation of the current posix-timer subsystem on top of
+  the timer wheel has already introduced a quite complex handling of
+  the required readjusting of absolute CLOCK_REALTIME timers at
+  settimeofday or NTP time - further underlying our experience by
+  example: that the timer wheel data structure is too rigid for high-res
+  timers.
+
+- the timer wheel code is most optimal for use cases which can be
+  identified as "timeouts". Such timeouts are usually set up to cover
+  error conditions in various I/O paths, such as networking and block
+  I/O. The vast majority of those timers never expire and are rarely
+  recascaded because the expected correct event arrives in time so they
+  can be removed from the timer wheel before any further processing of
+  them becomes necessary. Thus the users of these timeouts can accept
+  the granularity and precision tradeoffs of the timer wheel, and
+  largely expect the timer subsystem to have near-zero overhead.
+  Accurate timing for them is not a core purpose - in fact most of the
+  timeout values used are ad-hoc. For them it is at most a necessary
+  evil to guarantee the processing of actual timeout completions
+  (because most of the timeouts are deleted before completion), which
+  should thus be as cheap and unintrusive as possible.
+
+The primary users of precision timers are user-space applications that
+utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel
+users like drivers and subsystems which require precise timed events
+(e.g. multimedia) can benefit from the availability of a separate
+high-resolution timer subsystem as well.
+
+While this subsystem does not offer high-resolution clock sources just
+yet, the hrtimer subsystem can be easily extended with high-resolution
+clock capabilities, and patches for that exist and are maturing quickly.
+The increasing demand for realtime and multimedia applications along
+with other potential users for precise timers gives another reason to
+separate the "timeout" and "precise timer" subsystems.
+
+Another potential benefit is that such a separation allows even more
+special-purpose optimization of the existing timer wheel for the low
+resolution and low precision use cases - once the precision-sensitive
+APIs are separated from the timer wheel and are migrated over to
+hrtimers. E.g. we could decrease the frequency of the timeout subsystem
+from 250 Hz to 100 HZ (or even smaller).
+
+hrtimer subsystem implementation details
+----------------------------------------
+
+the basic design considerations were:
+
+- simplicity
+
+- data structure not bound to jiffies or any other granularity. All the
+  kernel logic works at 64-bit nanoseconds resolution - no compromises.
+
+- simplification of existing, timing related kernel code
+
+another basic requirement was the immediate enqueueing and ordering of
+timers at activation time. After looking at several possible solutions
+such as radix trees and hashes, we chose the red black tree as the basic
+data structure. Rbtrees are available as a library in the kernel and are
+used in various performance-critical areas of e.g. memory management and
+file systems. The rbtree is solely used for time sorted ordering, while
+a separate list is used to give the expiry code fast access to the
+queued timers, without having to walk the rbtree.
+
+(This separate list is also useful for later when we'll introduce
+high-resolution clocks, where we need separate pending and expired
+queues while keeping the time-order intact.)
+
+Time-ordered enqueueing is not purely for the purposes of
+high-resolution clocks though, it also simplifies the handling of
+absolute timers based on a low-resolution CLOCK_REALTIME. The existing
+implementation needed to keep an extra list of all armed absolute
+CLOCK_REALTIME timers along with complex locking. In case of
+settimeofday and NTP, all the timers (!) had to be dequeued, the
+time-changing code had to fix them up one by one, and all of them had to
+be enqueued again. The time-ordered enqueueing and the storage of the
+expiry time in absolute time units removes all this complex and poorly
+scaling code from the posix-timer implementation - the clock can simply
+be set without having to touch the rbtree. This also makes the handling
+of posix-timers simpler in general.
+
+The locking and per-CPU behavior of hrtimers was mostly taken from the
+existing timer wheel code, as it is mature and well suited. Sharing code
+was not really a win, due to the different data structures. Also, the
+hrtimer functions now have clearer behavior and clearer names - such as
+hrtimer_try_to_cancel() and hrtimer_cancel() [which are roughly
+equivalent to del_timer() and del_timer_sync()] - so there's no direct
+1:1 mapping between them on the algorithmic level, and thus no real
+potential for code sharing either.
+
+Basic data types: every time value, absolute or relative, is in a
+special nanosecond-resolution type: ktime_t. The kernel-internal
+representation of ktime_t values and operations is implemented via
+macros and inline functions, and can be switched between a "hybrid
+union" type and a plain "scalar" 64bit nanoseconds representation (at
+compile time). The hybrid union type optimizes time conversions on 32bit
+CPUs. This build-time-selectable ktime_t storage format was implemented
+to avoid the performance impact of 64-bit multiplications and divisions
+on 32bit CPUs. Such operations are frequently necessary to convert
+between the storage formats provided by kernel and userspace interfaces
+and the internal time format. (See include/linux/ktime.h for further
+details.)
+
+hrtimers - rounding of timer values
+-----------------------------------
+
+the hrtimer code will round timer events to lower-resolution clocks
+because it has to. Otherwise it will do no artificial rounding at all.
+
+one question is, what resolution value should be returned to the user by
+the clock_getres() interface. This will return whatever real resolution
+a given clock has - be it low-res, high-res, or artificially-low-res.
+
+hrtimers - testing and verification
+-----------------------------------
+
+We used the high-resolution clock subsystem ontop of hrtimers to verify
+the hrtimer implementation details in praxis, and we also ran the posix
+timer tests in order to ensure specification compliance. We also ran
+tests on low-resolution clocks.
+
+The hrtimer patch converts the following kernel functionality to use
+hrtimers:
+
+ - nanosleep
+ - itimers
+ - posix-timers
+
+The conversion of nanosleep and posix-timers enabled the unification of
+nanosleep and clock_nanosleep.
+
+The code was successfully compiled for the following platforms:
+
+ i386, x86_64, ARM, PPC, PPC64, IA64
+
+The code was run-tested on the following platforms:
+
+ i386(UP/SMP), x86_64(UP/SMP), ARM, PPC
+
+hrtimers were also integrated into the -rt tree, along with a
+hrtimers-based high-resolution clock implementation, so the hrtimers
+code got a healthy amount of testing and use in practice.
+
+	Thomas Gleixner, Ingo Molnar
diff --git a/Documentation/timers/hrtimers.txt b/Documentation/timers/hrtimers.txt
deleted file mode 100644
index 588d85724f10..000000000000
--- a/Documentation/timers/hrtimers.txt
+++ /dev/null
@@ -1,178 +0,0 @@
-
-hrtimers - subsystem for high-resolution kernel timers
-----------------------------------------------------
-
-This patch introduces a new subsystem for high-resolution kernel timers.
-
-One might ask the question: we already have a timer subsystem
-(kernel/timers.c), why do we need two timer subsystems? After a lot of
-back and forth trying to integrate high-resolution and high-precision
-features into the existing timer framework, and after testing various
-such high-resolution timer implementations in practice, we came to the
-conclusion that the timer wheel code is fundamentally not suitable for
-such an approach. We initially didn't believe this ('there must be a way
-to solve this'), and spent a considerable effort trying to integrate
-things into the timer wheel, but we failed. In hindsight, there are
-several reasons why such integration is hard/impossible:
-
-- the forced handling of low-resolution and high-resolution timers in
-  the same way leads to a lot of compromises, macro magic and #ifdef
-  mess. The timers.c code is very "tightly coded" around jiffies and
-  32-bitness assumptions, and has been honed and micro-optimized for a
-  relatively narrow use case (jiffies in a relatively narrow HZ range)
-  for many years - and thus even small extensions to it easily break
-  the wheel concept, leading to even worse compromises. The timer wheel
-  code is very good and tight code, there's zero problems with it in its
-  current usage - but it is simply not suitable to be extended for
-  high-res timers.
-
-- the unpredictable [O(N)] overhead of cascading leads to delays which
-  necessitate a more complex handling of high resolution timers, which
-  in turn decreases robustness. Such a design still leads to rather large
-  timing inaccuracies. Cascading is a fundamental property of the timer
-  wheel concept, it cannot be 'designed out' without inevitably
-  degrading other portions of the timers.c code in an unacceptable way.
-
-- the implementation of the current posix-timer subsystem on top of
-  the timer wheel has already introduced a quite complex handling of
-  the required readjusting of absolute CLOCK_REALTIME timers at
-  settimeofday or NTP time - further underlying our experience by
-  example: that the timer wheel data structure is too rigid for high-res
-  timers.
-
-- the timer wheel code is most optimal for use cases which can be
-  identified as "timeouts". Such timeouts are usually set up to cover
-  error conditions in various I/O paths, such as networking and block
-  I/O. The vast majority of those timers never expire and are rarely
-  recascaded because the expected correct event arrives in time so they
-  can be removed from the timer wheel before any further processing of
-  them becomes necessary. Thus the users of these timeouts can accept
-  the granularity and precision tradeoffs of the timer wheel, and
-  largely expect the timer subsystem to have near-zero overhead.
-  Accurate timing for them is not a core purpose - in fact most of the
-  timeout values used are ad-hoc. For them it is at most a necessary
-  evil to guarantee the processing of actual timeout completions
-  (because most of the timeouts are deleted before completion), which
-  should thus be as cheap and unintrusive as possible.
-
-The primary users of precision timers are user-space applications that
-utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel
-users like drivers and subsystems which require precise timed events
-(e.g. multimedia) can benefit from the availability of a separate
-high-resolution timer subsystem as well.
-
-While this subsystem does not offer high-resolution clock sources just
-yet, the hrtimer subsystem can be easily extended with high-resolution
-clock capabilities, and patches for that exist and are maturing quickly.
-The increasing demand for realtime and multimedia applications along
-with other potential users for precise timers gives another reason to
-separate the "timeout" and "precise timer" subsystems.
-
-Another potential benefit is that such a separation allows even more
-special-purpose optimization of the existing timer wheel for the low
-resolution and low precision use cases - once the precision-sensitive
-APIs are separated from the timer wheel and are migrated over to
-hrtimers. E.g. we could decrease the frequency of the timeout subsystem
-from 250 Hz to 100 HZ (or even smaller).
-
-hrtimer subsystem implementation details
-----------------------------------------
-
-the basic design considerations were:
-
-- simplicity
-
-- data structure not bound to jiffies or any other granularity. All the
-  kernel logic works at 64-bit nanoseconds resolution - no compromises.
-
-- simplification of existing, timing related kernel code
-
-another basic requirement was the immediate enqueueing and ordering of
-timers at activation time. After looking at several possible solutions
-such as radix trees and hashes, we chose the red black tree as the basic
-data structure. Rbtrees are available as a library in the kernel and are
-used in various performance-critical areas of e.g. memory management and
-file systems. The rbtree is solely used for time sorted ordering, while
-a separate list is used to give the expiry code fast access to the
-queued timers, without having to walk the rbtree.
-
-(This separate list is also useful for later when we'll introduce
-high-resolution clocks, where we need separate pending and expired
-queues while keeping the time-order intact.)
-
-Time-ordered enqueueing is not purely for the purposes of
-high-resolution clocks though, it also simplifies the handling of
-absolute timers based on a low-resolution CLOCK_REALTIME. The existing
-implementation needed to keep an extra list of all armed absolute
-CLOCK_REALTIME timers along with complex locking. In case of
-settimeofday and NTP, all the timers (!) had to be dequeued, the
-time-changing code had to fix them up one by one, and all of them had to
-be enqueued again. The time-ordered enqueueing and the storage of the
-expiry time in absolute time units removes all this complex and poorly
-scaling code from the posix-timer implementation - the clock can simply
-be set without having to touch the rbtree. This also makes the handling
-of posix-timers simpler in general.
-
-The locking and per-CPU behavior of hrtimers was mostly taken from the
-existing timer wheel code, as it is mature and well suited. Sharing code
-was not really a win, due to the different data structures. Also, the
-hrtimer functions now have clearer behavior and clearer names - such as
-hrtimer_try_to_cancel() and hrtimer_cancel() [which are roughly
-equivalent to del_timer() and del_timer_sync()] - so there's no direct
-1:1 mapping between them on the algorithmic level, and thus no real
-potential for code sharing either.
-
-Basic data types: every time value, absolute or relative, is in a
-special nanosecond-resolution type: ktime_t. The kernel-internal
-representation of ktime_t values and operations is implemented via
-macros and inline functions, and can be switched between a "hybrid
-union" type and a plain "scalar" 64bit nanoseconds representation (at
-compile time). The hybrid union type optimizes time conversions on 32bit
-CPUs. This build-time-selectable ktime_t storage format was implemented
-to avoid the performance impact of 64-bit multiplications and divisions
-on 32bit CPUs. Such operations are frequently necessary to convert
-between the storage formats provided by kernel and userspace interfaces
-and the internal time format. (See include/linux/ktime.h for further
-details.)
-
-hrtimers - rounding of timer values
------------------------------------
-
-the hrtimer code will round timer events to lower-resolution clocks
-because it has to. Otherwise it will do no artificial rounding at all.
-
-one question is, what resolution value should be returned to the user by
-the clock_getres() interface. This will return whatever real resolution
-a given clock has - be it low-res, high-res, or artificially-low-res.
-
-hrtimers - testing and verification
-----------------------------------
-
-We used the high-resolution clock subsystem ontop of hrtimers to verify
-the hrtimer implementation details in praxis, and we also ran the posix
-timer tests in order to ensure specification compliance. We also ran
-tests on low-resolution clocks.
-
-The hrtimer patch converts the following kernel functionality to use
-hrtimers:
-
- - nanosleep
- - itimers
- - posix-timers
-
-The conversion of nanosleep and posix-timers enabled the unification of
-nanosleep and clock_nanosleep.
-
-The code was successfully compiled for the following platforms:
-
- i386, x86_64, ARM, PPC, PPC64, IA64
-
-The code was run-tested on the following platforms:
-
- i386(UP/SMP), x86_64(UP/SMP), ARM, PPC
-
-hrtimers were also integrated into the -rt tree, along with a
-hrtimers-based high-resolution clock implementation, so the hrtimers
-code got a healthy amount of testing and use in practice.
-
-	Thomas Gleixner, Ingo Molnar
diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst
new file mode 100644
index 000000000000..91f6f8263c48
--- /dev/null
+++ b/Documentation/timers/index.rst
@@ -0,0 +1,22 @@
+:orphan:
+
+======
+timers
+======
+
+.. toctree::
+    :maxdepth: 1
+
+    highres
+    hpet
+    hrtimers
+    no_hz
+    timekeeping
+    timers-howto
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/timers/no_hz.rst b/Documentation/timers/no_hz.rst
new file mode 100644
index 000000000000..065db217cb04
--- /dev/null
+++ b/Documentation/timers/no_hz.rst
@@ -0,0 +1,326 @@
+﻿======================================
+NO_HZ: Reducing Scheduling-Clock Ticks
+======================================
+
+
+This document describes Kconfig options and boot parameters that can
+reduce the number of scheduling-clock interrupts, thereby improving energy
+efficiency and reducing OS jitter.  Reducing OS jitter is important for
+some types of computationally intensive high-performance computing (HPC)
+applications and for real-time applications.
+
+There are three main ways of managing scheduling-clock interrupts
+(also known as "scheduling-clock ticks" or simply "ticks"):
+
+1.	Never omit scheduling-clock ticks (CONFIG_HZ_PERIODIC=y or
+	CONFIG_NO_HZ=n for older kernels).  You normally will -not-
+	want to choose this option.
+
+2.	Omit scheduling-clock ticks on idle CPUs (CONFIG_NO_HZ_IDLE=y or
+	CONFIG_NO_HZ=y for older kernels).  This is the most common
+	approach, and should be the default.
+
+3.	Omit scheduling-clock ticks on CPUs that are either idle or that
+	have only one runnable task (CONFIG_NO_HZ_FULL=y).  Unless you
+	are running realtime applications or certain types of HPC
+	workloads, you will normally -not- want this option.
+
+These three cases are described in the following three sections, followed
+by a third section on RCU-specific considerations, a fourth section
+discussing testing, and a fifth and final section listing known issues.
+
+
+Never Omit Scheduling-Clock Ticks
+=================================
+
+Very old versions of Linux from the 1990s and the very early 2000s
+are incapable of omitting scheduling-clock ticks.  It turns out that
+there are some situations where this old-school approach is still the
+right approach, for example, in heavy workloads with lots of tasks
+that use short bursts of CPU, where there are very frequent idle
+periods, but where these idle periods are also quite short (tens or
+hundreds of microseconds).  For these types of workloads, scheduling
+clock interrupts will normally be delivered any way because there
+will frequently be multiple runnable tasks per CPU.  In these cases,
+attempting to turn off the scheduling clock interrupt will have no effect
+other than increasing the overhead of switching to and from idle and
+transitioning between user and kernel execution.
+
+This mode of operation can be selected using CONFIG_HZ_PERIODIC=y (or
+CONFIG_NO_HZ=n for older kernels).
+
+However, if you are instead running a light workload with long idle
+periods, failing to omit scheduling-clock interrupts will result in
+excessive power consumption.  This is especially bad on battery-powered
+devices, where it results in extremely short battery lifetimes.  If you
+are running light workloads, you should therefore read the following
+section.
+
+In addition, if you are running either a real-time workload or an HPC
+workload with short iterations, the scheduling-clock interrupts can
+degrade your applications performance.  If this describes your workload,
+you should read the following two sections.
+
+
+Omit Scheduling-Clock Ticks For Idle CPUs
+=========================================
+
+If a CPU is idle, there is little point in sending it a scheduling-clock
+interrupt.  After all, the primary purpose of a scheduling-clock interrupt
+is to force a busy CPU to shift its attention among multiple duties,
+and an idle CPU has no duties to shift its attention among.
+
+The CONFIG_NO_HZ_IDLE=y Kconfig option causes the kernel to avoid sending
+scheduling-clock interrupts to idle CPUs, which is critically important
+both to battery-powered devices and to highly virtualized mainframes.
+A battery-powered device running a CONFIG_HZ_PERIODIC=y kernel would
+drain its battery very quickly, easily 2-3 times as fast as would the
+same device running a CONFIG_NO_HZ_IDLE=y kernel.  A mainframe running
+1,500 OS instances might find that half of its CPU time was consumed by
+unnecessary scheduling-clock interrupts.  In these situations, there
+is strong motivation to avoid sending scheduling-clock interrupts to
+idle CPUs.  That said, dyntick-idle mode is not free:
+
+1.	It increases the number of instructions executed on the path
+	to and from the idle loop.
+
+2.	On many architectures, dyntick-idle mode also increases the
+	number of expensive clock-reprogramming operations.
+
+Therefore, systems with aggressive real-time response constraints often
+run CONFIG_HZ_PERIODIC=y kernels (or CONFIG_NO_HZ=n for older kernels)
+in order to avoid degrading from-idle transition latencies.
+
+An idle CPU that is not receiving scheduling-clock interrupts is said to
+be "dyntick-idle", "in dyntick-idle mode", "in nohz mode", or "running
+tickless".  The remainder of this document will use "dyntick-idle mode".
+
+There is also a boot parameter "nohz=" that can be used to disable
+dyntick-idle mode in CONFIG_NO_HZ_IDLE=y kernels by specifying "nohz=off".
+By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling
+dyntick-idle mode.
+
+
+Omit Scheduling-Clock Ticks For CPUs With Only One Runnable Task
+================================================================
+
+If a CPU has only one runnable task, there is little point in sending it
+a scheduling-clock interrupt because there is no other task to switch to.
+Note that omitting scheduling-clock ticks for CPUs with only one runnable
+task implies also omitting them for idle CPUs.
+
+The CONFIG_NO_HZ_FULL=y Kconfig option causes the kernel to avoid
+sending scheduling-clock interrupts to CPUs with a single runnable task,
+and such CPUs are said to be "adaptive-ticks CPUs".  This is important
+for applications with aggressive real-time response constraints because
+it allows them to improve their worst-case response times by the maximum
+duration of a scheduling-clock interrupt.  It is also important for
+computationally intensive short-iteration workloads:  If any CPU is
+delayed during a given iteration, all the other CPUs will be forced to
+wait idle while the delayed CPU finishes.  Thus, the delay is multiplied
+by one less than the number of CPUs.  In these situations, there is
+again strong motivation to avoid sending scheduling-clock interrupts.
+
+By default, no CPU will be an adaptive-ticks CPU.  The "nohz_full="
+boot parameter specifies the adaptive-ticks CPUs.  For example,
+"nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks
+CPUs.  Note that you are prohibited from marking all of the CPUs as
+adaptive-tick CPUs:  At least one non-adaptive-tick CPU must remain
+online to handle timekeeping tasks in order to ensure that system
+calls like gettimeofday() returns accurate values on adaptive-tick CPUs.
+(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no running
+user processes to observe slight drifts in clock rate.)  Therefore, the
+boot CPU is prohibited from entering adaptive-ticks mode.  Specifying a
+"nohz_full=" mask that includes the boot CPU will result in a boot-time
+error message, and the boot CPU will be removed from the mask.  Note that
+this means that your system must have at least two CPUs in order for
+CONFIG_NO_HZ_FULL=y to do anything for you.
+
+Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded.
+This is covered in the "RCU IMPLICATIONS" section below.
+
+Normally, a CPU remains in adaptive-ticks mode as long as possible.
+In particular, transitioning to kernel mode does not automatically change
+the mode.  Instead, the CPU will exit adaptive-ticks mode only if needed,
+for example, if that CPU enqueues an RCU callback.
+
+Just as with dyntick-idle mode, the benefits of adaptive-tick mode do
+not come for free:
+
+1.	CONFIG_NO_HZ_FULL selects CONFIG_NO_HZ_COMMON, so you cannot run
+	adaptive ticks without also running dyntick idle.  This dependency
+	extends down into the implementation, so that all of the costs
+	of CONFIG_NO_HZ_IDLE are also incurred by CONFIG_NO_HZ_FULL.
+
+2.	The user/kernel transitions are slightly more expensive due
+	to the need to inform kernel subsystems (such as RCU) about
+	the change in mode.
+
+3.	POSIX CPU timers prevent CPUs from entering adaptive-tick mode.
+	Real-time applications needing to take actions based on CPU time
+	consumption need to use other means of doing so.
+
+4.	If there are more perf events pending than the hardware can
+	accommodate, they are normally round-robined so as to collect
+	all of them over time.  Adaptive-tick mode may prevent this
+	round-robining from happening.  This will likely be fixed by
+	preventing CPUs with large numbers of perf events pending from
+	entering adaptive-tick mode.
+
+5.	Scheduler statistics for adaptive-tick CPUs may be computed
+	slightly differently than those for non-adaptive-tick CPUs.
+	This might in turn perturb load-balancing of real-time tasks.
+
+6.	The LB_BIAS scheduler feature is disabled by adaptive ticks.
+
+Although improvements are expected over time, adaptive ticks is quite
+useful for many types of real-time and compute-intensive applications.
+However, the drawbacks listed above mean that adaptive ticks should not
+(yet) be enabled by default.
+
+
+RCU Implications
+================
+
+There are situations in which idle CPUs cannot be permitted to
+enter either dyntick-idle mode or adaptive-tick mode, the most
+common being when that CPU has RCU callbacks pending.
+
+The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs
+to enter dyntick-idle mode or adaptive-tick mode anyway.  In this case,
+a timer will awaken these CPUs every four jiffies in order to ensure
+that the RCU callbacks are processed in a timely fashion.
+
+Another approach is to offload RCU callback processing to "rcuo" kthreads
+using the CONFIG_RCU_NOCB_CPU=y Kconfig option.  The specific CPUs to
+offload may be selected using The "rcu_nocbs=" kernel boot parameter,
+which takes a comma-separated list of CPUs and CPU ranges, for example,
+"1,3-5" selects CPUs 1, 3, 4, and 5.
+
+The offloaded CPUs will never queue RCU callbacks, and therefore RCU
+never prevents offloaded CPUs from entering either dyntick-idle mode
+or adaptive-tick mode.  That said, note that it is up to userspace to
+pin the "rcuo" kthreads to specific CPUs if desired.  Otherwise, the
+scheduler will decide where to run them, which might or might not be
+where you want them to run.
+
+
+Testing
+=======
+
+So you enable all the OS-jitter features described in this document,
+but do not see any change in your workload's behavior.  Is this because
+your workload isn't affected that much by OS jitter, or is it because
+something else is in the way?  This section helps answer this question
+by providing a simple OS-jitter test suite, which is available on branch
+master of the following git archive:
+
+git://git.kernel.org/pub/scm/linux/kernel/git/frederic/dynticks-testing.git
+
+Clone this archive and follow the instructions in the README file.
+This test procedure will produce a trace that will allow you to evaluate
+whether or not you have succeeded in removing OS jitter from your system.
+If this trace shows that you have removed OS jitter as much as is
+possible, then you can conclude that your workload is not all that
+sensitive to OS jitter.
+
+Note: this test requires that your system have at least two CPUs.
+We do not currently have a good way to remove OS jitter from single-CPU
+systems.
+
+
+Known Issues
+============
+
+*	Dyntick-idle slows transitions to and from idle slightly.
+	In practice, this has not been a problem except for the most
+	aggressive real-time workloads, which have the option of disabling
+	dyntick-idle mode, an option that most of them take.  However,
+	some workloads will no doubt want to use adaptive ticks to
+	eliminate scheduling-clock interrupt latencies.  Here are some
+	options for these workloads:
+
+	a.	Use PMQOS from userspace to inform the kernel of your
+		latency requirements (preferred).
+
+	b.	On x86 systems, use the "idle=mwait" boot parameter.
+
+	c.	On x86 systems, use the "intel_idle.max_cstate=" to limit
+	`	the maximum C-state depth.
+
+	d.	On x86 systems, use the "idle=poll" boot parameter.
+		However, please note that use of this parameter can cause
+		your CPU to overheat, which may cause thermal throttling
+		to degrade your latencies -- and that this degradation can
+		be even worse than that of dyntick-idle.  Furthermore,
+		this parameter effectively disables Turbo Mode on Intel
+		CPUs, which can significantly reduce maximum performance.
+
+*	Adaptive-ticks slows user/kernel transitions slightly.
+	This is not expected to be a problem for computationally intensive
+	workloads, which have few such transitions.  Careful benchmarking
+	will be required to determine whether or not other workloads
+	are significantly affected by this effect.
+
+*	Adaptive-ticks does not do anything unless there is only one
+	runnable task for a given CPU, even though there are a number
+	of other situations where the scheduling-clock tick is not
+	needed.  To give but one example, consider a CPU that has one
+	runnable high-priority SCHED_FIFO task and an arbitrary number
+	of low-priority SCHED_OTHER tasks.  In this case, the CPU is
+	required to run the SCHED_FIFO task until it either blocks or
+	some other higher-priority task awakens on (or is assigned to)
+	this CPU, so there is no point in sending a scheduling-clock
+	interrupt to this CPU.	However, the current implementation
+	nevertheless sends scheduling-clock interrupts to CPUs having a
+	single runnable SCHED_FIFO task and multiple runnable SCHED_OTHER
+	tasks, even though these interrupts are unnecessary.
+
+	And even when there are multiple runnable tasks on a given CPU,
+	there is little point in interrupting that CPU until the current
+	running task's timeslice expires, which is almost always way
+	longer than the time of the next scheduling-clock interrupt.
+
+	Better handling of these sorts of situations is future work.
+
+*	A reboot is required to reconfigure both adaptive idle and RCU
+	callback offloading.  Runtime reconfiguration could be provided
+	if needed, however, due to the complexity of reconfiguring RCU at
+	runtime, there would need to be an earthshakingly good reason.
+	Especially given that you have the straightforward option of
+	simply offloading RCU callbacks from all CPUs and pinning them
+	where you want them whenever you want them pinned.
+
+*	Additional configuration is required to deal with other sources
+	of OS jitter, including interrupts and system-utility tasks
+	and processes.  This configuration normally involves binding
+	interrupts and tasks to particular CPUs.
+
+*	Some sources of OS jitter can currently be eliminated only by
+	constraining the workload.  For example, the only way to eliminate
+	OS jitter due to global TLB shootdowns is to avoid the unmapping
+	operations (such as kernel module unload operations) that
+	result in these shootdowns.  For another example, page faults
+	and TLB misses can be reduced (and in some cases eliminated) by
+	using huge pages and by constraining the amount of memory used
+	by the application.  Pre-faulting the working set can also be
+	helpful, especially when combined with the mlock() and mlockall()
+	system calls.
+
+*	Unless all CPUs are idle, at least one CPU must keep the
+	scheduling-clock interrupt going in order to support accurate
+	timekeeping.
+
+*	If there might potentially be some adaptive-ticks CPUs, there
+	will be at least one CPU keeping the scheduling-clock interrupt
+	going, even if all CPUs are otherwise idle.
+
+	Better handling of this situation is ongoing work.
+
+*	Some process-handling operations still require the occasional
+	scheduling-clock tick.	These operations include calculating CPU
+	load, maintaining sched average, computing CFS entity vruntime,
+	computing avenrun, and carrying out load balancing.  They are
+	currently accommodated by scheduling-clock tick every second
+	or so.	On-going work will eliminate the need even for these
+	infrequent scheduling-clock ticks.
diff --git a/Documentation/timers/timekeeping.rst b/Documentation/timers/timekeeping.rst
new file mode 100644
index 000000000000..f83e98852e2c
--- /dev/null
+++ b/Documentation/timers/timekeeping.rst
@@ -0,0 +1,180 @@
+===========================================================
+Clock sources, Clock events, sched_clock() and delay timers
+===========================================================
+
+This document tries to briefly explain some basic kernel timekeeping
+abstractions. It partly pertains to the drivers usually found in
+drivers/clocksource in the kernel tree, but the code may be spread out
+across the kernel.
+
+If you grep through the kernel source you will find a number of architecture-
+specific implementations of clock sources, clockevents and several likewise
+architecture-specific overrides of the sched_clock() function and some
+delay timers.
+
+To provide timekeeping for your platform, the clock source provides
+the basic timeline, whereas clock events shoot interrupts on certain points
+on this timeline, providing facilities such as high-resolution timers.
+sched_clock() is used for scheduling and timestamping, and delay timers
+provide an accurate delay source using hardware counters.
+
+
+Clock sources
+-------------
+
+The purpose of the clock source is to provide a timeline for the system that
+tells you where you are in time. For example issuing the command 'date' on
+a Linux system will eventually read the clock source to determine exactly
+what time it is.
+
+Typically the clock source is a monotonic, atomic counter which will provide
+n bits which count from 0 to (2^n)-1 and then wraps around to 0 and start over.
+It will ideally NEVER stop ticking as long as the system is running. It
+may stop during system suspend.
+
+The clock source shall have as high resolution as possible, and the frequency
+shall be as stable and correct as possible as compared to a real-world wall
+clock. It should not move unpredictably back and forth in time or miss a few
+cycles here and there.
+
+It must be immune to the kind of effects that occur in hardware where e.g.
+the counter register is read in two phases on the bus lowest 16 bits first
+and the higher 16 bits in a second bus cycle with the counter bits
+potentially being updated in between leading to the risk of very strange
+values from the counter.
+
+When the wall-clock accuracy of the clock source isn't satisfactory, there
+are various quirks and layers in the timekeeping code for e.g. synchronizing
+the user-visible time to RTC clocks in the system or against networked time
+servers using NTP, but all they do basically is update an offset against
+the clock source, which provides the fundamental timeline for the system.
+These measures does not affect the clock source per se, they only adapt the
+system to the shortcomings of it.
+
+The clock source struct shall provide means to translate the provided counter
+into a nanosecond value as an unsigned long long (unsigned 64 bit) number.
+Since this operation may be invoked very often, doing this in a strict
+mathematical sense is not desirable: instead the number is taken as close as
+possible to a nanosecond value using only the arithmetic operations
+multiply and shift, so in clocksource_cyc2ns() you find:
+
+  ns ~= (clocksource * mult) >> shift
+
+You will find a number of helper functions in the clock source code intended
+to aid in providing these mult and shift values, such as
+clocksource_khz2mult(), clocksource_hz2mult() that help determine the
+mult factor from a fixed shift, and clocksource_register_hz() and
+clocksource_register_khz() which will help out assigning both shift and mult
+factors using the frequency of the clock source as the only input.
+
+For real simple clock sources accessed from a single I/O memory location
+there is nowadays even clocksource_mmio_init() which will take a memory
+location, bit width, a parameter telling whether the counter in the
+register counts up or down, and the timer clock rate, and then conjure all
+necessary parameters.
+
+Since a 32-bit counter at say 100 MHz will wrap around to zero after some 43
+seconds, the code handling the clock source will have to compensate for this.
+That is the reason why the clock source struct also contains a 'mask'
+member telling how many bits of the source are valid. This way the timekeeping
+code knows when the counter will wrap around and can insert the necessary
+compensation code on both sides of the wrap point so that the system timeline
+remains monotonic.
+
+
+Clock events
+------------
+
+Clock events are the conceptual reverse of clock sources: they take a
+desired time specification value and calculate the values to poke into
+hardware timer registers.
+
+Clock events are orthogonal to clock sources. The same hardware
+and register range may be used for the clock event, but it is essentially
+a different thing. The hardware driving clock events has to be able to
+fire interrupts, so as to trigger events on the system timeline. On an SMP
+system, it is ideal (and customary) to have one such event driving timer per
+CPU core, so that each core can trigger events independently of any other
+core.
+
+You will notice that the clock event device code is based on the same basic
+idea about translating counters to nanoseconds using mult and shift
+arithmetic, and you find the same family of helper functions again for
+assigning these values. The clock event driver does not need a 'mask'
+attribute however: the system will not try to plan events beyond the time
+horizon of the clock event.
+
+
+sched_clock()
+-------------
+
+In addition to the clock sources and clock events there is a special weak
+function in the kernel called sched_clock(). This function shall return the
+number of nanoseconds since the system was started. An architecture may or
+may not provide an implementation of sched_clock() on its own. If a local
+implementation is not provided, the system jiffy counter will be used as
+sched_clock().
+
+As the name suggests, sched_clock() is used for scheduling the system,
+determining the absolute timeslice for a certain process in the CFS scheduler
+for example. It is also used for printk timestamps when you have selected to
+include time information in printk for things like bootcharts.
+
+Compared to clock sources, sched_clock() has to be very fast: it is called
+much more often, especially by the scheduler. If you have to do trade-offs
+between accuracy compared to the clock source, you may sacrifice accuracy
+for speed in sched_clock(). It however requires some of the same basic
+characteristics as the clock source, i.e. it should be monotonic.
+
+The sched_clock() function may wrap only on unsigned long long boundaries,
+i.e. after 64 bits. Since this is a nanosecond value this will mean it wraps
+after circa 585 years. (For most practical systems this means "never".)
+
+If an architecture does not provide its own implementation of this function,
+it will fall back to using jiffies, making its maximum resolution 1/HZ of the
+jiffy frequency for the architecture. This will affect scheduling accuracy
+and will likely show up in system benchmarks.
+
+The clock driving sched_clock() may stop or reset to zero during system
+suspend/sleep. This does not matter to the function it serves of scheduling
+events on the system. However it may result in interesting timestamps in
+printk().
+
+The sched_clock() function should be callable in any context, IRQ- and
+NMI-safe and return a sane value in any context.
+
+Some architectures may have a limited set of time sources and lack a nice
+counter to derive a 64-bit nanosecond value, so for example on the ARM
+architecture, special helper functions have been created to provide a
+sched_clock() nanosecond base from a 16- or 32-bit counter. Sometimes the
+same counter that is also used as clock source is used for this purpose.
+
+On SMP systems, it is crucial for performance that sched_clock() can be called
+independently on each CPU without any synchronization performance hits.
+Some hardware (such as the x86 TSC) will cause the sched_clock() function to
+drift between the CPUs on the system. The kernel can work around this by
+enabling the CONFIG_HAVE_UNSTABLE_SCHED_CLOCK option. This is another aspect
+that makes sched_clock() different from the ordinary clock source.
+
+
+Delay timers (some architectures only)
+--------------------------------------
+
+On systems with variable CPU frequency, the various kernel delay() functions
+will sometimes behave strangely. Basically these delays usually use a hard
+loop to delay a certain number of jiffy fractions using a "lpj" (loops per
+jiffy) value, calibrated on boot.
+
+Let's hope that your system is running on maximum frequency when this value
+is calibrated: as an effect when the frequency is geared down to half the
+full frequency, any delay() will be twice as long. Usually this does not
+hurt, as you're commonly requesting that amount of delay *or more*. But
+basically the semantics are quite unpredictable on such systems.
+
+Enter timer-based delays. Using these, a timer read may be used instead of
+a hard-coded loop for providing the desired delay.
+
+This is done by declaring a struct delay_timer and assigning the appropriate
+function pointers and rate settings for this delay timer.
+
+This is available on some architectures like OpenRISC or ARM.
diff --git a/Documentation/timers/timekeeping.txt b/Documentation/timers/timekeeping.txt
deleted file mode 100644
index 2d1732b0a868..000000000000
--- a/Documentation/timers/timekeeping.txt
+++ /dev/null
@@ -1,179 +0,0 @@
-Clock sources, Clock events, sched_clock() and delay timers
------------------------------------------------------------
-
-This document tries to briefly explain some basic kernel timekeeping
-abstractions. It partly pertains to the drivers usually found in
-drivers/clocksource in the kernel tree, but the code may be spread out
-across the kernel.
-
-If you grep through the kernel source you will find a number of architecture-
-specific implementations of clock sources, clockevents and several likewise
-architecture-specific overrides of the sched_clock() function and some
-delay timers.
-
-To provide timekeeping for your platform, the clock source provides
-the basic timeline, whereas clock events shoot interrupts on certain points
-on this timeline, providing facilities such as high-resolution timers.
-sched_clock() is used for scheduling and timestamping, and delay timers
-provide an accurate delay source using hardware counters.
-
-
-Clock sources
--------------
-
-The purpose of the clock source is to provide a timeline for the system that
-tells you where you are in time. For example issuing the command 'date' on
-a Linux system will eventually read the clock source to determine exactly
-what time it is.
-
-Typically the clock source is a monotonic, atomic counter which will provide
-n bits which count from 0 to (2^n)-1 and then wraps around to 0 and start over.
-It will ideally NEVER stop ticking as long as the system is running. It
-may stop during system suspend.
-
-The clock source shall have as high resolution as possible, and the frequency
-shall be as stable and correct as possible as compared to a real-world wall
-clock. It should not move unpredictably back and forth in time or miss a few
-cycles here and there.
-
-It must be immune to the kind of effects that occur in hardware where e.g.
-the counter register is read in two phases on the bus lowest 16 bits first
-and the higher 16 bits in a second bus cycle with the counter bits
-potentially being updated in between leading to the risk of very strange
-values from the counter.
-
-When the wall-clock accuracy of the clock source isn't satisfactory, there
-are various quirks and layers in the timekeeping code for e.g. synchronizing
-the user-visible time to RTC clocks in the system or against networked time
-servers using NTP, but all they do basically is update an offset against
-the clock source, which provides the fundamental timeline for the system.
-These measures does not affect the clock source per se, they only adapt the
-system to the shortcomings of it.
-
-The clock source struct shall provide means to translate the provided counter
-into a nanosecond value as an unsigned long long (unsigned 64 bit) number.
-Since this operation may be invoked very often, doing this in a strict
-mathematical sense is not desirable: instead the number is taken as close as
-possible to a nanosecond value using only the arithmetic operations
-multiply and shift, so in clocksource_cyc2ns() you find:
-
-  ns ~= (clocksource * mult) >> shift
-
-You will find a number of helper functions in the clock source code intended
-to aid in providing these mult and shift values, such as
-clocksource_khz2mult(), clocksource_hz2mult() that help determine the
-mult factor from a fixed shift, and clocksource_register_hz() and
-clocksource_register_khz() which will help out assigning both shift and mult
-factors using the frequency of the clock source as the only input.
-
-For real simple clock sources accessed from a single I/O memory location
-there is nowadays even clocksource_mmio_init() which will take a memory
-location, bit width, a parameter telling whether the counter in the
-register counts up or down, and the timer clock rate, and then conjure all
-necessary parameters.
-
-Since a 32-bit counter at say 100 MHz will wrap around to zero after some 43
-seconds, the code handling the clock source will have to compensate for this.
-That is the reason why the clock source struct also contains a 'mask'
-member telling how many bits of the source are valid. This way the timekeeping
-code knows when the counter will wrap around and can insert the necessary
-compensation code on both sides of the wrap point so that the system timeline
-remains monotonic.
-
-
-Clock events
-------------
-
-Clock events are the conceptual reverse of clock sources: they take a
-desired time specification value and calculate the values to poke into
-hardware timer registers.
-
-Clock events are orthogonal to clock sources. The same hardware
-and register range may be used for the clock event, but it is essentially
-a different thing. The hardware driving clock events has to be able to
-fire interrupts, so as to trigger events on the system timeline. On an SMP
-system, it is ideal (and customary) to have one such event driving timer per
-CPU core, so that each core can trigger events independently of any other
-core.
-
-You will notice that the clock event device code is based on the same basic
-idea about translating counters to nanoseconds using mult and shift
-arithmetic, and you find the same family of helper functions again for
-assigning these values. The clock event driver does not need a 'mask'
-attribute however: the system will not try to plan events beyond the time
-horizon of the clock event.
-
-
-sched_clock()
--------------
-
-In addition to the clock sources and clock events there is a special weak
-function in the kernel called sched_clock(). This function shall return the
-number of nanoseconds since the system was started. An architecture may or
-may not provide an implementation of sched_clock() on its own. If a local
-implementation is not provided, the system jiffy counter will be used as
-sched_clock().
-
-As the name suggests, sched_clock() is used for scheduling the system,
-determining the absolute timeslice for a certain process in the CFS scheduler
-for example. It is also used for printk timestamps when you have selected to
-include time information in printk for things like bootcharts.
-
-Compared to clock sources, sched_clock() has to be very fast: it is called
-much more often, especially by the scheduler. If you have to do trade-offs
-between accuracy compared to the clock source, you may sacrifice accuracy
-for speed in sched_clock(). It however requires some of the same basic
-characteristics as the clock source, i.e. it should be monotonic.
-
-The sched_clock() function may wrap only on unsigned long long boundaries,
-i.e. after 64 bits. Since this is a nanosecond value this will mean it wraps
-after circa 585 years. (For most practical systems this means "never".)
-
-If an architecture does not provide its own implementation of this function,
-it will fall back to using jiffies, making its maximum resolution 1/HZ of the
-jiffy frequency for the architecture. This will affect scheduling accuracy
-and will likely show up in system benchmarks.
-
-The clock driving sched_clock() may stop or reset to zero during system
-suspend/sleep. This does not matter to the function it serves of scheduling
-events on the system. However it may result in interesting timestamps in
-printk().
-
-The sched_clock() function should be callable in any context, IRQ- and
-NMI-safe and return a sane value in any context.
-
-Some architectures may have a limited set of time sources and lack a nice
-counter to derive a 64-bit nanosecond value, so for example on the ARM
-architecture, special helper functions have been created to provide a
-sched_clock() nanosecond base from a 16- or 32-bit counter. Sometimes the
-same counter that is also used as clock source is used for this purpose.
-
-On SMP systems, it is crucial for performance that sched_clock() can be called
-independently on each CPU without any synchronization performance hits.
-Some hardware (such as the x86 TSC) will cause the sched_clock() function to
-drift between the CPUs on the system. The kernel can work around this by
-enabling the CONFIG_HAVE_UNSTABLE_SCHED_CLOCK option. This is another aspect
-that makes sched_clock() different from the ordinary clock source.
-
-
-Delay timers (some architectures only)
---------------------------------------
-
-On systems with variable CPU frequency, the various kernel delay() functions
-will sometimes behave strangely. Basically these delays usually use a hard
-loop to delay a certain number of jiffy fractions using a "lpj" (loops per
-jiffy) value, calibrated on boot.
-
-Let's hope that your system is running on maximum frequency when this value
-is calibrated: as an effect when the frequency is geared down to half the
-full frequency, any delay() will be twice as long. Usually this does not
-hurt, as you're commonly requesting that amount of delay *or more*. But
-basically the semantics are quite unpredictable on such systems.
-
-Enter timer-based delays. Using these, a timer read may be used instead of
-a hard-coded loop for providing the desired delay.
-
-This is done by declaring a struct delay_timer and assigning the appropriate
-function pointers and rate settings for this delay timer.
-
-This is available on some architectures like OpenRISC or ARM.
diff --git a/Documentation/timers/timers-howto.rst b/Documentation/timers/timers-howto.rst
new file mode 100644
index 000000000000..7e3167bec2b1
--- /dev/null
+++ b/Documentation/timers/timers-howto.rst
@@ -0,0 +1,112 @@
+===================================================================
+delays - Information on the various kernel delay / sleep mechanisms
+===================================================================
+
+This document seeks to answer the common question: "What is the
+RightWay (TM) to insert a delay?"
+
+This question is most often faced by driver writers who have to
+deal with hardware delays and who may not be the most intimately
+familiar with the inner workings of the Linux Kernel.
+
+
+Inserting Delays
+----------------
+
+The first, and most important, question you need to ask is "Is my
+code in an atomic context?"  This should be followed closely by "Does
+it really need to delay in atomic context?" If so...
+
+ATOMIC CONTEXT:
+	You must use the `*delay` family of functions. These
+	functions use the jiffie estimation of clock speed
+	and will busy wait for enough loop cycles to achieve
+	the desired delay:
+
+	ndelay(unsigned long nsecs)
+	udelay(unsigned long usecs)
+	mdelay(unsigned long msecs)
+
+	udelay is the generally preferred API; ndelay-level
+	precision may not actually exist on many non-PC devices.
+
+	mdelay is macro wrapper around udelay, to account for
+	possible overflow when passing large arguments to udelay.
+	In general, use of mdelay is discouraged and code should
+	be refactored to allow for the use of msleep.
+
+NON-ATOMIC CONTEXT:
+	You should use the `*sleep[_range]` family of functions.
+	There are a few more options here, while any of them may
+	work correctly, using the "right" sleep function will
+	help the scheduler, power management, and just make your
+	driver better :)
+
+	-- Backed by busy-wait loop:
+
+		udelay(unsigned long usecs)
+
+	-- Backed by hrtimers:
+
+		usleep_range(unsigned long min, unsigned long max)
+
+	-- Backed by jiffies / legacy_timers
+
+		msleep(unsigned long msecs)
+		msleep_interruptible(unsigned long msecs)
+
+	Unlike the `*delay` family, the underlying mechanism
+	driving each of these calls varies, thus there are
+	quirks you should be aware of.
+
+
+	SLEEPING FOR "A FEW" USECS ( < ~10us? ):
+		* Use udelay
+
+		- Why not usleep?
+			On slower systems, (embedded, OR perhaps a speed-
+			stepped PC!) the overhead of setting up the hrtimers
+			for usleep *may* not be worth it. Such an evaluation
+			will obviously depend on your specific situation, but
+			it is something to be aware of.
+
+	SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms):
+		* Use usleep_range
+
+		- Why not msleep for (1ms - 20ms)?
+			Explained originally here:
+				http://lkml.org/lkml/2007/8/3/250
+
+			msleep(1~20) may not do what the caller intends, and
+			will often sleep longer (~20 ms actual sleep for any
+			value given in the 1~20ms range). In many cases this
+			is not the desired behavior.
+
+		- Why is there no "usleep" / What is a good range?
+			Since usleep_range is built on top of hrtimers, the
+			wakeup will be very precise (ish), thus a simple
+			usleep function would likely introduce a large number
+			of undesired interrupts.
+
+			With the introduction of a range, the scheduler is
+			free to coalesce your wakeup with any other wakeup
+			that may have happened for other reasons, or at the
+			worst case, fire an interrupt for your upper bound.
+
+			The larger a range you supply, the greater a chance
+			that you will not trigger an interrupt; this should
+			be balanced with what is an acceptable upper bound on
+			delay / performance for your specific code path. Exact
+			tolerances here are very situation specific, thus it
+			is left to the caller to determine a reasonable range.
+
+	SLEEPING FOR LARGER MSECS ( 10ms+ )
+		* Use msleep or possibly msleep_interruptible
+
+		- What's the difference?
+			msleep sets the current task to TASK_UNINTERRUPTIBLE
+			whereas msleep_interruptible sets the current task to
+			TASK_INTERRUPTIBLE before scheduling the sleep. In
+			short, the difference is whether the sleep can be ended
+			early by a signal. In general, just use msleep unless
+			you know you have a need for the interruptible variant.
diff --git a/Documentation/timers/timers-howto.txt b/Documentation/timers/timers-howto.txt
deleted file mode 100644
index 038f8c77a076..000000000000
--- a/Documentation/timers/timers-howto.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-delays - Information on the various kernel delay / sleep mechanisms
--------------------------------------------------------------------
-
-This document seeks to answer the common question: "What is the
-RightWay (TM) to insert a delay?"
-
-This question is most often faced by driver writers who have to
-deal with hardware delays and who may not be the most intimately
-familiar with the inner workings of the Linux Kernel.
-
-
-Inserting Delays
-----------------
-
-The first, and most important, question you need to ask is "Is my
-code in an atomic context?"  This should be followed closely by "Does
-it really need to delay in atomic context?" If so...
-
-ATOMIC CONTEXT:
-	You must use the *delay family of functions. These
-	functions use the jiffie estimation of clock speed
-	and will busy wait for enough loop cycles to achieve
-	the desired delay:
-
-	ndelay(unsigned long nsecs)
-	udelay(unsigned long usecs)
-	mdelay(unsigned long msecs)
-
-	udelay is the generally preferred API; ndelay-level
-	precision may not actually exist on many non-PC devices.
-
-	mdelay is macro wrapper around udelay, to account for
-	possible overflow when passing large arguments to udelay.
-	In general, use of mdelay is discouraged and code should
-	be refactored to allow for the use of msleep.
-
-NON-ATOMIC CONTEXT:
-	You should use the *sleep[_range] family of functions.
-	There are a few more options here, while any of them may
-	work correctly, using the "right" sleep function will
-	help the scheduler, power management, and just make your
-	driver better :)
-
-	-- Backed by busy-wait loop:
-		udelay(unsigned long usecs)
-	-- Backed by hrtimers:
-		usleep_range(unsigned long min, unsigned long max)
-	-- Backed by jiffies / legacy_timers
-		msleep(unsigned long msecs)
-		msleep_interruptible(unsigned long msecs)
-
-	Unlike the *delay family, the underlying mechanism
-	driving each of these calls varies, thus there are
-	quirks you should be aware of.
-
-
-	SLEEPING FOR "A FEW" USECS ( < ~10us? ):
-		* Use udelay
-
-		- Why not usleep?
-			On slower systems, (embedded, OR perhaps a speed-
-			stepped PC!) the overhead of setting up the hrtimers
-			for usleep *may* not be worth it. Such an evaluation
-			will obviously depend on your specific situation, but
-			it is something to be aware of.
-
-	SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms):
-		* Use usleep_range
-
-		- Why not msleep for (1ms - 20ms)?
-			Explained originally here:
-				http://lkml.org/lkml/2007/8/3/250
-			msleep(1~20) may not do what the caller intends, and
-			will often sleep longer (~20 ms actual sleep for any
-			value given in the 1~20ms range). In many cases this
-			is not the desired behavior.
-
-		- Why is there no "usleep" / What is a good range?
-			Since usleep_range is built on top of hrtimers, the
-			wakeup will be very precise (ish), thus a simple
-			usleep function would likely introduce a large number
-			of undesired interrupts.
-
-			With the introduction of a range, the scheduler is
-			free to coalesce your wakeup with any other wakeup
-			that may have happened for other reasons, or at the
-			worst case, fire an interrupt for your upper bound.
-
-			The larger a range you supply, the greater a chance
-			that you will not trigger an interrupt; this should
-			be balanced with what is an acceptable upper bound on
-			delay / performance for your specific code path. Exact
-			tolerances here are very situation specific, thus it
-			is left to the caller to determine a reasonable range.
-
-	SLEEPING FOR LARGER MSECS ( 10ms+ )
-		* Use msleep or possibly msleep_interruptible
-
-		- What's the difference?
-			msleep sets the current task to TASK_UNINTERRUPTIBLE
-			whereas msleep_interruptible sets the current task to
-			TASK_INTERRUPTIBLE before scheduling the sleep. In
-			short, the difference is whether the sleep can be ended
-			early by a signal. In general, just use msleep unless
-			you know you have a need for the interruptible variant.
diff --git a/MAINTAINERS b/MAINTAINERS
index 5fe44d5d82b4..0db7f12439f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7192,7 +7192,7 @@ F:	drivers/net/ethernet/hp/hp100.*
 HPET:	High Precision Event Timers driver
 M:	Clemens Ladisch <clemens@ladisch.de>
 S:	Maintained
-F:	Documentation/timers/hpet.txt
+F:	Documentation/timers/hpet.rst
 F:	drivers/char/hpet.c
 F:	include/linux/hpet.h
 F:	include/uapi/linux/hpet.h
diff --git a/drivers/media/usb/dvb-usb-v2/anysee.c b/drivers/media/usb/dvb-usb-v2/anysee.c
index 48fb0d41e03b..fb6d99dea31a 100644
--- a/drivers/media/usb/dvb-usb-v2/anysee.c
+++ b/drivers/media/usb/dvb-usb-v2/anysee.c
@@ -56,7 +56,7 @@ static int anysee_ctrl_msg(struct dvb_usb_device *d,
 	/* TODO FIXME: dvb_usb_generic_rw() fails rarely with error code -32
 	 * (EPIPE, Broken pipe). Function supports currently msleep() as a
 	 * parameter but I would not like to use it, since according to
-	 * Documentation/timers/timers-howto.txt it should not be used such
+	 * Documentation/timers/timers-howto.rst it should not be used such
 	 * short, under < 20ms, sleeps. Repeating failed message would be
 	 * better choice as not to add unwanted delays...
 	 * Fixing that correctly is one of those or both;
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c894cf0d8a28..c5d8996d5165 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2304,7 +2304,7 @@ static int regulator_ena_gpio_ctrl(struct regulator_dev *rdev, bool enable)
  *
  * Delay for the requested amount of time as per the guidelines in:
  *
- *     Documentation/timers/timers-howto.txt
+ *     Documentation/timers/timers-howto.rst
  *
  * The assumption here is that regulators will never be enabled in
  * atomic context and therefore sleeping functions can be used.
diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 3908353deec6..35e15dfd4155 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -21,7 +21,7 @@
  * @cond: Break condition (usually involving @val)
  * @sleep_us: Maximum time to sleep between reads in us (0
  *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.txt).
+ *            is used (see Documentation/timers/timers-howto.rst).
  * @timeout_us: Timeout in us, 0 means never timeout
  *
  * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
@@ -60,7 +60,7 @@
  * @cond: Break condition (usually involving @val)
  * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
  *            be less than ~10us since udelay is used (see
- *            Documentation/timers/timers-howto.txt).
+ *            Documentation/timers/timers-howto.rst).
  * @timeout_us: Timeout in us, 0 means never timeout
  *
  * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index daeec7dbd65c..ed5e9d0a1285 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -112,7 +112,7 @@ struct reg_sequence {
  * @cond: Break condition (usually involving @val)
  * @sleep_us: Maximum time to sleep between reads in us (0
  *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.txt).
+ *            is used (see Documentation/timers/timers-howto.rst).
  * @timeout_us: Timeout in us, 0 means never timeout
  *
  * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read
@@ -154,7 +154,7 @@ struct reg_sequence {
  * @cond: Break condition (usually involving @val)
  * @sleep_us: Maximum time to sleep between reads in us (0
  *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.txt).
+ *            is used (see Documentation/timers/timers-howto.rst).
  * @timeout_us: Timeout in us, 0 means never timeout
  *
  * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 342c7c781ba5..a6d436809bf5 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5712,7 +5712,7 @@ sub process {
 			# ignore udelay's < 10, however
 			if (! ($delay < 10) ) {
 				CHK("USLEEP_RANGE",
-				    "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.txt\n" . $herecurr);
+				    "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.rst\n" . $herecurr);
 			}
 			if ($delay > 2000) {
 				WARN("LONG_UDELAY",
@@ -5724,7 +5724,7 @@ sub process {
 		if ($line =~ /\bmsleep\s*\((\d+)\);/) {
 			if ($1 < 20) {
 				WARN("MSLEEP",
-				     "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.txt\n" . $herecurr);
+				     "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst\n" . $herecurr);
 			}
 		}
 
@@ -6115,11 +6115,11 @@ sub process {
 			my $max = $7;
 			if ($min eq $max) {
 				WARN("USLEEP_RANGE",
-				     "usleep_range should not use min == max args; see Documentation/timers/timers-howto.txt\n" . "$here\n$stat\n");
+				     "usleep_range should not use min == max args; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n");
 			} elsif ($min =~ /^\d+$/ && $max =~ /^\d+$/ &&
 				 $min > $max) {
 				WARN("USLEEP_RANGE",
-				     "usleep_range args reversed, use min then max; see Documentation/timers/timers-howto.txt\n" . "$here\n$stat\n");
+				     "usleep_range args reversed, use min then max; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n");
 			}
 		}
 
diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h
index 80fc3b374c2b..8058a6c73082 100644
--- a/sound/soc/sof/ops.h
+++ b/sound/soc/sof/ops.h
@@ -349,7 +349,7 @@ static inline const struct snd_sof_dsp_ops
  * @cond: Break condition (usually involving @val)
  * @sleep_us: Maximum time to sleep between reads in us (0
  *            tight-loops).  Should be less than ~20ms since usleep_range
- *            is used (see Documentation/timers/timers-howto.txt).
+ *            is used (see Documentation/timers/timers-howto.rst).
  * @timeout_us: Timeout in us, 0 means never timeout
  *
  * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
-- 
cgit v1.2.3


From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 13 Jun 2019 07:10:36 -0300
Subject: docs: power: convert docs to ReST and rename to *.rst

Convert the PM documents to ReST, in order to allow them to
build with Sphinx.

The conversion is actually:
  - add blank lines and indentation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
---
 Documentation/ABI/testing/sysfs-class-powercap     |    2 +-
 Documentation/admin-guide/kernel-parameters.txt    |    6 +-
 Documentation/cpu-freq/core.txt                    |    2 +-
 Documentation/driver-api/pm/devices.rst            |    6 +-
 Documentation/driver-api/usb/power-management.rst  |    2 +-
 Documentation/power/apm-acpi.rst                   |   36 +
 Documentation/power/apm-acpi.txt                   |   32 -
 Documentation/power/basic-pm-debugging.rst         |  269 +++++
 Documentation/power/basic-pm-debugging.txt         |  254 -----
 Documentation/power/charger-manager.rst            |  205 ++++
 Documentation/power/charger-manager.txt            |  200 ----
 Documentation/power/drivers-testing.rst            |   51 +
 Documentation/power/drivers-testing.txt            |   46 -
 Documentation/power/energy-model.rst               |  147 +++
 Documentation/power/energy-model.txt               |  144 ---
 Documentation/power/freezing-of-tasks.rst          |  244 +++++
 Documentation/power/freezing-of-tasks.txt          |  231 ----
 Documentation/power/index.rst                      |   46 +
 Documentation/power/interface.rst                  |   79 ++
 Documentation/power/interface.txt                  |   77 --
 Documentation/power/opp.rst                        |  379 +++++++
 Documentation/power/opp.txt                        |  342 ------
 Documentation/power/pci.rst                        | 1135 ++++++++++++++++++++
 Documentation/power/pci.txt                        | 1094 -------------------
 Documentation/power/pm_qos_interface.rst           |  225 ++++
 Documentation/power/pm_qos_interface.txt           |  212 ----
 Documentation/power/power_supply_class.rst         |  282 +++++
 Documentation/power/power_supply_class.txt         |  231 ----
 Documentation/power/powercap/powercap.rst          |  257 +++++
 Documentation/power/powercap/powercap.txt          |  236 ----
 Documentation/power/regulator/consumer.rst         |  229 ++++
 Documentation/power/regulator/consumer.txt         |  218 ----
 Documentation/power/regulator/design.rst           |   38 +
 Documentation/power/regulator/design.txt           |   33 -
 Documentation/power/regulator/machine.rst          |   97 ++
 Documentation/power/regulator/machine.txt          |   96 --
 Documentation/power/regulator/overview.rst         |  178 +++
 Documentation/power/regulator/overview.txt         |  171 ---
 Documentation/power/regulator/regulator.rst        |   32 +
 Documentation/power/regulator/regulator.txt        |   30 -
 Documentation/power/runtime_pm.rst                 |  940 ++++++++++++++++
 Documentation/power/runtime_pm.txt                 |  928 ----------------
 Documentation/power/s2ram.rst                      |   87 ++
 Documentation/power/s2ram.txt                      |   85 --
 Documentation/power/suspend-and-cpuhotplug.rst     |  286 +++++
 Documentation/power/suspend-and-cpuhotplug.txt     |  274 -----
 Documentation/power/suspend-and-interrupts.rst     |  137 +++
 Documentation/power/suspend-and-interrupts.txt     |  135 ---
 Documentation/power/swsusp-and-swap-files.rst      |   63 ++
 Documentation/power/swsusp-and-swap-files.txt      |   60 --
 Documentation/power/swsusp-dmcrypt.rst             |  140 +++
 Documentation/power/swsusp-dmcrypt.txt             |  138 ---
 Documentation/power/swsusp.rst                     |  501 +++++++++
 Documentation/power/swsusp.txt                     |  446 --------
 Documentation/power/tricks.rst                     |   29 +
 Documentation/power/tricks.txt                     |   27 -
 Documentation/power/userland-swsusp.rst            |  191 ++++
 Documentation/power/userland-swsusp.txt            |  170 ---
 Documentation/power/video.rst                      |  213 ++++
 Documentation/power/video.txt                      |  185 ----
 Documentation/process/submitting-drivers.rst       |    2 +-
 Documentation/scheduler/sched-energy.txt           |    6 +-
 Documentation/trace/coresight-cpu-debug.txt        |    2 +-
 .../zh_CN/process/submitting-drivers.rst           |    2 +-
 MAINTAINERS                                        |    4 +-
 arch/x86/Kconfig                                   |    2 +-
 drivers/gpu/drm/i915/i915_drv.h                    |    2 +-
 drivers/opp/Kconfig                                |    2 +-
 drivers/power/supply/power_supply_core.c           |    2 +-
 include/linux/interrupt.h                          |    2 +-
 include/linux/pci.h                                |    2 +-
 include/linux/pm.h                                 |    2 +-
 kernel/power/Kconfig                               |    6 +-
 net/wireless/Kconfig                               |    2 +-
 74 files changed, 6544 insertions(+), 6123 deletions(-)
 create mode 100644 Documentation/power/apm-acpi.rst
 delete mode 100644 Documentation/power/apm-acpi.txt
 create mode 100644 Documentation/power/basic-pm-debugging.rst
 delete mode 100644 Documentation/power/basic-pm-debugging.txt
 create mode 100644 Documentation/power/charger-manager.rst
 delete mode 100644 Documentation/power/charger-manager.txt
 create mode 100644 Documentation/power/drivers-testing.rst
 delete mode 100644 Documentation/power/drivers-testing.txt
 create mode 100644 Documentation/power/energy-model.rst
 delete mode 100644 Documentation/power/energy-model.txt
 create mode 100644 Documentation/power/freezing-of-tasks.rst
 delete mode 100644 Documentation/power/freezing-of-tasks.txt
 create mode 100644 Documentation/power/index.rst
 create mode 100644 Documentation/power/interface.rst
 delete mode 100644 Documentation/power/interface.txt
 create mode 100644 Documentation/power/opp.rst
 delete mode 100644 Documentation/power/opp.txt
 create mode 100644 Documentation/power/pci.rst
 delete mode 100644 Documentation/power/pci.txt
 create mode 100644 Documentation/power/pm_qos_interface.rst
 delete mode 100644 Documentation/power/pm_qos_interface.txt
 create mode 100644 Documentation/power/power_supply_class.rst
 delete mode 100644 Documentation/power/power_supply_class.txt
 create mode 100644 Documentation/power/powercap/powercap.rst
 delete mode 100644 Documentation/power/powercap/powercap.txt
 create mode 100644 Documentation/power/regulator/consumer.rst
 delete mode 100644 Documentation/power/regulator/consumer.txt
 create mode 100644 Documentation/power/regulator/design.rst
 delete mode 100644 Documentation/power/regulator/design.txt
 create mode 100644 Documentation/power/regulator/machine.rst
 delete mode 100644 Documentation/power/regulator/machine.txt
 create mode 100644 Documentation/power/regulator/overview.rst
 delete mode 100644 Documentation/power/regulator/overview.txt
 create mode 100644 Documentation/power/regulator/regulator.rst
 delete mode 100644 Documentation/power/regulator/regulator.txt
 create mode 100644 Documentation/power/runtime_pm.rst
 delete mode 100644 Documentation/power/runtime_pm.txt
 create mode 100644 Documentation/power/s2ram.rst
 delete mode 100644 Documentation/power/s2ram.txt
 create mode 100644 Documentation/power/suspend-and-cpuhotplug.rst
 delete mode 100644 Documentation/power/suspend-and-cpuhotplug.txt
 create mode 100644 Documentation/power/suspend-and-interrupts.rst
 delete mode 100644 Documentation/power/suspend-and-interrupts.txt
 create mode 100644 Documentation/power/swsusp-and-swap-files.rst
 delete mode 100644 Documentation/power/swsusp-and-swap-files.txt
 create mode 100644 Documentation/power/swsusp-dmcrypt.rst
 delete mode 100644 Documentation/power/swsusp-dmcrypt.txt
 create mode 100644 Documentation/power/swsusp.rst
 delete mode 100644 Documentation/power/swsusp.txt
 create mode 100644 Documentation/power/tricks.rst
 delete mode 100644 Documentation/power/tricks.txt
 create mode 100644 Documentation/power/userland-swsusp.rst
 delete mode 100644 Documentation/power/userland-swsusp.txt
 create mode 100644 Documentation/power/video.rst
 delete mode 100644 Documentation/power/video.txt

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap
index db3b3ff70d84..742dfd966592 100644
--- a/Documentation/ABI/testing/sysfs-class-powercap
+++ b/Documentation/ABI/testing/sysfs-class-powercap
@@ -5,7 +5,7 @@ Contact:	linux-pm@vger.kernel.org
 Description:
 		The powercap/ class sub directory belongs to the power cap
 		subsystem. Refer to
-		Documentation/power/powercap/powercap.txt for details.
+		Documentation/power/powercap/powercap.rst for details.
 
 What:		/sys/class/powercap/<control type>
 Date:		September 2013
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..7f5ca6e7c4d3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -13,7 +13,7 @@
 			For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
 			are available
 
-			See also Documentation/power/runtime_pm.txt, pci=noacpi
+			See also Documentation/power/runtime_pm.rst, pci=noacpi
 
 	acpi_apic_instance=	[ACPI, IOAPIC]
 			Format: <int>
@@ -223,7 +223,7 @@
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
 				  old_ordering, nonvs, sci_force_enable, nobl }
-			See Documentation/power/video.txt for information on
+			See Documentation/power/video.rst for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
@@ -4108,7 +4108,7 @@
 			Specify the offset from the beginning of the partition
 			given by "resume=" at which the swap header is located,
 			in <PAGE_SIZE> units (needed only for swap files).
-			See  Documentation/power/swsusp-and-swap-files.txt
+			See  Documentation/power/swsusp-and-swap-files.rst
 
 	resumedelay=	[HIBERNATION] Delay (in seconds) to pause before attempting to
 			read the resume files
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 073f128af5a7..55193e680250 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -95,7 +95,7 @@ flags	- flags of the cpufreq driver
 
 3. CPUFreq Table Generation with Operating Performance Point (OPP)
 ==================================================================
-For details about OPP, see Documentation/power/opp.txt
+For details about OPP, see Documentation/power/opp.rst
 
 dev_pm_opp_init_cpufreq_table -
 	This function provides a ready to use conversion routine to translate
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 30835683616a..f66c7b9126ea 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto`
 flag is clear.
 
 For more information about the runtime power management framework, refer to
-:file:`Documentation/power/runtime_pm.txt`.
+:file:`Documentation/power/runtime_pm.rst`.
 
 
 Calling Drivers to Enter and Leave System Sleep States
@@ -728,7 +728,7 @@ it into account in any way.
 
 Devices may be defined as IRQ-safe which indicates to the PM core that their
 runtime PM callbacks may be invoked with disabled interrupts (see
-:file:`Documentation/power/runtime_pm.txt` for more information).  If an
+:file:`Documentation/power/runtime_pm.rst` for more information).  If an
 IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
 disallowed, unless the domain itself is defined as IRQ-safe. However, it
 makes sense to define a PM domain as IRQ-safe only if all the devices in it
@@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM
 status in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`.
 [Refer to that document for more information regarding this particular issue as
 well as for information on the device runtime power management framework in
 general.]
diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst
index 4a74cf6f2797..2525c3622cae 100644
--- a/Documentation/driver-api/usb/power-management.rst
+++ b/Documentation/driver-api/usb/power-management.rst
@@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we
 call it a "dynamic suspend" (also known as a "runtime suspend" or
 "selective suspend").  This document concentrates mostly on how
 dynamic PM is implemented in the USB subsystem, although system PM is
-covered to some extent (see ``Documentation/power/*.txt`` for more
+covered to some extent (see ``Documentation/power/*.rst`` for more
 information about system PM).
 
 System PM support is present only if the kernel was built with
diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst
new file mode 100644
index 000000000000..5b90d947126d
--- /dev/null
+++ b/Documentation/power/apm-acpi.rst
@@ -0,0 +1,36 @@
+============
+APM or ACPI?
+============
+
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default).  If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once.  Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two.  Only one power management
+interface can be in control of the machine at once.  Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional.  Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both.  If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+  =====  =======================================
+  apmd   http://ftp.debian.org/pool/main/a/apmd/
+  acpid  http://acpid.sf.net/
+  =====  =======================================
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
deleted file mode 100644
index 6cc423d3662e..000000000000
--- a/Documentation/power/apm-acpi.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-APM or ACPI?
-------------
-If you have a relatively recent x86 mobile, desktop, or server system,
-odds are it supports either Advanced Power Management (APM) or
-Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
-of the two technologies and puts power management in the hands of the
-operating system, allowing for more intelligent power management than
-is possible with BIOS controlled APM.
-
-The best way to determine which, if either, your system supports is to
-build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
-enabled by default).  If a working ACPI implementation is found, the
-ACPI driver will override and disable APM, otherwise the APM driver
-will be used.
-
-No, sorry, you cannot have both ACPI and APM enabled and running at
-once.  Some people with broken ACPI or broken APM implementations
-would like to use both to get a full set of working features, but you
-simply cannot mix and match the two.  Only one power management
-interface can be in control of the machine at once.  Think about it..
-
-User-space Daemons
-------------------
-Both APM and ACPI rely on user-space daemons, apmd and acpid
-respectively, to be completely functional.  Obtain both of these
-daemons from your Linux distribution or from the Internet (see below)
-and be sure that they are started sometime in the system boot process.
-Go ahead and start both.  If ACPI or APM is not available on your
-system the associated daemon will exit gracefully.
-
-  apmd:   http://ftp.debian.org/pool/main/a/apmd/
-  acpid:  http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst
new file mode 100644
index 000000000000..69862e759c30
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.rst
@@ -0,0 +1,269 @@
+=================================
+Debugging hibernation and suspend
+=================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+===================================================
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode::
+
+	# echo reboot > /sys/power/disk
+	# echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition.  If that happens,
+hibernation is most likely to work correctly.  Still, you need to repeat the
+test at least a couple of times in a row for confidence.  [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work.  Thus, if your machine
+fails to hibernate or resume in the "reboot" mode, you should try the
+"platform" mode::
+
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
+work::
+
+	# echo shutdown > /sys/power/disk
+	# echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+----------------------------
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode.  There are 5 test modes available:
+
+freezer
+	- test the freezing of processes
+
+devices
+	- test the freezing of processes and suspending of devices
+
+platform
+	- test the freezing of processes, suspending of devices and platform
+	  global control methods [1]_
+
+processors
+	- test the freezing of processes, suspending of devices, platform
+	  global control methods [1]_ and the disabling of nonboot CPUs
+
+core
+	- test the freezing of processes, suspending of devices, platform global
+	  control methods\ [1]_, the disabling of nonboot CPUs and suspending
+	  of platform/system devices
+
+.. [1]
+
+    the platform global control methods are only available on ACPI systems
+    and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands.  For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following::
+
+	# echo devices > /sys/power/pm_test
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes.  If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation.  Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image.  Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test).  Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration).  To find this driver,
+you can carry out a binary search according to the rules:
+
+- if the test fails, unload a half of the drivers currently loaded and repeat
+  (that would probably involve rebooting the system, so always note what drivers
+  have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+  recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation.  In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules).  You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
+of hibernation is not likely to work.  You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported.  In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware.  Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+--------------------------------
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes.  If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually.  Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Using the "test_resume" hibernation option
+---------------------------------------------
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image.  One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration.  Namely,
+after doing::
+
+	# echo test_resume > /sys/power/disk
+	# echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware.  That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
+---------------------
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem.  First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console.  This may provide you with some
+information about the reasons of the suspend (resume) failure.  Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst .
+
+2. Testing suspend to RAM (STR)
+===============================
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string.  The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them.  In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices.  They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at S2RAM_LINK to test the system, but if
+it does not work "out of the box", you may need to boot it with
+"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
+you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1.  If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
+
+There is a debugfs entry which shows the suspend to RAM statistics. Here is an
+example of its output::
+
+	# mount -t debugfs none /sys/kernel/debug
+	# cat /sys/kernel/debug/suspend_stats
+	success: 20
+	fail: 5
+	failed_freeze: 0
+	failed_prepare: 0
+	failed_suspend: 5
+	failed_suspend_noirq: 0
+	failed_resume: 0
+	failed_resume_noirq: 0
+	failures:
+	  last_failed_dev:	alarm
+				adc
+	  last_failed_errno:	-16
+				-16
+	  last_failed_step:	suspend
+				suspend
+
+Field success means the success number of suspend to RAM, and field fail means
+the failure number. Others are the failure number of different steps of suspend
+to RAM. suspend_stats just lists the last 2 failed devices, error number and
+failed step of suspend.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
deleted file mode 100644
index 708f87f78a75..000000000000
--- a/Documentation/power/basic-pm-debugging.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-Debugging hibernation and suspend
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Testing hibernation (aka suspend to disk or STD)
-
-To check if hibernation works, you can try to hibernate in the "reboot" mode:
-
-# echo reboot > /sys/power/disk
-# echo disk > /sys/power/state
-
-and the system should create a hibernation image, reboot, resume and get back to
-the command prompt where you have started the transition.  If that happens,
-hibernation is most likely to work correctly.  Still, you need to repeat the
-test at least a couple of times in a row for confidence.  [This is necessary,
-because some problems only show up on a second attempt at suspending and
-resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
-modes causes the PM core to skip some platform-related callbacks which on ACPI
-systems might be necessary to make hibernation work.  Thus, if your machine fails
-to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
-
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-which is the default and recommended mode of hibernation.
-
-Unfortunately, the "platform" mode of hibernation does not work on some systems
-with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
-work:
-
-# echo shutdown > /sys/power/disk
-# echo disk > /sys/power/state
-
-(it is similar to the "reboot" mode, but it requires you to press the power
-button to make the system resume).
-
-If neither "platform" nor "shutdown" hibernation mode works, you will need to
-identify what goes wrong.
-
-a) Test modes of hibernation
-
-To find out why hibernation fails on your system, you can use a special testing
-facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
-there is the file /sys/power/pm_test that can be used to make the hibernation
-core run in a test mode.  There are 5 test modes available:
-
-freezer
-- test the freezing of processes
-
-devices
-- test the freezing of processes and suspending of devices
-
-platform
-- test the freezing of processes, suspending of devices and platform
-  global control methods(*)
-
-processors
-- test the freezing of processes, suspending of devices, platform
-  global control methods(*) and the disabling of nonboot CPUs
-
-core
-- test the freezing of processes, suspending of devices, platform global
-  control methods(*), the disabling of nonboot CPUs and suspending of
-  platform/system devices
-
-(*) the platform global control methods are only available on ACPI systems
-    and are only tested if the hibernation mode is set to "platform"
-
-To use one of them it is necessary to write the corresponding string to
-/sys/power/pm_test (eg. "devices" to test the freezing of processes and
-suspending devices) and issue the standard hibernation commands.  For example,
-to use the "devices" test mode along with the "platform" mode of hibernation,
-you should do the following:
-
-# echo devices > /sys/power/pm_test
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-Then, the kernel will try to freeze processes, suspend devices, wait a few
-seconds (5 by default, but configurable by the suspend.pm_test_delay module
-parameter), resume devices and thaw processes.  If "platform" is written to
-/sys/power/pm_test , then after suspending devices the kernel will additionally
-invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation.  Next, it will wait a
-configurable number of seconds and invoke the platform (eg. ACPI) global
-methods used to cancel hibernation etc.
-
-Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
-hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
-contains a space-separated list of all available tests (including "none" that
-represents the normal functionality) in which the current test level is
-indicated by square brackets.
-
-Generally, as you can see, each test level is more "invasive" than the previous
-one and the "core" level tests the hardware and drivers as deeply as possible
-without creating a hibernation image.  Obviously, if the "devices" test fails,
-the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
-should try the test modes starting from "freezer", through "devices", "platform"
-and "processors" up to "core" (repeat the test on each level a couple of times
-to make sure that any random factors are avoided).
-
-If the "freezer" test fails, there is a task that cannot be frozen (in that case
-it usually is possible to identify the offending task by analysing the output of
-dmesg obtained after the failing test).  Failure at this level usually means
-that there is a problem with the tasks freezer subsystem that should be
-reported.
-
-If the "devices" test fails, most likely there is a driver that cannot suspend
-or resume its device (in the latter case the system may hang or become unstable
-after the test, so please take that into consideration).  To find this driver,
-you can carry out a binary search according to the rules:
-- if the test fails, unload a half of the drivers currently loaded and repeat
-(that would probably involve rebooting the system, so always note what drivers
-have been loaded before the test),
-- if the test succeeds, load a half of the drivers you have unloaded most
-recently and repeat.
-
-Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before hibernation.  In that case please
-make sure to report the problem with the driver.
-
-It is also possible that the "devices" test will still fail after you have
-unloaded all modules. In that case, you may want to look in your kernel
-configuration for the drivers that can be compiled as modules (and test again
-with these drivers compiled as modules).  You may also try to use some special
-kernel command line options such as "noapic", "noacpi" or even "acpi=off".
-
-If the "platform" test fails, there is a problem with the handling of the
-platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
-of hibernation is not likely to work.  You can try the "shutdown" mode, but that
-is rather a poor man's workaround.
-
-If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
-work (of course, this only may be an issue on SMP systems) and the problem
-should be reported.  In that case you can also try to switch the nonboot CPUs
-off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
-see if that works.
-
-If the "core" test fails, which means that suspending of the system/platform
-devices has failed (these devices are suspended on one CPU with interrupts off),
-the problem is most probably hardware-related and serious, so it should be
-reported.
-
-A failure of any of the "platform", "processors" or "core" tests may cause your
-system to hang or become unstable, so please beware.  Such a failure usually
-indicates a serious problem that very well may be related to the hardware, but
-please report it anyway.
-
-b) Testing minimal configuration
-
-If all of the hibernation test modes work, you can boot the system with the
-"init=/bin/bash" command line parameter and attempt to hibernate in the
-"reboot", "shutdown" and "platform" modes.  If that does not work, there
-probably is a problem with a driver statically compiled into the kernel and you
-can try to compile more drivers as modules, so that they can be tested
-individually.  Otherwise, there is a problem with a modular driver and you can
-find it by loading a half of the modules you normally use and binary searching
-in accordance with the algorithm:
-- if there are n modules loaded and the attempt to suspend and resume fails,
-unload n/2 of the modules and try again (that would probably involve rebooting
-the system),
-- if there are n modules loaded and the attempt to suspend and resume succeeds,
-load n/2 modules more and try again.
-
-Again, if you find the offending module(s), it(they) must be unloaded every time
-before hibernation, and please report the problem with it(them).
-
-c) Using the "test_resume" hibernation option
-
-/sys/power/disk generally tells the kernel what to do after creating a
-hibernation image.  One of the available options is "test_resume" which
-causes the just created image to be used for immediate restoration.  Namely,
-after doing:
-
-# echo test_resume > /sys/power/disk
-# echo disk > /sys/power/state
-
-a hibernation image will be created and a resume from it will be triggered
-immediately without involving the platform firmware in any way.
-
-That test can be used to check if failures to resume from hibernation are
-related to bad interactions with the platform firmware.  That is, if the above
-works every time, but resume from actual hibernation does not work or is
-unreliable, the platform firmware may be responsible for the failures.
-
-On architectures and platforms that support using different kernels to restore
-hibernation images (that is, the kernel used to read the image from storage and
-load it into memory is different from the one included in the image) or support
-kernel address space randomization, it also can be used to check if failures
-to resume may be related to the differences between the restore and image
-kernels.
-
-d) Advanced debugging
-
-In case that hibernation does not work on your system even in the minimal
-configuration and compiling more drivers as modules is not practical or some
-modules cannot be unloaded, you can use one of the more advanced debugging
-techniques to find the problem.  First, if there is a serial port in your box,
-you can boot the kernel with the 'no_console_suspend' parameter and try to log
-kernel messages using the serial console.  This may provide you with some
-information about the reasons of the suspend (resume) failure.  Alternatively,
-it may be possible to use a FireWire port for debugging with firescope
-(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
-use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt .
-
-2. Testing suspend to RAM (STR)
-
-To verify that the STR works, it is generally more convenient to use the s2ram
-tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
-
-Namely, after writing "freezer", "devices", "platform", "processors", or "core"
-into /sys/power/pm_test (available if the kernel is compiled with
-CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
-to given string.  The STR test modes are defined in the same way as for
-hibernation, so please refer to Section 1 for more information about them.  In
-particular, the "core" test allows you to test everything except for the actual
-invocation of the platform firmware in order to put the system into the sleep
-state.
-
-Among other things, the testing with the help of /sys/power/pm_test may allow
-you to identify drivers that fail to suspend or resume their devices.  They
-should be unloaded every time before an STR transition.
-
-Next, you can follow the instructions at S2RAM_LINK to test the system, but if
-it does not work "out of the box", you may need to boot it with
-"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
-you may be able to search for failing drivers by following the procedure
-analogous to the one described in section 1.  If you find some failing drivers,
-you will have to unload them every time before an STR transition (ie. before
-you run s2ram), and please report the problems with them.
-
-There is a debugfs entry which shows the suspend to RAM statistics. Here is an
-example of its output.
-	# mount -t debugfs none /sys/kernel/debug
-	# cat /sys/kernel/debug/suspend_stats
-	success: 20
-	fail: 5
-	failed_freeze: 0
-	failed_prepare: 0
-	failed_suspend: 5
-	failed_suspend_noirq: 0
-	failed_resume: 0
-	failed_resume_noirq: 0
-	failures:
-	  last_failed_dev:	alarm
-				adc
-	  last_failed_errno:	-16
-				-16
-	  last_failed_step:	suspend
-				suspend
-Field success means the success number of suspend to RAM, and field fail means
-the failure number. Others are the failure number of different steps of suspend
-to RAM. suspend_stats just lists the last 2 failed devices, error number and
-failed step of suspend.
diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst
new file mode 100644
index 000000000000..84fab9376792
--- /dev/null
+++ b/Documentation/power/charger-manager.rst
@@ -0,0 +1,205 @@
+===============
+Charger Manager
+===============
+
+	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
+
+Charger Manager provides in-kernel battery charger management that
+requires temperature monitoring during suspend-to-RAM state
+and where each battery may have multiple chargers attached and the userland
+wants to look at the aggregated information of the multiple chargers.
+
+Charger Manager is a platform_driver with power-supply-class entries.
+An instance of Charger Manager (a platform-device created with Charger-Manager)
+represents an independent battery with chargers. If there are multiple
+batteries with their own chargers acting independently in a system,
+the system may need multiple instances of Charger Manager.
+
+1. Introduction
+===============
+
+Charger Manager supports the following:
+
+* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
+	A system may have multiple chargers (or power sources) and some of
+	they may be activated at the same time. Each charger may have its
+	own power-supply-class and each power-supply-class can provide
+	different information about the battery status. This framework
+	aggregates charger-related information from multiple sources and
+	shows combined information as a single power-supply-class.
+
+* Support for in suspend-to-RAM polling (with suspend_again callback)
+	While the battery is being charged and the system is in suspend-to-RAM,
+	we may need to monitor the battery health by looking at the ambient or
+	battery temperature. We can accomplish this by waking up the system
+	periodically. However, such a method wakes up devices unnecessarily for
+	monitoring the battery health and tasks, and user processes that are
+	supposed to be kept suspended. That, in turn, incurs unnecessary power
+	consumption and slow down charging process. Or even, such peak power
+	consumption can stop chargers in the middle of charging
+	(external power input < device power consumption), which not
+	only affects the charging time, but the lifespan of the battery.
+
+	Charger Manager provides a function "cm_suspend_again" that can be
+	used as suspend_again callback of platform_suspend_ops. If the platform
+	requires tasks other than cm_suspend_again, it may implement its own
+	suspend_again callback that calls cm_suspend_again in the middle.
+	Normally, the platform will need to resume and suspend some devices
+	that are used by Charger Manager.
+
+* Support for premature full-battery event handling
+	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
+	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
+	restarts charging. This check is also performed while suspended by
+	setting wakeup time accordingly and using suspend_again.
+
+* Support for uevent-notify
+	With the charger-related events, the device sends
+	notification to users with UEVENT.
+
+2. Global Charger-Manager Data related with suspend_again
+=========================================================
+In order to setup Charger Manager with suspend-again feature
+(in-suspend monitoring), the user should provide charger_global_desc
+with setup_charger_manager(`struct charger_global_desc *`).
+This charger_global_desc data for in-suspend monitoring is global
+as the name suggests. Thus, the user needs to provide only once even
+if there are multiple batteries. If there are multiple batteries, the
+multiple instances of Charger Manager share the same charger_global_desc
+and it will manage in-suspend monitoring for all instances of Charger Manager.
+
+The user needs to provide all the three entries to `struct charger_global_desc`
+properly in order to activate in-suspend monitoring:
+
+`char *rtc_name;`
+	The name of rtc (e.g., "rtc0") used to wakeup the system from
+	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
+	should be able to wake up the system from suspend. Charger Manager
+	saves and restores the alarm value and use the previously-defined
+	alarm if it is going to go off earlier than Charger Manager so that
+	Charger Manager does not interfere with previously-defined alarms.
+
+`bool (*rtc_only_wakeup)(void);`
+	This callback should let CM know whether
+	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
+	same struct. If there is any other wakeup source triggered the
+	wakeup, it should return false. If the "rtc" is the only wakeup
+	reason, it should return true.
+
+`bool assume_timer_stops_in_suspend;`
+	if true, Charger Manager assumes that
+	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
+	assumes that the suspend-duration is same as the alarm length.
+
+
+3. How to setup suspend_again
+=============================
+Charger Manager provides a function "extern bool cm_suspend_again(void)".
+When cm_suspend_again is called, it monitors every battery. The suspend_ops
+callback of the system's platform_suspend_ops can call cm_suspend_again
+function to know whether Charger Manager wants to suspend again or not.
+If there are no other devices or tasks that want to use suspend_again
+feature, the platform_suspend_ops may directly refer to cm_suspend_again
+for its suspend_again callback.
+
+The cm_suspend_again() returns true (meaning "I want to suspend again")
+if the system was woken up by Charger Manager and the polling
+(in-suspend monitoring) results in "normal".
+
+4. Charger-Manager Data (struct charger_desc)
+=============================================
+For each battery charged independently from other batteries (if a series of
+batteries are charged by a single charger, they are counted as one independent
+battery), an instance of Charger Manager is attached to it. The following
+
+struct charger_desc elements:
+
+`char *psy_name;`
+	The power-supply-class name of the battery. Default is
+	"battery" if psy_name is NULL. Users can access the psy entries
+	at "/sys/class/power_supply/[psy_name]/".
+
+`enum polling_modes polling_mode;`
+	  CM_POLL_DISABLE:
+		do not poll this battery.
+	  CM_POLL_ALWAYS:
+		always poll this battery.
+	  CM_POLL_EXTERNAL_POWER_ONLY:
+		poll this battery if and only if an external power
+		source is attached.
+	  CM_POLL_CHARGING_ONLY:
+		poll this battery if and only if the battery is being charged.
+
+`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;`
+	If both have non-zero values, Charger Manager will check the
+	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
+	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
+	Manager will try to recharge the battery by disabling and enabling
+	chargers. Recharge with voltage drop condition only (without delay
+	condition) is needed to be implemented with hardware interrupts from
+	fuel gauges or charger devices/chips.
+
+`unsigned int fullbatt_uV;`
+	If specified with a non-zero value, Charger Manager assumes
+	that the battery is full (capacity = 100) if the battery is not being
+	charged and the battery voltage is equal to or greater than
+	fullbatt_uV.
+
+`unsigned int polling_interval_ms;`
+	Required polling interval in ms. Charger Manager will poll
+	this battery every polling_interval_ms or more frequently.
+
+`enum data_source battery_present;`
+	CM_BATTERY_PRESENT:
+		assume that the battery exists.
+	CM_NO_BATTERY:
+		assume that the battery does not exists.
+	CM_FUEL_GAUGE:
+		get battery presence information from fuel gauge.
+	CM_CHARGER_STAT:
+		get battery presence from chargers.
+
+`char **psy_charger_stat;`
+	An array ending with NULL that has power-supply-class names of
+	chargers. Each power-supply-class should provide "PRESENT" (if
+	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
+	external power source is attached or not), and "STATUS" (shows whether
+	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
+	"Discharging", "NotCharging"}).
+
+`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;`
+	Regulators representing the chargers in the form for
+	regulator framework's bulk functions.
+
+`char *psy_fuel_gauge;`
+	Power-supply-class name of the fuel gauge.
+
+`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;`
+	This callback returns 0 if the temperature is safe for charging,
+	a positive number if it is too hot to charge, and a negative number
+	if it is too cold to charge. With the variable mC, the callback returns
+	the temperature in 1/1000 of centigrade.
+	The source of temperature can be battery or ambient one according to
+	the value of measure_battery_temp.
+
+
+5. Notify Charger-Manager of charger events: cm_notify_event()
+==============================================================
+If there is an charger event is required to notify
+Charger Manager, a charger device driver that triggers the event can call
+cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
+In the function, psy is the charger driver's power_supply pointer, which is
+associated with Charger-Manager. The parameter "type"
+is the same as irq's type (enum cm_event_types). The event message "msg" is
+optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
+
+6. Other Considerations
+=======================
+
+At the charger/battery-related events such as battery-pulled-out,
+charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
+and others critical to chargers, the system should be configured to wake up.
+At least the following should wake up the system from a suspend:
+a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
+
+It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt
deleted file mode 100644
index 9ff1105e58d6..000000000000
--- a/Documentation/power/charger-manager.txt
+++ /dev/null
@@ -1,200 +0,0 @@
-Charger Manager
-	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
-
-Charger Manager provides in-kernel battery charger management that
-requires temperature monitoring during suspend-to-RAM state
-and where each battery may have multiple chargers attached and the userland
-wants to look at the aggregated information of the multiple chargers.
-
-Charger Manager is a platform_driver with power-supply-class entries.
-An instance of Charger Manager (a platform-device created with Charger-Manager)
-represents an independent battery with chargers. If there are multiple
-batteries with their own chargers acting independently in a system,
-the system may need multiple instances of Charger Manager.
-
-1. Introduction
-===============
-
-Charger Manager supports the following:
-
-* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
-	A system may have multiple chargers (or power sources) and some of
-	they may be activated at the same time. Each charger may have its
-	own power-supply-class and each power-supply-class can provide
-	different information about the battery status. This framework
-	aggregates charger-related information from multiple sources and
-	shows combined information as a single power-supply-class.
-
-* Support for in suspend-to-RAM polling (with suspend_again callback)
-	While the battery is being charged and the system is in suspend-to-RAM,
-	we may need to monitor the battery health by looking at the ambient or
-	battery temperature. We can accomplish this by waking up the system
-	periodically. However, such a method wakes up devices unnecessarily for
-	monitoring the battery health and tasks, and user processes that are
-	supposed to be kept suspended. That, in turn, incurs unnecessary power
-	consumption and slow down charging process. Or even, such peak power
-	consumption can stop chargers in the middle of charging
-	(external power input < device power consumption), which not
-	only affects the charging time, but the lifespan of the battery.
-
-	Charger Manager provides a function "cm_suspend_again" that can be
-	used as suspend_again callback of platform_suspend_ops. If the platform
-	requires tasks other than cm_suspend_again, it may implement its own
-	suspend_again callback that calls cm_suspend_again in the middle.
-	Normally, the platform will need to resume and suspend some devices
-	that are used by Charger Manager.
-
-* Support for premature full-battery event handling
-	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
-	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
-	restarts charging. This check is also performed while suspended by
-	setting wakeup time accordingly and using suspend_again.
-
-* Support for uevent-notify
-	With the charger-related events, the device sends
-	notification to users with UEVENT.
-
-2. Global Charger-Manager Data related with suspend_again
-========================================================
-In order to setup Charger Manager with suspend-again feature
-(in-suspend monitoring), the user should provide charger_global_desc
-with setup_charger_manager(struct charger_global_desc *).
-This charger_global_desc data for in-suspend monitoring is global
-as the name suggests. Thus, the user needs to provide only once even
-if there are multiple batteries. If there are multiple batteries, the
-multiple instances of Charger Manager share the same charger_global_desc
-and it will manage in-suspend monitoring for all instances of Charger Manager.
-
-The user needs to provide all the three entries properly in order to activate
-in-suspend monitoring:
-
-struct charger_global_desc {
-
-char *rtc_name;
-	: The name of rtc (e.g., "rtc0") used to wakeup the system from
-	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
-	should be able to wake up the system from suspend. Charger Manager
-	saves and restores the alarm value and use the previously-defined
-	alarm if it is going to go off earlier than Charger Manager so that
-	Charger Manager does not interfere with previously-defined alarms.
-
-bool (*rtc_only_wakeup)(void);
-	: This callback should let CM know whether
-	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
-	same struct. If there is any other wakeup source triggered the
-	wakeup, it should return false. If the "rtc" is the only wakeup
-	reason, it should return true.
-
-bool assume_timer_stops_in_suspend;
-	: if true, Charger Manager assumes that
-	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
-	assumes that the suspend-duration is same as the alarm length.
-};
-
-3. How to setup suspend_again
-=============================
-Charger Manager provides a function "extern bool cm_suspend_again(void)".
-When cm_suspend_again is called, it monitors every battery. The suspend_ops
-callback of the system's platform_suspend_ops can call cm_suspend_again
-function to know whether Charger Manager wants to suspend again or not.
-If there are no other devices or tasks that want to use suspend_again
-feature, the platform_suspend_ops may directly refer to cm_suspend_again
-for its suspend_again callback.
-
-The cm_suspend_again() returns true (meaning "I want to suspend again")
-if the system was woken up by Charger Manager and the polling
-(in-suspend monitoring) results in "normal".
-
-4. Charger-Manager Data (struct charger_desc)
-=============================================
-For each battery charged independently from other batteries (if a series of
-batteries are charged by a single charger, they are counted as one independent
-battery), an instance of Charger Manager is attached to it.
-
-struct charger_desc {
-
-char *psy_name;
-	: The power-supply-class name of the battery. Default is
-	"battery" if psy_name is NULL. Users can access the psy entries
-	at "/sys/class/power_supply/[psy_name]/".
-
-enum polling_modes polling_mode;
-	: CM_POLL_DISABLE: do not poll this battery.
-	  CM_POLL_ALWAYS: always poll this battery.
-	  CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if
-				       an external power source is attached.
-	  CM_POLL_CHARGING_ONLY: poll this battery if and only if the
-				 battery is being charged.
-
-unsigned int fullbatt_vchkdrop_ms;
-unsigned int fullbatt_vchkdrop_uV;
-	: If both have non-zero values, Charger Manager will check the
-	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
-	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
-	Manager will try to recharge the battery by disabling and enabling
-	chargers. Recharge with voltage drop condition only (without delay
-	condition) is needed to be implemented with hardware interrupts from
-	fuel gauges or charger devices/chips.
-
-unsigned int fullbatt_uV;
-	: If specified with a non-zero value, Charger Manager assumes
-	that the battery is full (capacity = 100) if the battery is not being
-	charged and the battery voltage is equal to or greater than
-	fullbatt_uV.
-
-unsigned int polling_interval_ms;
-	: Required polling interval in ms. Charger Manager will poll
-	this battery every polling_interval_ms or more frequently.
-
-enum data_source battery_present;
-	: CM_BATTERY_PRESENT: assume that the battery exists.
-	CM_NO_BATTERY: assume that the battery does not exists.
-	CM_FUEL_GAUGE: get battery presence information from fuel gauge.
-	CM_CHARGER_STAT: get battery presence from chargers.
-
-char **psy_charger_stat;
-	: An array ending with NULL that has power-supply-class names of
-	chargers. Each power-supply-class should provide "PRESENT" (if
-	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
-	external power source is attached or not), and "STATUS" (shows whether
-	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
-	"Discharging", "NotCharging"}).
-
-int num_charger_regulators;
-struct regulator_bulk_data *charger_regulators;
-	: Regulators representing the chargers in the form for
-	regulator framework's bulk functions.
-
-char *psy_fuel_gauge;
-	: Power-supply-class name of the fuel gauge.
-
-int (*temperature_out_of_range)(int *mC);
-bool measure_battery_temp;
-	: This callback returns 0 if the temperature is safe for charging,
-	a positive number if it is too hot to charge, and a negative number
-	if it is too cold to charge. With the variable mC, the callback returns
-	the temperature in 1/1000 of centigrade.
-	The source of temperature can be battery or ambient one according to
-	the value of measure_battery_temp.
-};
-
-5. Notify Charger-Manager of charger events: cm_notify_event()
-=========================================================
-If there is an charger event is required to notify
-Charger Manager, a charger device driver that triggers the event can call
-cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
-In the function, psy is the charger driver's power_supply pointer, which is
-associated with Charger-Manager. The parameter "type"
-is the same as irq's type (enum cm_event_types). The event message "msg" is
-optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
-
-6. Other Considerations
-=======================
-
-At the charger/battery-related events such as battery-pulled-out,
-charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
-and others critical to chargers, the system should be configured to wake up.
-At least the following should wake up the system from a suspend:
-a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
-
-It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst
new file mode 100644
index 000000000000..e53f1999fc39
--- /dev/null
+++ b/Documentation/power/drivers-testing.rst
@@ -0,0 +1,51 @@
+====================================================
+Testing suspend and resume support in device drivers
+====================================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+============================
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded.  Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested.  Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver.  Please see Documentation/power/basic-pm-debugging.rst
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+=====================
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+   (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+   "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+   hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+   in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst,
+   2).  [As far as the STR tests are concerned, it should not matter whether or
+   not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+   (see: Documentation/power/basic-pm-debugging.rst, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests.  If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
deleted file mode 100644
index 638afdf4d6b8..000000000000
--- a/Documentation/power/drivers-testing.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Testing suspend and resume support in device drivers
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Preparing the test system
-
-Unfortunately, to effectively test the support for the system-wide suspend and
-resume transitions in a driver, it is necessary to suspend and resume a fully
-functional system with this driver loaded.  Moreover, that should be done
-several times, preferably several times in a row, and separately for hibernation
-(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
-cases involves slightly different operations and different interactions with
-the machine's BIOS.
-
-Of course, for this purpose the test system has to be known to suspend and
-resume without the driver being tested.  Thus, if possible, you should first
-resolve all suspend/resume-related problems in the test system before you start
-testing the new driver.  Please see Documentation/power/basic-pm-debugging.txt
-for more information about the debugging of suspend/resume functionality.
-
-2. Testing the driver
-
-Once you have resolved the suspend/resume-related problems with your test system
-without the new driver, you are ready to test it:
-
-a) Build the driver as a module, load it and try the test modes of hibernation
-   (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
-   "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-c) Compile the driver directly into the kernel and try the test modes of
-   hibernation.
-
-d) Attempt to hibernate with the driver compiled directly into the kernel
-   in the "reboot", "shutdown" and "platform" modes.
-
-e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt,
-   2).  [As far as the STR tests are concerned, it should not matter whether or
-   not the driver is built as a module.]
-
-f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
-   (see: Documentation/power/basic-pm-debugging.txt, 2).
-
-Each of the above tests should be repeated several times and the STD tests
-should be mixed with the STR tests.  If any of them fails, the driver cannot be
-regarded as suspend/resume-safe.
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
new file mode 100644
index 000000000000..90a345d57ae9
--- /dev/null
+++ b/Documentation/power/energy-model.rst
@@ -0,0 +1,147 @@
+====================
+Energy Model of CPUs
+====================
+
+1. Overview
+-----------
+
+The Energy Model (EM) framework serves as an interface between drivers knowing
+the power consumed by CPUs at various performance levels, and the kernel
+subsystems willing to use that information to make energy-aware decisions.
+
+The source of the information about the power consumed by CPUs can vary greatly
+from one platform to another. These power costs can be estimated using
+devicetree data in some cases. In others, the firmware will know better.
+Alternatively, userspace might be best positioned. And so on. In order to avoid
+each and every client subsystem to re-implement support for each and every
+possible source of information on its own, the EM framework intervenes as an
+abstraction layer which standardizes the format of power cost tables in the
+kernel, hence enabling to avoid redundant work.
+
+The figure below depicts an example of drivers (Arm-specific here, but the
+approach is applicable to any architecture) providing power costs to the EM
+framework, and interested clients reading the data from it::
+
+       +---------------+  +-----------------+  +---------------+
+       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
+       +---------------+  +-----------------+  +---------------+
+               |                   | em_pd_energy()    |
+               |                   | em_cpu_get()      |
+               +---------+         |         +---------+
+                         |         |         |
+                         v         v         v
+                        +---------------------+
+                        |    Energy Model     |
+                        |     Framework       |
+                        +---------------------+
+                           ^       ^       ^
+                           |       |       | em_register_perf_domain()
+                +----------+       |       +---------+
+                |                  |                 |
+        +---------------+  +---------------+  +--------------+
+        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
+        +---------------+  +---------------+  +--------------+
+                ^                  ^                 ^
+                |                  |                 |
+        +--------------+   +---------------+  +--------------+
+        | Device Tree  |   |   Firmware    |  |      ?       |
+        +--------------+   +---------------+  +--------------+
+
+The EM framework manages power cost tables per 'performance domain' in the
+system. A performance domain is a group of CPUs whose performance is scaled
+together. Performance domains generally have a 1-to-1 mapping with CPUFreq
+policies. All CPUs in a performance domain are required to have the same
+micro-architecture. CPUs in different performance domains can have different
+micro-architectures.
+
+
+2. Core APIs
+------------
+
+2.1 Config options
+^^^^^^^^^^^^^^^^^^
+
+CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
+
+
+2.2 Registration of performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Drivers are expected to register performance domains into the EM framework by
+calling the following API::
+
+  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+			      struct em_data_callback *cb);
+
+Drivers must specify the CPUs of the performance domains using the cpumask
+argument, and provide a callback function returning <frequency, power> tuples
+for each capacity state. The callback function provided by the driver is free
+to fetch data from any relevant location (DT, firmware, ...), and by any mean
+deemed necessary. See Section 3. for an example of driver implementing this
+callback, and kernel/power/energy_model.c for further documentation on this
+API.
+
+
+2.3 Accessing performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subsystems interested in the energy model of a CPU can retrieve it using the
+em_cpu_get() API. The energy model tables are allocated once upon creation of
+the performance domains, and kept in memory untouched.
+
+The energy consumed by a performance domain can be estimated using the
+em_pd_energy() API. The estimation is performed assuming that the schedutil
+CPUfreq governor is in use.
+
+More details about the above APIs can be found in include/linux/energy_model.h.
+
+
+3. Example driver
+-----------------
+
+This section provides a simple example of a CPUFreq driver registering a
+performance domain in the Energy Model framework using the (fake) 'foo'
+protocol. The driver implements an est_power() function to be provided to the
+EM framework::
+
+  -> drivers/cpufreq/foo_cpufreq.c
+
+  01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+  02	{
+  03		long freq, power;
+  04
+  05		/* Use the 'foo' protocol to ceil the frequency */
+  06		freq = foo_get_freq_ceil(cpu, *KHz);
+  07		if (freq < 0);
+  08			return freq;
+  09
+  10		/* Estimate the power cost for the CPU at the relevant freq. */
+  11		power = foo_estimate_power(cpu, freq);
+  12		if (power < 0);
+  13			return power;
+  14
+  15		/* Return the values to the EM framework */
+  16		*mW = power;
+  17		*KHz = freq;
+  18
+  19		return 0;
+  20	}
+  21
+  22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
+  23	{
+  24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
+  25		int nr_opp, ret;
+  26
+  27		/* Do the actual CPUFreq init work ... */
+  28		ret = do_foo_cpufreq_init(policy);
+  29		if (ret)
+  30			return ret;
+  31
+  32		/* Find the number of OPPs for this policy */
+  33		nr_opp = foo_get_nr_opp(policy);
+  34
+  35		/* And register the new performance domain */
+  36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+  37
+  38	        return 0;
+  39	}
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
deleted file mode 100644
index a2b0ae4c76bd..000000000000
--- a/Documentation/power/energy-model.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-                           ====================
-                           Energy Model of CPUs
-                           ====================
-
-1. Overview
------------
-
-The Energy Model (EM) framework serves as an interface between drivers knowing
-the power consumed by CPUs at various performance levels, and the kernel
-subsystems willing to use that information to make energy-aware decisions.
-
-The source of the information about the power consumed by CPUs can vary greatly
-from one platform to another. These power costs can be estimated using
-devicetree data in some cases. In others, the firmware will know better.
-Alternatively, userspace might be best positioned. And so on. In order to avoid
-each and every client subsystem to re-implement support for each and every
-possible source of information on its own, the EM framework intervenes as an
-abstraction layer which standardizes the format of power cost tables in the
-kernel, hence enabling to avoid redundant work.
-
-The figure below depicts an example of drivers (Arm-specific here, but the
-approach is applicable to any architecture) providing power costs to the EM
-framework, and interested clients reading the data from it.
-
-       +---------------+  +-----------------+  +---------------+
-       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
-       +---------------+  +-----------------+  +---------------+
-               |                   | em_pd_energy()    |
-               |                   | em_cpu_get()      |
-               +---------+         |         +---------+
-                         |         |         |
-                         v         v         v
-                        +---------------------+
-                        |    Energy Model     |
-                        |     Framework       |
-                        +---------------------+
-                           ^       ^       ^
-                           |       |       | em_register_perf_domain()
-                +----------+       |       +---------+
-                |                  |                 |
-        +---------------+  +---------------+  +--------------+
-        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
-        +---------------+  +---------------+  +--------------+
-                ^                  ^                 ^
-                |                  |                 |
-        +--------------+   +---------------+  +--------------+
-        | Device Tree  |   |   Firmware    |  |      ?       |
-        +--------------+   +---------------+  +--------------+
-
-The EM framework manages power cost tables per 'performance domain' in the
-system. A performance domain is a group of CPUs whose performance is scaled
-together. Performance domains generally have a 1-to-1 mapping with CPUFreq
-policies. All CPUs in a performance domain are required to have the same
-micro-architecture. CPUs in different performance domains can have different
-micro-architectures.
-
-
-2. Core APIs
-------------
-
-  2.1 Config options
-
-CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
-
-
-  2.2 Registration of performance domains
-
-Drivers are expected to register performance domains into the EM framework by
-calling the following API:
-
-  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-			      struct em_data_callback *cb);
-
-Drivers must specify the CPUs of the performance domains using the cpumask
-argument, and provide a callback function returning <frequency, power> tuples
-for each capacity state. The callback function provided by the driver is free
-to fetch data from any relevant location (DT, firmware, ...), and by any mean
-deemed necessary. See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
-
-
-  2.3 Accessing performance domains
-
-Subsystems interested in the energy model of a CPU can retrieve it using the
-em_cpu_get() API. The energy model tables are allocated once upon creation of
-the performance domains, and kept in memory untouched.
-
-The energy consumed by a performance domain can be estimated using the
-em_pd_energy() API. The estimation is performed assuming that the schedutil
-CPUfreq governor is in use.
-
-More details about the above APIs can be found in include/linux/energy_model.h.
-
-
-3. Example driver
------------------
-
-This section provides a simple example of a CPUFreq driver registering a
-performance domain in the Energy Model framework using the (fake) 'foo'
-protocol. The driver implements an est_power() function to be provided to the
-EM framework.
-
- -> drivers/cpufreq/foo_cpufreq.c
-
-01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
-02	{
-03		long freq, power;
-04
-05		/* Use the 'foo' protocol to ceil the frequency */
-06		freq = foo_get_freq_ceil(cpu, *KHz);
-07		if (freq < 0);
-08			return freq;
-09
-10		/* Estimate the power cost for the CPU at the relevant freq. */
-11		power = foo_estimate_power(cpu, freq);
-12		if (power < 0);
-13			return power;
-14
-15		/* Return the values to the EM framework */
-16		*mW = power;
-17		*KHz = freq;
-18
-19		return 0;
-20	}
-21
-22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
-23	{
-24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
-25		int nr_opp, ret;
-26
-27		/* Do the actual CPUFreq init work ... */
-28		ret = do_foo_cpufreq_init(policy);
-29		if (ret)
-30			return ret;
-31
-32		/* Find the number of OPPs for this policy */
-33		nr_opp = foo_get_nr_opp(policy);
-34
-35		/* And register the new performance domain */
-36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
-37
-38	        return 0;
-39	}
diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst
new file mode 100644
index 000000000000..ef110fe55e82
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.rst
@@ -0,0 +1,244 @@
+=================
+Freezing of tasks
+=================
+
+(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+=================================
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+=====================
+
+There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
+and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
+variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
+whether the system is to undergo a freezing operation. And freeze_processes()
+sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
+fake signal to all user space processes, and wakes up all the kernel threads.
+All freezable tasks must react to that by calling try_to_freeze(), which
+results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
+the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
+it loop until PF_FROZEN is cleared for it. Then, we say that the task is
+'frozen' and therefore the set of functions handling this mechanism is referred
+to as 'the freezer' (these functions are defined in kernel/power/process.c,
+kernel/freezer.c & include/linux/freezer.h). User space processes are generally
+frozen before kernel threads.
+
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+if the task is to be frozen and makes the task enter __refrigerator().
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if the task is to be frozen and
+calling try_to_freeze().  The main loop of a freezable kernel thread may look
+like the following one::
+
+	set_freezable();
+	do {
+		hub_events();
+		wait_event_freezable(khubd_wait,
+				!list_empty(&hub_event_list) ||
+				kthread_should_stop());
+	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+initiated a freezing operation, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled.  For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
+have been frozen leave __refrigerator() and continue running.
+
+
+Rationale behind the functions dealing with freezing and thawing of tasks
+-------------------------------------------------------------------------
+
+freeze_processes():
+  - freezes only userspace tasks
+
+freeze_kernel_threads():
+  - freezes all tasks (including kernel threads) because we can't freeze
+    kernel threads without freezing userspace tasks
+
+thaw_kernel_threads():
+  - thaws only kernel threads; this is particularly useful if we need to do
+    anything special in between thawing of kernel threads and thawing of
+    userspace tasks, or if we want to postpone the thawing of userspace tasks
+
+thaw_processes():
+  - thaws all tasks (including kernel threads) because we can't thaw userspace
+    tasks without thawing kernel threads
+
+
+III. Which kernel threads are freezable?
+========================================
+
+Kernel threads are not freezable by default.  However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is not allowed).  From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+======================
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+   hibernation.  At the moment we have no simple means of checkpointing
+   filesystems, so if there are any modifications made to filesystem data and/or
+   metadata on disks, we cannot bring them back to the state from before the
+   modifications.  At the same time each hibernation image contains some
+   filesystem-related information that must be consistent with the state of the
+   on-disk data and metadata after the system memory state has been restored
+   from the image (otherwise the filesystems will be damaged in a nasty way,
+   usually making them almost impossible to repair).  We therefore freeze
+   tasks that might cause the on-disk filesystems' data and metadata to be
+   modified after the hibernation image has been created and before the
+   system is finally powered off. The majority of these are user space
+   processes, but if any of the kernel threads may cause something like this
+   to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+   memory (approximately 50% of available RAM) and we need to do that before
+   devices are deactivated, because we generally need them for swapping out.
+   Then, after the memory for the image has been freed, we don't want tasks
+   to allocate additional memory and we prevent them from doing that by
+   freezing them earlier. [Of course, this also means that device drivers
+   should not allocate substantial amounts of memory from their .suspend()
+   callbacks before hibernation, but this is a separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+   from interfering with the suspending and resuming of devices.  A user space
+   process running on a second CPU while we are suspending devices may, for
+   example, be troublesome and without the freezing of tasks we would need some
+   safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I **do** realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA.  So we want to be able to
+avoid *that*, there's no question about that.  And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable.  For example, if
+a kernel thread that belongs to a device driver accesses the device directly, it
+in principle needs to know when the device is suspended, so that it doesn't try
+to access it at that time.  However, if the kernel thread is freezable, it will
+be frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+   realizing that hibernation (or suspend) operation takes place.  Ideally, user
+   space processes should not notice that such a system-wide operation has
+   occurred and should continue running without any problems after the restore
+   (or resume from suspend).  Unfortunately, in the most general case this
+   is quite difficult to achieve without the freezing of tasks.  Consider,
+   for example, a process that depends on all CPUs being online while it's
+   running.  Since we need to disable nonboot CPUs during the hibernation,
+   if this process is not frozen, it may notice that the number of CPUs has
+   changed and may start to work incorrectly because of that.
+
+V. Are there any problems related to the freezing of tasks?
+===========================================================
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another.  For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable.  That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+   userspace, it gets even more complicated because some userspace processes are
+   now doing the sorts of things that kernel threads do
+   (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it.  For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point.  So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet.  In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used.  Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in
+Documentation/driver-api/pm/notifiers.rst.
+
+VI. Are there any precautions to be taken to prevent freezing failures?
+=======================================================================
+
+Yes, there are.
+
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
+from system-wide sleep such as suspend/hibernation is not encouraged.
+If possible, that piece of code must instead hook onto the suspend/hibernation
+notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
+(kernel/cpu.c) for an example.
+
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
+that could lead to freezing failures, because if the suspend/hibernate code
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
+to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
+state. As a consequence, the freezer would not be able to freeze that task,
+leading to freezing failure.
+
+However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
+since they ask the freezer to skip freezing this task, since it is anyway
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
+only after the entire suspend/hibernation sequence is complete.
+So, to summarize, use [un]lock_system_sleep() instead of directly using
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
+
+V. Miscellaneous
+================
+
+/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
+all user space processes or all freezable kernel threads, in unit of millisecond.
+The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
deleted file mode 100644
index cd283190855a..000000000000
--- a/Documentation/power/freezing-of-tasks.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Freezing of tasks
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-I. What is the freezing of tasks?
-
-The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).
-
-II. How does it work?
-
-There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
-and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
-PF_NOFREEZE unset (all user space processes and some kernel threads) are
-regarded as 'freezable' and treated in a special way before the system enters a
-suspend state as well as before a hibernation image is created (in what follows
-we only consider hibernation, but the description also applies to suspend).
-
-Namely, as the first step of the hibernation procedure the function
-freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
-variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
-whether the system is to undergo a freezing operation. And freeze_processes()
-sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
-fake signal to all user space processes, and wakes up all the kernel threads.
-All freezable tasks must react to that by calling try_to_freeze(), which
-results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
-the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
-it loop until PF_FROZEN is cleared for it. Then, we say that the task is
-'frozen' and therefore the set of functions handling this mechanism is referred
-to as 'the freezer' (these functions are defined in kernel/power/process.c,
-kernel/freezer.c & include/linux/freezer.h). User space processes are generally
-frozen before kernel threads.
-
-__refrigerator() must not be called directly.  Instead, use the
-try_to_freeze() function (defined in include/linux/freezer.h), that checks
-if the task is to be frozen and makes the task enter __refrigerator().
-
-For user space processes try_to_freeze() is called automatically from the
-signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places or use the wait_event_freezable() or
-wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
-that combine interruptible sleep with checking if the task is to be frozen and
-calling try_to_freeze().  The main loop of a freezable kernel thread may look
-like the following one:
-
-	set_freezable();
-	do {
-		hub_events();
-		wait_event_freezable(khubd_wait,
-				!list_empty(&hub_event_list) ||
-				kthread_should_stop());
-	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
-
-(from drivers/usb/core/hub.c::hub_thread()).
-
-If a freezable kernel thread fails to call try_to_freeze() after the freezer has
-initiated a freezing operation, the freezing of tasks will fail and the entire
-hibernation operation will be cancelled.  For this reason, freezable kernel
-threads must call try_to_freeze() somewhere or use one of the
-wait_event_freezable() and wait_event_freezable_timeout() macros.
-
-After the system memory state has been restored from a hibernation image and
-devices have been reinitialized, the function thaw_processes() is called in
-order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave __refrigerator() and continue running.
-
-
-Rationale behind the functions dealing with freezing and thawing of tasks:
--------------------------------------------------------------------------
-
-freeze_processes():
-  - freezes only userspace tasks
-
-freeze_kernel_threads():
-  - freezes all tasks (including kernel threads) because we can't freeze
-    kernel threads without freezing userspace tasks
-
-thaw_kernel_threads():
-  - thaws only kernel threads; this is particularly useful if we need to do
-    anything special in between thawing of kernel threads and thawing of
-    userspace tasks, or if we want to postpone the thawing of userspace tasks
-
-thaw_processes():
-  - thaws all tasks (including kernel threads) because we can't thaw userspace
-    tasks without thawing kernel threads
-
-
-III. Which kernel threads are freezable?
-
-Kernel threads are not freezable by default.  However, a kernel thread may clear
-PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
-directly is not allowed).  From this point it is regarded as freezable
-and must call try_to_freeze() in a suitable place.
-
-IV. Why do we do that?
-
-Generally speaking, there is a couple of reasons to use the freezing of tasks:
-
-1. The principal reason is to prevent filesystems from being damaged after
-hibernation.  At the moment we have no simple means of checkpointing
-filesystems, so if there are any modifications made to filesystem data and/or
-metadata on disks, we cannot bring them back to the state from before the
-modifications.  At the same time each hibernation image contains some
-filesystem-related information that must be consistent with the state of the
-on-disk data and metadata after the system memory state has been restored from
-the image (otherwise the filesystems will be damaged in a nasty way, usually
-making them almost impossible to repair).  We therefore freeze tasks that might
-cause the on-disk filesystems' data and metadata to be modified after the
-hibernation image has been created and before the system is finally powered off.
-The majority of these are user space processes, but if any of the kernel threads
-may cause something like this to happen, they have to be freezable.
-
-2. Next, to create the hibernation image we need to free a sufficient amount of
-memory (approximately 50% of available RAM) and we need to do that before
-devices are deactivated, because we generally need them for swapping out.  Then,
-after the memory for the image has been freed, we don't want tasks to allocate
-additional memory and we prevent them from doing that by freezing them earlier.
-[Of course, this also means that device drivers should not allocate substantial
-amounts of memory from their .suspend() callbacks before hibernation, but this
-is a separate issue.]
-
-3. The third reason is to prevent user space processes and some kernel threads
-from interfering with the suspending and resuming of devices.  A user space
-process running on a second CPU while we are suspending devices may, for
-example, be troublesome and without the freezing of tasks we would need some
-safeguards against race conditions that might occur in such a case.
-
-Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
-of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
-
-"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
-
-Linus: In many ways, 'at all'.
-
-I _do_ realize the IO request queue issues, and that we cannot actually do
-s2ram with some devices in the middle of a DMA.  So we want to be able to
-avoid *that*, there's no question about that.  And I suspect that stopping
-user threads and then waiting for a sync is practically one of the easier
-ways to do so.
-
-So in practice, the 'at all' may become a 'why freeze kernel threads?' and
-freezing user threads I don't find really objectionable."
-
-Still, there are kernel threads that may want to be freezable.  For example, if
-a kernel thread that belongs to a device driver accesses the device directly, it
-in principle needs to know when the device is suspended, so that it doesn't try
-to access it at that time.  However, if the kernel thread is freezable, it will
-be frozen before the driver's .suspend() callback is executed and it will be
-thawed after the driver's .resume() callback has run, so it won't be accessing
-the device while it's suspended.
-
-4. Another reason for freezing tasks is to prevent user space processes from
-realizing that hibernation (or suspend) operation takes place.  Ideally, user
-space processes should not notice that such a system-wide operation has occurred
-and should continue running without any problems after the restore (or resume
-from suspend).  Unfortunately, in the most general case this is quite difficult
-to achieve without the freezing of tasks.  Consider, for example, a process
-that depends on all CPUs being online while it's running.  Since we need to
-disable nonboot CPUs during the hibernation, if this process is not frozen, it
-may notice that the number of CPUs has changed and may start to work incorrectly
-because of that.
-
-V. Are there any problems related to the freezing of tasks?
-
-Yes, there are.
-
-First of all, the freezing of kernel threads may be tricky if they depend one
-on another.  For example, if kernel thread A waits for a completion (in the
-TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
-and B is frozen in the meantime, then A will be blocked until B is thawed, which
-may be undesirable.  That's why kernel threads are not freezable by default.
-
-Second, there are the following two problems related to the freezing of user
-space processes:
-1. Putting processes into an uninterruptible sleep distorts the load average.
-2. Now that we have FUSE, plus the framework for doing device drivers in
-userspace, it gets even more complicated because some userspace processes are
-now doing the sorts of things that kernel threads do
-(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
-
-The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
-other one is more serious, but it seems that we can work around it by using
-hibernation (and suspend) notifiers (in that case, though, we won't be able to
-avoid the realization by the user space processes that the hibernation is taking
-place).
-
-There are also problems that the freezing of tasks tends to expose, although
-they are not directly related to it.  For example, if request_firmware() is
-called from a device driver's .resume() routine, it will timeout and eventually
-fail, because the user land process that should respond to the request is frozen
-at this point.  So, seemingly, the failure is due to the freezing of tasks.
-Suppose, however, that the firmware file is located on a filesystem accessible
-only through another device that hasn't been resumed yet.  In that case,
-request_firmware() will fail regardless of whether or not the freezing of tasks
-is used.  Consequently, the problem is not really related to the freezing of
-tasks, since it generally exists anyway.
-
-A driver must have all firmwares it may need in RAM before suspend() is called.
-If keeping them is not practical, for example due to their size, they must be
-requested early enough using the suspend notifier API described in
-Documentation/driver-api/pm/notifiers.rst.
-
-VI. Are there any precautions to be taken to prevent freezing failures?
-
-Yes, there are.
-
-First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
-from system-wide sleep such as suspend/hibernation is not encouraged.
-If possible, that piece of code must instead hook onto the suspend/hibernation
-notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
-(kernel/cpu.c) for an example.
-
-However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
-that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
-to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
-state. As a consequence, the freezer would not be able to freeze that task,
-leading to freezing failure.
-
-However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
-since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
-only after the entire suspend/hibernation sequence is complete.
-So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
-
-V. Miscellaneous
-/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
-all user space processes or all freezable kernel threads, in unit of millisecond.
-The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
new file mode 100644
index 000000000000..20415f21e48a
--- /dev/null
+++ b/Documentation/power/index.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+================
+Power Management
+================
+
+.. toctree::
+    :maxdepth: 1
+
+    apm-acpi
+    basic-pm-debugging
+    charger-manager
+    drivers-testing
+    energy-model
+    freezing-of-tasks
+    interface
+    opp
+    pci
+    pm_qos_interface
+    power_supply_class
+    runtime_pm
+    s2ram
+    suspend-and-cpuhotplug
+    suspend-and-interrupts
+    swsusp-and-swap-files
+    swsusp-dmcrypt
+    swsusp
+    video
+    tricks
+
+    userland-swsusp
+
+    powercap/powercap
+
+    regulator/consumer
+    regulator/design
+    regulator/machine
+    regulator/overview
+    regulator/regulator
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst
new file mode 100644
index 000000000000..8d270ed27228
--- /dev/null
+++ b/Documentation/power/interface.rst
@@ -0,0 +1,79 @@
+===========================================
+Power Management Interface for System Sleep
+===========================================
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+- 'freeze' (Suspend-to-Idle)
+- 'standby' (Power-On Suspend)
+- 'mem' (Suspend-to-RAM)
+- 'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state.  Refer to
+Documentation/admin-guide/pm/sleep-states.rst for a description of each of
+those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+- 'platform' (put the system into sleep using a platform-provided method)
+- 'shutdown' (shut the system down)
+- 'reboot' (reboot the system)
+- 'suspend' (trigger a Suspend-to-RAM transition)
+- 'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example).  The 'suspend' option is available if Suspend-to-RAM
+is supported.  Refer to Documentation/power/basic-pm-debugging.rst for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes.  The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible.  In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.rst for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
deleted file mode 100644
index 27df7f98668a..000000000000
--- a/Documentation/power/interface.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Power Management Interface for System Sleep
-
-Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-The power management subsystem provides userspace with a unified sysfs interface
-for system sleep regardless of the underlying system architecture or platform.
-The interface is located in the /sys/power/ directory (assuming that sysfs is
-mounted at /sys).
-
-/sys/power/state is the system sleep state control file.
-
-Reading from it returns a list of supported sleep states, encoded as:
-
-'freeze' (Suspend-to-Idle)
-'standby' (Power-On Suspend)
-'mem' (Suspend-to-RAM)
-'disk' (Suspend-to-Disk)
-
-Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
-too as long the kernel has been configured to support hibernation at all
-(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
-for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
-platform.
-
-If one of the strings listed in /sys/power/state is written to it, the system
-will attempt to transition into the corresponding sleep state.  Refer to
-Documentation/admin-guide/pm/sleep-states.rst for a description of each of
-those states.
-
-/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
-Specifically, it tells the kernel what to do after creating a hibernation image.
-
-Reading from it returns a list of supported options encoded as:
-
-'platform' (put the system into sleep using a platform-provided method)
-'shutdown' (shut the system down)
-'reboot' (reboot the system)
-'suspend' (trigger a Suspend-to-RAM transition)
-'test_resume' (resume-after-hibernation test mode)
-
-The currently selected option is printed in square brackets.
-
-The 'platform' option is only available if the platform provides a special
-mechanism to put the system to sleep after creating a hibernation image (ACPI
-does that, for example).  The 'suspend' option is available if Suspend-to-RAM
-is supported.  Refer to Documentation/power/basic-pm-debugging.txt for the
-description of the 'test_resume' option.
-
-To select an option, write the string representing it to /sys/power/disk.
-
-/sys/power/image_size controls the size of hibernation images.
-
-It can be written a string representing a non-negative integer that will be
-used as a best-effort upper limit of the image size, in bytes.  The hibernation
-core will do its best to ensure that the image size will not exceed that number.
-However, if that turns out to be impossible to achieve, a hibernation image will
-still be created and its size will be as small as possible.  In particular,
-writing '0' to this file will enforce hibernation images to be as small as
-possible.
-
-Reading from this file returns the current image size limit, which is set to
-around 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
-or resume event point in the RTC across reboots.
-
-It helps to debug hard lockups or reboots due to device driver failures that
-occur during system suspend or resume (which is more common) more effectively.
-
-If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
-event point in turn will be stored in the RTC memory (overwriting the actual
-RTC information), so it will survive a system crash if one occurs right after
-storing it and it can be used later to identify the driver that caused the crash
-to happen (see Documentation/power/s2ram.txt for more information).
-
-Initially it contains '0' which may be changed to '1' by writing a string
-representing a nonzero integer into it.
diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst
new file mode 100644
index 000000000000..b3cf1def9dee
--- /dev/null
+++ b/Documentation/power/opp.rst
@@ -0,0 +1,379 @@
+==========================================
+Operating Performance Points (OPP) Library
+==========================================
+
+(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
+
+.. Contents
+
+  1. Introduction
+  2. Initial OPP List Registration
+  3. OPP Search Functions
+  4. OPP Availability Control Functions
+  5. OPP Data Retrieval Functions
+  6. Data Structures
+
+1. Introduction
+===============
+
+1.1 What is an Operating Performance Point (OPP)?
+-------------------------------------------------
+
+Complex SoCs of today consists of a multiple sub-modules working in conjunction.
+In an operational system executing varied use cases, not all modules in the SoC
+need to function at their highest performing frequency all the time. To
+facilitate this, sub-modules in a SoC are grouped into domains, allowing some
+domains to run at lower voltage and frequency while other domains run at
+voltage/frequency pairs that are higher.
+
+The set of discrete tuples consisting of frequency and voltage pairs that
+the device will support per domain are called Operating Performance Points or
+OPPs.
+
+As an example:
+
+Let us consider an MPU device which supports the following:
+{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
+{1GHz at minimum voltage of 1.3V}
+
+We can represent these as three OPPs as the following {Hz, uV} tuples:
+
+- {300000000, 1000000}
+- {800000000, 1200000}
+- {1000000000, 1300000}
+
+1.2 Operating Performance Points Library
+----------------------------------------
+
+OPP library provides a set of helper functions to organize and query the OPP
+information. The library is located in drivers/base/power/opp.c and the header
+is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
+CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
+CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
+optionally boot at a certain OPP without needing cpufreq.
+
+Typical usage of the OPP library is as follows::
+
+ (users)	-> registers a set of default OPPs		-> (library)
+ SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
+		-> queries to search/retrieve information	->
+
+OPP layer expects each domain to be represented by a unique device pointer. SoC
+framework registers a set of initial OPPs per device with the OPP layer. This
+list is expected to be an optimally small number typically around 5 per device.
+This initial list contains a set of OPPs that the framework expects to be safely
+enabled by default in the system.
+
+Note on OPP Availability
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As the system proceeds to operate, SoC framework may choose to make certain
+OPPs available or not available on each device based on various external
+factors. Example usage: Thermal management or other exceptional situations where
+SoC framework might choose to disable a higher frequency OPP to safely continue
+operations until that OPP could be re-enabled if possible.
+
+OPP library facilitates this concept in it's implementation. The following
+operational functions operate only on available opps:
+opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
+
+dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
+be used for dev_pm_opp_enable/disable functions to make an opp available as required.
+
+WARNING: Users of OPP library should refresh their availability count using
+get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
+exact mechanism to trigger these or the notification mechanism to other
+dependent subsystems such as cpufreq are left to the discretion of the SoC
+specific framework which uses the OPP library. Similar care needs to be taken
+care to refresh the cpufreq table in cases of these operations.
+
+2. Initial OPP List Registration
+================================
+The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
+device. It is expected that the SoC framework will register the OPP entries
+optimally- typical numbers range to be less than 5. The list generated by
+registering the OPPs is maintained by OPP library throughout the device
+operation. The SoC framework can subsequently control the availability of the
+OPPs dynamically using the dev_pm_opp_enable / disable functions.
+
+dev_pm_opp_add
+	Add a new OPP for a specific domain represented by the device pointer.
+	The OPP is defined using the frequency and voltage. Once added, the OPP
+	is assumed to be available and control of it's availability can be done
+	with the dev_pm_opp_enable/disable functions. OPP library internally stores
+	and manages this information in the opp struct. This function may be
+	used by SoC framework to define a optimal list as per the demands of
+	SoC usage environment.
+
+	WARNING:
+		Do not use this function in interrupt context.
+
+	Example::
+
+	 soc_pm_init()
+	 {
+		/* Do things */
+		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
+		if (!r) {
+			pr_err("%s: unable to register mpu opp(%d)\n", r);
+			goto no_cpufreq;
+		}
+		/* Do cpufreq things */
+	 no_cpufreq:
+		/* Do remaining things */
+	 }
+
+3. OPP Search Functions
+=======================
+High level framework such as cpufreq operates on frequencies. To map the
+frequency back to the corresponding OPP, OPP library provides handy functions
+to search the OPP list that OPP library internally manages. These search
+functions return the matching pointer representing the opp if a match is
+found, else returns error. These errors are expected to be handled by standard
+error checks such as IS_ERR() and appropriate actions taken by the caller.
+
+Callers of these functions shall call dev_pm_opp_put() after they have used the
+OPP. Otherwise the memory for the OPP will never get freed and result in
+memleak.
+
+dev_pm_opp_find_freq_exact
+	Search for an OPP based on an *exact* frequency and
+	availability. This function is especially useful to enable an OPP which
+	is not available by default.
+	Example: In a case when SoC framework detects a situation where a
+	higher frequency could be made available, it can use this function to
+	find the OPP prior to call the dev_pm_opp_enable to actually make
+	it available::
+
+	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+	 dev_pm_opp_put(opp);
+	 /* dont operate on the pointer.. just do a sanity check.. */
+	 if (IS_ERR(opp)) {
+		pr_err("frequency not disabled!\n");
+		/* trigger appropriate actions.. */
+	 } else {
+		dev_pm_opp_enable(dev,1000000000);
+	 }
+
+	NOTE:
+	  This is the only search function that operates on OPPs which are
+	  not available.
+
+dev_pm_opp_find_freq_floor
+	Search for an available OPP which is *at most* the
+	provided frequency. This function is useful while searching for a lesser
+	match OR operating on OPP information in the order of decreasing
+	frequency.
+	Example: To find the highest opp for a device::
+
+	 freq = ULONG_MAX;
+	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+dev_pm_opp_find_freq_ceil
+	Search for an available OPP which is *at least* the
+	provided frequency. This function is useful while searching for a
+	higher match OR operating on OPP information in the order of increasing
+	frequency.
+	Example 1: To find the lowest opp for a device::
+
+	 freq = 0;
+	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+	Example 2: A simplified implementation of a SoC cpufreq_driver->target::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* Do stuff like policy checks etc. */
+		/* Find the best frequency match for the req */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		dev_pm_opp_put(opp);
+		if (!IS_ERR(opp))
+			soc_switch_to_freq_voltage(freq);
+		else
+			/* do something when we can't satisfy the req */
+		/* do other stuff */
+	 }
+
+4. OPP Availability Control Functions
+=====================================
+A default OPP list registered with the OPP library may not cater to all possible
+situation. The OPP library provides a set of functions to modify the
+availability of a OPP within the OPP list. This allows SoC frameworks to have
+fine grained dynamic control of which sets of OPPs are operationally available.
+These functions are intended to *temporarily* remove an OPP in conditions such
+as thermal considerations (e.g. don't use OPPx until the temperature drops).
+
+WARNING:
+	Do not use these functions in interrupt context.
+
+dev_pm_opp_enable
+	Make a OPP available for operation.
+	Example: Lets say that 1GHz OPP is to be made available only if the
+	SoC temperature is lower than a certain threshold. The SoC framework
+	implementation might choose to do something as follows::
+
+	 if (cur_temp < temp_low_thresh) {
+		/* Enable 1GHz if it was disabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_enable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+dev_pm_opp_disable
+	Make an OPP to be not available for operation
+	Example: Lets say that 1GHz OPP is to be disabled if the temperature
+	exceeds a threshold value. The SoC framework implementation might
+	choose to do something as follows::
+
+	 if (cur_temp > temp_high_thresh) {
+		/* Disable 1GHz if it was enabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_disable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+5. OPP Data Retrieval Functions
+===============================
+Since OPP library abstracts away the OPP information, a set of functions to pull
+information from the OPP structure is necessary. Once an OPP pointer is
+retrieved using the search functions, the following functions can be used by SoC
+framework to retrieve the information represented inside the OPP layer.
+
+dev_pm_opp_get_voltage
+	Retrieve the voltage represented by the opp pointer.
+	Example: At a cpufreq transition to a different frequency, SoC
+	framework requires to set the voltage represented by the OPP using
+	the regulator framework to the Power Management chip providing the
+	voltage::
+
+	 soc_switch_to_freq_voltage(freq)
+	 {
+		/* do things */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		v = dev_pm_opp_get_voltage(opp);
+		dev_pm_opp_put(opp);
+		if (v)
+			regulator_set_voltage(.., v);
+		/* do other things */
+	 }
+
+dev_pm_opp_get_freq
+	Retrieve the freq represented by the opp pointer.
+	Example: Lets say the SoC framework uses a couple of helper functions
+	we could pass opp pointers instead of doing additional parameters to
+	handle quiet a bit of data parameters::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* do things.. */
+		 max_freq = ULONG_MAX;
+		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
+		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
+		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
+			r = soc_test_validity(max_opp, requested_opp);
+		 dev_pm_opp_put(max_opp);
+		 dev_pm_opp_put(requested_opp);
+		/* do other things */
+	 }
+	 soc_test_validity(..)
+	 {
+		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
+			 return -EINVAL;
+		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
+			 return -EINVAL;
+		/* do things.. */
+	 }
+
+dev_pm_opp_get_opp_count
+	Retrieve the number of available opps for a device
+	Example: Lets say a co-processor in the SoC needs to know the available
+	frequencies in a table, the main processor can notify as following::
+
+	 soc_notify_coproc_available_frequencies()
+	 {
+		/* Do things */
+		num_available = dev_pm_opp_get_opp_count(dev);
+		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
+		/* populate the table in increasing order */
+		freq = 0;
+		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
+			speeds[i] = freq;
+			freq++;
+			i++;
+			dev_pm_opp_put(opp);
+		}
+
+		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
+		/* Do other things */
+	 }
+
+6. Data Structures
+==================
+Typically an SoC contains multiple voltage domains which are variable. Each
+domain is represented by a device pointer. The relationship to OPP can be
+represented as follows::
+
+  SoC
+   |- device 1
+   |	|- opp 1 (availability, freq, voltage)
+   |	|- opp 2 ..
+   ...	...
+   |	`- opp n ..
+   |- device 2
+   ...
+   `- device m
+
+OPP library maintains a internal list that the SoC framework populates and
+accessed by various functions as described above. However, the structures
+representing the actual OPPs and domains are internal to the OPP library itself
+to allow for suitable abstraction reusable across systems.
+
+struct dev_pm_opp
+	The internal data structure of OPP library which is used to
+	represent an OPP. In addition to the freq, voltage, availability
+	information, it also contains internal book keeping information required
+	for the OPP library to operate on.  Pointer to this structure is
+	provided back to the users such as SoC framework to be used as a
+	identifier for OPP in the interactions with OPP layer.
+
+	WARNING:
+	  The struct dev_pm_opp pointer should not be parsed or modified by the
+	  users. The defaults of for an instance is populated by
+	  dev_pm_opp_add, but the availability of the OPP can be modified
+	  by dev_pm_opp_enable/disable functions.
+
+struct device
+	This is used to identify a domain to the OPP layer. The
+	nature of the device and it's implementation is left to the user of
+	OPP library such as the SoC framework.
+
+Overall, in a simplistic view, the data structure operations is represented as
+following::
+
+  Initialization / modification:
+              +-----+        /- dev_pm_opp_enable
+  dev_pm_opp_add --> | opp | <-------
+    |         +-----+        \- dev_pm_opp_disable
+    \-------> domain_info(device)
+
+  Search functions:
+               /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
+  domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
+               \-- dev_pm_opp_find_freq_floor ---/   +-----+
+
+  Retrieval functions:
+  +-----+     /- dev_pm_opp_get_voltage
+  | opp | <---
+  +-----+     \- dev_pm_opp_get_freq
+
+  domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
deleted file mode 100644
index 0c007e250cd1..000000000000
--- a/Documentation/power/opp.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Operating Performance Points (OPP) Library
-==========================================
-
-(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
-
-Contents
---------
-1. Introduction
-2. Initial OPP List Registration
-3. OPP Search Functions
-4. OPP Availability Control Functions
-5. OPP Data Retrieval Functions
-6. Data Structures
-
-1. Introduction
-===============
-1.1 What is an Operating Performance Point (OPP)?
-
-Complex SoCs of today consists of a multiple sub-modules working in conjunction.
-In an operational system executing varied use cases, not all modules in the SoC
-need to function at their highest performing frequency all the time. To
-facilitate this, sub-modules in a SoC are grouped into domains, allowing some
-domains to run at lower voltage and frequency while other domains run at
-voltage/frequency pairs that are higher.
-
-The set of discrete tuples consisting of frequency and voltage pairs that
-the device will support per domain are called Operating Performance Points or
-OPPs.
-
-As an example:
-Let us consider an MPU device which supports the following:
-{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
-{1GHz at minimum voltage of 1.3V}
-
-We can represent these as three OPPs as the following {Hz, uV} tuples:
-{300000000, 1000000}
-{800000000, 1200000}
-{1000000000, 1300000}
-
-1.2 Operating Performance Points Library
-
-OPP library provides a set of helper functions to organize and query the OPP
-information. The library is located in drivers/base/power/opp.c and the header
-is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
-CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
-CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
-optionally boot at a certain OPP without needing cpufreq.
-
-Typical usage of the OPP library is as follows:
-(users)		-> registers a set of default OPPs		-> (library)
-SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
-		-> queries to search/retrieve information	->
-
-OPP layer expects each domain to be represented by a unique device pointer. SoC
-framework registers a set of initial OPPs per device with the OPP layer. This
-list is expected to be an optimally small number typically around 5 per device.
-This initial list contains a set of OPPs that the framework expects to be safely
-enabled by default in the system.
-
-Note on OPP Availability:
-------------------------
-As the system proceeds to operate, SoC framework may choose to make certain
-OPPs available or not available on each device based on various external
-factors. Example usage: Thermal management or other exceptional situations where
-SoC framework might choose to disable a higher frequency OPP to safely continue
-operations until that OPP could be re-enabled if possible.
-
-OPP library facilitates this concept in it's implementation. The following
-operational functions operate only on available opps:
-opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
-
-dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
-be used for dev_pm_opp_enable/disable functions to make an opp available as required.
-
-WARNING: Users of OPP library should refresh their availability count using
-get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
-exact mechanism to trigger these or the notification mechanism to other
-dependent subsystems such as cpufreq are left to the discretion of the SoC
-specific framework which uses the OPP library. Similar care needs to be taken
-care to refresh the cpufreq table in cases of these operations.
-
-2. Initial OPP List Registration
-================================
-The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
-device. It is expected that the SoC framework will register the OPP entries
-optimally- typical numbers range to be less than 5. The list generated by
-registering the OPPs is maintained by OPP library throughout the device
-operation. The SoC framework can subsequently control the availability of the
-OPPs dynamically using the dev_pm_opp_enable / disable functions.
-
-dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer.
-	The OPP is defined using the frequency and voltage. Once added, the OPP
-	is assumed to be available and control of it's availability can be done
-	with the dev_pm_opp_enable/disable functions. OPP library internally stores
-	and manages this information in the opp struct. This function may be
-	used by SoC framework to define a optimal list as per the demands of
-	SoC usage environment.
-
-	WARNING: Do not use this function in interrupt context.
-
-	Example:
-	 soc_pm_init()
-	 {
-		/* Do things */
-		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
-		if (!r) {
-			pr_err("%s: unable to register mpu opp(%d)\n", r);
-			goto no_cpufreq;
-		}
-		/* Do cpufreq things */
-	 no_cpufreq:
-		/* Do remaining things */
-	 }
-
-3. OPP Search Functions
-=======================
-High level framework such as cpufreq operates on frequencies. To map the
-frequency back to the corresponding OPP, OPP library provides handy functions
-to search the OPP list that OPP library internally manages. These search
-functions return the matching pointer representing the opp if a match is
-found, else returns error. These errors are expected to be handled by standard
-error checks such as IS_ERR() and appropriate actions taken by the caller.
-
-Callers of these functions shall call dev_pm_opp_put() after they have used the
-OPP. Otherwise the memory for the OPP will never get freed and result in
-memleak.
-
-dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and
-	availability. This function is especially useful to enable an OPP which
-	is not available by default.
-	Example: In a case when SoC framework detects a situation where a
-	higher frequency could be made available, it can use this function to
-	find the OPP prior to call the dev_pm_opp_enable to actually make it available.
-	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-	 dev_pm_opp_put(opp);
-	 /* dont operate on the pointer.. just do a sanity check.. */
-	 if (IS_ERR(opp)) {
-		pr_err("frequency not disabled!\n");
-		/* trigger appropriate actions.. */
-	 } else {
-		dev_pm_opp_enable(dev,1000000000);
-	 }
-
-	NOTE: This is the only search function that operates on OPPs which are
-	not available.
-
-dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the
-	provided frequency. This function is useful while searching for a lesser
-	match OR operating on OPP information in the order of decreasing
-	frequency.
-	Example: To find the highest opp for a device:
-	 freq = ULONG_MAX;
-	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
-	 dev_pm_opp_put(opp);
-
-dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the
-	provided frequency. This function is useful while searching for a
-	higher match OR operating on OPP information in the order of increasing
-	frequency.
-	Example 1: To find the lowest opp for a device:
-	 freq = 0;
-	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-	 dev_pm_opp_put(opp);
-	Example 2: A simplified implementation of a SoC cpufreq_driver->target:
-	 soc_cpufreq_target(..)
-	 {
-		/* Do stuff like policy checks etc. */
-		/* Find the best frequency match for the req */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		dev_pm_opp_put(opp);
-		if (!IS_ERR(opp))
-			soc_switch_to_freq_voltage(freq);
-		else
-			/* do something when we can't satisfy the req */
-		/* do other stuff */
-	 }
-
-4. OPP Availability Control Functions
-=====================================
-A default OPP list registered with the OPP library may not cater to all possible
-situation. The OPP library provides a set of functions to modify the
-availability of a OPP within the OPP list. This allows SoC frameworks to have
-fine grained dynamic control of which sets of OPPs are operationally available.
-These functions are intended to *temporarily* remove an OPP in conditions such
-as thermal considerations (e.g. don't use OPPx until the temperature drops).
-
-WARNING: Do not use these functions in interrupt context.
-
-dev_pm_opp_enable - Make a OPP available for operation.
-	Example: Lets say that 1GHz OPP is to be made available only if the
-	SoC temperature is lower than a certain threshold. The SoC framework
-	implementation might choose to do something as follows:
-	 if (cur_temp < temp_low_thresh) {
-		/* Enable 1GHz if it was disabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_enable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-dev_pm_opp_disable - Make an OPP to be not available for operation
-	Example: Lets say that 1GHz OPP is to be disabled if the temperature
-	exceeds a threshold value. The SoC framework implementation might
-	choose to do something as follows:
-	 if (cur_temp > temp_high_thresh) {
-		/* Disable 1GHz if it was enabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_disable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-5. OPP Data Retrieval Functions
-===============================
-Since OPP library abstracts away the OPP information, a set of functions to pull
-information from the OPP structure is necessary. Once an OPP pointer is
-retrieved using the search functions, the following functions can be used by SoC
-framework to retrieve the information represented inside the OPP layer.
-
-dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer.
-	Example: At a cpufreq transition to a different frequency, SoC
-	framework requires to set the voltage represented by the OPP using
-	the regulator framework to the Power Management chip providing the
-	voltage.
-	 soc_switch_to_freq_voltage(freq)
-	 {
-		/* do things */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		v = dev_pm_opp_get_voltage(opp);
-		dev_pm_opp_put(opp);
-		if (v)
-			regulator_set_voltage(.., v);
-		/* do other things */
-	 }
-
-dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer.
-	Example: Lets say the SoC framework uses a couple of helper functions
-	we could pass opp pointers instead of doing additional parameters to
-	handle quiet a bit of data parameters.
-	 soc_cpufreq_target(..)
-	 {
-		/* do things.. */
-		 max_freq = ULONG_MAX;
-		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
-		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
-		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
-			r = soc_test_validity(max_opp, requested_opp);
-		 dev_pm_opp_put(max_opp);
-		 dev_pm_opp_put(requested_opp);
-		/* do other things */
-	 }
-	 soc_test_validity(..)
-	 {
-		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
-			 return -EINVAL;
-		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
-			 return -EINVAL;
-		/* do things.. */
-	 }
-
-dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device
-	Example: Lets say a co-processor in the SoC needs to know the available
-	frequencies in a table, the main processor can notify as following:
-	 soc_notify_coproc_available_frequencies()
-	 {
-		/* Do things */
-		num_available = dev_pm_opp_get_opp_count(dev);
-		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
-		/* populate the table in increasing order */
-		freq = 0;
-		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
-			speeds[i] = freq;
-			freq++;
-			i++;
-			dev_pm_opp_put(opp);
-		}
-
-		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
-		/* Do other things */
-	 }
-
-6. Data Structures
-==================
-Typically an SoC contains multiple voltage domains which are variable. Each
-domain is represented by a device pointer. The relationship to OPP can be
-represented as follows:
-SoC
- |- device 1
- |	|- opp 1 (availability, freq, voltage)
- |	|- opp 2 ..
- ...	...
- |	`- opp n ..
- |- device 2
- ...
- `- device m
-
-OPP library maintains a internal list that the SoC framework populates and
-accessed by various functions as described above. However, the structures
-representing the actual OPPs and domains are internal to the OPP library itself
-to allow for suitable abstraction reusable across systems.
-
-struct dev_pm_opp - The internal data structure of OPP library which is used to
-	represent an OPP. In addition to the freq, voltage, availability
-	information, it also contains internal book keeping information required
-	for the OPP library to operate on.  Pointer to this structure is
-	provided back to the users such as SoC framework to be used as a
-	identifier for OPP in the interactions with OPP layer.
-
-	WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the
-	users. The defaults of for an instance is populated by dev_pm_opp_add, but the
-	availability of the OPP can be modified by dev_pm_opp_enable/disable functions.
-
-struct device - This is used to identify a domain to the OPP layer. The
-	nature of the device and it's implementation is left to the user of
-	OPP library such as the SoC framework.
-
-Overall, in a simplistic view, the data structure operations is represented as
-following:
-
-Initialization / modification:
-            +-----+        /- dev_pm_opp_enable
-dev_pm_opp_add --> | opp | <-------
-  |         +-----+        \- dev_pm_opp_disable
-  \-------> domain_info(device)
-
-Search functions:
-             /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
-domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
-             \-- dev_pm_opp_find_freq_floor ---/   +-----+
-
-Retrieval functions:
-+-----+     /- dev_pm_opp_get_voltage
-| opp | <---
-+-----+     \- dev_pm_opp_get_freq
-
-domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
new file mode 100644
index 000000000000..0e2ef7429304
--- /dev/null
+++ b/Documentation/power/pci.rst
@@ -0,0 +1,1135 @@
+====================
+PCI Power Management
+====================
+
+Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+An overview of concepts and the Linux kernel's interfaces related to PCI power
+management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
+(and others).
+
+This document only covers the aspects of power management specific to PCI
+devices.  For general description of the kernel's interfaces related to device
+power management refer to Documentation/driver-api/pm/devices.rst and
+Documentation/power/runtime_pm.rst.
+
+.. contents:
+
+   1. Hardware and Platform Support for PCI Power Management
+   2. PCI Subsystem and Device Power Management
+   3. PCI Device Drivers and Power Management
+   4. Resources
+
+
+1. Hardware and Platform Support for PCI Power Management
+=========================================================
+
+1.1. Native and Platform-Based Power Management
+-----------------------------------------------
+
+In general, power management is a feature allowing one to save energy by putting
+devices into states in which they draw less power (low-power states) at the
+price of reduced functionality or performance.
+
+Usually, a device is put into a low-power state when it is underutilized or
+completely inactive.  However, when it is necessary to use the device once
+again, it has to be put back into the "fully functional" state (full-power
+state).  This may happen when there are some data for the device to handle or
+as a result of an external event requiring the device to be active, which may
+be signaled by the device itself.
+
+PCI devices may be put into low-power states in two ways, by using the device
+capabilities introduced by the PCI Bus Power Management Interface Specification,
+or with the help of platform firmware, such as an ACPI BIOS.  In the first
+approach, that is referred to as the native PCI power management (native PCI PM)
+in what follows, the device power state is changed as a result of writing a
+specific value into one of its standard configuration registers.  The second
+approach requires the platform firmware to provide special methods that may be
+used by the kernel to change the device's power state.
+
+Devices supporting the native PCI PM usually can generate wakeup signals called
+Power Management Events (PMEs) to let the kernel know about external events
+requiring the device to be active.  After receiving a PME the kernel is supposed
+to put the device that sent it into the full-power state.  However, the PCI Bus
+Power Management Interface Specification doesn't define any standard method of
+delivering the PME from the device to the CPU and the operating system kernel.
+It is assumed that the platform firmware will perform this task and therefore,
+even though a PCI device is set up to generate PMEs, it also may be necessary to
+prepare the platform firmware for notifying the CPU of the PMEs coming from the
+device (e.g. by generating interrupts).
+
+In turn, if the methods provided by the platform firmware are used for changing
+the power state of a device, usually the platform also provides a method for
+preparing the device to generate wakeup signals.  In that case, however, it
+often also is necessary to prepare the device for generating PMEs using the
+native PCI PM mechanism, because the method provided by the platform depends on
+that.
+
+Thus in many situations both the native and the platform-based power management
+mechanisms have to be used simultaneously to obtain the desired result.
+
+1.2. Native PCI Power Management
+--------------------------------
+
+The PCI Bus Power Management Interface Specification (PCI PM Spec) was
+introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
+standard interface for performing various operations related to power
+management.
+
+The implementation of the PCI PM Spec is optional for conventional PCI devices,
+but it is mandatory for PCI Express devices.  If a device supports the PCI PM
+Spec, it has an 8 byte power management capability field in its PCI
+configuration space.  This field is used to describe and control the standard
+features related to the native PCI power management.
+
+The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
+(B0-B3).  The higher the number, the less power is drawn by the device or bus
+in that state.  However, the higher the number, the longer the latency for
+the device or bus to return to the full-power state (D0 or B0, respectively).
+
+There are two variants of the D3 state defined by the specification.  The first
+one is D3hot, referred to as the software accessible D3, because devices can be
+programmed to go into it.  The second one, D3cold, is the state that PCI devices
+are in when the supply voltage (Vcc) is removed from them.  It is not possible
+to program a PCI device to go into D3cold, although there may be a programmable
+interface for putting the bus the device is on into a state in which Vcc is
+removed from all devices on the bus.
+
+PCI bus power management, however, is not supported by the Linux kernel at the
+time of this writing and therefore it is not covered by this document.
+
+Note that every PCI device can be in the full-power state (D0) or in D3cold,
+regardless of whether or not it implements the PCI PM Spec.  In addition to
+that, if the PCI PM Spec is implemented by the device, it must support D3hot
+as well as D0.  The support for the D1 and D2 power states is optional.
+
+PCI devices supporting the PCI PM Spec can be programmed to go to any of the
+supported low-power states (except for D3cold).  While in D1-D3hot the
+standard configuration registers of the device must be accessible to software
+(i.e. the device is required to respond to PCI configuration accesses), although
+its I/O and memory spaces are then disabled.  This allows the device to be
+programmatically put into D0.  Thus the kernel can switch the device back and
+forth between D0 and the supported low-power states (except for D3cold) and the
+possible power state transitions the device can undergo are the following:
+
++----------------------------+
+| Current State | New State  |
++----------------------------+
+| D0            | D1, D2, D3 |
++----------------------------+
+| D1            | D2, D3     |
++----------------------------+
+| D2            | D3         |
++----------------------------+
+| D1, D2, D3    | D0         |
++----------------------------+
+
+The transition from D3cold to D0 occurs when the supply voltage is provided to
+the device (i.e. power is restored).  In that case the device returns to D0 with
+a full power-on reset sequence and the power-on defaults are restored to the
+device by hardware just as at initial power up.
+
+PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
+while in a low-power state (D1-D3), but they are not required to be capable
+of generating PMEs from all supported low-power states.  In particular, the
+capability of generating PMEs from D3cold is optional and depends on the
+presence of additional voltage (3.3Vaux) allowing the device to remain
+sufficiently active to generate a wakeup signal.
+
+1.3. ACPI Device Power Management
+---------------------------------
+
+The platform firmware support for the power management of PCI devices is
+system-specific.  However, if the system in question is compliant with the
+Advanced Configuration and Power Interface (ACPI) Specification, like the
+majority of x86-based systems, it is supposed to implement device power
+management interfaces defined by the ACPI standard.
+
+For this purpose the ACPI BIOS provides special functions called "control
+methods" that may be executed by the kernel to perform specific tasks, such as
+putting a device into a low-power state.  These control methods are encoded
+using special byte-code language called the ACPI Machine Language (AML) and
+stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
+them as needed using an AML interpreter that translates the AML byte code into
+computations and memory or I/O space accesses.  This way, in theory, a BIOS
+writer can provide the kernel with a means to perform actions depending
+on the system design in a system-specific fashion.
+
+ACPI control methods may be divided into global control methods, that are not
+associated with any particular devices, and device control methods, that have
+to be defined separately for each device supposed to be handled with the help of
+the platform.  This means, in particular, that ACPI device control methods can
+only be used to handle devices that the BIOS writer knew about in advance.  The
+ACPI methods used for device power management fall into that category.
+
+The ACPI specification assumes that devices can be in one of four power states
+labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
+D0-D3 states (although the difference between D3hot and D3cold is not taken
+into account by ACPI).  Moreover, for each power state of a device there is a
+set of power resources that have to be enabled for the device to be put into
+that state.  These power resources are controlled (i.e. enabled or disabled)
+with the help of their own control methods, _ON and _OFF, that have to be
+defined individually for each of them.
+
+To put a device into the ACPI power state Dx (where x is a number between 0 and
+3 inclusive) the kernel is supposed to (1) enable the power resources required
+by the device in this state using their _ON control methods and (2) execute the
+_PSx control method defined for the device.  In addition to that, if the device
+is going to be put into a low-power state (D1-D3) and is supposed to generate
+wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
+3.0) control method defined for it has to be executed before _PSx.  Power
+resources that are not required by the device in the target power state and are
+not required any more by any other device should be disabled (by executing their
+_OFF control methods).  If the current power state of the device is D3, it can
+only be put into D0 this way.
+
+However, quite often the power states of devices are changed during a
+system-wide transition into a sleep state or back into the working state.  ACPI
+defines four system sleep states, S1, S2, S3, and S4, and denotes the system
+working state as S0.  In general, the target system sleep (or working) state
+determines the highest power (lowest number) state the device can be put
+into and the kernel is supposed to obtain this information by executing the
+device's _SxD control method (where x is a number between 0 and 4 inclusive).
+If the device is required to wake up the system from the target sleep state, the
+lowest power (highest number) state it can be put into is also determined by the
+target state of the system.  The kernel is then supposed to use the device's
+_SxW control method to obtain the number of that state.  It also is supposed to
+use the device's _PRW control method to learn which power resources need to be
+enabled for the device to be able to generate wakeup signals.
+
+1.4. Wakeup Signaling
+---------------------
+
+Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
+a result of the execution of the _DSW (or _PSW) ACPI control method before
+putting the device into a low-power state, have to be caught and handled as
+appropriate.  If they are sent while the system is in the working state
+(ACPI S0), they should be translated into interrupts so that the kernel can
+put the devices generating them into the full-power state and take care of the
+events that triggered them.  In turn, if they are sent while the system is
+sleeping, they should cause the system's core logic to trigger wakeup.
+
+On ACPI-based systems wakeup signals sent by conventional PCI devices are
+converted into ACPI General-Purpose Events (GPEs) which are hardware signals
+from the system core logic generated in response to various events that need to
+be acted upon.  Every GPE is associated with one or more sources of potentially
+interesting events.  In particular, a GPE may be associated with a PCI device
+capable of signaling wakeup.  The information on the connections between GPEs
+and event sources is recorded in the system's ACPI BIOS from where it can be
+read by the kernel.
+
+If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
+associated with it (if there is one) is triggered.  The GPEs associated with PCI
+bridges may also be triggered in response to a wakeup signal from one of the
+devices below the bridge (this also is the case for root bridges) and, for
+example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
+handled this way.
+
+A GPE may be triggered when the system is sleeping (i.e. when it is in one of
+the ACPI S1-S4 states), in which case system wakeup is started by its core logic
+(the device that was the source of the signal causing the system wakeup to occur
+may be identified later).  The GPEs used in such situations are referred to as
+wakeup GPEs.
+
+Usually, however, GPEs are also triggered when the system is in the working
+state (ACPI S0) and in that case the system's core logic generates a System
+Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
+handler identifies the GPE that caused the interrupt to be generated which,
+in turn, allows the kernel to identify the source of the event (that may be
+a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
+events occurring while the system is in the working state are referred to as
+runtime GPEs.
+
+Unfortunately, there is no standard way of handling wakeup signals sent by
+conventional PCI devices on systems that are not ACPI-based, but there is one
+for PCI Express devices.  Namely, the PCI Express Base Specification introduced
+a native mechanism for converting native PCI PMEs into interrupts generated by
+root ports.  For conventional PCI devices native PMEs are out-of-band, so they
+are routed separately and they need not pass through bridges (in principle they
+may be routed directly to the system's core logic), but for PCI Express devices
+they are in-band messages that have to pass through the PCI Express hierarchy,
+including the root port on the path from the device to the Root Complex.  Thus
+it was possible to introduce a mechanism by which a root port generates an
+interrupt whenever it receives a PME message from one of the devices below it.
+The PCI Express Requester ID of the device that sent the PME message is then
+recorded in one of the root port's configuration registers from where it may be
+read by the interrupt handler allowing the device to be identified.  [PME
+messages sent by PCI Express endpoints integrated with the Root Complex don't
+pass through root ports, but instead they cause a Root Complex Event Collector
+(if there is one) to generate interrupts.]
+
+In principle the native PCI Express PME signaling may also be used on ACPI-based
+systems along with the GPEs, but to use it the kernel has to ask the system's
+ACPI BIOS to release control of root port configuration registers.  The ACPI
+BIOS, however, is not required to allow the kernel to control these registers
+and if it doesn't do that, the kernel must not modify their contents.  Of course
+the native PCI Express PME signaling cannot be used by the kernel in that case.
+
+
+2. PCI Subsystem and Device Power Management
+============================================
+
+2.1. Device Power Management Callbacks
+--------------------------------------
+
+The PCI Subsystem participates in the power management of PCI devices in a
+number of ways.  First of all, it provides an intermediate code layer between
+the device power management core (PM core) and PCI device drivers.
+Specifically, the pm field of the PCI subsystem's struct bus_type object,
+pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
+pointers to several device power management callbacks::
+
+  const struct dev_pm_ops pci_dev_pm_ops = {
+	.prepare = pci_pm_prepare,
+	.complete = pci_pm_complete,
+	.suspend = pci_pm_suspend,
+	.resume = pci_pm_resume,
+	.freeze = pci_pm_freeze,
+	.thaw = pci_pm_thaw,
+	.poweroff = pci_pm_poweroff,
+	.restore = pci_pm_restore,
+	.suspend_noirq = pci_pm_suspend_noirq,
+	.resume_noirq = pci_pm_resume_noirq,
+	.freeze_noirq = pci_pm_freeze_noirq,
+	.thaw_noirq = pci_pm_thaw_noirq,
+	.poweroff_noirq = pci_pm_poweroff_noirq,
+	.restore_noirq = pci_pm_restore_noirq,
+	.runtime_suspend = pci_pm_runtime_suspend,
+	.runtime_resume = pci_pm_runtime_resume,
+	.runtime_idle = pci_pm_runtime_idle,
+  };
+
+These callbacks are executed by the PM core in various situations related to
+device power management and they, in turn, execute power management callbacks
+provided by PCI device drivers.  They also perform power management operations
+involving some standard configuration registers of PCI devices that device
+drivers need not know or care about.
+
+The structure representing a PCI device, struct pci_dev, contains several fields
+that these callbacks operate on::
+
+  struct pci_dev {
+	...
+	pci_power_t     current_state;  /* Current operating state. */
+	int		pm_cap;		/* PM capability offset in the
+					   configuration space */
+	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
+					   can be generated */
+	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
+	unsigned int	d1_support:1;	/* Low power state D1 is supported */
+	unsigned int	d2_support:1;	/* Low power state D2 is supported */
+	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
+	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
+	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
+	...
+  };
+
+They also indirectly use some fields of the struct device that is embedded in
+struct pci_dev.
+
+2.2. Device Initialization
+--------------------------
+
+The PCI subsystem's first task related to device power management is to
+prepare the device for power management and initialize the fields of struct
+pci_dev used for this purpose.  This happens in two functions defined in
+drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
+
+The first of these functions checks if the device supports native PCI PM
+and if that's the case the offset of its power management capability structure
+in the configuration space is stored in the pm_cap field of the device's struct
+pci_dev object.  Next, the function checks which PCI low-power states are
+supported by the device and from which low-power states the device can generate
+native PCI PMEs.  The power management fields of the device's struct pci_dev and
+the struct device embedded in it are updated accordingly and the generation of
+PMEs by the device is disabled.
+
+The second function checks if the device can be prepared to signal wakeup with
+the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
+the function updates the wakeup fields in struct device embedded in the
+device's struct pci_dev and uses the firmware-provided method to prevent the
+device from signaling wakeup.
+
+At this point the device is ready for power management.  For driverless devices,
+however, this functionality is limited to a few basic operations carried out
+during system-wide transitions to a sleep state and back to the working state.
+
+2.3. Runtime Device Power Management
+------------------------------------
+
+The PCI subsystem plays a vital role in the runtime power management of PCI
+devices.  For this purpose it uses the general runtime power management
+(runtime PM) framework described in Documentation/power/runtime_pm.rst.
+Namely, it provides subsystem-level callbacks::
+
+	pci_pm_runtime_suspend()
+	pci_pm_runtime_resume()
+	pci_pm_runtime_idle()
+
+that are executed by the core runtime PM routines.  It also implements the
+entire mechanics necessary for handling runtime wakeup signals from PCI devices
+in low-power states, which at the time of this writing works for both the native
+PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
+Section 1.
+
+First, a PCI device is put into a low-power state, or suspended, with the help
+of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
+pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
+driver has to provide a pm->runtime_suspend() callback (see below), which is
+run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
+returns successfully, the device's standard configuration registers are saved,
+the device is prepared to generate wakeup signals and, finally, it is put into
+the target low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup.  The exact method of signaling wakeup is
+system-dependent and is determined by the PCI subsystem on the basis of the
+reported capabilities of the device and the platform firmware.  To prepare the
+device for signaling wakeup and put it into the selected low-power state, the
+PCI subsystem can use the platform firmware as well as the device's native PCI
+PM capabilities, if supported.
+
+It is expected that the device driver's pm->runtime_suspend() callback will
+not attempt to prepare the device for signaling wakeup or to put it into a
+low-power state.  The driver ought to leave these tasks to the PCI subsystem
+that has all of the information necessary to perform them.
+
+A suspended device is brought back into the "active" state, or resumed,
+with the help of pm_request_resume() or pm_runtime_resume() which both call
+pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
+driver provides a pm->runtime_resume() callback (see below).  However, before
+the driver's callback is executed, pci_pm_runtime_resume() brings the device
+back into the full-power state, prevents it from signaling wakeup while in that
+state and restores its standard configuration registers.  Thus the driver's
+callback need not worry about the PCI-specific aspects of the device resume.
+
+Note that generally pci_pm_runtime_resume() may be called in two different
+situations.  First, it may be called at the request of the device's driver, for
+example if there are some data for it to process.  Second, it may be called
+as a result of a wakeup signal from the device itself (this sometimes is
+referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
+is handled in one of the ways described in Section 1 and finally converted into
+a notification for the PCI subsystem after the source device has been
+identified.
+
+The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
+and pm_request_idle(), executes the device driver's pm->runtime_idle()
+callback, if defined, and if that callback doesn't return error code (or is not
+present at all), suspends the device with the help of pm_runtime_suspend().
+Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
+example, it is called right after the device has just been resumed), in which
+cases it is expected to suspend the device if that makes sense.  Usually,
+however, the PCI subsystem doesn't really know if the device really can be
+suspended, so it lets the device's driver decide by running its
+pm->runtime_idle() callback.
+
+2.4. System-Wide Power Transitions
+----------------------------------
+There are a few different types of system-wide power transitions, described in
+Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
+in a specific way and the PM core executes subsystem-level power management
+callbacks for this purpose.  They are executed in phases such that each phase
+involves executing the same subsystem-level callback for every device belonging
+to the given subsystem before the next phase begins.  These phases always run
+after tasks have been frozen.
+
+2.4.1. System Suspend
+^^^^^^^^^^^^^^^^^^^^^
+
+When the system is going into a sleep state in which the contents of memory will
+be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
+
+	prepare, suspend, suspend_noirq.
+
+The following PCI bus type's callbacks, respectively, are used in these phases::
+
+	pci_pm_prepare()
+	pci_pm_suspend()
+	pci_pm_suspend_noirq()
+
+The pci_pm_prepare() routine first puts the device into the "fully functional"
+state with the help of pm_runtime_resume().  Then, it executes the device
+driver's pm->prepare() callback if defined (i.e. if the driver's struct
+dev_pm_ops object is present and the prepare pointer in that object is valid).
+
+The pci_pm_suspend() routine first checks if the device's driver implements
+legacy PCI suspend routines (see Section 3), in which case the driver's legacy
+suspend callback is executed, if present, and its result is returned.  Next, if
+the device's driver doesn't provide a struct dev_pm_ops object (containing
+pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
+simply turns off the device's bus master capability and runs
+pcibios_disable_device() to disable it, unless the device is a bridge (PCI
+bridges are ignored by this routine).  Next, the device driver's pm->suspend()
+callback is executed, if defined, and its result is returned if it fails.
+Finally, pci_fixup_device() is called to apply hardware suspend quirks related
+to the device if necessary.
+
+Note that the suspend phase is carried out asynchronously for PCI devices, so
+the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
+devices that don't depend on each other in a known way (i.e. none of the paths
+in the device tree from the root bridge to a leaf device contains both of them).
+
+The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
+been called, which means that the device driver's interrupt handler won't be
+invoked while this routine is running.  It first checks if the device's driver
+implements legacy PCI suspends routines (Section 3), in which case the legacy
+late suspend routine is called and its result is returned (the standard
+configuration registers of the device are saved if the driver's callback hasn't
+done that).  Second, if the device driver's struct dev_pm_ops object is not
+present, the device's standard configuration registers are saved and the routine
+returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
+executed, if present, and its result is returned if it fails.  Next, if the
+device's standard configuration registers haven't been saved yet (one of the
+device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
+saves them, prepares the device to signal wakeup (if necessary) and puts it into
+a low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup while the system is in the target sleep
+state.  Just like in the runtime PM case described above, the mechanism of
+signaling wakeup is system-dependent and determined by the PCI subsystem, which
+is also responsible for preparing the device to signal wakeup from the system's
+target sleep state as appropriate.
+
+PCI device drivers (that don't implement legacy power management callbacks) are
+generally not expected to prepare devices for signaling wakeup or to put them
+into low-power states.  However, if one of the driver's suspend callbacks
+(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
+registers, pci_pm_suspend_noirq() will assume that the device has been prepared
+to signal wakeup and put into a low-power state by the driver (the driver is
+then assumed to have used the helper functions provided by the PCI subsystem for
+this purpose).  PCI device drivers are not encouraged to do that, but in some
+rare cases doing that in the driver may be the optimum approach.
+
+2.4.2. System Resume
+^^^^^^^^^^^^^^^^^^^^
+
+When the system is undergoing a transition from a sleep state in which the
+contents of memory have been preserved, such as one of the ACPI sleep states
+S1-S3, into the working state (ACPI S0), the phases are:
+
+	resume_noirq, resume, complete.
+
+The following PCI bus type's callbacks, respectively, are executed in these
+phases::
+
+	pci_pm_resume_noirq()
+	pci_pm_resume()
+	pci_pm_complete()
+
+The pci_pm_resume_noirq() routine first puts the device into the full-power
+state, restores its standard configuration registers and applies early resume
+hardware quirks related to the device, if necessary.  This is done
+unconditionally, regardless of whether or not the device's driver implements
+legacy PCI power management callbacks (this way all PCI devices are in the
+full-power state and their standard configuration registers have been restored
+when their interrupt handlers are invoked for the first time during resume,
+which allows the kernel to avoid problems with the handling of shared interrupts
+by drivers whose devices are still suspended).  If legacy PCI power management
+callbacks (see Section 3) are implemented by the device's driver, the legacy
+early resume callback is executed and its result is returned.  Otherwise, the
+device driver's pm->resume_noirq() callback is executed, if defined, and its
+result is returned.
+
+The pci_pm_resume() routine first checks if the device's standard configuration
+registers have been restored and restores them if that's not the case (this
+only is necessary in the error path during a failing suspend).  Next, resume
+hardware quirks related to the device are applied, if necessary, and if the
+device's driver implements legacy PCI power management callbacks (see
+Section 3), the driver's legacy resume callback is executed and its result is
+returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
+its driver's pm->resume() callback is executed, if defined (the callback's
+result is then returned).
+
+The resume phase is carried out asynchronously for PCI devices, like the
+suspend phase described above, which means that if two PCI devices don't depend
+on each other in a known way, the pci_pm_resume() routine may be executed for
+the both of them in parallel.
+
+The pci_pm_complete() routine only executes the device driver's pm->complete()
+callback, if defined.
+
+2.4.3. System Hibernation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+System hibernation is more complicated than system suspend, because it requires
+a system image to be created and written into a persistent storage medium.  The
+image is created atomically and all devices are quiesced, or frozen, before that
+happens.
+
+The freezing of devices is carried out after enough memory has been freed (at
+the time of this writing the image creation requires at least 50% of system RAM
+to be free) in the following three phases:
+
+	prepare, freeze, freeze_noirq
+
+that correspond to the PCI bus type's callbacks::
+
+	pci_pm_prepare()
+	pci_pm_freeze()
+	pci_pm_freeze_noirq()
+
+This means that the prepare phase is exactly the same as for system suspend.
+The other two phases, however, are different.
+
+The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
+the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
+and it doesn't apply the suspend-related hardware quirks.  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The pci_pm_freeze_noirq() routine, in turn, is similar to
+pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
+routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
+device for signaling wakeup and put it into a low-power state.  Still, it saves
+the device's standard configuration registers if they haven't been saved by one
+of the driver's callbacks.
+
+Once the image has been created, it has to be saved.  However, at this point all
+devices are frozen and they cannot handle I/O, while their ability to handle
+I/O is obviously necessary for the image saving.  Thus they have to be brought
+back to the fully functional state and this is done in the following phases:
+
+	thaw_noirq, thaw, complete
+
+using the following PCI bus type's callbacks::
+
+	pci_pm_thaw_noirq()
+	pci_pm_thaw()
+	pci_pm_complete()
+
+respectively.
+
+The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
+but it doesn't put the device into the full power state and doesn't attempt to
+restore its standard configuration registers.  It also executes the device
+driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
+
+The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
+driver's pm->thaw() callback instead of pm->resume().  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The complete phase it the same as for system resume.
+
+After saving the image, devices need to be powered down before the system can
+enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
+three phases:
+
+	prepare, poweroff, poweroff_noirq
+
+where the prepare phase is exactly the same as for system suspend.  The other
+two phases are analogous to the suspend and suspend_noirq phases, respectively.
+The PCI subsystem-level callbacks they correspond to::
+
+	pci_pm_poweroff()
+	pci_pm_poweroff_noirq()
+
+work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
+although they don't attempt to save the device's standard configuration
+registers.
+
+2.4.4. System Restore
+^^^^^^^^^^^^^^^^^^^^^
+
+System restore requires a hibernation image to be loaded into memory and the
+pre-hibernation memory contents to be restored before the pre-hibernation system
+activity can be resumed.
+
+As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
+into memory by a fresh instance of the kernel, called the boot kernel, which in
+turn is loaded and run by a boot loader in the usual way.  After the boot kernel
+has loaded the image, it needs to replace its own code and data with the code
+and data of the "hibernated" kernel stored within the image, called the image
+kernel.  For this purpose all devices are frozen just like before creating
+the image during hibernation, in the
+
+	prepare, freeze, freeze_noirq
+
+phases described above.  However, the devices affected by these phases are only
+those having drivers in the boot kernel; other devices will still be in whatever
+state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the boot
+kernel would go through the "thawing" procedure described above, using the
+thaw_noirq, thaw, and complete phases (that will only affect the devices having
+drivers in the boot kernel), and then continue running normally.
+
+If the pre-hibernation memory contents are restored successfully, which is the
+usual situation, control is passed to the image kernel, which then becomes
+responsible for bringing the system back to the working state.  To achieve this,
+it must restore the devices' pre-hibernation functionality, which is done much
+like waking up from the memory sleep state, although it involves different
+phases:
+
+	restore_noirq, restore, complete
+
+The first two of these are analogous to the resume_noirq and resume phases
+described above, respectively, and correspond to the following PCI subsystem
+callbacks::
+
+	pci_pm_restore_noirq()
+	pci_pm_restore()
+
+These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
+respectively, but they execute the device driver's pm->restore_noirq() and
+pm->restore() callbacks, if available.
+
+The complete phase is carried out in exactly the same way as during system
+resume.
+
+
+3. PCI Device Drivers and Power Management
+==========================================
+
+3.1. Power Management Callbacks
+-------------------------------
+
+PCI device drivers participate in power management by providing callbacks to be
+executed by the PCI subsystem's power management routines described above and by
+controlling the runtime power management of their devices.
+
+At the time of this writing there are two ways to define power management
+callbacks for a PCI device driver, the recommended one, based on using a
+dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
+"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
+.resume() callbacks from struct pci_driver are used.  The legacy approach,
+however, doesn't allow one to define runtime power management callbacks and is
+not really suitable for any new drivers.  Therefore it is not covered by this
+document (refer to the source code to learn more about it).
+
+It is recommended that all PCI device drivers define a struct dev_pm_ops object
+containing pointers to power management (PM) callbacks that will be executed by
+the PCI subsystem's PM routines in various circumstances.  A pointer to the
+driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
+its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
+in struct pci_driver are ignored (even if they are not NULL).
+
+The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
+defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
+subsystem will handle the device in a simplified default manner.  If they are
+defined, though, they are expected to behave as described in the following
+subsections.
+
+3.1.1. prepare()
+^^^^^^^^^^^^^^^^
+
+The prepare() callback is executed during system suspend, during hibernation
+(when a hibernation image is about to be created), during power-off after
+saving a hibernation image and during system restore, when a hibernation image
+has just been loaded into memory.
+
+This callback is only necessary if the driver's device has children that in
+general may be registered at any time.  In that case the role of the prepare()
+callback is to prevent new children of the device from being registered until
+one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
+
+In addition to that the prepare() callback may carry out some operations
+preparing the device to be suspended, although it should not allocate memory
+(if additional memory is required to suspend the device, it has to be
+preallocated earlier, for example in a suspend/hibernate notifier as described
+in Documentation/driver-api/pm/notifiers.rst).
+
+3.1.2. suspend()
+^^^^^^^^^^^^^^^^
+
+The suspend() callback is only executed during system suspend, after prepare()
+callbacks have been executed for all devices in the system.
+
+This callback is expected to quiesce the device and prepare it to be put into a
+low-power state by the PCI subsystem.  It is not required (in fact it even is
+not recommended) that a PCI driver's suspend() callback save the standard
+configuration registers of the device, prepare it for waking up the system, or
+put it into a low-power state.  All of these operations can very well be taken
+care of by the PCI subsystem, without the driver's participation.
+
+However, in some rare case it is convenient to carry out these operations in
+a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
+pci_set_power_state() should be used to save the device's standard configuration
+registers, to prepare it for system wakeup (if necessary), and to put it into a
+low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
+the PCI subsystem will not execute either pci_prepare_to_sleep(), or
+pci_set_power_state() for its device, so the driver is then responsible for
+handling the device as appropriate.
+
+While the suspend() callback is being executed, the driver's interrupt handler
+can be invoked to handle an interrupt from the device, so all suspend-related
+operations relying on the driver's ability to handle interrupts should be
+carried out in this callback.
+
+3.1.3. suspend_noirq()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The suspend_noirq() callback is only executed during system suspend, after
+suspend() callbacks have been executed for all devices in the system and
+after device interrupts have been disabled by the PM core.
+
+The difference between suspend_noirq() and suspend() is that the driver's
+interrupt handler will not be invoked while suspend_noirq() is running.  Thus
+suspend_noirq() can carry out operations that would cause race conditions to
+arise if they were performed in suspend().
+
+3.1.4. freeze()
+^^^^^^^^^^^^^^^
+
+The freeze() callback is hibernation-specific and is executed in two situations,
+during hibernation, after prepare() callbacks have been executed for all devices
+in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory from persistent storage and the
+prepare() callbacks have been executed for all devices.
+
+The role of this callback is analogous to the role of the suspend() callback
+described above.  In fact, they only need to be different in the rare cases when
+the driver takes the responsibility for putting the device into a low-power
+state.
+
+In that cases the freeze() callback should not prepare the device system wakeup
+or put it into a low-power state.  Still, either it or freeze_noirq() should
+save the device's standard configuration registers using pci_save_state().
+
+3.1.5. freeze_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The freeze_noirq() callback is hibernation-specific.  It is executed during
+hibernation, after prepare() and freeze() callbacks have been executed for all
+devices in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory and after prepare() and
+freeze() callbacks have been executed for all devices.  It is always executed
+after device interrupts have been disabled by the PM core.
+
+The role of this callback is analogous to the role of the suspend_noirq()
+callback described above and it very rarely is necessary to define
+freeze_noirq().
+
+The difference between freeze_noirq() and freeze() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.6. poweroff()
+^^^^^^^^^^^^^^^^^
+
+The poweroff() callback is hibernation-specific.  It is executed when the system
+is about to be powered off after saving a hibernation image to a persistent
+storage.  prepare() callbacks are executed for all devices before poweroff() is
+called.
+
+The role of this callback is analogous to the role of the suspend() and freeze()
+callbacks described above, although it does not need to save the contents of
+the device's registers.  In particular, if the driver wants to put the device
+into a low-power state itself instead of allowing the PCI subsystem to do that,
+the poweroff() callback should use pci_prepare_to_sleep() and
+pci_set_power_state() to prepare the device for system wakeup and to put it
+into a low-power state, respectively, but it need not save the device's standard
+configuration registers.
+
+3.1.7. poweroff_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The poweroff_noirq() callback is hibernation-specific.  It is executed after
+poweroff() callbacks have been executed for all devices in the system.
+
+The role of this callback is analogous to the role of the suspend_noirq() and
+freeze_noirq() callbacks described above, but it does not need to save the
+contents of the device's registers.
+
+The difference between poweroff_noirq() and poweroff() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.8. resume_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The resume_noirq() callback is only executed during system resume, after the
+PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
+be invoked while resume_noirq() is running, so this callback can carry out
+operations that might race with the interrupt handler.
+
+Since the PCI subsystem unconditionally puts all devices into the full power
+state in the resume_noirq phase of system resume and restores their standard
+configuration registers, resume_noirq() is usually not necessary.  In general
+it should only be used for performing operations that would lead to race
+conditions if carried out by resume().
+
+3.1.9. resume()
+^^^^^^^^^^^^^^^
+
+The resume() callback is only executed during system resume, after
+resume_noirq() callbacks have been executed for all devices in the system and
+device interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-suspend configuration of the
+device and bringing it back to the fully functional state.  The device should be
+able to process I/O in a usual way after resume() has returned.
+
+3.1.10. thaw_noirq()
+^^^^^^^^^^^^^^^^^^^^
+
+The thaw_noirq() callback is hibernation-specific.  It is executed after a
+system image has been created and the non-boot CPUs have been enabled by the PM
+core, in the thaw_noirq phase of hibernation.  It also may be executed if the
+loading of a hibernation image fails during system restore (it is then executed
+after enabling the non-boot CPUs).  The driver's interrupt handler will not be
+invoked while thaw_noirq() is running.
+
+The role of this callback is analogous to the role of resume_noirq().  The
+difference between these two callbacks is that thaw_noirq() is executed after
+freeze() and freeze_noirq(), so in general it does not need to modify the
+contents of the device's registers.
+
+3.1.11. thaw()
+^^^^^^^^^^^^^^
+
+The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
+callbacks have been executed for all devices in the system and after device
+interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-freeze configuration of
+the device, so that it will work in a usual way after thaw() has returned.
+
+3.1.12. restore_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The restore_noirq() callback is hibernation-specific.  It is executed in the
+restore_noirq phase of hibernation, when the boot kernel has passed control to
+the image kernel and the non-boot CPUs have been enabled by the image kernel's
+PM core.
+
+This callback is analogous to resume_noirq() with the exception that it cannot
+make any assumption on the previous state of the device, even if the BIOS (or
+generally the platform firmware) is known to preserve that state over a
+suspend-resume cycle.
+
+For the vast majority of PCI device drivers there is no difference between
+resume_noirq() and restore_noirq().
+
+3.1.13. restore()
+^^^^^^^^^^^^^^^^^
+
+The restore() callback is hibernation-specific.  It is executed after
+restore_noirq() callbacks have been executed for all devices in the system and
+after the PM core has enabled device drivers' interrupt handlers to be invoked.
+
+This callback is analogous to resume(), just like restore_noirq() is analogous
+to resume_noirq().  Consequently, the difference between restore_noirq() and
+restore() is analogous to the difference between resume_noirq() and resume().
+
+For the vast majority of PCI device drivers there is no difference between
+resume() and restore().
+
+3.1.14. complete()
+^^^^^^^^^^^^^^^^^^
+
+The complete() callback is executed in the following situations:
+
+  - during system resume, after resume() callbacks have been executed for all
+    devices,
+  - during hibernation, before saving the system image, after thaw() callbacks
+    have been executed for all devices,
+  - during system restore, when the system is going back to its pre-hibernation
+    state, after restore() callbacks have been executed for all devices.
+
+It also may be executed if the loading of a hibernation image into memory fails
+(in that case it is run after thaw() callbacks have been executed for all
+devices that have drivers in the boot kernel).
+
+This callback is entirely optional, although it may be necessary if the
+prepare() callback performs operations that need to be reversed.
+
+3.1.15. runtime_suspend()
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_suspend() callback is specific to device runtime power management
+(runtime PM).  It is executed by the PM core's runtime PM framework when the
+device is about to be suspended (i.e. quiesced and put into a low-power state)
+at run time.
+
+This callback is responsible for freezing the device and preparing it to be
+put into a low-power state, but it must allow the PCI subsystem to perform all
+of the PCI-specific actions necessary for suspending the device.
+
+3.1.16. runtime_resume()
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_resume() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework when the device is about to be resumed
+(i.e. put into the full-power state and programmed to process I/O normally) at
+run time.
+
+This callback is responsible for restoring the normal functionality of the
+device after it has been put into the full-power state by the PCI subsystem.
+The device is expected to be able to process I/O in the usual way after
+runtime_resume() has returned.
+
+3.1.17. runtime_idle()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_idle() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework whenever it may be desirable to suspend
+the device according to the PM core's information.  In particular, it is
+automatically executed right after runtime_resume() has returned in case the
+resume of the device has happened as a result of a spurious event.
+
+This callback is optional, but if it is not implemented or if it returns 0, the
+PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
+cause the driver's runtime_suspend() callback to be executed.
+
+3.1.18. Pointing Multiple Callback Pointers to One Routine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although in principle each of the callbacks described in the previous
+subsections can be defined as a separate function, it often is convenient to
+point two or more members of struct dev_pm_ops to the same routine.  There are
+a few convenience macros that can be used for this purpose.
+
+The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
+suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
+members and one resume routine pointed to by the .resume(), .thaw(), and
+.restore() members.  The other function pointers in this struct dev_pm_ops are
+unset.
+
+The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
+additionally sets the .runtime_resume() pointer to the same value as
+.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
+the same value as .suspend() (and .freeze() and .poweroff()).
+
+The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
+dev_pm_ops to indicate that one suspend routine is to be pointed to by the
+.suspend(), .freeze(), and .poweroff() members and one resume routine is to
+be pointed to by the .resume(), .thaw(), and .restore() members.
+
+3.1.19. Driver Flags for Power Management
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type.  The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts.  That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value.  That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way.  Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
+3.2. Device Runtime Power Management
+------------------------------------
+
+In addition to providing device power management callbacks PCI device drivers
+are responsible for controlling the runtime power management (runtime PM) of
+their devices.
+
+The PCI device runtime PM is optional, but it is recommended that PCI device
+drivers implement it at least in the cases where there is a reliable way of
+verifying that the device is not used (like when the network cable is detached
+from an Ethernet adapter or there are no devices attached to a USB controller).
+
+To support the PCI runtime PM the driver first needs to implement the
+runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
+the runtime_idle() callback to prevent the device from being suspended again
+every time right after the runtime_resume() callback has returned
+(alternatively, the runtime_suspend() callback will have to check if the
+device should really be suspended and return -EAGAIN if that is not the case).
+
+The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function.  In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function.  If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended.  The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have caused the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though.  Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
+runtime PM of the device correctly as soon as pm_runtime_allow() is called
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time.  For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback.  [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
+
+The runtime PM framework works by processing requests to suspend or resume
+devices, or to check if they are idle (in which cases it is reasonable to
+subsequently request that they be suspended).  These requests are represented
+by work items put into the power management workqueue, pm_wq.  Although there
+are a few situations in which power management requests are automatically
+queued by the PM core (for example, after processing a request to resume a
+device the PM core automatically queues a request to check if the device is
+idle), device drivers are generally responsible for queuing power management
+requests for their devices.  For this purpose they should use the runtime PM
+helper functions provided by the PM core, discussed in
+Documentation/power/runtime_pm.rst.
+
+Devices can also be suspended and resumed synchronously, without placing a
+request into pm_wq.  In the majority of cases this also is done by their
+drivers that use helper functions provided by the PM core for this purpose.
+
+For more information on the runtime PM of devices refer to
+Documentation/power/runtime_pm.rst.
+
+
+4. Resources
+============
+
+PCI Local Bus Specification, Rev. 3.0
+
+PCI Bus Power Management Interface Specification, Rev. 1.2
+
+Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
+
+PCI Express Base Specification, Rev. 2.0
+
+Documentation/driver-api/pm/devices.rst
+
+Documentation/power/runtime_pm.rst
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
deleted file mode 100644
index 8eaf9ee24d43..000000000000
--- a/Documentation/power/pci.txt
+++ /dev/null
@@ -1,1094 +0,0 @@
-PCI Power Management
-
-Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-
-An overview of concepts and the Linux kernel's interfaces related to PCI power
-management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
-(and others).
-
-This document only covers the aspects of power management specific to PCI
-devices.  For general description of the kernel's interfaces related to device
-power management refer to Documentation/driver-api/pm/devices.rst and
-Documentation/power/runtime_pm.txt.
-
----------------------------------------------------------------------------
-
-1. Hardware and Platform Support for PCI Power Management
-2. PCI Subsystem and Device Power Management
-3. PCI Device Drivers and Power Management
-4. Resources
-
-
-1. Hardware and Platform Support for PCI Power Management
-=========================================================
-
-1.1. Native and Platform-Based Power Management
------------------------------------------------
-In general, power management is a feature allowing one to save energy by putting
-devices into states in which they draw less power (low-power states) at the
-price of reduced functionality or performance.
-
-Usually, a device is put into a low-power state when it is underutilized or
-completely inactive.  However, when it is necessary to use the device once
-again, it has to be put back into the "fully functional" state (full-power
-state).  This may happen when there are some data for the device to handle or
-as a result of an external event requiring the device to be active, which may
-be signaled by the device itself.
-
-PCI devices may be put into low-power states in two ways, by using the device
-capabilities introduced by the PCI Bus Power Management Interface Specification,
-or with the help of platform firmware, such as an ACPI BIOS.  In the first
-approach, that is referred to as the native PCI power management (native PCI PM)
-in what follows, the device power state is changed as a result of writing a
-specific value into one of its standard configuration registers.  The second
-approach requires the platform firmware to provide special methods that may be
-used by the kernel to change the device's power state.
-
-Devices supporting the native PCI PM usually can generate wakeup signals called
-Power Management Events (PMEs) to let the kernel know about external events
-requiring the device to be active.  After receiving a PME the kernel is supposed
-to put the device that sent it into the full-power state.  However, the PCI Bus
-Power Management Interface Specification doesn't define any standard method of
-delivering the PME from the device to the CPU and the operating system kernel.
-It is assumed that the platform firmware will perform this task and therefore,
-even though a PCI device is set up to generate PMEs, it also may be necessary to
-prepare the platform firmware for notifying the CPU of the PMEs coming from the
-device (e.g. by generating interrupts).
-
-In turn, if the methods provided by the platform firmware are used for changing
-the power state of a device, usually the platform also provides a method for
-preparing the device to generate wakeup signals.  In that case, however, it
-often also is necessary to prepare the device for generating PMEs using the
-native PCI PM mechanism, because the method provided by the platform depends on
-that.
-
-Thus in many situations both the native and the platform-based power management
-mechanisms have to be used simultaneously to obtain the desired result.
-
-1.2. Native PCI Power Management
---------------------------------
-The PCI Bus Power Management Interface Specification (PCI PM Spec) was
-introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
-standard interface for performing various operations related to power
-management.
-
-The implementation of the PCI PM Spec is optional for conventional PCI devices,
-but it is mandatory for PCI Express devices.  If a device supports the PCI PM
-Spec, it has an 8 byte power management capability field in its PCI
-configuration space.  This field is used to describe and control the standard
-features related to the native PCI power management.
-
-The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
-(B0-B3).  The higher the number, the less power is drawn by the device or bus
-in that state.  However, the higher the number, the longer the latency for
-the device or bus to return to the full-power state (D0 or B0, respectively).
-
-There are two variants of the D3 state defined by the specification.  The first
-one is D3hot, referred to as the software accessible D3, because devices can be
-programmed to go into it.  The second one, D3cold, is the state that PCI devices
-are in when the supply voltage (Vcc) is removed from them.  It is not possible
-to program a PCI device to go into D3cold, although there may be a programmable
-interface for putting the bus the device is on into a state in which Vcc is
-removed from all devices on the bus.
-
-PCI bus power management, however, is not supported by the Linux kernel at the
-time of this writing and therefore it is not covered by this document.
-
-Note that every PCI device can be in the full-power state (D0) or in D3cold,
-regardless of whether or not it implements the PCI PM Spec.  In addition to
-that, if the PCI PM Spec is implemented by the device, it must support D3hot
-as well as D0.  The support for the D1 and D2 power states is optional.
-
-PCI devices supporting the PCI PM Spec can be programmed to go to any of the
-supported low-power states (except for D3cold).  While in D1-D3hot the
-standard configuration registers of the device must be accessible to software
-(i.e. the device is required to respond to PCI configuration accesses), although
-its I/O and memory spaces are then disabled.  This allows the device to be
-programmatically put into D0.  Thus the kernel can switch the device back and
-forth between D0 and the supported low-power states (except for D3cold) and the
-possible power state transitions the device can undergo are the following:
-
-+----------------------------+
-| Current State | New State  |
-+----------------------------+
-| D0            | D1, D2, D3 |
-+----------------------------+
-| D1            | D2, D3     |
-+----------------------------+
-| D2            | D3         |
-+----------------------------+
-| D1, D2, D3    | D0         |
-+----------------------------+
-
-The transition from D3cold to D0 occurs when the supply voltage is provided to
-the device (i.e. power is restored).  In that case the device returns to D0 with
-a full power-on reset sequence and the power-on defaults are restored to the
-device by hardware just as at initial power up.
-
-PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
-while in a low-power state (D1-D3), but they are not required to be capable
-of generating PMEs from all supported low-power states.  In particular, the
-capability of generating PMEs from D3cold is optional and depends on the
-presence of additional voltage (3.3Vaux) allowing the device to remain
-sufficiently active to generate a wakeup signal.
-
-1.3. ACPI Device Power Management
----------------------------------
-The platform firmware support for the power management of PCI devices is
-system-specific.  However, if the system in question is compliant with the
-Advanced Configuration and Power Interface (ACPI) Specification, like the
-majority of x86-based systems, it is supposed to implement device power
-management interfaces defined by the ACPI standard.
-
-For this purpose the ACPI BIOS provides special functions called "control
-methods" that may be executed by the kernel to perform specific tasks, such as
-putting a device into a low-power state.  These control methods are encoded
-using special byte-code language called the ACPI Machine Language (AML) and
-stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
-them as needed using an AML interpreter that translates the AML byte code into
-computations and memory or I/O space accesses.  This way, in theory, a BIOS
-writer can provide the kernel with a means to perform actions depending
-on the system design in a system-specific fashion.
-
-ACPI control methods may be divided into global control methods, that are not
-associated with any particular devices, and device control methods, that have
-to be defined separately for each device supposed to be handled with the help of
-the platform.  This means, in particular, that ACPI device control methods can
-only be used to handle devices that the BIOS writer knew about in advance.  The
-ACPI methods used for device power management fall into that category.
-
-The ACPI specification assumes that devices can be in one of four power states
-labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
-D0-D3 states (although the difference between D3hot and D3cold is not taken
-into account by ACPI).  Moreover, for each power state of a device there is a
-set of power resources that have to be enabled for the device to be put into
-that state.  These power resources are controlled (i.e. enabled or disabled)
-with the help of their own control methods, _ON and _OFF, that have to be
-defined individually for each of them.
-
-To put a device into the ACPI power state Dx (where x is a number between 0 and
-3 inclusive) the kernel is supposed to (1) enable the power resources required
-by the device in this state using their _ON control methods and (2) execute the
-_PSx control method defined for the device.  In addition to that, if the device
-is going to be put into a low-power state (D1-D3) and is supposed to generate
-wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
-3.0) control method defined for it has to be executed before _PSx.  Power
-resources that are not required by the device in the target power state and are
-not required any more by any other device should be disabled (by executing their
-_OFF control methods).  If the current power state of the device is D3, it can
-only be put into D0 this way.
-
-However, quite often the power states of devices are changed during a
-system-wide transition into a sleep state or back into the working state.  ACPI
-defines four system sleep states, S1, S2, S3, and S4, and denotes the system
-working state as S0.  In general, the target system sleep (or working) state
-determines the highest power (lowest number) state the device can be put
-into and the kernel is supposed to obtain this information by executing the
-device's _SxD control method (where x is a number between 0 and 4 inclusive).
-If the device is required to wake up the system from the target sleep state, the
-lowest power (highest number) state it can be put into is also determined by the
-target state of the system.  The kernel is then supposed to use the device's
-_SxW control method to obtain the number of that state.  It also is supposed to
-use the device's _PRW control method to learn which power resources need to be
-enabled for the device to be able to generate wakeup signals.
-
-1.4. Wakeup Signaling
----------------------
-Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
-a result of the execution of the _DSW (or _PSW) ACPI control method before
-putting the device into a low-power state, have to be caught and handled as
-appropriate.  If they are sent while the system is in the working state
-(ACPI S0), they should be translated into interrupts so that the kernel can
-put the devices generating them into the full-power state and take care of the
-events that triggered them.  In turn, if they are sent while the system is
-sleeping, they should cause the system's core logic to trigger wakeup.
-
-On ACPI-based systems wakeup signals sent by conventional PCI devices are
-converted into ACPI General-Purpose Events (GPEs) which are hardware signals
-from the system core logic generated in response to various events that need to
-be acted upon.  Every GPE is associated with one or more sources of potentially
-interesting events.  In particular, a GPE may be associated with a PCI device
-capable of signaling wakeup.  The information on the connections between GPEs
-and event sources is recorded in the system's ACPI BIOS from where it can be
-read by the kernel.
-
-If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
-associated with it (if there is one) is triggered.  The GPEs associated with PCI
-bridges may also be triggered in response to a wakeup signal from one of the
-devices below the bridge (this also is the case for root bridges) and, for
-example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
-handled this way.
-
-A GPE may be triggered when the system is sleeping (i.e. when it is in one of
-the ACPI S1-S4 states), in which case system wakeup is started by its core logic
-(the device that was the source of the signal causing the system wakeup to occur
-may be identified later).  The GPEs used in such situations are referred to as
-wakeup GPEs.
-
-Usually, however, GPEs are also triggered when the system is in the working
-state (ACPI S0) and in that case the system's core logic generates a System
-Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
-handler identifies the GPE that caused the interrupt to be generated which,
-in turn, allows the kernel to identify the source of the event (that may be
-a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
-events occurring while the system is in the working state are referred to as
-runtime GPEs.
-
-Unfortunately, there is no standard way of handling wakeup signals sent by
-conventional PCI devices on systems that are not ACPI-based, but there is one
-for PCI Express devices.  Namely, the PCI Express Base Specification introduced
-a native mechanism for converting native PCI PMEs into interrupts generated by
-root ports.  For conventional PCI devices native PMEs are out-of-band, so they
-are routed separately and they need not pass through bridges (in principle they
-may be routed directly to the system's core logic), but for PCI Express devices
-they are in-band messages that have to pass through the PCI Express hierarchy,
-including the root port on the path from the device to the Root Complex.  Thus
-it was possible to introduce a mechanism by which a root port generates an
-interrupt whenever it receives a PME message from one of the devices below it.
-The PCI Express Requester ID of the device that sent the PME message is then
-recorded in one of the root port's configuration registers from where it may be
-read by the interrupt handler allowing the device to be identified.  [PME
-messages sent by PCI Express endpoints integrated with the Root Complex don't
-pass through root ports, but instead they cause a Root Complex Event Collector
-(if there is one) to generate interrupts.]
-
-In principle the native PCI Express PME signaling may also be used on ACPI-based
-systems along with the GPEs, but to use it the kernel has to ask the system's
-ACPI BIOS to release control of root port configuration registers.  The ACPI
-BIOS, however, is not required to allow the kernel to control these registers
-and if it doesn't do that, the kernel must not modify their contents.  Of course
-the native PCI Express PME signaling cannot be used by the kernel in that case.
-
-
-2. PCI Subsystem and Device Power Management
-============================================
-
-2.1. Device Power Management Callbacks
---------------------------------------
-The PCI Subsystem participates in the power management of PCI devices in a
-number of ways.  First of all, it provides an intermediate code layer between
-the device power management core (PM core) and PCI device drivers.
-Specifically, the pm field of the PCI subsystem's struct bus_type object,
-pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
-pointers to several device power management callbacks:
-
-const struct dev_pm_ops pci_dev_pm_ops = {
-	.prepare = pci_pm_prepare,
-	.complete = pci_pm_complete,
-	.suspend = pci_pm_suspend,
-	.resume = pci_pm_resume,
-	.freeze = pci_pm_freeze,
-	.thaw = pci_pm_thaw,
-	.poweroff = pci_pm_poweroff,
-	.restore = pci_pm_restore,
-	.suspend_noirq = pci_pm_suspend_noirq,
-	.resume_noirq = pci_pm_resume_noirq,
-	.freeze_noirq = pci_pm_freeze_noirq,
-	.thaw_noirq = pci_pm_thaw_noirq,
-	.poweroff_noirq = pci_pm_poweroff_noirq,
-	.restore_noirq = pci_pm_restore_noirq,
-	.runtime_suspend = pci_pm_runtime_suspend,
-	.runtime_resume = pci_pm_runtime_resume,
-	.runtime_idle = pci_pm_runtime_idle,
-};
-
-These callbacks are executed by the PM core in various situations related to
-device power management and they, in turn, execute power management callbacks
-provided by PCI device drivers.  They also perform power management operations
-involving some standard configuration registers of PCI devices that device
-drivers need not know or care about.
-
-The structure representing a PCI device, struct pci_dev, contains several fields
-that these callbacks operate on:
-
-struct pci_dev {
-	...
-	pci_power_t     current_state;  /* Current operating state. */
-	int		pm_cap;		/* PM capability offset in the
-					   configuration space */
-	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
-					   can be generated */
-	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
-	unsigned int	d1_support:1;	/* Low power state D1 is supported */
-	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
-	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
-	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
-	...
-};
-
-They also indirectly use some fields of the struct device that is embedded in
-struct pci_dev.
-
-2.2. Device Initialization
---------------------------
-The PCI subsystem's first task related to device power management is to
-prepare the device for power management and initialize the fields of struct
-pci_dev used for this purpose.  This happens in two functions defined in
-drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
-
-The first of these functions checks if the device supports native PCI PM
-and if that's the case the offset of its power management capability structure
-in the configuration space is stored in the pm_cap field of the device's struct
-pci_dev object.  Next, the function checks which PCI low-power states are
-supported by the device and from which low-power states the device can generate
-native PCI PMEs.  The power management fields of the device's struct pci_dev and
-the struct device embedded in it are updated accordingly and the generation of
-PMEs by the device is disabled.
-
-The second function checks if the device can be prepared to signal wakeup with
-the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
-the function updates the wakeup fields in struct device embedded in the
-device's struct pci_dev and uses the firmware-provided method to prevent the
-device from signaling wakeup.
-
-At this point the device is ready for power management.  For driverless devices,
-however, this functionality is limited to a few basic operations carried out
-during system-wide transitions to a sleep state and back to the working state.
-
-2.3. Runtime Device Power Management
-------------------------------------
-The PCI subsystem plays a vital role in the runtime power management of PCI
-devices.  For this purpose it uses the general runtime power management
-(runtime PM) framework described in Documentation/power/runtime_pm.txt.
-Namely, it provides subsystem-level callbacks:
-
-	pci_pm_runtime_suspend()
-	pci_pm_runtime_resume()
-	pci_pm_runtime_idle()
-
-that are executed by the core runtime PM routines.  It also implements the
-entire mechanics necessary for handling runtime wakeup signals from PCI devices
-in low-power states, which at the time of this writing works for both the native
-PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
-Section 1.
-
-First, a PCI device is put into a low-power state, or suspended, with the help
-of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
-pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
-driver has to provide a pm->runtime_suspend() callback (see below), which is
-run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
-returns successfully, the device's standard configuration registers are saved,
-the device is prepared to generate wakeup signals and, finally, it is put into
-the target low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup.  The exact method of signaling wakeup is
-system-dependent and is determined by the PCI subsystem on the basis of the
-reported capabilities of the device and the platform firmware.  To prepare the
-device for signaling wakeup and put it into the selected low-power state, the
-PCI subsystem can use the platform firmware as well as the device's native PCI
-PM capabilities, if supported.
-
-It is expected that the device driver's pm->runtime_suspend() callback will
-not attempt to prepare the device for signaling wakeup or to put it into a
-low-power state.  The driver ought to leave these tasks to the PCI subsystem
-that has all of the information necessary to perform them.
-
-A suspended device is brought back into the "active" state, or resumed,
-with the help of pm_request_resume() or pm_runtime_resume() which both call
-pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
-driver provides a pm->runtime_resume() callback (see below).  However, before
-the driver's callback is executed, pci_pm_runtime_resume() brings the device
-back into the full-power state, prevents it from signaling wakeup while in that
-state and restores its standard configuration registers.  Thus the driver's
-callback need not worry about the PCI-specific aspects of the device resume.
-
-Note that generally pci_pm_runtime_resume() may be called in two different
-situations.  First, it may be called at the request of the device's driver, for
-example if there are some data for it to process.  Second, it may be called
-as a result of a wakeup signal from the device itself (this sometimes is
-referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
-is handled in one of the ways described in Section 1 and finally converted into
-a notification for the PCI subsystem after the source device has been
-identified.
-
-The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
-and pm_request_idle(), executes the device driver's pm->runtime_idle()
-callback, if defined, and if that callback doesn't return error code (or is not
-present at all), suspends the device with the help of pm_runtime_suspend().
-Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
-example, it is called right after the device has just been resumed), in which
-cases it is expected to suspend the device if that makes sense.  Usually,
-however, the PCI subsystem doesn't really know if the device really can be
-suspended, so it lets the device's driver decide by running its
-pm->runtime_idle() callback.
-
-2.4. System-Wide Power Transitions
-----------------------------------
-There are a few different types of system-wide power transitions, described in
-Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
-in a specific way and the PM core executes subsystem-level power management
-callbacks for this purpose.  They are executed in phases such that each phase
-involves executing the same subsystem-level callback for every device belonging
-to the given subsystem before the next phase begins.  These phases always run
-after tasks have been frozen.
-
-2.4.1. System Suspend
-
-When the system is going into a sleep state in which the contents of memory will
-be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
-
-	prepare, suspend, suspend_noirq.
-
-The following PCI bus type's callbacks, respectively, are used in these phases:
-
-	pci_pm_prepare()
-	pci_pm_suspend()
-	pci_pm_suspend_noirq()
-
-The pci_pm_prepare() routine first puts the device into the "fully functional"
-state with the help of pm_runtime_resume().  Then, it executes the device
-driver's pm->prepare() callback if defined (i.e. if the driver's struct
-dev_pm_ops object is present and the prepare pointer in that object is valid).
-
-The pci_pm_suspend() routine first checks if the device's driver implements
-legacy PCI suspend routines (see Section 3), in which case the driver's legacy
-suspend callback is executed, if present, and its result is returned.  Next, if
-the device's driver doesn't provide a struct dev_pm_ops object (containing
-pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
-simply turns off the device's bus master capability and runs
-pcibios_disable_device() to disable it, unless the device is a bridge (PCI
-bridges are ignored by this routine).  Next, the device driver's pm->suspend()
-callback is executed, if defined, and its result is returned if it fails.
-Finally, pci_fixup_device() is called to apply hardware suspend quirks related
-to the device if necessary.
-
-Note that the suspend phase is carried out asynchronously for PCI devices, so
-the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
-devices that don't depend on each other in a known way (i.e. none of the paths
-in the device tree from the root bridge to a leaf device contains both of them).
-
-The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
-been called, which means that the device driver's interrupt handler won't be
-invoked while this routine is running.  It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
-late suspend routine is called and its result is returned (the standard
-configuration registers of the device are saved if the driver's callback hasn't
-done that).  Second, if the device driver's struct dev_pm_ops object is not
-present, the device's standard configuration registers are saved and the routine
-returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
-executed, if present, and its result is returned if it fails.  Next, if the
-device's standard configuration registers haven't been saved yet (one of the
-device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
-saves them, prepares the device to signal wakeup (if necessary) and puts it into
-a low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup while the system is in the target sleep
-state.  Just like in the runtime PM case described above, the mechanism of
-signaling wakeup is system-dependent and determined by the PCI subsystem, which
-is also responsible for preparing the device to signal wakeup from the system's
-target sleep state as appropriate.
-
-PCI device drivers (that don't implement legacy power management callbacks) are
-generally not expected to prepare devices for signaling wakeup or to put them
-into low-power states.  However, if one of the driver's suspend callbacks
-(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
-registers, pci_pm_suspend_noirq() will assume that the device has been prepared
-to signal wakeup and put into a low-power state by the driver (the driver is
-then assumed to have used the helper functions provided by the PCI subsystem for
-this purpose).  PCI device drivers are not encouraged to do that, but in some
-rare cases doing that in the driver may be the optimum approach.
-
-2.4.2. System Resume
-
-When the system is undergoing a transition from a sleep state in which the
-contents of memory have been preserved, such as one of the ACPI sleep states
-S1-S3, into the working state (ACPI S0), the phases are:
-
-	resume_noirq, resume, complete.
-
-The following PCI bus type's callbacks, respectively, are executed in these
-phases:
-
-	pci_pm_resume_noirq()
-	pci_pm_resume()
-	pci_pm_complete()
-
-The pci_pm_resume_noirq() routine first puts the device into the full-power
-state, restores its standard configuration registers and applies early resume
-hardware quirks related to the device, if necessary.  This is done
-unconditionally, regardless of whether or not the device's driver implements
-legacy PCI power management callbacks (this way all PCI devices are in the
-full-power state and their standard configuration registers have been restored
-when their interrupt handlers are invoked for the first time during resume,
-which allows the kernel to avoid problems with the handling of shared interrupts
-by drivers whose devices are still suspended).  If legacy PCI power management
-callbacks (see Section 3) are implemented by the device's driver, the legacy
-early resume callback is executed and its result is returned.  Otherwise, the
-device driver's pm->resume_noirq() callback is executed, if defined, and its
-result is returned.
-
-The pci_pm_resume() routine first checks if the device's standard configuration
-registers have been restored and restores them if that's not the case (this
-only is necessary in the error path during a failing suspend).  Next, resume
-hardware quirks related to the device are applied, if necessary, and if the
-device's driver implements legacy PCI power management callbacks (see
-Section 3), the driver's legacy resume callback is executed and its result is
-returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
-its driver's pm->resume() callback is executed, if defined (the callback's
-result is then returned).
-
-The resume phase is carried out asynchronously for PCI devices, like the
-suspend phase described above, which means that if two PCI devices don't depend
-on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
-
-The pci_pm_complete() routine only executes the device driver's pm->complete()
-callback, if defined.
-
-2.4.3. System Hibernation
-
-System hibernation is more complicated than system suspend, because it requires
-a system image to be created and written into a persistent storage medium.  The
-image is created atomically and all devices are quiesced, or frozen, before that
-happens.
-
-The freezing of devices is carried out after enough memory has been freed (at
-the time of this writing the image creation requires at least 50% of system RAM
-to be free) in the following three phases:
-
-	prepare, freeze, freeze_noirq
-
-that correspond to the PCI bus type's callbacks:
-
-	pci_pm_prepare()
-	pci_pm_freeze()
-	pci_pm_freeze_noirq()
-
-This means that the prepare phase is exactly the same as for system suspend.
-The other two phases, however, are different.
-
-The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
-the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
-and it doesn't apply the suspend-related hardware quirks.  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The pci_pm_freeze_noirq() routine, in turn, is similar to
-pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
-routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
-device for signaling wakeup and put it into a low-power state.  Still, it saves
-the device's standard configuration registers if they haven't been saved by one
-of the driver's callbacks.
-
-Once the image has been created, it has to be saved.  However, at this point all
-devices are frozen and they cannot handle I/O, while their ability to handle
-I/O is obviously necessary for the image saving.  Thus they have to be brought
-back to the fully functional state and this is done in the following phases:
-
-	thaw_noirq, thaw, complete
-
-using the following PCI bus type's callbacks:
-
-	pci_pm_thaw_noirq()
-	pci_pm_thaw()
-	pci_pm_complete()
-
-respectively.
-
-The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
-but it doesn't put the device into the full power state and doesn't attempt to
-restore its standard configuration registers.  It also executes the device
-driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
-
-The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
-driver's pm->thaw() callback instead of pm->resume().  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The complete phase it the same as for system resume.
-
-After saving the image, devices need to be powered down before the system can
-enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
-three phases:
-
-	prepare, poweroff, poweroff_noirq
-
-where the prepare phase is exactly the same as for system suspend.  The other
-two phases are analogous to the suspend and suspend_noirq phases, respectively.
-The PCI subsystem-level callbacks they correspond to
-
-	pci_pm_poweroff()
-	pci_pm_poweroff_noirq()
-
-work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
-although they don't attempt to save the device's standard configuration
-registers.
-
-2.4.4. System Restore
-
-System restore requires a hibernation image to be loaded into memory and the
-pre-hibernation memory contents to be restored before the pre-hibernation system
-activity can be resumed.
-
-As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
-into memory by a fresh instance of the kernel, called the boot kernel, which in
-turn is loaded and run by a boot loader in the usual way.  After the boot kernel
-has loaded the image, it needs to replace its own code and data with the code
-and data of the "hibernated" kernel stored within the image, called the image
-kernel.  For this purpose all devices are frozen just like before creating
-the image during hibernation, in the
-
-	prepare, freeze, freeze_noirq
-
-phases described above.  However, the devices affected by these phases are only
-those having drivers in the boot kernel; other devices will still be in whatever
-state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases (that will only affect the devices having
-drivers in the boot kernel), and then continue running normally.
-
-If the pre-hibernation memory contents are restored successfully, which is the
-usual situation, control is passed to the image kernel, which then becomes
-responsible for bringing the system back to the working state.  To achieve this,
-it must restore the devices' pre-hibernation functionality, which is done much
-like waking up from the memory sleep state, although it involves different
-phases:
-
-	restore_noirq, restore, complete
-
-The first two of these are analogous to the resume_noirq and resume phases
-described above, respectively, and correspond to the following PCI subsystem
-callbacks:
-
-	pci_pm_restore_noirq()
-	pci_pm_restore()
-
-These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
-respectively, but they execute the device driver's pm->restore_noirq() and
-pm->restore() callbacks, if available.
-
-The complete phase is carried out in exactly the same way as during system
-resume.
-
-
-3. PCI Device Drivers and Power Management
-==========================================
-
-3.1. Power Management Callbacks
--------------------------------
-PCI device drivers participate in power management by providing callbacks to be
-executed by the PCI subsystem's power management routines described above and by
-controlling the runtime power management of their devices.
-
-At the time of this writing there are two ways to define power management
-callbacks for a PCI device driver, the recommended one, based on using a
-dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
-"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
-.resume() callbacks from struct pci_driver are used.  The legacy approach,
-however, doesn't allow one to define runtime power management callbacks and is
-not really suitable for any new drivers.  Therefore it is not covered by this
-document (refer to the source code to learn more about it).
-
-It is recommended that all PCI device drivers define a struct dev_pm_ops object
-containing pointers to power management (PM) callbacks that will be executed by
-the PCI subsystem's PM routines in various circumstances.  A pointer to the
-driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
-its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
-in struct pci_driver are ignored (even if they are not NULL).
-
-The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
-defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
-subsystem will handle the device in a simplified default manner.  If they are
-defined, though, they are expected to behave as described in the following
-subsections.
-
-3.1.1. prepare()
-
-The prepare() callback is executed during system suspend, during hibernation
-(when a hibernation image is about to be created), during power-off after
-saving a hibernation image and during system restore, when a hibernation image
-has just been loaded into memory.
-
-This callback is only necessary if the driver's device has children that in
-general may be registered at any time.  In that case the role of the prepare()
-callback is to prevent new children of the device from being registered until
-one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
-
-In addition to that the prepare() callback may carry out some operations
-preparing the device to be suspended, although it should not allocate memory
-(if additional memory is required to suspend the device, it has to be
-preallocated earlier, for example in a suspend/hibernate notifier as described
-in Documentation/driver-api/pm/notifiers.rst).
-
-3.1.2. suspend()
-
-The suspend() callback is only executed during system suspend, after prepare()
-callbacks have been executed for all devices in the system.
-
-This callback is expected to quiesce the device and prepare it to be put into a
-low-power state by the PCI subsystem.  It is not required (in fact it even is
-not recommended) that a PCI driver's suspend() callback save the standard
-configuration registers of the device, prepare it for waking up the system, or
-put it into a low-power state.  All of these operations can very well be taken
-care of by the PCI subsystem, without the driver's participation.
-
-However, in some rare case it is convenient to carry out these operations in
-a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
-pci_set_power_state() should be used to save the device's standard configuration
-registers, to prepare it for system wakeup (if necessary), and to put it into a
-low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
-the PCI subsystem will not execute either pci_prepare_to_sleep(), or
-pci_set_power_state() for its device, so the driver is then responsible for
-handling the device as appropriate.
-
-While the suspend() callback is being executed, the driver's interrupt handler
-can be invoked to handle an interrupt from the device, so all suspend-related
-operations relying on the driver's ability to handle interrupts should be
-carried out in this callback.
-
-3.1.3. suspend_noirq()
-
-The suspend_noirq() callback is only executed during system suspend, after
-suspend() callbacks have been executed for all devices in the system and
-after device interrupts have been disabled by the PM core.
-
-The difference between suspend_noirq() and suspend() is that the driver's
-interrupt handler will not be invoked while suspend_noirq() is running.  Thus
-suspend_noirq() can carry out operations that would cause race conditions to
-arise if they were performed in suspend().
-
-3.1.4. freeze()
-
-The freeze() callback is hibernation-specific and is executed in two situations,
-during hibernation, after prepare() callbacks have been executed for all devices
-in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory from persistent storage and the
-prepare() callbacks have been executed for all devices.
-
-The role of this callback is analogous to the role of the suspend() callback
-described above.  In fact, they only need to be different in the rare cases when
-the driver takes the responsibility for putting the device into a low-power
-state.
-
-In that cases the freeze() callback should not prepare the device system wakeup
-or put it into a low-power state.  Still, either it or freeze_noirq() should
-save the device's standard configuration registers using pci_save_state().
-
-3.1.5. freeze_noirq()
-
-The freeze_noirq() callback is hibernation-specific.  It is executed during
-hibernation, after prepare() and freeze() callbacks have been executed for all
-devices in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory and after prepare() and
-freeze() callbacks have been executed for all devices.  It is always executed
-after device interrupts have been disabled by the PM core.
-
-The role of this callback is analogous to the role of the suspend_noirq()
-callback described above and it very rarely is necessary to define
-freeze_noirq().
-
-The difference between freeze_noirq() and freeze() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.6. poweroff()
-
-The poweroff() callback is hibernation-specific.  It is executed when the system
-is about to be powered off after saving a hibernation image to a persistent
-storage.  prepare() callbacks are executed for all devices before poweroff() is
-called.
-
-The role of this callback is analogous to the role of the suspend() and freeze()
-callbacks described above, although it does not need to save the contents of
-the device's registers.  In particular, if the driver wants to put the device
-into a low-power state itself instead of allowing the PCI subsystem to do that,
-the poweroff() callback should use pci_prepare_to_sleep() and
-pci_set_power_state() to prepare the device for system wakeup and to put it
-into a low-power state, respectively, but it need not save the device's standard
-configuration registers.
-
-3.1.7. poweroff_noirq()
-
-The poweroff_noirq() callback is hibernation-specific.  It is executed after
-poweroff() callbacks have been executed for all devices in the system.
-
-The role of this callback is analogous to the role of the suspend_noirq() and
-freeze_noirq() callbacks described above, but it does not need to save the
-contents of the device's registers.
-
-The difference between poweroff_noirq() and poweroff() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.8. resume_noirq()
-
-The resume_noirq() callback is only executed during system resume, after the
-PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
-be invoked while resume_noirq() is running, so this callback can carry out
-operations that might race with the interrupt handler.
-
-Since the PCI subsystem unconditionally puts all devices into the full power
-state in the resume_noirq phase of system resume and restores their standard
-configuration registers, resume_noirq() is usually not necessary.  In general
-it should only be used for performing operations that would lead to race
-conditions if carried out by resume().
-
-3.1.9. resume()
-
-The resume() callback is only executed during system resume, after
-resume_noirq() callbacks have been executed for all devices in the system and
-device interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-suspend configuration of the
-device and bringing it back to the fully functional state.  The device should be
-able to process I/O in a usual way after resume() has returned.
-
-3.1.10. thaw_noirq()
-
-The thaw_noirq() callback is hibernation-specific.  It is executed after a
-system image has been created and the non-boot CPUs have been enabled by the PM
-core, in the thaw_noirq phase of hibernation.  It also may be executed if the
-loading of a hibernation image fails during system restore (it is then executed
-after enabling the non-boot CPUs).  The driver's interrupt handler will not be
-invoked while thaw_noirq() is running.
-
-The role of this callback is analogous to the role of resume_noirq().  The
-difference between these two callbacks is that thaw_noirq() is executed after
-freeze() and freeze_noirq(), so in general it does not need to modify the
-contents of the device's registers.
-
-3.1.11. thaw()
-
-The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
-callbacks have been executed for all devices in the system and after device
-interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-freeze configuration of
-the device, so that it will work in a usual way after thaw() has returned.
-
-3.1.12. restore_noirq()
-
-The restore_noirq() callback is hibernation-specific.  It is executed in the
-restore_noirq phase of hibernation, when the boot kernel has passed control to
-the image kernel and the non-boot CPUs have been enabled by the image kernel's
-PM core.
-
-This callback is analogous to resume_noirq() with the exception that it cannot
-make any assumption on the previous state of the device, even if the BIOS (or
-generally the platform firmware) is known to preserve that state over a
-suspend-resume cycle.
-
-For the vast majority of PCI device drivers there is no difference between
-resume_noirq() and restore_noirq().
-
-3.1.13. restore()
-
-The restore() callback is hibernation-specific.  It is executed after
-restore_noirq() callbacks have been executed for all devices in the system and
-after the PM core has enabled device drivers' interrupt handlers to be invoked.
-
-This callback is analogous to resume(), just like restore_noirq() is analogous
-to resume_noirq().  Consequently, the difference between restore_noirq() and
-restore() is analogous to the difference between resume_noirq() and resume().
-
-For the vast majority of PCI device drivers there is no difference between
-resume() and restore().
-
-3.1.14. complete()
-
-The complete() callback is executed in the following situations:
-  - during system resume, after resume() callbacks have been executed for all
-    devices,
-  - during hibernation, before saving the system image, after thaw() callbacks
-    have been executed for all devices,
-  - during system restore, when the system is going back to its pre-hibernation
-    state, after restore() callbacks have been executed for all devices.
-It also may be executed if the loading of a hibernation image into memory fails
-(in that case it is run after thaw() callbacks have been executed for all
-devices that have drivers in the boot kernel).
-
-This callback is entirely optional, although it may be necessary if the
-prepare() callback performs operations that need to be reversed.
-
-3.1.15. runtime_suspend()
-
-The runtime_suspend() callback is specific to device runtime power management
-(runtime PM).  It is executed by the PM core's runtime PM framework when the
-device is about to be suspended (i.e. quiesced and put into a low-power state)
-at run time.
-
-This callback is responsible for freezing the device and preparing it to be
-put into a low-power state, but it must allow the PCI subsystem to perform all
-of the PCI-specific actions necessary for suspending the device.
-
-3.1.16. runtime_resume()
-
-The runtime_resume() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework when the device is about to be resumed
-(i.e. put into the full-power state and programmed to process I/O normally) at
-run time.
-
-This callback is responsible for restoring the normal functionality of the
-device after it has been put into the full-power state by the PCI subsystem.
-The device is expected to be able to process I/O in the usual way after
-runtime_resume() has returned.
-
-3.1.17. runtime_idle()
-
-The runtime_idle() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework whenever it may be desirable to suspend
-the device according to the PM core's information.  In particular, it is
-automatically executed right after runtime_resume() has returned in case the
-resume of the device has happened as a result of a spurious event.
-
-This callback is optional, but if it is not implemented or if it returns 0, the
-PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
-cause the driver's runtime_suspend() callback to be executed.
-
-3.1.18. Pointing Multiple Callback Pointers to One Routine
-
-Although in principle each of the callbacks described in the previous
-subsections can be defined as a separate function, it often is convenient to
-point two or more members of struct dev_pm_ops to the same routine.  There are
-a few convenience macros that can be used for this purpose.
-
-The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
-suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
-members and one resume routine pointed to by the .resume(), .thaw(), and
-.restore() members.  The other function pointers in this struct dev_pm_ops are
-unset.
-
-The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
-additionally sets the .runtime_resume() pointer to the same value as
-.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
-the same value as .suspend() (and .freeze() and .poweroff()).
-
-The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
-dev_pm_ops to indicate that one suspend routine is to be pointed to by the
-.suspend(), .freeze(), and .poweroff() members and one resume routine is to
-be pointed to by the .resume(), .thaw(), and .restore() members.
-
-3.1.19. Driver Flags for Power Management
-
-The PM core allows device drivers to set flags that influence the handling of
-power management for the devices by the core itself and by middle layer code
-including the PCI bus type.  The flags should be set once at the driver probe
-time with the help of the dev_pm_set_driver_flags() function and they should not
-be updated directly afterwards.
-
-The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
-mechanism allowing device suspend/resume callbacks to be skipped if the device
-is in runtime suspend when the system suspend starts.  That also affects all of
-the ancestors of the device, so this flag should only be used if absolutely
-necessary.
-
-The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
-positive value from pci_pm_prepare() if the ->prepare callback provided by the
-driver of the device returns a positive value.  That allows the driver to opt
-out from using the direct-complete mechanism dynamically.
-
-The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
-perspective the device can be safely left in runtime suspend during system
-suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
-to skip resuming the device from runtime suspend unless there are PCI-specific
-reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
-pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
-if the device remains in runtime suspend in the beginning of the "late" phase
-of the system-wide transition under way.  Moreover, if the device is in
-runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
-power management status will be changed to "active" (as it is going to be put
-into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
-the function will set the power.direct_complete flag for it (to make the PM core
-skip the subsequent "thaw" callbacks for it) and return.
-
-Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
-device to be left in suspend after system-wide transitions to the working state.
-This flag is checked by the PM core, but the PCI bus type informs the PM core
-which devices may be left in suspend from its perspective (that happens during
-the "noirq" phase of system-wide suspend and analogous transitions) and next it
-uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
-pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
-callbacks for the device during the transition under way and will set its
-runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
-it.
-
-3.2. Device Runtime Power Management
-------------------------------------
-In addition to providing device power management callbacks PCI device drivers
-are responsible for controlling the runtime power management (runtime PM) of
-their devices.
-
-The PCI device runtime PM is optional, but it is recommended that PCI device
-drivers implement it at least in the cases where there is a reliable way of
-verifying that the device is not used (like when the network cable is detached
-from an Ethernet adapter or there are no devices attached to a USB controller).
-
-To support the PCI runtime PM the driver first needs to implement the
-runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
-the runtime_idle() callback to prevent the device from being suspended again
-every time right after the runtime_resume() callback has returned
-(alternatively, the runtime_suspend() callback will have to check if the
-device should really be suspended and return -EAGAIN if that is not the case).
-
-The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
-device drivers do not need to enable it and should not attempt to do so.
-However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
-helper function.  In addition to that, the runtime PM usage counter of
-each PCI device is incremented by local_pci_probe() before executing the
-probe callback provided by the device's driver.
-
-If a PCI driver implements the runtime PM callbacks and intends to use the
-runtime PM framework provided by the PM core and the PCI subsystem, it needs
-to decrement the device's runtime PM usage counter in its probe callback
-function.  If it doesn't do that, the counter will always be different from
-zero for the device and it will never be runtime-suspended.  The simplest
-way to do that is by calling pm_runtime_put_noidle(), but if the driver
-wants to schedule an autosuspend right away, for example, it may call
-pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
-just needs to call a function that decrements the devices usage counter
-from its probe routine to make runtime PM work for the device.
-
-It is important to remember that the driver's runtime_suspend() callback
-may be executed right after the usage counter has been decremented, because
-user space may already have caused the pm_runtime_allow() helper function
-unblocking the runtime PM of the device to run via sysfs, so the driver must
-be prepared to cope with that.
-
-The driver itself should not call pm_runtime_allow(), though.  Instead, it
-should let user space or some platform-specific code do that (user space can
-do it via sysfs as stated above), but it must be prepared to handle the
-runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time, even before the driver is loaded).
-
-When the driver's remove callback runs, it has to balance the decrementation
-of the device's runtime PM usage counter at the probe time.  For this reason,
-if it has decremented the counter in its probe callback, it must run
-pm_runtime_get_noresume() in its remove callback.  [Since the core carries
-out a runtime resume of the device and bumps up the device's usage counter
-before running the driver's remove callback, the runtime PM of the device
-is effectively disabled for the duration of the remove execution and all
-runtime PM helper functions incrementing the device's usage counter are
-then effectively equivalent to pm_runtime_get_noresume().]
-
-The runtime PM framework works by processing requests to suspend or resume
-devices, or to check if they are idle (in which cases it is reasonable to
-subsequently request that they be suspended).  These requests are represented
-by work items put into the power management workqueue, pm_wq.  Although there
-are a few situations in which power management requests are automatically
-queued by the PM core (for example, after processing a request to resume a
-device the PM core automatically queues a request to check if the device is
-idle), device drivers are generally responsible for queuing power management
-requests for their devices.  For this purpose they should use the runtime PM
-helper functions provided by the PM core, discussed in
-Documentation/power/runtime_pm.txt.
-
-Devices can also be suspended and resumed synchronously, without placing a
-request into pm_wq.  In the majority of cases this also is done by their
-drivers that use helper functions provided by the PM core for this purpose.
-
-For more information on the runtime PM of devices refer to
-Documentation/power/runtime_pm.txt.
-
-
-4. Resources
-============
-
-PCI Local Bus Specification, Rev. 3.0
-PCI Bus Power Management Interface Specification, Rev. 1.2
-Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
-PCI Express Base Specification, Rev. 2.0
-Documentation/driver-api/pm/devices.rst
-Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
new file mode 100644
index 000000000000..945fc6d760c9
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.rst
@@ -0,0 +1,225 @@
+===============================
+PM Quality Of Service Interface
+===============================
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Two different PM QoS frameworks are available:
+1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
+memory_bandwidth.
+2. the per-device PM QoS framework provides the API to manage the per-device latency
+constraints and PM QoS flags.
+
+Each parameters have defined units:
+
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+ * memory bandwidth: mbs (mega bit / sec)
+
+
+1. PM QoS framework
+===================
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter.  The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h.  This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requests is maintained along with
+an aggregated target value.  The aggregated target value is updated with
+changes to the request list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the request values held
+in the parameter list elements.
+Note: the aggregated target value is implemented as an atomic variable so that
+reading the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is simple:
+
+void pm_qos_add_request(handle, param_class, target_value):
+  Will insert an element into the list for that identified PM QoS class with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of pm_qos need to save the returned handle for future use in other
+  pm_qos API functions.
+
+void pm_qos_update_request(handle, new_target_value):
+  Will update the list element pointed to by the handle with the new target value
+  and recompute the new aggregated target, calling the notification tree if the
+  target is changed.
+
+void pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target and
+  call the notification tree if the target was changed as a result of removing
+  the request.
+
+int pm_qos_request(param_class):
+  Returns the aggregated value for a given PM QoS class.
+
+int pm_qos_request_active(handle):
+  Returns if the request is still active, i.e. it has not been removed from a
+  PM QoS class constraints list.
+
+int pm_qos_add_notifier(param_class, notifier):
+  Adds a notification callback function to the PM QoS class. The callback is
+  called when the aggregated value for the PM QoS class is changed.
+
+int pm_qos_remove_notifier(int param_class, notifier):
+  Removes the notification callback function for the PM QoS class.
+
+
+From user mode:
+
+Only processes can register a pm_qos request.  To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+request on the parameter.
+
+To change the requested target value the process needs to write an s32 value to
+the open device node.  Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678".  This
+translates to a pm_qos_update_request call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+2. PM QoS per-device latency and flags framework
+================================================
+
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
+
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
+
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is the following:
+
+int dev_pm_qos_add_request(device, handle, type, value):
+  Will insert an element into the list for that identified device with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of dev_pm_qos need to save the handle for future use in other
+  dev_pm_qos API functions.
+
+int dev_pm_qos_update_request(handle, new_value):
+  Will update the list element pointed to by the handle with the new target
+  value and recompute the new aggregated target, calling the notification
+  trees if the target is changed.
+
+int dev_pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target
+  and call the notification trees if the target was changed as a result of
+  removing the request.
+
+s32 dev_pm_qos_read_value(device):
+  Returns the aggregated value for a given device's constraints list.
+
+enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
+  Check PM QoS flags of the given device against the given mask of flags.
+  The meaning of the return values is as follows:
+
+	PM_QOS_FLAGS_ALL:
+		All flags from the mask are set
+	PM_QOS_FLAGS_SOME:
+		Some flags from the mask are set
+	PM_QOS_FLAGS_NONE:
+		No flags from the mask are set
+	PM_QOS_FLAGS_UNDEFINED:
+		The device's PM QoS structure has not been initialized
+		or the list of requests is empty.
+
+int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
+  Add a PM QoS request for the first direct ancestor of the given device whose
+  power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
+  or whose power.set_latency_tolerance callback pointer is not NULL (for
+  DEV_PM_QOS_LATENCY_TOLERANCE requests).
+
+int dev_pm_qos_expose_latency_limit(device, value)
+  Add a request to the device's PM QoS list of resume latency constraints and
+  create a sysfs attribute pm_qos_resume_latency_us under the device's power
+  directory allowing user space to manipulate that request.
+
+void dev_pm_qos_hide_latency_limit(device)
+  Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
+  PM QoS list of resume latency constraints and remove sysfs attribute
+  pm_qos_resume_latency_us from the device's power directory.
+
+int dev_pm_qos_expose_flags(device, value)
+  Add a request to the device's PM QoS list of flags and create sysfs attribute
+  pm_qos_no_power_off under the device's power directory allowing user space to
+  change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
+
+void dev_pm_qos_hide_flags(device)
+  Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
+  of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+  directory.
+
+Notification mechanisms:
+
+The per-device PM QoS framework has a per-device notification tree.
+
+int dev_pm_qos_add_notifier(device, notifier):
+  Adds a notification callback function for the device.
+  The callback is called when the aggregated value of the device constraints list
+  is changed (for resume latency device PM QoS only).
+
+int dev_pm_qos_remove_notifier(device, notifier):
+  Removes the notification callback function for the device.
+
+
+Active state latency tolerance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly.  In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated.  The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it.  If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it.  That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any.  Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
deleted file mode 100644
index 19c5f7b1a7ba..000000000000
--- a/Documentation/power/pm_qos_interface.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-PM Quality Of Service Interface.
-
-This interface provides a kernel and user mode interface for registering
-performance expectations by drivers, subsystems and user space applications on
-one of the parameters.
-
-Two different PM QoS frameworks are available:
-1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
-memory_bandwidth.
-2. the per-device PM QoS framework provides the API to manage the per-device latency
-constraints and PM QoS flags.
-
-Each parameters have defined units:
- * latency: usec
- * timeout: usec
- * throughput: kbs (kilo bit / sec)
- * memory bandwidth: mbs (mega bit / sec)
-
-
-1. PM QoS framework
-
-The infrastructure exposes multiple misc device nodes one per implemented
-parameter.  The set of parameters implement is defined by pm_qos_power_init()
-and pm_qos_params.h.  This is done because having the available parameters
-being runtime configurable or changeable from a driver was seen as too easy to
-abuse.
-
-For each parameter a list of performance requests is maintained along with
-an aggregated target value.  The aggregated target value is updated with
-changes to the request list or elements of the list.  Typically the
-aggregated target value is simply the max or min of the request values held
-in the parameter list elements.
-Note: the aggregated target value is implemented as an atomic variable so that
-reading the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is simple:
-
-void pm_qos_add_request(handle, param_class, target_value):
-Will insert an element into the list for that identified PM QoS class with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of pm_qos need to save the returned handle for future use in other
-pm_qos API functions.
-
-void pm_qos_update_request(handle, new_target_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification tree if the
-target is changed.
-
-void pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification tree if the target was changed as a result of removing
-the request.
-
-int pm_qos_request(param_class):
-Returns the aggregated value for a given PM QoS class.
-
-int pm_qos_request_active(handle):
-Returns if the request is still active, i.e. it has not been removed from a
-PM QoS class constraints list.
-
-int pm_qos_add_notifier(param_class, notifier):
-Adds a notification callback function to the PM QoS class. The callback is
-called when the aggregated value for the PM QoS class is changed.
-
-int pm_qos_remove_notifier(int param_class, notifier):
-Removes the notification callback function for the PM QoS class.
-
-
-From user mode:
-Only processes can register a pm_qos request.  To provide for automatic
-cleanup of a process, the interface requires the process to register its
-parameter requests in the following way:
-
-To register the default pm_qos target for the specific parameter, the process
-must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
-
-As long as the device node is held open that process has a registered
-request on the parameter.
-
-To change the requested target value the process needs to write an s32 value to
-the open device node.  Alternatively the user mode program could write a hex
-string for the value using 10 char long format e.g. "0x12345678".  This
-translates to a pm_qos_update_request call.
-
-To remove the user mode request for a target value simply close the device
-node.
-
-
-2. PM QoS per-device latency and flags framework
-
-For each device, there are three lists of PM QoS requests. Two of them are
-maintained along with the aggregated targets of resume latency and active
-state latency tolerance (in microseconds) and the third one is for PM QoS flags.
-Values are updated in response to changes of the request list.
-
-The target values of resume latency and active state latency tolerance are
-simply the minimum of the request values held in the parameter list elements.
-The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
-
-Note: The aggregated target values are implemented in such a way that reading
-the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is the following:
-
-int dev_pm_qos_add_request(device, handle, type, value):
-Will insert an element into the list for that identified device with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of dev_pm_qos need to save the handle for future use in other
-dev_pm_qos API functions.
-
-int dev_pm_qos_update_request(handle, new_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification trees if the
-target is changed.
-
-int dev_pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification trees if the target was changed as a result of removing
-the request.
-
-s32 dev_pm_qos_read_value(device):
-Returns the aggregated value for a given device's constraints list.
-
-enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
-Check PM QoS flags of the given device against the given mask of flags.
-The meaning of the return values is as follows:
-	PM_QOS_FLAGS_ALL: All flags from the mask are set
-	PM_QOS_FLAGS_SOME: Some flags from the mask are set
-	PM_QOS_FLAGS_NONE: No flags from the mask are set
-	PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been
-			initialized or the list of requests is empty.
-
-int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
-Add a PM QoS request for the first direct ancestor of the given device whose
-power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
-or whose power.set_latency_tolerance callback pointer is not NULL (for
-DEV_PM_QOS_LATENCY_TOLERANCE requests).
-
-int dev_pm_qos_expose_latency_limit(device, value)
-Add a request to the device's PM QoS list of resume latency constraints and
-create a sysfs attribute pm_qos_resume_latency_us under the device's power
-directory allowing user space to manipulate that request.
-
-void dev_pm_qos_hide_latency_limit(device)
-Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
-PM QoS list of resume latency constraints and remove sysfs attribute
-pm_qos_resume_latency_us from the device's power directory.
-
-int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attribute
-pm_qos_no_power_off under the device's power directory allowing user space to
-change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
-
-void dev_pm_qos_hide_flags(device)
-Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
-directory.
-
-Notification mechanisms:
-The per-device PM QoS framework has a per-device notification tree.
-
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
-The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_notifier(device, notifier):
-Removes the notification callback function for the device.
-
-
-Active state latency tolerance
-
-This device PM QoS type is used to support systems in which hardware may switch
-to energy-saving operation modes on the fly.  In those systems, if the operation
-mode chosen by the hardware attempts to save energy in an overly aggressive way,
-it may cause excess latencies to be visible to software, causing it to miss
-certain protocol requirements or target frame or sample rates etc.
-
-If there is a latency tolerance control mechanism for a given device available
-to software, the .set_latency_tolerance callback in that device's dev_pm_info
-structure should be populated.  The routine pointed to by it is should implement
-whatever is necessary to transfer the effective requirement value to the
-hardware.
-
-Whenever the effective latency tolerance changes for the device, its
-.set_latency_tolerance() callback will be executed and the effective value will
-be passed to it.  If that value is negative, which means that the list of
-latency tolerance requirements for the device is empty, the callback is expected
-to switch the underlying hardware latency tolerance control mechanism to an
-autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
-the hardware supports a special "no requirement" setting, the callback is
-expected to use it.  That allows software to prevent the hardware from
-automatically updating the device's latency tolerance in response to its power
-state changes (e.g. during transitions from D3cold to D0), which generally may
-be done in the autonomous latency tolerance control mode.
-
-If .set_latency_tolerance() is present for the device, sysfs attribute
-pm_qos_latency_tolerance_us will be present in the devivce's power directory.
-Then, user space can use that attribute to specify its latency tolerance
-requirement for the device, if any.  Writing "any" to it means "no requirement,
-but do not let the hardware control latency tolerance" and writing "auto" to it
-allows the hardware to be switched to the autonomous mode if there are no other
-requirements from the kernel side in the device's list.
-
-Kernel code can use the functions described above along with the
-DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
-latency tolerance requirements for devices.
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
new file mode 100644
index 000000000000..3f2c3fe38a61
--- /dev/null
+++ b/Documentation/power/power_supply_class.rst
@@ -0,0 +1,282 @@
+========================
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
+for the example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+  All voltages, currents, charges, energies, time and temperatures in µV,
+  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+  stated. It's driver's job to convert its raw values to units in which
+  this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------------------------------------------------------+
+|               **Charge/Energy/Capacity - how to not confuse**            |
++--------------------------------------------------------------------------+
+| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity"   |
+| of battery, this class distinguish these terms. Don't mix them!**        |
+|                                                                          |
+| - `CHARGE_*`                                                             |
+|	attributes represents capacity in µAh only.                        |
+| - `ENERGY_*`                                                             |
+|	attributes represents capacity in µWh only.                        |
+| - `CAPACITY`                                                             |
+|	attribute represents capacity in *percents*, from 0 to 100.        |
++--------------------------------------------------------------------------+
+
+Postfixes:
+
+_AVG
+  *hardware* averaged value, use it if your hardware is really able to
+  report averaged values.
+_NOW
+  momentary/instantaneous values.
+
+STATUS
+  this attribute represents operating status (charging, full,
+  discharging (i.e. powering a load), etc.). This corresponds to
+  `BATTERY_STATUS_*` values, as defined in battery.h.
+
+CHARGE_TYPE
+  batteries can typically charge at different rates.
+  This defines trickle and fast charges.  For batteries that
+  are already charged or discharging, 'n/a' can be displayed (or
+  'unknown', if the status is not known).
+
+AUTHENTIC
+  indicates the power supply (battery or charger) connected
+  to the platform is authentic(1) or non authentic(0).
+
+HEALTH
+  represents health of the battery, values corresponds to
+  POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_OCV
+  open circuit voltage of the battery.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
+  design values for maximal and minimal power supply voltages.
+  Maximal/minimal means values of voltages when battery considered
+  "full"/"empty" at normal conditions. Yes, there is no direct relation
+  between voltage and battery capacity, but some dumb
+  batteries use voltage for very approximated calculation of capacity.
+  Battery driver also can use this attribute just to inform userspace
+  about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN
+  same as _DESIGN voltage values except that these ones should be used
+  if hardware could only guess (measure and retain) the thresholds of a
+  given power supply.
+
+VOLTAGE_BOOT
+  Reports the voltage measured during boot
+
+CURRENT_BOOT
+  Reports the current measured during boot
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
+  design charge values, when battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
+  same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY
+  These attributes means "last remembered value of charge when battery
+  became full/empty". It also could mean "value of charge when battery
+  considered full/empty at given conditions (temperature, age)".
+  I.e. these attributes represents real thresholds, not design values.
+
+ENERGY_FULL, ENERGY_EMPTY
+  same as above but for energy.
+
+CHARGE_COUNTER
+  the current charge counter (in µAh).  This could easily
+  be negative; there is no empty or full value.  It is only useful for
+  relative, time-based measurements.
+
+PRECHARGE_CURRENT
+  the maximum charge current during precharge phase of charge cycle
+  (typically 20% of battery capacity).
+
+CHARGE_TERM_CURRENT
+  Charge termination current. The charge cycle terminates when battery
+  voltage is above recharge threshold, and charge current is below
+  this setting (typically 10% of battery capacity).
+
+CONSTANT_CHARGE_CURRENT
+  constant charge current programmed by charger.
+
+
+CONSTANT_CHARGE_CURRENT_MAX
+  maximum charge current supported by the power supply object.
+
+CONSTANT_CHARGE_VOLTAGE
+  constant charge voltage programmed by charger.
+CONSTANT_CHARGE_VOLTAGE_MAX
+  maximum charge voltage supported by the power supply object.
+
+INPUT_CURRENT_LIMIT
+  input current limit programmed by charger. Indicates
+  the current drawn from a charging source.
+
+CHARGE_CONTROL_LIMIT
+  current charge control limit setting
+CHARGE_CONTROL_LIMIT_MAX
+  maximum charge control limit setting
+
+CALIBRATE
+  battery or coulomb counter calibration status
+
+CAPACITY
+  capacity in percents.
+CAPACITY_ALERT_MIN
+  minimum capacity alert value in percents.
+CAPACITY_ALERT_MAX
+  maximum capacity alert value in percents.
+CAPACITY_LEVEL
+  capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*.
+
+TEMP
+  temperature of the power supply.
+TEMP_ALERT_MIN
+  minimum battery temperature alert.
+TEMP_ALERT_MAX
+  maximum battery temperature alert.
+TEMP_AMBIENT
+  ambient temperature.
+TEMP_AMBIENT_ALERT_MIN
+  minimum ambient temperature alert.
+TEMP_AMBIENT_ALERT_MAX
+  maximum ambient temperature alert.
+TEMP_MIN
+  minimum operatable temperature
+TEMP_MAX
+  maximum operatable temperature
+
+TIME_TO_EMPTY
+  seconds left for battery to be considered empty
+  (i.e. while battery powers a load)
+TIME_TO_FULL
+  seconds left for battery to be considered full
+  (i.e. while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+Devicetree battery characteristics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Drivers should call power_supply_get_battery_info() to obtain battery
+characteristics from a devicetree battery node, defined in
+Documentation/devicetree/bindings/power/supply/battery.txt. This is
+implemented in drivers/power/supply/bq27xxx_battery.c.
+
+Properties in struct power_supply_battery_info and their counterparts in the
+battery node have names corresponding to elements in enum power_supply_property,
+for naming consistency between sysfs attributes and battery node properties.
+
+
+QA
+~~
+
+Q:
+   Where is POWER_SUPPLY_PROP_XYZ attribute?
+A:
+   If you cannot find attribute suitable for your driver needs, feel free
+   to add it and send patch along with your driver.
+
+   The attributes available currently are the ones currently provided by the
+   drivers written.
+
+   Good candidates to add in future: model/part#, cycle_time, manufacturer,
+   etc.
+
+
+Q:
+   I have some very specific attribute (e.g. battery color), should I add
+   this attribute to standard ones?
+A:
+   Most likely, no. Such attribute can be placed in the driver itself, if
+   it is useful. Of course, if the attribute in question applicable to
+   large set of batteries, provided by many drivers, and/or comes from
+   some general battery specification/standard, it may be a candidate to
+   be added to the core attribute set.
+
+
+Q:
+   Suppose, my battery monitoring chip/firmware does not provides capacity
+   in percents, but provides charge_{now,full,empty}. Should I calculate
+   percentage capacity manually, inside the driver, and register CAPACITY
+   attribute? The same question about time_to_empty/time_to_full.
+A:
+   Most likely, no. This class is designed to export properties which are
+   directly measurable by the specific hardware available.
+
+   Inferring not available properties using some heuristics or mathematical
+   model is not subject of work for a battery driver. Such functionality
+   should be factored out, and in fact, apm_power, the driver to serve
+   legacy APM API on top of power supply class, uses a simple heuristic of
+   approximating remaining battery capacity based on its charge, current,
+   voltage and so on. But full-fledged battery model is likely not subject
+   for kernel at all, as it would require floating point calculation to deal
+   with things like differential equations and Kalman filters. This is
+   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
deleted file mode 100644
index 300d37896e51..000000000000
--- a/Documentation/power/power_supply_class.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Linux power supply class
-========================
-
-Synopsis
-~~~~~~~~
-Power supply class used to represent battery, UPS, AC or DC power supply
-properties to user-space.
-
-It defines core set of attributes, which should be applicable to (almost)
-every power supply out there. Attributes are available via sysfs and uevent
-interfaces.
-
-Each attribute has well defined meaning, up to unit of measure used. While
-the attributes provided are believed to be universally applicable to any
-power supply, specific monitoring hardware may not be able to provide them
-all, so any of them may be skipped.
-
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
-
-It also integrates with LED framework, for the purpose of providing
-typically expected feedback of battery charging/fully charged status and
-AC/USB power supply online status. (Note that specific details of the
-indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
-
-
-Attributes/properties
-~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
-predefined attributes *and* their units.
-
-So, userspace gets predictable set of attributes and their units for any
-kind of power supply, and can process/present them to a user in consistent
-manner. Results for different power supplies and machines are also directly
-comparable.
-
-See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
-for the example how to declare and handle attributes.
-
-
-Units
-~~~~~
-Quoting include/linux/power_supply.h:
-
-  All voltages, currents, charges, energies, time and temperatures in µV,
-  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
-  stated. It's driver's job to convert its raw values to units in which
-  this class operates.
-
-
-Attributes/properties detailed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-~ ~ ~ ~ ~ ~ ~  Charge/Energy/Capacity - how to not confuse  ~ ~ ~ ~ ~ ~ ~
-~                                                                       ~
-~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity"  ~
-~ of battery, this class distinguish these terms. Don't mix them!       ~
-~                                                                       ~
-~ CHARGE_* attributes represents capacity in µAh only.                  ~
-~ ENERGY_* attributes represents capacity in µWh only.                  ~
-~ CAPACITY attribute represents capacity in *percents*, from 0 to 100.  ~
-~                                                                       ~
-~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-
-Postfixes:
-_AVG - *hardware* averaged value, use it if your hardware is really able to
-report averaged values.
-_NOW - momentary/instantaneous values.
-
-STATUS - this attribute represents operating status (charging, full,
-discharging (i.e. powering a load), etc.). This corresponds to
-BATTERY_STATUS_* values, as defined in battery.h.
-
-CHARGE_TYPE - batteries can typically charge at different rates.
-This defines trickle and fast charges.  For batteries that
-are already charged or discharging, 'n/a' can be displayed (or
-'unknown', if the status is not known).
-
-AUTHENTIC - indicates the power supply (battery or charger) connected
-to the platform is authentic(1) or non authentic(0).
-
-HEALTH - represents health of the battery, values corresponds to
-POWER_SUPPLY_HEALTH_*, defined in battery.h.
-
-VOLTAGE_OCV - open circuit voltage of the battery.
-
-VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
-minimal power supply voltages. Maximal/minimal means values of voltages
-when battery considered "full"/"empty" at normal conditions. Yes, there is
-no direct relation between voltage and battery capacity, but some dumb
-batteries use voltage for very approximated calculation of capacity.
-Battery driver also can use this attribute just to inform userspace
-about maximal and minimal voltage thresholds of a given battery.
-
-VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
-these ones should be used if hardware could only guess (measure and
-retain) the thresholds of a given power supply.
-
-VOLTAGE_BOOT - Reports the voltage measured during boot
-
-CURRENT_BOOT - Reports the current measured during boot
-
-CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
-battery considered full/empty.
-
-ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
-
-CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
-of charge when battery became full/empty". It also could mean "value of
-charge when battery considered full/empty at given conditions (temperature,
-age)". I.e. these attributes represents real thresholds, not design values.
-
-ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
-
-CHARGE_COUNTER - the current charge counter (in µAh).  This could easily
-be negative; there is no empty or full value.  It is only useful for
-relative, time-based measurements.
-
-PRECHARGE_CURRENT - the maximum charge current during precharge phase
-of charge cycle (typically 20% of battery capacity).
-CHARGE_TERM_CURRENT - Charge termination current. The charge cycle
-terminates when battery voltage is above recharge threshold, and charge
-current is below this setting (typically 10% of battery capacity).
-
-CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
-CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
-power supply object.
-
-CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
-CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
-power supply object.
-
-INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
-the current drawn from a charging source.
-
-CHARGE_CONTROL_LIMIT - current charge control limit setting
-CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
-
-CALIBRATE - battery or coulomb counter calibration status
-
-CAPACITY - capacity in percents.
-CAPACITY_ALERT_MIN - minimum capacity alert value in percents.
-CAPACITY_ALERT_MAX - maximum capacity alert value in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
-
-TEMP - temperature of the power supply.
-TEMP_ALERT_MIN - minimum battery temperature alert.
-TEMP_ALERT_MAX - maximum battery temperature alert.
-TEMP_AMBIENT - ambient temperature.
-TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
-TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
-TEMP_MIN - minimum operatable temperature
-TEMP_MAX - maximum operatable temperature
-
-TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
-while battery powers a load)
-TIME_TO_FULL - seconds left for battery to be considered full (i.e.
-while battery is charging)
-
-
-Battery <-> external power supply interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Often power supplies are acting as supplies and supplicants at the same
-time. Batteries are good example. So, batteries usually care if they're
-externally powered or not.
-
-For that case, power supply class implements notification mechanism for
-batteries.
-
-External power supply (AC) lists supplicants (batteries) names in
-"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
-
-
-Devicetree battery characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Drivers should call power_supply_get_battery_info() to obtain battery
-characteristics from a devicetree battery node, defined in
-Documentation/devicetree/bindings/power/supply/battery.txt. This is
-implemented in drivers/power/supply/bq27xxx_battery.c.
-
-Properties in struct power_supply_battery_info and their counterparts in the
-battery node have names corresponding to elements in enum power_supply_property,
-for naming consistency between sysfs attributes and battery node properties.
-
-
-QA
-~~
-Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
-A: If you cannot find attribute suitable for your driver needs, feel free
-   to add it and send patch along with your driver.
-
-   The attributes available currently are the ones currently provided by the
-   drivers written.
-
-   Good candidates to add in future: model/part#, cycle_time, manufacturer,
-   etc.
-
-
-Q: I have some very specific attribute (e.g. battery color), should I add
-   this attribute to standard ones?
-A: Most likely, no. Such attribute can be placed in the driver itself, if
-   it is useful. Of course, if the attribute in question applicable to
-   large set of batteries, provided by many drivers, and/or comes from
-   some general battery specification/standard, it may be a candidate to
-   be added to the core attribute set.
-
-
-Q: Suppose, my battery monitoring chip/firmware does not provides capacity
-   in percents, but provides charge_{now,full,empty}. Should I calculate
-   percentage capacity manually, inside the driver, and register CAPACITY
-   attribute? The same question about time_to_empty/time_to_full.
-A: Most likely, no. This class is designed to export properties which are
-   directly measurable by the specific hardware available.
-
-   Inferring not available properties using some heuristics or mathematical
-   model is not subject of work for a battery driver. Such functionality
-   should be factored out, and in fact, apm_power, the driver to serve
-   legacy APM API on top of power supply class, uses a simple heuristic of
-   approximating remaining battery capacity based on its charge, current,
-   voltage and so on. But full-fledged battery model is likely not subject
-   for kernel at all, as it would require floating point calculation to deal
-   with things like differential equations and Kalman filters. This is
-   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst
new file mode 100644
index 000000000000..7ae3b44c7624
--- /dev/null
+++ b/Documentation/power/powercap/powercap.rst
@@ -0,0 +1,257 @@
+=======================
+Power Capping Framework
+=======================
+
+The power capping framework provides a consistent interface between the kernel
+and the user space that allows power capping drivers to expose the settings to
+user space in a uniform way.
+
+Terminology
+===========
+
+The framework exposes power capping devices to user space via sysfs in the
+form of a tree of objects. The objects at the root level of the tree represent
+'control types', which correspond to different methods of power capping.  For
+example, the intel-rapl control type represents the Intel "Running Average
+Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
+corresponds to the use of idle injection for controlling power.
+
+Power zones represent different parts of the system, which can be controlled and
+monitored using the power capping method determined by the control type the
+given zone belongs to. They each contain attributes for monitoring power, as
+well as controls represented in the form of power constraints.  If the parts of
+the system represented by different power zones are hierarchical (that is, one
+bigger part consists of multiple smaller parts that each have their own power
+controls), those power zones may also be organized in a hierarchy with one
+parent power zone containing multiple subzones and so on to reflect the power
+control topology of the system.  In that case, it is possible to apply power
+capping to a set of devices together using the parent power zone and if more
+fine grained control is required, it can be applied through the subzones.
+
+
+Example sysfs interface tree::
+
+  /sys/devices/virtual/powercap
+  └──intel-rapl
+      ├──intel-rapl:0
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:0:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:0:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──enabled
+      │   ├──uevent
+      ├──intel-rapl:1
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:1:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:1:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──uevent
+      ├──power
+      │   ├──async
+      │   []
+      ├──subsystem -> ../../../../class/power_cap
+      ├──enabled
+      └──uevent
+
+The above example illustrates a case in which the Intel RAPL technology,
+available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
+control type called intel-rapl which contains two power zones, intel-rapl:0 and
+intel-rapl:1, representing CPU packages.  Each of these power zones contains
+two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
+"core" and the "uncore" parts of the given CPU package, respectively.  All of
+the zones and subzones contain energy monitoring attributes (energy_uj,
+max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
+to be applied (the constraints in the 'package' power zones apply to the whole
+CPU packages and the subzone constraints only apply to the respective parts of
+the given package individually). Since Intel RAPL doesn't provide instantaneous
+power value, there is no power_uw attribute.
+
+In addition to that, each power zone contains a name attribute, allowing the
+part of the system represented by that zone to be identified.
+For example::
+
+	cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
+
+package-0
+---------
+
+The Intel RAPL technology allows two constraints, short term and long term,
+with two different time windows to be applied to each power zone.  Thus for
+each zone there are 2 attributes representing the constraint names, 2 power
+limits and 2 attributes representing the sizes of the time windows. Such that,
+constraint_j_* attributes correspond to the jth constraint (j = 0,1).
+
+For example::
+
+	constraint_0_name
+	constraint_0_power_limit_uw
+	constraint_0_time_window_us
+	constraint_1_name
+	constraint_1_power_limit_uw
+	constraint_1_time_window_us
+
+Power Zone Attributes
+=====================
+
+Monitoring attributes
+---------------------
+
+energy_uj (rw)
+	Current energy counter in micro joules. Write "0" to reset.
+	If the counter can not be reset, then this attribute is read only.
+
+max_energy_range_uj (ro)
+	Range of the above energy counter in micro-joules.
+
+power_uw (ro)
+	Current power in micro watts.
+
+max_power_range_uw (ro)
+	Range of the above power value in micro-watts.
+
+name (ro)
+	Name of this power zone.
+
+It is possible that some domains have both power ranges and energy counter ranges;
+however, only one is mandatory.
+
+Constraints
+-----------
+
+constraint_X_power_limit_uw (rw)
+	Power limit in micro watts, which should be applicable for the
+	time window specified by "constraint_X_time_window_us".
+
+constraint_X_time_window_us (rw)
+	Time window in micro seconds.
+
+constraint_X_name (ro)
+	An optional name of the constraint
+
+constraint_X_max_power_uw(ro)
+	Maximum allowed power in micro watts.
+
+constraint_X_min_power_uw(ro)
+	Minimum allowed power in micro watts.
+
+constraint_X_max_time_window_us(ro)
+	Maximum allowed time window in micro seconds.
+
+constraint_X_min_time_window_us(ro)
+	Minimum allowed time window in micro seconds.
+
+Except power_limit_uw and time_window_us other fields are optional.
+
+Common zone and control type attributes
+---------------------------------------
+
+enabled (rw): Enable/Disable controls at zone level or for all zones using
+a control type.
+
+Power Cap Client Driver Interface
+=================================
+
+The API summary:
+
+Call powercap_register_control_type() to register control type object.
+Call powercap_register_zone() to register a power zone (under a given
+control type), either as a top-level power zone or as a subzone of another
+power zone registered earlier.
+The number of constraints in a power zone and the corresponding callbacks have
+to be defined prior to calling powercap_register_zone() to register that zone.
+
+To Free a power zone call powercap_unregister_zone().
+To free a control type object call powercap_unregister_control_type().
+Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt
deleted file mode 100644
index 1e6ef164e07a..000000000000
--- a/Documentation/power/powercap/powercap.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-Power Capping Framework
-==================================
-
-The power capping framework provides a consistent interface between the kernel
-and the user space that allows power capping drivers to expose the settings to
-user space in a uniform way.
-
-Terminology
-=========================
-The framework exposes power capping devices to user space via sysfs in the
-form of a tree of objects. The objects at the root level of the tree represent
-'control types', which correspond to different methods of power capping.  For
-example, the intel-rapl control type represents the Intel "Running Average
-Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
-corresponds to the use of idle injection for controlling power.
-
-Power zones represent different parts of the system, which can be controlled and
-monitored using the power capping method determined by the control type the
-given zone belongs to. They each contain attributes for monitoring power, as
-well as controls represented in the form of power constraints.  If the parts of
-the system represented by different power zones are hierarchical (that is, one
-bigger part consists of multiple smaller parts that each have their own power
-controls), those power zones may also be organized in a hierarchy with one
-parent power zone containing multiple subzones and so on to reflect the power
-control topology of the system.  In that case, it is possible to apply power
-capping to a set of devices together using the parent power zone and if more
-fine grained control is required, it can be applied through the subzones.
-
-
-Example sysfs interface tree:
-
-/sys/devices/virtual/powercap
-??? intel-rapl
-    ??? intel-rapl:0
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:0:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:0:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? enabled
-    ?   ??? uevent
-    ??? intel-rapl:1
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:1:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:1:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? uevent
-    ??? power
-    ?   ??? async
-    ?   []
-    ??? subsystem -> ../../../../class/power_cap
-    ??? enabled
-    ??? uevent
-
-The above example illustrates a case in which the Intel RAPL technology,
-available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
-control type called intel-rapl which contains two power zones, intel-rapl:0 and
-intel-rapl:1, representing CPU packages.  Each of these power zones contains
-two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
-"core" and the "uncore" parts of the given CPU package, respectively.  All of
-the zones and subzones contain energy monitoring attributes (energy_uj,
-max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
-to be applied (the constraints in the 'package' power zones apply to the whole
-CPU packages and the subzone constraints only apply to the respective parts of
-the given package individually). Since Intel RAPL doesn't provide instantaneous
-power value, there is no power_uw attribute.
-
-In addition to that, each power zone contains a name attribute, allowing the
-part of the system represented by that zone to be identified.
-For example:
-
-cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
-package-0
-
-The Intel RAPL technology allows two constraints, short term and long term,
-with two different time windows to be applied to each power zone.  Thus for
-each zone there are 2 attributes representing the constraint names, 2 power
-limits and 2 attributes representing the sizes of the time windows. Such that,
-constraint_j_* attributes correspond to the jth constraint (j = 0,1).
-
-For example:
-	constraint_0_name
-	constraint_0_power_limit_uw
-	constraint_0_time_window_us
-	constraint_1_name
-	constraint_1_power_limit_uw
-	constraint_1_time_window_us
-
-Power Zone Attributes
-=================================
-Monitoring attributes
-----------------------
-
-energy_uj (rw): Current energy counter in micro joules. Write "0" to reset.
-If the counter can not be reset, then this attribute is read only.
-
-max_energy_range_uj (ro): Range of the above energy counter in micro-joules.
-
-power_uw (ro): Current power in micro watts.
-
-max_power_range_uw (ro): Range of the above power value in micro-watts.
-
-name (ro): Name of this power zone.
-
-It is possible that some domains have both power ranges and energy counter ranges;
-however, only one is mandatory.
-
-Constraints
-----------------
-constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be
-applicable for the time window specified by "constraint_X_time_window_us".
-
-constraint_X_time_window_us (rw): Time window in micro seconds.
-
-constraint_X_name (ro): An optional name of the constraint
-
-constraint_X_max_power_uw(ro): Maximum allowed power in micro watts.
-
-constraint_X_min_power_uw(ro): Minimum allowed power in micro watts.
-
-constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds.
-
-constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds.
-
-Except power_limit_uw and time_window_us other fields are optional.
-
-Common zone and control type attributes
-----------------------------------------
-enabled (rw): Enable/Disable controls at zone level or for all zones using
-a control type.
-
-Power Cap Client Driver Interface
-==================================
-The API summary:
-
-Call powercap_register_control_type() to register control type object.
-Call powercap_register_zone() to register a power zone (under a given
-control type), either as a top-level power zone or as a subzone of another
-power zone registered earlier.
-The number of constraints in a power zone and the corresponding callbacks have
-to be defined prior to calling powercap_register_zone() to register that zone.
-
-To Free a power zone call powercap_unregister_zone().
-To free a control type object call powercap_unregister_control_type().
-Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst
new file mode 100644
index 000000000000..0cd8cc1275a7
--- /dev/null
+++ b/Documentation/power/regulator/consumer.rst
@@ -0,0 +1,229 @@
+===================================
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to its supply regulator by calling ::
+
+	regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in its struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call ::
+
+	regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies ::
+
+	digital = regulator_get(dev, "Vcc");  /* digital core */
+	analog = regulator_get(dev, "Avdd");  /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+===============================================================
+
+
+A consumer can enable its power supply by calling::
+
+	int regulator_enable(regulator);
+
+NOTE:
+  The supply may already be enabled before regulator_enabled() is called.
+  This may happen if the consumer shares the regulator or the regulator has been
+  previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling::
+
+	int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable its supply when no longer needed by calling::
+
+	int regulator_disable(regulator);
+
+NOTE:
+  This may not disable the supply if it's shared with other consumers. The
+  regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency::
+
+	int regulator_force_disable(regulator);
+
+NOTE:
+  this will immediately and forcefully shutdown the regulator output. All
+  consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+=======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling::
+
+	int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling::
+
+	int regulator_get_voltage(regulator);
+
+NOTE:
+  get_voltage() will return the configured output voltage whether the
+  regulator is enabled or disabled and should NOT be used to determine regulator
+  output state. However this can be used in conjunction with is_enabled() to
+  determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+=============================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling::
+
+	int regulator_set_current_limit(regulator, min_uA, max_uA);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE:
+  this can be called when the regulator is enabled or disabled. If called
+  when enabled, then the current limit changes instantly, otherwise the current
+  limit configuration changes and the current limit is physically set when the
+  regulator is next enabled.
+
+A regulators current limit can be found by calling::
+
+	int regulator_get_current_limit(regulator);
+
+NOTE:
+  get_current_limit() will return the current limit whether the regulator
+  is enabled or disabled and should not be used to determine regulator current
+  load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+==============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling::
+
+	int regulator_set_load(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all its consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling::
+
+	int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+	unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling::
+
+	int regulator_register_notifier(struct regulator *regulator,
+					struct notifier_block *nb);
+
+Consumers can unregister interest by calling::
+
+	int regulator_unregister_notifier(struct regulator *regulator,
+					  struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to their interested
+consumers.
+
+7. Regulator Direct Register Access
+===================================
+
+Some kinds of power management hardware or firmware are designed such that
+they need to do low-level hardware access to regulators, with no involvement
+from the kernel. Examples of such devices are:
+
+- clocksource with a voltage-controlled oscillator and control logic to change
+  the supply voltage over I2C to achieve a desired output clock rate
+- thermal management firmware that can issue an arbitrary I2C transaction to
+  perform system poweroff during overtemperature conditions
+
+To set up such a device/firmware, various parameters like I2C address of the
+regulator, addresses of various regulator registers etc. need to be configured
+to it. The regulator framework provides the following helpers for querying
+these details.
+
+Bus-specific details, like I2C addresses or transfer rates are handled by the
+regmap framework. To get the regulator's regmap (if supported), use::
+
+	struct regmap *regulator_get_regmap(struct regulator *regulator);
+
+To obtain the hardware register offset and bitmask for the regulator's voltage
+selector register, use::
+
+	int regulator_get_hardware_vsel_register(struct regulator *regulator,
+						 unsigned *vsel_reg,
+						 unsigned *vsel_mask);
+
+To convert a regulator framework voltage selector code (used by
+regulator_list_voltage) to a hardware-specific voltage selector that can be
+directly written to the voltage selector register, use::
+
+	int regulator_list_hardware_vsel(struct regulator *regulator,
+					 unsigned selector);
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
deleted file mode 100644
index e51564c1a140..000000000000
--- a/Documentation/power/regulator/consumer.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-Regulator Consumer Driver Interface
-===================================
-
-This text describes the regulator interface for consumer device drivers.
-Please see overview.txt for a description of the terms used in this text.
-
-
-1. Consumer Regulator Access (static & dynamic drivers)
-=======================================================
-
-A consumer driver can get access to its supply regulator by calling :-
-
-regulator = regulator_get(dev, "Vcc");
-
-The consumer passes in its struct device pointer and power supply ID. The core
-then finds the correct regulator by consulting a machine specific lookup table.
-If the lookup is successful then this call will return a pointer to the struct
-regulator that supplies this consumer.
-
-To release the regulator the consumer driver should call :-
-
-regulator_put(regulator);
-
-Consumers can be supplied by more than one regulator e.g. codec consumer with
-analog and digital supplies :-
-
-digital = regulator_get(dev, "Vcc");  /* digital core */
-analog = regulator_get(dev, "Avdd");  /* analog */
-
-The regulator access functions regulator_get() and regulator_put() will
-usually be called in your device drivers probe() and remove() respectively.
-
-
-2. Regulator Output Enable & Disable (static & dynamic drivers)
-====================================================================
-
-A consumer can enable its power supply by calling:-
-
-int regulator_enable(regulator);
-
-NOTE: The supply may already be enabled before regulator_enabled() is called.
-This may happen if the consumer shares the regulator or the regulator has been
-previously enabled by bootloader or kernel board initialization code.
-
-A consumer can determine if a regulator is enabled by calling :-
-
-int regulator_is_enabled(regulator);
-
-This will return > zero when the regulator is enabled.
-
-
-A consumer can disable its supply when no longer needed by calling :-
-
-int regulator_disable(regulator);
-
-NOTE: This may not disable the supply if it's shared with other consumers. The
-regulator will only be disabled when the enabled reference count is zero.
-
-Finally, a regulator can be forcefully disabled in the case of an emergency :-
-
-int regulator_force_disable(regulator);
-
-NOTE: this will immediately and forcefully shutdown the regulator output. All
-consumers will be powered off.
-
-
-3. Regulator Voltage Control & Status (dynamic drivers)
-======================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-voltage to match system operating points. e.g. CPUfreq drivers can scale
-voltage along with frequency to save power, SD drivers may need to select the
-correct card voltage, etc.
-
-Consumers can control their supply voltage by calling :-
-
-int regulator_set_voltage(regulator, min_uV, max_uV);
-
-Where min_uV and max_uV are the minimum and maximum acceptable voltages in
-microvolts.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the voltage changes instantly, otherwise the voltage
-configuration changes and the voltage is physically set when the regulator is
-next enabled.
-
-The regulators configured voltage output can be found by calling :-
-
-int regulator_get_voltage(regulator);
-
-NOTE: get_voltage() will return the configured output voltage whether the
-regulator is enabled or disabled and should NOT be used to determine regulator
-output state. However this can be used in conjunction with is_enabled() to
-determine the regulator physical output voltage.
-
-
-4. Regulator Current Limit Control & Status (dynamic drivers)
-===========================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-current limit to match system operating points. e.g. LCD backlight driver can
-change the current limit to vary the backlight brightness, USB drivers may want
-to set the limit to 500mA when supplying power.
-
-Consumers can control their supply current limit by calling :-
-
-int regulator_set_current_limit(regulator, min_uA, max_uA);
-
-Where min_uA and max_uA are the minimum and maximum acceptable current limit in
-microamps.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the current limit changes instantly, otherwise the current
-limit configuration changes and the current limit is physically set when the
-regulator is next enabled.
-
-A regulators current limit can be found by calling :-
-
-int regulator_get_current_limit(regulator);
-
-NOTE: get_current_limit() will return the current limit whether the regulator
-is enabled or disabled and should not be used to determine regulator current
-load.
-
-
-5. Regulator Operating Mode Control & Status (dynamic drivers)
-=============================================================
-
-Some consumers can further save system power by changing the operating mode of
-their supply regulator to be more efficient when the consumers operating state
-changes. e.g. consumer driver is idle and subsequently draws less current
-
-Regulator operating mode can be changed indirectly or directly.
-
-Indirect operating mode control.
---------------------------------
-Consumer drivers can request a change in their supply regulator operating mode
-by calling :-
-
-int regulator_set_load(struct regulator *regulator, int load_uA);
-
-This will cause the core to recalculate the total load on the regulator (based
-on all its consumers) and change operating mode (if necessary and permitted)
-to best match the current operating load.
-
-The load_uA value can be determined from the consumer's datasheet. e.g. most
-datasheets have tables showing the maximum current consumed in certain
-situations.
-
-Most consumers will use indirect operating mode control since they have no
-knowledge of the regulator or whether the regulator is shared with other
-consumers.
-
-Direct operating mode control.
-------------------------------
-Bespoke or tightly coupled drivers may want to directly control regulator
-operating mode depending on their operating point. This can be achieved by
-calling :-
-
-int regulator_set_mode(struct regulator *regulator, unsigned int mode);
-unsigned int regulator_get_mode(struct regulator *regulator);
-
-Direct mode will only be used by consumers that *know* about the regulator and
-are not sharing the regulator with other consumers.
-
-
-6. Regulator Events
-===================
-Regulators can notify consumers of external events. Events could be received by
-consumers under regulator stress or failure conditions.
-
-Consumers can register interest in regulator events by calling :-
-
-int regulator_register_notifier(struct regulator *regulator,
-			      struct notifier_block *nb);
-
-Consumers can unregister interest by calling :-
-
-int regulator_unregister_notifier(struct regulator *regulator,
-				struct notifier_block *nb);
-
-Regulators use the kernel notifier framework to send event to their interested
-consumers.
-
-7. Regulator Direct Register Access
-===================================
-Some kinds of power management hardware or firmware are designed such that
-they need to do low-level hardware access to regulators, with no involvement
-from the kernel. Examples of such devices are:
-
-- clocksource with a voltage-controlled oscillator and control logic to change
-  the supply voltage over I2C to achieve a desired output clock rate
-- thermal management firmware that can issue an arbitrary I2C transaction to
-  perform system poweroff during overtemperature conditions
-
-To set up such a device/firmware, various parameters like I2C address of the
-regulator, addresses of various regulator registers etc. need to be configured
-to it. The regulator framework provides the following helpers for querying
-these details.
-
-Bus-specific details, like I2C addresses or transfer rates are handled by the
-regmap framework. To get the regulator's regmap (if supported), use :-
-
-struct regmap *regulator_get_regmap(struct regulator *regulator);
-
-To obtain the hardware register offset and bitmask for the regulator's voltage
-selector register, use :-
-
-int regulator_get_hardware_vsel_register(struct regulator *regulator,
-					 unsigned *vsel_reg,
-					 unsigned *vsel_mask);
-
-To convert a regulator framework voltage selector code (used by
-regulator_list_voltage) to a hardware-specific voltage selector that can be
-directly written to the voltage selector register, use :-
-
-int regulator_list_hardware_vsel(struct regulator *regulator,
-				 unsigned selector);
diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst
new file mode 100644
index 000000000000..3b09c6841dc4
--- /dev/null
+++ b/Documentation/power/regulator/design.rst
@@ -0,0 +1,38 @@
+==========================
+Regulator API design notes
+==========================
+
+This document provides a brief, partially structured, overview of some
+of the design considerations which impact the regulator API design.
+
+Safety
+------
+
+ - Errors in regulator configuration can have very serious consequences
+   for the system, potentially including lasting hardware damage.
+ - It is not possible to automatically determine the power configuration
+   of the system - software-equivalent variants of the same chip may
+   have different power requirements, and not all components with power
+   requirements are visible to software.
+
+.. note::
+
+     The API should make no changes to the hardware state unless it has
+     specific knowledge that these changes are safe to perform on this
+     particular system.
+
+Consumer use cases
+------------------
+
+ - The overwhelming majority of devices in a system will have no
+   requirement to do any runtime configuration of their power beyond
+   being able to turn it on or off.
+
+ - Many of the power supplies in the system will be shared between many
+   different consumers.
+
+.. note::
+
+     The consumer API should be structured so that these use cases are
+     very easy to handle and so that consumers will work with shared
+     supplies without any additional effort.
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
deleted file mode 100644
index fdd919b96830..000000000000
--- a/Documentation/power/regulator/design.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Regulator API design notes
-==========================
-
-This document provides a brief, partially structured, overview of some
-of the design considerations which impact the regulator API design.
-
-Safety
-------
-
- - Errors in regulator configuration can have very serious consequences
-   for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power configuration
-   of the system - software-equivalent variants of the same chip may
-   have different power requirements, and not all components with power
-   requirements are visible to software.
-
-  => The API should make no changes to the hardware state unless it has
-     specific knowledge that these changes are safe to perform on this
-     particular system.
-
-Consumer use cases
-------------------
-
- - The overwhelming majority of devices in a system will have no
-   requirement to do any runtime configuration of their power beyond
-   being able to turn it on or off.
-
- - Many of the power supplies in the system will be shared between many
-   different consumers.
-
-  => The consumer API should be structured so that these use cases are
-     very easy to handle and so that consumers will work with shared
-     supplies without any additional effort.
diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst
new file mode 100644
index 000000000000..22fffefaa3ad
--- /dev/null
+++ b/Documentation/power/regulator/machine.rst
@@ -0,0 +1,97 @@
+==================================
+Regulator Machine Driver Interface
+==================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine::
+
+  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+               |
+               +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supplies. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator::
+
+  struct regulator_consumer_supply {
+	const char *dev_name;	/* consumer dev_name() */
+	const char *supply;	/* consumer supply - e.g. "vcc" */
+  };
+
+e.g. for the machine above::
+
+  static struct regulator_consumer_supply regulator1_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer B"),
+  };
+
+  static struct regulator_consumer_supply regulator2_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer A"),
+  };
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulators::
+
+  static struct regulator_init_data regulator1_data = {
+	.constraints = {
+		.name = "Regulator-1",
+		.min_uV = 3300000,
+		.max_uV = 3300000,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+	.consumer_supplies = regulator1_consumers,
+  };
+
+The name field should be set to something that is usefully descriptive
+for the board for configuration of supplies for other regulators and
+for use in logging and other diagnostic output.  Normally the name
+used for the supply rail in the schematic is a good choice.  If no
+name is provided then the subsystem will choose one.
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables its
+supply (Regulator-2). The supply regulator is set by the supply_regulator
+field below and co::
+
+  static struct regulator_init_data regulator2_data = {
+	.supply_regulator = "Regulator-1",
+	.constraints = {
+		.min_uV = 1800000,
+		.max_uV = 2000000,
+		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+	.consumer_supplies = regulator2_consumers,
+  };
+
+Finally the regulator devices must be registered in the usual manner::
+
+  static struct platform_device regulator_devices[] = {
+	{
+		.name = "regulator",
+		.id = DCDC_1,
+		.dev = {
+			.platform_data = &regulator1_data,
+		},
+	},
+	{
+		.name = "regulator",
+		.id = DCDC_2,
+		.dev = {
+			.platform_data = &regulator2_data,
+		},
+	},
+  };
+  /* register regulator 1 device */
+  platform_device_register(&regulator_devices[0]);
+
+  /* register regulator 2 device */
+  platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
deleted file mode 100644
index eff4dcaaa252..000000000000
--- a/Documentation/power/regulator/machine.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-Regulator Machine Driver Interface
-===================================
-
-The regulator machine driver interface is intended for board/machine specific
-initialisation code to configure the regulator subsystem.
-
-Consider the following machine :-
-
-  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
-               |
-               +-> [Consumer B @ 3.3V]
-
-The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supplies. This mapping can be achieved in machine
-initialisation code by creating a struct regulator_consumer_supply for
-each regulator.
-
-struct regulator_consumer_supply {
-	const char *dev_name;	/* consumer dev_name() */
-	const char *supply;	/* consumer supply - e.g. "vcc" */
-};
-
-e.g. for the machine above
-
-static struct regulator_consumer_supply regulator1_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer B"),
-};
-
-static struct regulator_consumer_supply regulator2_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer A"),
-};
-
-This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
-to the 'Vcc' supply for Consumer A.
-
-Constraints can now be registered by defining a struct regulator_init_data
-for each regulator power domain. This structure also maps the consumers
-to their supply regulators :-
-
-static struct regulator_init_data regulator1_data = {
-	.constraints = {
-		.name = "Regulator-1",
-		.min_uV = 3300000,
-		.max_uV = 3300000,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
-	.consumer_supplies = regulator1_consumers,
-};
-
-The name field should be set to something that is usefully descriptive
-for the board for configuration of supplies for other regulators and
-for use in logging and other diagnostic output.  Normally the name
-used for the supply rail in the schematic is a good choice.  If no
-name is provided then the subsystem will choose one.
-
-Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables its
-supply (Regulator-2). The supply regulator is set by the supply_regulator
-field below and co:-
-
-static struct regulator_init_data regulator2_data = {
-	.supply_regulator = "Regulator-1",
-	.constraints = {
-		.min_uV = 1800000,
-		.max_uV = 2000000,
-		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
-	.consumer_supplies = regulator2_consumers,
-};
-
-Finally the regulator devices must be registered in the usual manner.
-
-static struct platform_device regulator_devices[] = {
-	{
-		.name = "regulator",
-		.id = DCDC_1,
-		.dev = {
-			.platform_data = &regulator1_data,
-		},
-	},
-	{
-		.name = "regulator",
-		.id = DCDC_2,
-		.dev = {
-			.platform_data = &regulator2_data,
-		},
-	},
-};
-/* register regulator 1 device */
-platform_device_register(&regulator_devices[0]);
-
-/* register regulator 2 device */
-platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst
new file mode 100644
index 000000000000..ee494c70a7c4
--- /dev/null
+++ b/Documentation/power/regulator/overview.rst
@@ -0,0 +1,178 @@
+=============================================
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008  Wolfson Microelectronics PLC.
+
+Author: Liam Girdwood <lrg@slimlogic.co.uk>
+
+
+Nomenclature
+============
+
+Some terms used in this document:
+
+  - Regulator
+                 - Electronic device that supplies power to other devices.
+                   Most regulators can enable and disable their output while
+                   some can control their output voltage and or current.
+
+                   Input Voltage -> Regulator -> Output Voltage
+
+
+  - PMIC
+                 - Power Management IC. An IC that contains numerous
+                   regulators and often contains other subsystems.
+
+
+  - Consumer
+                 - Electronic device that is supplied power by a regulator.
+                   Consumers can be classified into two types:-
+
+                   Static: consumer does not change its supply voltage or
+                   current limit. It only needs to enable or disable its
+                   power supply. Its supply voltage is set by the hardware,
+                   bootloader, firmware or kernel board initialisation code.
+
+                   Dynamic: consumer needs to change its supply voltage or
+                   current limit to meet operation demands.
+
+
+  - Power Domain
+                 - Electronic circuit that is supplied its input power by the
+                   output power of a regulator, switch or by another power
+                   domain.
+
+                   The supply regulator may be behind a switch(s). i.e.::
+
+                     Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+                                |             |
+                                |             +-> [Consumer B], [Consumer C]
+                                |
+                                +-> [Consumer D], [Consumer E]
+
+                   That is one regulator and three power domains:
+
+                   - Domain 1: Switch-1, Consumers D & E.
+                   - Domain 2: Switch-2, Consumers B & C.
+                   - Domain 3: Consumer A.
+
+                   and this represents a "supplies" relationship:
+
+                   Domain-1 --> Domain-2 --> Domain-3.
+
+                   A power domain may have regulators that are supplied power
+                   by other regulators. i.e.::
+
+                     Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+                                  |
+                                  +-> [Consumer B]
+
+                   This gives us two regulators and two power domains:
+
+                   - Domain 1: Regulator-2, Consumer B.
+                   - Domain 2: Consumer A.
+
+                   and a "supplies" relationship:
+
+                   Domain-1 --> Domain-2
+
+
+  - Constraints
+                 - Constraints are used to define power levels for performance
+                   and hardware protection. Constraints exist at three levels:
+
+                   Regulator Level: This is defined by the regulator hardware
+                   operating parameters and is specified in the regulator
+                   datasheet. i.e.
+
+                     - voltage output is in the range 800mV -> 3500mV.
+                     - regulator current output limit is 20mA @ 5V but is
+                       10mA @ 10V.
+
+                   Power Domain Level: This is defined in software by kernel
+                   level board initialisation code. It is used to constrain a
+                   power domain to a particular power range. i.e.
+
+                     - Domain-1 voltage is 3300mV
+                     - Domain-2 voltage is 1400mV -> 1600mV
+                     - Domain-3 current limit is 0mA -> 20mA.
+
+                   Consumer Level: This is defined by consumer drivers
+                   dynamically setting voltage or current limit levels.
+
+                   e.g. a consumer backlight driver asks for a current increase
+                   from 5mA to 10mA to increase LCD illumination. This passes
+                   to through the levels as follows :-
+
+                   Consumer: need to increase LCD brightness. Lookup and
+                   request next current mA value in brightness table (the
+                   consumer driver could be used on several different
+                   personalities based upon the same reference device).
+
+                   Power Domain: is the new current limit within the domain
+                   operating limits for this domain and system state (e.g.
+                   battery power, USB power)
+
+                   Regulator Domains: is the new current limit within the
+                   regulator operating parameters for input/output voltage.
+
+                   If the regulator request passes all the constraint tests
+                   then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+   1. Consumer driver interface.
+
+      This uses a similar API to the kernel clock interface in that consumer
+      drivers can get and put a regulator (like they can with clocks atm) and
+      get/set voltage, current limit, mode, enable and disable. This should
+      allow consumers complete control over their supply voltage and current
+      limit. This also compiles out if not in use so drivers can be reused in
+      systems with no regulator based power control.
+
+        See Documentation/power/regulator/consumer.rst
+
+   2. Regulator driver interface.
+
+      This allows regulator drivers to register their regulators and provide
+      operations to the core. It also has a notifier call chain for propagating
+      regulator events to clients.
+
+        See Documentation/power/regulator/regulator.rst
+
+   3. Machine interface.
+
+      This interface is for machine specific code and allows the creation of
+      voltage/current domains (with constraints) for each regulator. It can
+      provide regulator constraints that will prevent device damage through
+      overvoltage or overcurrent caused by buggy client drivers. It also
+      allows the creation of a regulator tree whereby some regulators are
+      supplied by others (similar to a clock tree).
+
+        See Documentation/power/regulator/machine.rst
+
+   4. Userspace ABI.
+
+      The framework also exports a lot of useful voltage/current/opmode data to
+      userspace via sysfs. This could be used to help monitor device power
+      consumption and status.
+
+        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
deleted file mode 100644
index 721b4739ec32..000000000000
--- a/Documentation/power/regulator/overview.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-Linux voltage and current regulator framework
-=============================================
-
-About
-=====
-
-This framework is designed to provide a standard kernel interface to control
-voltage and current regulators.
-
-The intention is to allow systems to dynamically control regulator power output
-in order to save power and prolong battery life. This applies to both voltage
-regulators (where voltage output is controllable) and current sinks (where
-current limit is controllable).
-
-(C) 2008  Wolfson Microelectronics PLC.
-Author: Liam Girdwood <lrg@slimlogic.co.uk>
-
-
-Nomenclature
-============
-
-Some terms used in this document:-
-
-  o Regulator    - Electronic device that supplies power to other devices.
-                   Most regulators can enable and disable their output while
-                   some can control their output voltage and or current.
-
-                   Input Voltage -> Regulator -> Output Voltage
-
-
-  o PMIC         - Power Management IC. An IC that contains numerous regulators
-                   and often contains other subsystems.
-
-
-  o Consumer     - Electronic device that is supplied power by a regulator.
-                   Consumers can be classified into two types:-
-
-                   Static: consumer does not change its supply voltage or
-                   current limit. It only needs to enable or disable its
-                   power supply. Its supply voltage is set by the hardware,
-                   bootloader, firmware or kernel board initialisation code.
-
-                   Dynamic: consumer needs to change its supply voltage or
-                   current limit to meet operation demands.
-
-
-  o Power Domain - Electronic circuit that is supplied its input power by the
-                   output power of a regulator, switch or by another power
-                   domain.
-
-                   The supply regulator may be behind a switch(s). i.e.
-
-                   Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
-                              |             |
-                              |             +-> [Consumer B], [Consumer C]
-                              |
-                              +-> [Consumer D], [Consumer E]
-
-                   That is one regulator and three power domains:
-
-                   Domain 1: Switch-1, Consumers D & E.
-                   Domain 2: Switch-2, Consumers B & C.
-                   Domain 3: Consumer A.
-
-                   and this represents a "supplies" relationship:
-
-                   Domain-1 --> Domain-2 --> Domain-3.
-
-                   A power domain may have regulators that are supplied power
-                   by other regulators. i.e.
-
-                   Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
-                                |
-                                +-> [Consumer B]
-
-                   This gives us two regulators and two power domains:
-
-                   Domain 1: Regulator-2, Consumer B.
-                   Domain 2: Consumer A.
-
-                   and a "supplies" relationship:
-
-                   Domain-1 --> Domain-2
-
-
-  o Constraints  - Constraints are used to define power levels for performance
-                   and hardware protection. Constraints exist at three levels:
-
-                   Regulator Level: This is defined by the regulator hardware
-                   operating parameters and is specified in the regulator
-                   datasheet. i.e.
-
-                     - voltage output is in the range 800mV -> 3500mV.
-                     - regulator current output limit is 20mA @ 5V but is
-                       10mA @ 10V.
-
-                   Power Domain Level: This is defined in software by kernel
-                   level board initialisation code. It is used to constrain a
-                   power domain to a particular power range. i.e.
-
-                     - Domain-1 voltage is 3300mV
-                     - Domain-2 voltage is 1400mV -> 1600mV
-                     - Domain-3 current limit is 0mA -> 20mA.
-
-                   Consumer Level: This is defined by consumer drivers
-                   dynamically setting voltage or current limit levels.
-
-                   e.g. a consumer backlight driver asks for a current increase
-                   from 5mA to 10mA to increase LCD illumination. This passes
-                   to through the levels as follows :-
-
-                   Consumer: need to increase LCD brightness. Lookup and
-                   request next current mA value in brightness table (the
-                   consumer driver could be used on several different
-                   personalities based upon the same reference device).
-
-                   Power Domain: is the new current limit within the domain
-                   operating limits for this domain and system state (e.g.
-                   battery power, USB power)
-
-                   Regulator Domains: is the new current limit within the
-                   regulator operating parameters for input/output voltage.
-
-                   If the regulator request passes all the constraint tests
-                   then the new regulator value is applied.
-
-
-Design
-======
-
-The framework is designed and targeted at SoC based devices but may also be
-relevant to non SoC devices and is split into the following four interfaces:-
-
-
-   1. Consumer driver interface.
-
-      This uses a similar API to the kernel clock interface in that consumer
-      drivers can get and put a regulator (like they can with clocks atm) and
-      get/set voltage, current limit, mode, enable and disable. This should
-      allow consumers complete control over their supply voltage and current
-      limit. This also compiles out if not in use so drivers can be reused in
-      systems with no regulator based power control.
-
-        See Documentation/power/regulator/consumer.txt
-
-   2. Regulator driver interface.
-
-      This allows regulator drivers to register their regulators and provide
-      operations to the core. It also has a notifier call chain for propagating
-      regulator events to clients.
-
-        See Documentation/power/regulator/regulator.txt
-
-   3. Machine interface.
-
-      This interface is for machine specific code and allows the creation of
-      voltage/current domains (with constraints) for each regulator. It can
-      provide regulator constraints that will prevent device damage through
-      overvoltage or overcurrent caused by buggy client drivers. It also
-      allows the creation of a regulator tree whereby some regulators are
-      supplied by others (similar to a clock tree).
-
-        See Documentation/power/regulator/machine.txt
-
-   4. Userspace ABI.
-
-      The framework also exports a lot of useful voltage/current/opmode data to
-      userspace via sysfs. This could be used to help monitor device power
-      consumption and status.
-
-        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst
new file mode 100644
index 000000000000..794b3256fbb9
--- /dev/null
+++ b/Documentation/power/regulator/regulator.rst
@@ -0,0 +1,32 @@
+==========================
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling::
+
+  struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
+					   const struct regulator_config *config);
+
+This will register the regulator's capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling::
+
+  void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling::
+
+  int regulator_notifier_call_chain(struct regulator_dev *rdev,
+				    unsigned long event, void *data);
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
deleted file mode 100644
index b17e5833ce21..000000000000
--- a/Documentation/power/regulator/regulator.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Regulator Driver Interface
-==========================
-
-The regulator driver interface is relatively simple and designed to allow
-regulator drivers to register their services with the core framework.
-
-
-Registration
-============
-
-Drivers can register a regulator by calling :-
-
-struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-					 const struct regulator_config *config);
-
-This will register the regulator's capabilities and operations to the regulator
-core.
-
-Regulators can be unregistered by calling :-
-
-void regulator_unregister(struct regulator_dev *rdev);
-
-
-Regulator Events
-================
-Regulators can send events (e.g. overtemperature, undervoltage, etc) to
-consumer drivers by calling :-
-
-int regulator_notifier_call_chain(struct regulator_dev *rdev,
-				  unsigned long event, void *data);
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
new file mode 100644
index 000000000000..2c2ec99b5088
--- /dev/null
+++ b/Documentation/power/runtime_pm.rst
@@ -0,0 +1,940 @@
+==================================================
+Runtime Power Management Framework for I/O Devices
+==================================================
+
+(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+1. Introduction
+===============
+
+Support for runtime power management (runtime PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+  put their PM-related work items.  It is strongly recommended that pm_wq be
+  used for queuing all work items related to runtime PM, because this allows
+  them to be synchronized with system-wide power transitions (suspend to RAM,
+  hibernation and resume from system sleep states).  pm_wq is declared in
+  include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of runtime PM fields in the 'power' member of 'struct device' (which
+  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+  be used for synchronizing runtime PM operations with one another.
+
+* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
+  include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+  used for carrying out runtime PM operations in such a way that the
+  synchronization between them is taken care of by the PM core.  Bus types and
+  device drivers are encouraged to use these functions.
+
+The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+runtime PM are described below.
+
+2. Device Runtime PM Callbacks
+==============================
+
+There are three device runtime PM callbacks defined in 'struct dev_pm_ops'::
+
+  struct dev_pm_ops {
+	...
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	int (*runtime_idle)(struct device *dev);
+	...
+  };
+
+The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
+are executed by the PM core for the device's subsystem that may be either of
+the following:
+
+  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
+     is present.
+
+  2. Device type of the device, if both dev->type and dev->type->pm are present.
+
+  3. Device class of the device, if both dev->class and dev->class->pm are
+     present.
+
+  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
+
+If the subsystem chosen by applying the above rules doesn't provide the relevant
+callback, the PM core will invoke the corresponding driver callback stored in
+dev->driver->pm directly (if present).
+
+The PM core always checks which callback to use in the order given above, so the
+priority order of callbacks from high to low is: PM domain, device type, class
+and bus type.  Moreover, the high-priority one will always take precedence over
+a low-priority one.  The PM domain, bus type, device type and class callbacks
+are referred to as subsystem-level callbacks in what follows.
+
+By default, the callbacks are always invoked in process context with interrupts
+enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
+the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
+and ->runtime_idle() callbacks for the given device in atomic context with
+interrupts disabled.  This implies that the callback routines in question must
+not block or sleep, but it also means that the synchronous helper functions
+listed at the end of Section 4 may be used for that device within an interrupt
+handler or generally in an atomic context.
+
+The subsystem-level suspend callback, if present, is _entirely_ _responsible_
+for handling the suspend of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the subsystem-level suspend callback
+knows what to do to handle the device).
+
+  * Once the subsystem-level suspend callback (or the driver suspend callback,
+    if invoked directly) has completed successfully for the given device, the PM
+    core regards the device as suspended, which need not mean that it has been
+    put into a low power state.  It is supposed to mean, however, that the
+    device will not process data and will not communicate with the CPU(s) and
+    RAM until the appropriate resume callback is executed for it.  The runtime
+    PM status of a device after successful execution of the suspend callback is
+    'suspended'.
+
+  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
+    status remains 'active', which means that the device _must_ be fully
+    operational afterwards.
+
+  * If the suspend callback returns an error code different from -EBUSY and
+    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
+    the helper functions described in Section 4 for the device until its status
+    is directly set to  either 'active', or 'suspended' (the PM core provides
+    special helper functions for this purpose).
+
+In particular, if the driver requires remote wakeup capability (i.e. hardware
+mechanism allowing the device to request a change of its power state, such as
+PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
+device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
+device_can_wakeup() returns 'true' for the device and the device is put into a
+low-power state during the execution of the suspend callback, it is expected
+that remote wakeup will be enabled for the device.  Generally, remote wakeup
+should be enabled for all input devices put into low-power states at run time.
+
+The subsystem-level resume callback, if present, is **entirely responsible** for
+handling the resume of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the subsystem-level resume callback knows
+what to do to handle the device).
+
+  * Once the subsystem-level resume callback (or the driver resume callback, if
+    invoked directly) has completed successfully, the PM core regards the device
+    as fully operational, which means that the device _must_ be able to complete
+    I/O operations as needed.  The runtime PM status of the device is then
+    'active'.
+
+  * If the resume callback returns an error code, the PM core regards this as a
+    fatal error and will refuse to run the helper functions described in Section
+    4 for the device, until its status is directly set to either 'active', or
+    'suspended' (by means of special helper functions provided by the PM core
+    for this purpose).
+
+The idle callback (a subsystem-level one, if present, or the driver one) is
+executed by the PM core whenever the device appears to be idle, which is
+indicated to the PM core by two counters, the device's usage counter and the
+counter of 'active' children of the device.
+
+  * If any of these counters is decreased using a helper function provided by
+    the PM core and it turns out to be equal to zero, the other counter is
+    checked.  If that counter also is equal to zero, the PM core executes the
+    idle callback with the device as its argument.
+
+The action performed by the idle callback is totally dependent on the subsystem
+(or driver) in question, but the expected and recommended action is to check
+if the device can be suspended (i.e. if all of the conditions necessary for
+suspending the device are satisfied) and to queue up a suspend request for the
+device in that case.  If there is no idle callback, or if the callback returns
+0, then the PM core will attempt to carry out a runtime suspend of the device,
+also respecting devices configured for autosuspend.  In essence this means a
+call to pm_runtime_autosuspend() (do note that drivers needs to update the
+device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
+this circumstance).  To prevent this (for example, if the callback routine has
+started a delayed suspend), the routine must return a non-zero value.  Negative
+error return codes are ignored by the PM core.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to runtime PM callbacks for
+one device:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+    ->runtime_suspend() in parallel with ->runtime_resume() or with another
+    instance of ->runtime_suspend() for the same device) with the exception that
+    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+    ->runtime_idle() (although ->runtime_idle() will not be started while any
+    of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+    devices (i.e. the PM core will only execute ->runtime_idle() or
+    ->runtime_suspend() for the devices the runtime PM status of which is
+    'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+    the usage counter of which is equal to zero _and_ either the counter of
+    'active' children of which is equal to zero, or the 'power.ignore_children'
+    flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
+    PM core will only execute ->runtime_resume() for the devices the runtime
+    PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+  * If ->runtime_suspend() is about to be executed or there's a pending request
+    to execute it, ->runtime_idle() will not be executed for the same device.
+
+  * A request to execute or to schedule the execution of ->runtime_suspend()
+    will cancel any pending requests to execute ->runtime_idle() for the same
+    device.
+
+  * If ->runtime_resume() is about to be executed or there's a pending request
+    to execute it, the other callbacks will not be executed for the same device.
+
+  * A request to execute ->runtime_resume() will cancel any pending or
+    scheduled requests to execute the other callbacks for the same device,
+    except for scheduled autosuspends.
+
+3. Runtime PM Device Fields
+===========================
+
+The following device runtime PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+  `struct timer_list suspend_timer;`
+    - timer used for scheduling (delayed) suspend and autosuspend requests
+
+  `unsigned long timer_expires;`
+    - timer expiration time, in jiffies (if this is different from zero, the
+      timer is running and will expire at that time, otherwise the timer is not
+      running)
+
+  `struct work_struct work;`
+    - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+  `wait_queue_head_t wait_queue;`
+    - wait queue used if any of the helper functions needs to wait for another
+      one to complete
+
+  `spinlock_t lock;`
+    - lock used for synchronization
+
+  `atomic_t usage_count;`
+    - the usage counter of the device
+
+  `atomic_t child_count;`
+    - the count of 'active' children of the device
+
+  `unsigned int ignore_children;`
+    - if set, the value of child_count is ignored (but still updated)
+
+  `unsigned int disable_depth;`
+    - used for disabling the helper functions (they work normally if this is
+      equal to zero); the initial value of it is 1 (i.e. runtime PM is
+      initially disabled for all devices)
+
+  `int runtime_error;`
+    - if set, there was a fatal error (one of the callbacks returned error code
+      as described in Section 2), so the helper functions will not work until
+      this flag is cleared; this is the error code returned by the failing
+      callback
+
+  `unsigned int idle_notification;`
+    - if set, ->runtime_idle() is being executed
+
+  `unsigned int request_pending;`
+    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+  `enum rpm_request request;`
+    - type of request that's pending (valid if request_pending is set)
+
+  `unsigned int deferred_resume;`
+    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+      being executed for that device and it is not practical to wait for the
+      suspend to complete; means "start a resume as soon as you've suspended"
+
+  `enum rpm_status runtime_status;`
+    - the runtime PM status of the device; this field's initial value is
+      RPM_SUSPENDED, which means that each device is initially regarded by the
+      PM core as 'suspended', regardless of its real hardware status
+
+  `unsigned int runtime_auto;`
+    - if set, indicates that the user space has allowed the device driver to
+      power manage the device at run time via the /sys/devices/.../power/control
+      `interface;` it may only be modified with the help of the pm_runtime_allow()
+      and pm_runtime_forbid() helper functions
+
+  `unsigned int no_callbacks;`
+    - indicates that the device does not use the runtime PM callbacks (see
+      Section 8); it may be modified only by the pm_runtime_no_callbacks()
+      helper function
+
+  `unsigned int irq_safe;`
+    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+      will be invoked with the spinlock held and interrupts disabled
+
+  `unsigned int use_autosuspend;`
+    - indicates that the device's driver supports delayed autosuspend (see
+      Section 9); it may be modified only by the
+      pm_runtime{_dont}_use_autosuspend() helper functions
+
+  `unsigned int timer_autosuspends;`
+    - indicates that the PM core should attempt to carry out an autosuspend
+      when the timer expires rather than a normal suspend
+
+  `int autosuspend_delay;`
+    - the delay time (in milliseconds) to be used for autosuspend
+
+  `unsigned long last_busy;`
+    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
+      function was last called for this device; used in calculating inactivity
+      periods for autosuspend
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Runtime PM Device Helper Functions
+=====================================
+
+The following runtime PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+  `void pm_runtime_init(struct device *dev);`
+    - initialize the device runtime PM fields in 'struct dev_pm_info'
+
+  `void pm_runtime_remove(struct device *dev);`
+    - make sure that the runtime PM of the device will be disabled after
+      removing the device from device hierarchy
+
+  `int pm_runtime_idle(struct device *dev);`
+    - execute the subsystem-level idle callback for the device; returns an
+      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
+      already being executed; if there is no callback or the callback returns 0
+      then run pm_runtime_autosuspend(dev) and return its result
+
+  `int pm_runtime_suspend(struct device *dev);`
+    - execute the subsystem-level suspend callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'suspended', or
+      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+      to suspend the device again in future and -EACCES means that
+      'power.disable_depth' is different from 0
+
+  `int pm_runtime_autosuspend(struct device *dev);`
+    - same as pm_runtime_suspend() except that the autosuspend delay is taken
+      `into account;` if pm_runtime_autosuspend_expiration() says the delay has
+      not yet expired then an autosuspend is scheduled for the appropriate time
+      and 0 is returned
+
+  `int pm_runtime_resume(struct device *dev);`
+    - execute the subsystem-level resume callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'active' or
+      error code on failure, where -EAGAIN means it may be safe to attempt to
+      resume the device again in future, but 'power.runtime_error' should be
+      checked additionally, and -EACCES means that 'power.disable_depth' is
+      different from 0
+
+  `int pm_request_idle(struct device *dev);`
+    - submit a request to execute the subsystem-level idle callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success or error code if the request has not been queued up
+
+  `int pm_request_autosuspend(struct device *dev);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device when the autosuspend delay has expired; if the delay has already
+      expired then the work item is queued up immediately
+
+  `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device in future, where 'delay' is the time to wait before queuing up a
+      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
+      item is queued up immediately); returns 0 on success, 1 if the device's PM
+      runtime status was already 'suspended', or error code if the request
+      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+      ->runtime_suspend() is already scheduled and not yet expired, the new
+      value of 'delay' will be used as the time to wait
+
+  `int pm_request_resume(struct device *dev);`
+    - submit a request to execute the subsystem-level resume callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success, 1 if the device's runtime PM status was already 'active', or
+      error code if the request hasn't been queued up
+
+  `void pm_runtime_get_noresume(struct device *dev);`
+    - increment the device's usage counter
+
+  `int pm_runtime_get(struct device *dev);`
+    - increment the device's usage counter, run pm_request_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_sync(struct device *dev);`
+    - increment the device's usage counter, run pm_runtime_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_if_in_use(struct device *dev);`
+    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
+      nonzero, increment the counter and return 1; otherwise return 0 without
+      changing the counter
+
+  `void pm_runtime_put_noidle(struct device *dev);`
+    - decrement the device's usage counter
+
+  `int pm_runtime_put(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_idle(dev) and return its result
+
+  `int pm_runtime_put_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_autosuspend(dev) and return its result
+
+  `int pm_runtime_put_sync(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_idle(dev) and return its result
+
+  `int pm_runtime_put_sync_suspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_suspend(dev) and return its result
+
+  `int pm_runtime_put_sync_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_autosuspend(dev) and return its result
+
+  `void pm_runtime_enable(struct device *dev);`
+    - decrement the device's 'power.disable_depth' field; if that field is equal
+      to zero, the runtime PM helper functions can execute subsystem-level
+      callbacks described in Section 2 for the device
+
+  `int pm_runtime_disable(struct device *dev);`
+    - increment the device's 'power.disable_depth' field (if the value of that
+      field was previously zero, this prevents subsystem-level runtime PM
+      callbacks from being run for the device), make sure that all of the
+      pending runtime PM operations on the device are either completed or
+      canceled; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device
+      to satisfy that request, otherwise 0 is returned
+
+  `int pm_runtime_barrier(struct device *dev);`
+    - check if there's a resume request pending for the device and resume it
+      (synchronously) in that case, cancel any other pending runtime PM requests
+      regarding it and wait for all runtime PM operations on it in progress to
+      complete; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device to
+      satisfy that request, otherwise 0 is returned
+
+  `void pm_suspend_ignore_children(struct device *dev, bool enable);`
+    - set/unset the power.ignore_children flag of the device
+
+  `int pm_runtime_set_active(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'active' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero); it will fail and return error code if the device has a parent
+      which is not active and the 'power.ignore_children' flag of which is unset
+
+  `void pm_runtime_set_suspended(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'suspended' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero)
+
+  `bool pm_runtime_active(struct device *dev);`
+    - return true if the device's runtime PM status is 'active' or its
+      'power.disable_depth' field is not equal to zero, or false otherwise
+
+  `bool pm_runtime_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended' and its
+      'power.disable_depth' field is equal to zero, or false otherwise
+
+  `bool pm_runtime_status_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended'
+
+  `void pm_runtime_allow(struct device *dev);`
+    - set the power.runtime_auto flag for the device and decrease its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively allow the device to be power managed at run time)
+
+  `void pm_runtime_forbid(struct device *dev);`
+    - unset the power.runtime_auto flag for the device and increase its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively prevent the device from being power managed at run time)
+
+  `void pm_runtime_no_callbacks(struct device *dev);`
+    - set the power.no_callbacks flag for the device and remove the runtime
+      PM attributes from /sys/devices/.../power (or prevent them from being
+      added when the device is registered)
+
+  `void pm_runtime_irq_safe(struct device *dev);`
+    - set the power.irq_safe flag for the device, causing the runtime-PM
+      callbacks to be invoked with interrupts off
+
+  `bool pm_runtime_is_irq_safe(struct device *dev);`
+    - return true if power.irq_safe flag was set for the device, causing
+      the runtime-PM callbacks to be invoked with interrupts off
+
+  `void pm_runtime_mark_last_busy(struct device *dev);`
+    - set the power.last_busy field to the current time
+
+  `void pm_runtime_use_autosuspend(struct device *dev);`
+    - set the power.use_autosuspend flag, enabling autosuspend delays; call
+      pm_runtime_get_sync if the flag was previously cleared and
+      power.autosuspend_delay is negative
+
+  `void pm_runtime_dont_use_autosuspend(struct device *dev);`
+    - clear the power.use_autosuspend flag, disabling autosuspend delays;
+      decrement the device's usage counter if the flag was previously set and
+      power.autosuspend_delay is negative; call pm_runtime_idle
+
+  `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);`
+    - set the power.autosuspend_delay value to 'delay' (expressed in
+      milliseconds); if 'delay' is negative then runtime suspends are
+      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+      called or the device's usage counter may be decremented and
+      pm_runtime_idle called depending on if power.autosuspend_delay is
+      changed to or from a negative value; if power.use_autosuspend is clear,
+      pm_runtime_idle is called
+
+  `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);`
+    - calculate the time when the current autosuspend delay period will expire,
+      based on power.last_busy and power.autosuspend_delay; if the delay time
+      is 1000 ms or larger then the expiration time is rounded up to the
+      nearest second; returns 0 if the delay period has already expired or
+      power.use_autosuspend isn't set, otherwise returns the expiration time
+      in jiffies
+
+It is safe to execute the following helper functions from interrupt context:
+
+- pm_request_idle()
+- pm_request_autosuspend()
+- pm_schedule_suspend()
+- pm_request_resume()
+- pm_runtime_get_noresume()
+- pm_runtime_get()
+- pm_runtime_put_noidle()
+- pm_runtime_put()
+- pm_runtime_put_autosuspend()
+- pm_runtime_enable()
+- pm_suspend_ignore_children()
+- pm_runtime_set_active()
+- pm_runtime_set_suspended()
+- pm_runtime_suspended()
+- pm_runtime_mark_last_busy()
+- pm_runtime_autosuspend_expiration()
+
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+- pm_runtime_idle()
+- pm_runtime_suspend()
+- pm_runtime_autosuspend()
+- pm_runtime_resume()
+- pm_runtime_get_sync()
+- pm_runtime_put_sync()
+- pm_runtime_put_sync_suspend()
+- pm_runtime_put_sync_autosuspend()
+
+5. Runtime PM Initialization, Device Probing and Removal
+========================================================
+
+Initially, the runtime PM is disabled for all devices, which means that the
+majority of the runtime PM helper functions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial runtime PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+runtime PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's runtime PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set.  Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it).  For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its runtime PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial runtime PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4.  In that case, pm_runtime_resume()
+should be used.  Of course, for this purpose the device's runtime PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+Note, if the device may execute pm_runtime calls during the probe (such as
+if it is registers with a subsystem that may call back in) then the
+pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
+appropriate to ensure that the device is not put back to sleep during the
+probe. This can happen with systems such as the network device layer.
+
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asynchronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time.  A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
+
+Moreover, the driver core prevents runtime PM callbacks from racing with the bus
+notifier callback in __device_release_driver(), which is necessary, because the
+notifier is used by some subsystems to carry out operations affecting the
+runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
+driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
+resumes the device if it's in the suspended state and prevents it from
+being suspended again while those routines are being executed.
+
+To allow bus types and drivers to put devices into the suspended state by
+calling pm_runtime_suspend() from their ->remove() routines, the driver core
+executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
+notifications in __device_release_driver().  This requires bus types and
+drivers to make their ->remove() callbacks avoid races with runtime PM directly,
+but also it allows of more flexibility in the handling of devices during the
+removal of their drivers.
+
+Drivers in ->remove() callback should undo the runtime PM changes done
+in ->probe(). Usually this means calling pm_runtime_disable(),
+pm_runtime_dont_use_autosuspend() etc.
+
+The user space can effectively disallow the driver of the device to power manage
+it at run time by changing the value of its /sys/devices/.../power/control
+attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
+this mechanism may also be used by the driver to effectively turn off the
+runtime power management of the device until the user space turns it on.
+Namely, during the initialization the driver can make sure that the runtime PM
+status of the device is 'active' and call pm_runtime_forbid().  It should be
+noted, however, that if the user space has already intentionally changed the
+value of /sys/devices/.../power/control to "auto" to allow the driver to power
+manage the device at run time, the driver may confuse it by using
+pm_runtime_forbid() this way.
+
+6. Runtime PM and System Sleep
+==============================
+
+Runtime PM and system sleep (i.e., system suspend and hibernation, also known
+as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
+ways.  If a device is active when a system sleep starts, everything is
+straightforward.  But what should happen if the device is already suspended?
+
+The device may have different wake-up settings for runtime PM and system sleep.
+For example, remote wake-up may be enabled for runtime suspend but disallowed
+for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
+the subsystem-level system suspend callback is responsible for changing the
+device's wake-up setting (it may leave that to the device driver's system
+suspend routine).  It may be necessary to resume the device and suspend it again
+in order to do so.  The same is true if the driver uses different power levels
+or other settings for runtime suspend and system sleep.
+
+During system resume, the simplest approach is to bring all devices back to full
+power, even if they had been suspended before the system suspend began.  There
+are several reasons for this, including:
+
+  * The device might need to switch power levels, wake-up settings, etc.
+
+  * Remote wake-up events might have been lost by the firmware.
+
+  * The device's children may need the device to be at full power in order
+    to resume themselves.
+
+  * The driver's idea of the device state may not agree with the device's
+    physical state.  This can happen during resume from hibernation.
+
+  * The device might need to be reset.
+
+  * Even though the device was suspended, if its usage counter was > 0 then most
+    likely it would need a runtime resume in the near future anyway.
+
+If the device had been suspended before the system suspend began and it's
+brought back to full power during resume, then its runtime PM status will have
+to be updated to reflect the actual post-system sleep status.  The way to do
+this is:
+
+	 - pm_runtime_disable(dev);
+	 - pm_runtime_set_active(dev);
+	 - pm_runtime_enable(dev);
+
+The PM core always increments the runtime usage counter before calling the
+->suspend() callback and decrements it after calling the ->resume() callback.
+Hence disabling runtime PM temporarily like this will not cause any runtime
+suspend attempts to be permanently lost.  If the usage count goes to zero
+following the return of the ->resume() callback, the ->runtime_idle() callback
+will be invoked as usual.
+
+On some systems, however, system sleep is not entered through a global firmware
+or hardware operation.  Instead, all hardware components are put into low-power
+states directly by the kernel in a coordinated way.  Then, the system sleep
+state effectively follows from the states the hardware components end up in
+and the system is woken up from that state by a hardware interrupt or a similar
+mechanism entirely under the kernel's control.  As a result, the kernel never
+gives control away and the states of all devices during resume are precisely
+known to it.  If that is the case and none of the situations listed above takes
+place (in particular, if the system is not waking up from hibernation), it may
+be more efficient to leave the devices that had been suspended before the system
+suspend began in the suspended state.
+
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy.  Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend.  If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate.  This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/driver-api/pm/devices.rst for more
+information).
+
+The PM core does its best to reduce the probability of race conditions between
+the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
+out the following operations:
+
+  * During system suspend pm_runtime_get_noresume() is called for every device
+    right before executing the subsystem-level .prepare() callback for it and
+    pm_runtime_barrier() is called for every device right before executing the
+    subsystem-level .suspend() callback for it.  In addition to that the PM core
+    calls  __pm_runtime_disable() with 'false' as the second argument for every
+    device right before executing the subsystem-level .suspend_late() callback
+    for it.
+
+  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
+    every device right after executing the subsystem-level .resume_early()
+    callback and right after executing the subsystem-level .complete() callback
+    for it, respectively.
+
+7. Generic subsystem callbacks
+
+Subsystems may wish to conserve code space by using the set of generic power
+management callbacks provided by the PM core, defined in
+driver/base/power/generic_ops.c:
+
+  `int pm_generic_runtime_suspend(struct device *dev);`
+    - invoke the ->runtime_suspend() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_runtime_resume(struct device *dev);`
+    - invoke the ->runtime_resume() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_suspend(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->suspend()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_suspend_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_resume(struct device *dev);`
+    - invoke the ->resume() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_resume_noirq(struct device *dev);`
+    - invoke the ->resume_noirq() callback provided by the driver of this device
+
+  `int pm_generic_freeze(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->freeze()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_freeze_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_thaw(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->thaw()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_thaw_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_poweroff(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->poweroff()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_poweroff_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_restore(struct device *dev);`
+    - invoke the ->restore() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_restore_noirq(struct device *dev);`
+    - invoke the ->restore_noirq() callback provided by the device's driver
+
+These functions are the defaults used by the PM core, if a subsystem doesn't
+provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
+subsystem-level dev_pm_ops structure.
+
+Device drivers that wish to use the same function as a system suspend, freeze,
+poweroff and runtime suspend callback, and similarly for system resume, thaw,
+restore, and runtime resume, can achieve this with the help of the
+UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
+last argument to NULL).
+
+8. "No-Callback" Devices
+========================
+
+Some "devices" are only logical sub-devices of their parent and cannot be
+power-managed on their own.  (The prototype example is a USB interface.  Entire
+USB devices can go into low-power mode or send wake-up requests, but neither is
+possible for individual interfaces.)  The drivers for these devices have no
+need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
+and ->runtime_resume() would always return 0 without doing anything else and
+->runtime_idle() would always call pm_runtime_suspend().
+
+Subsystems can tell the PM core about these devices by calling
+pm_runtime_no_callbacks().  This should be done after the device structure is
+initialized and before it is registered (although after device registration is
+also okay).  The routine will set the device's power.no_callbacks flag and
+prevent the non-debugging runtime PM sysfs attributes from being created.
+
+When power.no_callbacks is set, the PM core will not invoke the
+->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
+Instead it will assume that suspends and resumes always succeed and that idle
+devices should be suspended.
+
+As a consequence, the PM core will never directly inform the device's subsystem
+or driver about runtime power changes.  Instead, the driver for the device's
+parent must take responsibility for telling the device's driver when the
+parent's power state changes.
+
+9. Autosuspend, or automatically-delayed suspends
+=================================================
+
+Changing a device's power state isn't free; it requires both time and energy.
+A device should be put in a low-power state only when there's some reason to
+think it will remain in that state for a substantial time.  A common heuristic
+says that a device which hasn't been used for a while is liable to remain
+unused; following this advice, drivers should not allow devices to be suspended
+at runtime until they have been inactive for some minimum period.  Even when
+the heuristic ends up being non-optimal, it will still prevent devices from
+"bouncing" too rapidly between low-power and full-power states.
+
+The term "autosuspend" is an historical remnant.  It doesn't mean that the
+device is automatically suspended (the subsystem or driver still has to call
+the appropriate PM routines); rather it means that runtime suspends will
+automatically be delayed until the desired period of inactivity has elapsed.
+
+Inactivity is determined based on the power.last_busy field.  Drivers should
+call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
+typically just before calling pm_runtime_put_autosuspend().  The desired length
+of the inactivity period is a matter of policy.  Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
+registration the length should be controlled by user space, using the
+/sys/devices/.../power/autosuspend_delay_ms attribute.
+
+In order to use autosuspend, subsystems or drivers must call
+pm_runtime_use_autosuspend() (preferably before registering the device), and
+thereafter they should use the various `*_autosuspend()` helper functions
+instead of the non-autosuspend counterparts::
+
+	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
+	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
+	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
+	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
+
+Drivers may also continue to use the non-autosuspend helper functions; they
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
+
+Under some circumstances a driver or subsystem may want to prevent a device
+from autosuspending immediately, even though the usage counter is zero and the
+autosuspend delay time has expired.  If the ->runtime_suspend() callback
+returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
+in the future (as it normally would be if the callback invoked
+pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
+autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
+itself because no suspend requests of any kind are accepted while the device is
+suspending (i.e., while the callback is running).
+
+The implementation is well suited for asynchronous use in interrupt contexts.
+However such use inevitably involves races, because the PM core can't
+synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
+This synchronization must be handled by the driver, using its private lock.
+Here is a schematic pseudo-code example::
+
+	foo_read_or_write(struct foo_priv *foo, void *data)
+	{
+		lock(&foo->private_lock);
+		add_request_to_io_queue(foo, data);
+		if (foo->num_pending_requests++ == 0)
+			pm_runtime_get(&foo->dev);
+		if (!foo->is_suspended)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+	}
+
+	foo_io_completion(struct foo_priv *foo, void *req)
+	{
+		lock(&foo->private_lock);
+		if (--foo->num_pending_requests == 0) {
+			pm_runtime_mark_last_busy(&foo->dev);
+			pm_runtime_put_autosuspend(&foo->dev);
+		} else {
+			foo_process_next_request(foo);
+		}
+		unlock(&foo->private_lock);
+		/* Send req result back to the user ... */
+	}
+
+	int foo_runtime_suspend(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+		int ret = 0;
+
+		lock(&foo->private_lock);
+		if (foo->num_pending_requests > 0) {
+			ret = -EBUSY;
+		} else {
+			/* ... suspend the device ... */
+			foo->is_suspended = 1;
+		}
+		unlock(&foo->private_lock);
+		return ret;
+	}
+
+	int foo_runtime_resume(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+
+		lock(&foo->private_lock);
+		/* ... resume the device ... */
+		foo->is_suspended = 0;
+		pm_runtime_mark_last_busy(&foo->dev);
+		if (foo->num_pending_requests > 0)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+		return 0;
+	}
+
+The important point is that after foo_io_completion() asks for an autosuspend,
+the foo_runtime_suspend() callback may race with foo_read_or_write().
+Therefore foo_runtime_suspend() has to check whether there are any pending I/O
+requests (while holding the private lock) before allowing the suspend to
+proceed.
+
+In addition, the power.autosuspend_delay field can be changed by user space at
+any time.  If a driver cares about this, it can call
+pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
+callback while holding its private lock.  If the function returns a nonzero
+value then the delay has not yet expired and the callback should return
+-EAGAIN.
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
deleted file mode 100644
index 937e33c46211..000000000000
--- a/Documentation/power/runtime_pm.txt
+++ /dev/null
@@ -1,928 +0,0 @@
-Runtime Power Management Framework for I/O Devices
-
-(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-(C) 2010 Alan Stern <stern@rowland.harvard.edu>
-(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-1. Introduction
-
-Support for runtime power management (runtime PM) of I/O devices is provided
-at the power management core (PM core) level by means of:
-
-* The power management workqueue pm_wq in which bus types and device drivers can
-  put their PM-related work items.  It is strongly recommended that pm_wq be
-  used for queuing all work items related to runtime PM, because this allows
-  them to be synchronized with system-wide power transitions (suspend to RAM,
-  hibernation and resume from system sleep states).  pm_wq is declared in
-  include/linux/pm_runtime.h and defined in kernel/power/main.c.
-
-* A number of runtime PM fields in the 'power' member of 'struct device' (which
-  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
-  be used for synchronizing runtime PM operations with one another.
-
-* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
-  include/linux/pm.h).
-
-* A set of helper functions defined in drivers/base/power/runtime.c that can be
-  used for carrying out runtime PM operations in such a way that the
-  synchronization between them is taken care of by the PM core.  Bus types and
-  device drivers are encouraged to use these functions.
-
-The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
-fields of 'struct dev_pm_info' and the core helper functions provided for
-runtime PM are described below.
-
-2. Device Runtime PM Callbacks
-
-There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
-
-struct dev_pm_ops {
-	...
-	int (*runtime_suspend)(struct device *dev);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_idle)(struct device *dev);
-	...
-};
-
-The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
-are executed by the PM core for the device's subsystem that may be either of
-the following:
-
-  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
-     is present.
-
-  2. Device type of the device, if both dev->type and dev->type->pm are present.
-
-  3. Device class of the device, if both dev->class and dev->class->pm are
-     present.
-
-  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
-
-If the subsystem chosen by applying the above rules doesn't provide the relevant
-callback, the PM core will invoke the corresponding driver callback stored in
-dev->driver->pm directly (if present).
-
-The PM core always checks which callback to use in the order given above, so the
-priority order of callbacks from high to low is: PM domain, device type, class
-and bus type.  Moreover, the high-priority one will always take precedence over
-a low-priority one.  The PM domain, bus type, device type and class callbacks
-are referred to as subsystem-level callbacks in what follows.
-
-By default, the callbacks are always invoked in process context with interrupts
-enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
-the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
-and ->runtime_idle() callbacks for the given device in atomic context with
-interrupts disabled.  This implies that the callback routines in question must
-not block or sleep, but it also means that the synchronous helper functions
-listed at the end of Section 4 may be used for that device within an interrupt
-handler or generally in an atomic context.
-
-The subsystem-level suspend callback, if present, is _entirely_ _responsible_
-for handling the suspend of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_suspend() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_suspend()
-callback in a device driver as long as the subsystem-level suspend callback
-knows what to do to handle the device).
-
-  * Once the subsystem-level suspend callback (or the driver suspend callback,
-    if invoked directly) has completed successfully for the given device, the PM
-    core regards the device as suspended, which need not mean that it has been
-    put into a low power state.  It is supposed to mean, however, that the
-    device will not process data and will not communicate with the CPU(s) and
-    RAM until the appropriate resume callback is executed for it.  The runtime
-    PM status of a device after successful execution of the suspend callback is
-    'suspended'.
-
-  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
-    status remains 'active', which means that the device _must_ be fully
-    operational afterwards.
-
-  * If the suspend callback returns an error code different from -EBUSY and
-    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
-    the helper functions described in Section 4 for the device until its status
-    is directly set to  either 'active', or 'suspended' (the PM core provides
-    special helper functions for this purpose).
-
-In particular, if the driver requires remote wakeup capability (i.e. hardware
-mechanism allowing the device to request a change of its power state, such as
-PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
-device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
-device_can_wakeup() returns 'true' for the device and the device is put into a
-low-power state during the execution of the suspend callback, it is expected
-that remote wakeup will be enabled for the device.  Generally, remote wakeup
-should be enabled for all input devices put into low-power states at run time.
-
-The subsystem-level resume callback, if present, is _entirely_ _responsible_ for
-handling the resume of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_resume() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_resume()
-callback in a device driver as long as the subsystem-level resume callback knows
-what to do to handle the device).
-
-  * Once the subsystem-level resume callback (or the driver resume callback, if
-    invoked directly) has completed successfully, the PM core regards the device
-    as fully operational, which means that the device _must_ be able to complete
-    I/O operations as needed.  The runtime PM status of the device is then
-    'active'.
-
-  * If the resume callback returns an error code, the PM core regards this as a
-    fatal error and will refuse to run the helper functions described in Section
-    4 for the device, until its status is directly set to either 'active', or
-    'suspended' (by means of special helper functions provided by the PM core
-    for this purpose).
-
-The idle callback (a subsystem-level one, if present, or the driver one) is
-executed by the PM core whenever the device appears to be idle, which is
-indicated to the PM core by two counters, the device's usage counter and the
-counter of 'active' children of the device.
-
-  * If any of these counters is decreased using a helper function provided by
-    the PM core and it turns out to be equal to zero, the other counter is
-    checked.  If that counter also is equal to zero, the PM core executes the
-    idle callback with the device as its argument.
-
-The action performed by the idle callback is totally dependent on the subsystem
-(or driver) in question, but the expected and recommended action is to check
-if the device can be suspended (i.e. if all of the conditions necessary for
-suspending the device are satisfied) and to queue up a suspend request for the
-device in that case.  If there is no idle callback, or if the callback returns
-0, then the PM core will attempt to carry out a runtime suspend of the device,
-also respecting devices configured for autosuspend.  In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance).  To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value.  Negative
-error return codes are ignored by the PM core.
-
-The helper functions provided by the PM core, described in Section 4, guarantee
-that the following constraints are met with respect to runtime PM callbacks for
-one device:
-
-(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
-    ->runtime_suspend() in parallel with ->runtime_resume() or with another
-    instance of ->runtime_suspend() for the same device) with the exception that
-    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
-    ->runtime_idle() (although ->runtime_idle() will not be started while any
-    of the other callbacks is being executed for the same device).
-
-(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
-    devices (i.e. the PM core will only execute ->runtime_idle() or
-    ->runtime_suspend() for the devices the runtime PM status of which is
-    'active').
-
-(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
-    the usage counter of which is equal to zero _and_ either the counter of
-    'active' children of which is equal to zero, or the 'power.ignore_children'
-    flag of which is set.
-
-(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
-    PM core will only execute ->runtime_resume() for the devices the runtime
-    PM status of which is 'suspended').
-
-Additionally, the helper functions provided by the PM core obey the following
-rules:
-
-  * If ->runtime_suspend() is about to be executed or there's a pending request
-    to execute it, ->runtime_idle() will not be executed for the same device.
-
-  * A request to execute or to schedule the execution of ->runtime_suspend()
-    will cancel any pending requests to execute ->runtime_idle() for the same
-    device.
-
-  * If ->runtime_resume() is about to be executed or there's a pending request
-    to execute it, the other callbacks will not be executed for the same device.
-
-  * A request to execute ->runtime_resume() will cancel any pending or
-    scheduled requests to execute the other callbacks for the same device,
-    except for scheduled autosuspends.
-
-3. Runtime PM Device Fields
-
-The following device runtime PM fields are present in 'struct dev_pm_info', as
-defined in include/linux/pm.h:
-
-  struct timer_list suspend_timer;
-    - timer used for scheduling (delayed) suspend and autosuspend requests
-
-  unsigned long timer_expires;
-    - timer expiration time, in jiffies (if this is different from zero, the
-      timer is running and will expire at that time, otherwise the timer is not
-      running)
-
-  struct work_struct work;
-    - work structure used for queuing up requests (i.e. work items in pm_wq)
-
-  wait_queue_head_t wait_queue;
-    - wait queue used if any of the helper functions needs to wait for another
-      one to complete
-
-  spinlock_t lock;
-    - lock used for synchronization
-
-  atomic_t usage_count;
-    - the usage counter of the device
-
-  atomic_t child_count;
-    - the count of 'active' children of the device
-
-  unsigned int ignore_children;
-    - if set, the value of child_count is ignored (but still updated)
-
-  unsigned int disable_depth;
-    - used for disabling the helper functions (they work normally if this is
-      equal to zero); the initial value of it is 1 (i.e. runtime PM is
-      initially disabled for all devices)
-
-  int runtime_error;
-    - if set, there was a fatal error (one of the callbacks returned error code
-      as described in Section 2), so the helper functions will not work until
-      this flag is cleared; this is the error code returned by the failing
-      callback
-
-  unsigned int idle_notification;
-    - if set, ->runtime_idle() is being executed
-
-  unsigned int request_pending;
-    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
-
-  enum rpm_request request;
-    - type of request that's pending (valid if request_pending is set)
-
-  unsigned int deferred_resume;
-    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
-      being executed for that device and it is not practical to wait for the
-      suspend to complete; means "start a resume as soon as you've suspended"
-
-  enum rpm_status runtime_status;
-    - the runtime PM status of the device; this field's initial value is
-      RPM_SUSPENDED, which means that each device is initially regarded by the
-      PM core as 'suspended', regardless of its real hardware status
-
-  unsigned int runtime_auto;
-    - if set, indicates that the user space has allowed the device driver to
-      power manage the device at run time via the /sys/devices/.../power/control
-      interface; it may only be modified with the help of the pm_runtime_allow()
-      and pm_runtime_forbid() helper functions
-
-  unsigned int no_callbacks;
-    - indicates that the device does not use the runtime PM callbacks (see
-      Section 8); it may be modified only by the pm_runtime_no_callbacks()
-      helper function
-
-  unsigned int irq_safe;
-    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
-      will be invoked with the spinlock held and interrupts disabled
-
-  unsigned int use_autosuspend;
-    - indicates that the device's driver supports delayed autosuspend (see
-      Section 9); it may be modified only by the
-      pm_runtime{_dont}_use_autosuspend() helper functions
-
-  unsigned int timer_autosuspends;
-    - indicates that the PM core should attempt to carry out an autosuspend
-      when the timer expires rather than a normal suspend
-
-  int autosuspend_delay;
-    - the delay time (in milliseconds) to be used for autosuspend
-
-  unsigned long last_busy;
-    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
-      function was last called for this device; used in calculating inactivity
-      periods for autosuspend
-
-All of the above fields are members of the 'power' member of 'struct device'.
-
-4. Runtime PM Device Helper Functions
-
-The following runtime PM helper functions are defined in
-drivers/base/power/runtime.c and include/linux/pm_runtime.h:
-
-  void pm_runtime_init(struct device *dev);
-    - initialize the device runtime PM fields in 'struct dev_pm_info'
-
-  void pm_runtime_remove(struct device *dev);
-    - make sure that the runtime PM of the device will be disabled after
-      removing the device from device hierarchy
-
-  int pm_runtime_idle(struct device *dev);
-    - execute the subsystem-level idle callback for the device; returns an
-      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
-      already being executed; if there is no callback or the callback returns 0
-      then run pm_runtime_autosuspend(dev) and return its result
-
-  int pm_runtime_suspend(struct device *dev);
-    - execute the subsystem-level suspend callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'suspended', or
-      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
-      to suspend the device again in future and -EACCES means that
-      'power.disable_depth' is different from 0
-
-  int pm_runtime_autosuspend(struct device *dev);
-    - same as pm_runtime_suspend() except that the autosuspend delay is taken
-      into account; if pm_runtime_autosuspend_expiration() says the delay has
-      not yet expired then an autosuspend is scheduled for the appropriate time
-      and 0 is returned
-
-  int pm_runtime_resume(struct device *dev);
-    - execute the subsystem-level resume callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'active' or
-      error code on failure, where -EAGAIN means it may be safe to attempt to
-      resume the device again in future, but 'power.runtime_error' should be
-      checked additionally, and -EACCES means that 'power.disable_depth' is
-      different from 0
-
-  int pm_request_idle(struct device *dev);
-    - submit a request to execute the subsystem-level idle callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success or error code if the request has not been queued up
-
-  int pm_request_autosuspend(struct device *dev);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device when the autosuspend delay has expired; if the delay has already
-      expired then the work item is queued up immediately
-
-  int pm_schedule_suspend(struct device *dev, unsigned int delay);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device in future, where 'delay' is the time to wait before queuing up a
-      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
-      item is queued up immediately); returns 0 on success, 1 if the device's PM
-      runtime status was already 'suspended', or error code if the request
-      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
-      ->runtime_suspend() is already scheduled and not yet expired, the new
-      value of 'delay' will be used as the time to wait
-
-  int pm_request_resume(struct device *dev);
-    - submit a request to execute the subsystem-level resume callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success, 1 if the device's runtime PM status was already 'active', or
-      error code if the request hasn't been queued up
-
-  void pm_runtime_get_noresume(struct device *dev);
-    - increment the device's usage counter
-
-  int pm_runtime_get(struct device *dev);
-    - increment the device's usage counter, run pm_request_resume(dev) and
-      return its result
-
-  int pm_runtime_get_sync(struct device *dev);
-    - increment the device's usage counter, run pm_runtime_resume(dev) and
-      return its result
-
-  int pm_runtime_get_if_in_use(struct device *dev);
-    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
-      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
-      nonzero, increment the counter and return 1; otherwise return 0 without
-      changing the counter
-
-  void pm_runtime_put_noidle(struct device *dev);
-    - decrement the device's usage counter
-
-  int pm_runtime_put(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_idle(dev) and return its result
-
-  int pm_runtime_put_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_autosuspend(dev) and return its result
-
-  int pm_runtime_put_sync(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_idle(dev) and return its result
-
-  int pm_runtime_put_sync_suspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_suspend(dev) and return its result
-
-  int pm_runtime_put_sync_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_autosuspend(dev) and return its result
-
-  void pm_runtime_enable(struct device *dev);
-    - decrement the device's 'power.disable_depth' field; if that field is equal
-      to zero, the runtime PM helper functions can execute subsystem-level
-      callbacks described in Section 2 for the device
-
-  int pm_runtime_disable(struct device *dev);
-    - increment the device's 'power.disable_depth' field (if the value of that
-      field was previously zero, this prevents subsystem-level runtime PM
-      callbacks from being run for the device), make sure that all of the
-      pending runtime PM operations on the device are either completed or
-      canceled; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device
-      to satisfy that request, otherwise 0 is returned
-
-  int pm_runtime_barrier(struct device *dev);
-    - check if there's a resume request pending for the device and resume it
-      (synchronously) in that case, cancel any other pending runtime PM requests
-      regarding it and wait for all runtime PM operations on it in progress to
-      complete; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device to
-      satisfy that request, otherwise 0 is returned
-
-  void pm_suspend_ignore_children(struct device *dev, bool enable);
-    - set/unset the power.ignore_children flag of the device
-
-  int pm_runtime_set_active(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'active' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero); it will fail and return error code if the device has a parent
-      which is not active and the 'power.ignore_children' flag of which is unset
-
-  void pm_runtime_set_suspended(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'suspended' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero)
-
-  bool pm_runtime_active(struct device *dev);
-    - return true if the device's runtime PM status is 'active' or its
-      'power.disable_depth' field is not equal to zero, or false otherwise
-
-  bool pm_runtime_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended' and its
-      'power.disable_depth' field is equal to zero, or false otherwise
-
-  bool pm_runtime_status_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended'
-
-  void pm_runtime_allow(struct device *dev);
-    - set the power.runtime_auto flag for the device and decrease its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively allow the device to be power managed at run time)
-
-  void pm_runtime_forbid(struct device *dev);
-    - unset the power.runtime_auto flag for the device and increase its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively prevent the device from being power managed at run time)
-
-  void pm_runtime_no_callbacks(struct device *dev);
-    - set the power.no_callbacks flag for the device and remove the runtime
-      PM attributes from /sys/devices/.../power (or prevent them from being
-      added when the device is registered)
-
-  void pm_runtime_irq_safe(struct device *dev);
-    - set the power.irq_safe flag for the device, causing the runtime-PM
-      callbacks to be invoked with interrupts off
-
-  bool pm_runtime_is_irq_safe(struct device *dev);
-    - return true if power.irq_safe flag was set for the device, causing
-      the runtime-PM callbacks to be invoked with interrupts off
-
-  void pm_runtime_mark_last_busy(struct device *dev);
-    - set the power.last_busy field to the current time
-
-  void pm_runtime_use_autosuspend(struct device *dev);
-    - set the power.use_autosuspend flag, enabling autosuspend delays; call
-      pm_runtime_get_sync if the flag was previously cleared and
-      power.autosuspend_delay is negative
-
-  void pm_runtime_dont_use_autosuspend(struct device *dev);
-    - clear the power.use_autosuspend flag, disabling autosuspend delays;
-      decrement the device's usage counter if the flag was previously set and
-      power.autosuspend_delay is negative; call pm_runtime_idle
-
-  void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
-    - set the power.autosuspend_delay value to 'delay' (expressed in
-      milliseconds); if 'delay' is negative then runtime suspends are
-      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
-      called or the device's usage counter may be decremented and
-      pm_runtime_idle called depending on if power.autosuspend_delay is
-      changed to or from a negative value; if power.use_autosuspend is clear,
-      pm_runtime_idle is called
-
-  unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
-    - calculate the time when the current autosuspend delay period will expire,
-      based on power.last_busy and power.autosuspend_delay; if the delay time
-      is 1000 ms or larger then the expiration time is rounded up to the
-      nearest second; returns 0 if the delay period has already expired or
-      power.use_autosuspend isn't set, otherwise returns the expiration time
-      in jiffies
-
-It is safe to execute the following helper functions from interrupt context:
-
-pm_request_idle()
-pm_request_autosuspend()
-pm_schedule_suspend()
-pm_request_resume()
-pm_runtime_get_noresume()
-pm_runtime_get()
-pm_runtime_put_noidle()
-pm_runtime_put()
-pm_runtime_put_autosuspend()
-pm_runtime_enable()
-pm_suspend_ignore_children()
-pm_runtime_set_active()
-pm_runtime_set_suspended()
-pm_runtime_suspended()
-pm_runtime_mark_last_busy()
-pm_runtime_autosuspend_expiration()
-
-If pm_runtime_irq_safe() has been called for a device then the following helper
-functions may also be used in interrupt context:
-
-pm_runtime_idle()
-pm_runtime_suspend()
-pm_runtime_autosuspend()
-pm_runtime_resume()
-pm_runtime_get_sync()
-pm_runtime_put_sync()
-pm_runtime_put_sync_suspend()
-pm_runtime_put_sync_autosuspend()
-
-5. Runtime PM Initialization, Device Probing and Removal
-
-Initially, the runtime PM is disabled for all devices, which means that the
-majority of the runtime PM helper functions described in Section 4 will return
--EAGAIN until pm_runtime_enable() is called for the device.
-
-In addition to that, the initial runtime PM status of all devices is
-'suspended', but it need not reflect the actual physical state of the device.
-Thus, if the device is initially active (i.e. it is able to process I/O), its
-runtime PM status must be changed to 'active', with the help of
-pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
-
-However, if the device has a parent and the parent's runtime PM is enabled,
-calling pm_runtime_set_active() for the device will affect the parent, unless
-the parent's 'power.ignore_children' flag is set.  Namely, in that case the
-parent won't be able to suspend at run time, using the PM core's helper
-functions, as long as the child's status is 'active', even if the child's
-runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
-the child yet or pm_runtime_disable() has been called for it).  For this reason,
-once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
-should be called for it too as soon as reasonably possible or its runtime PM
-status should be changed back to 'suspended' with the help of
-pm_runtime_set_suspended().
-
-If the default initial runtime PM status of the device (i.e. 'suspended')
-reflects the actual state of the device, its bus type's or its driver's
-->probe() callback will likely need to wake it up using one of the PM core's
-helper functions described in Section 4.  In that case, pm_runtime_resume()
-should be used.  Of course, for this purpose the device's runtime PM has to be
-enabled earlier by calling pm_runtime_enable().
-
-Note, if the device may execute pm_runtime calls during the probe (such as
-if it is registers with a subsystem that may call back in) then the
-pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
-appropriate to ensure that the device is not put back to sleep during the
-probe. This can happen with systems such as the network device layer.
-
-It may be desirable to suspend the device once ->probe() has finished.
-Therefore the driver core uses the asynchronous pm_request_idle() to submit a
-request to execute the subsystem-level idle callback for the device at that
-time.  A driver that makes use of the runtime autosuspend feature, may want to
-update the last busy mark before returning from ->probe().
-
-Moreover, the driver core prevents runtime PM callbacks from racing with the bus
-notifier callback in __device_release_driver(), which is necessary, because the
-notifier is used by some subsystems to carry out operations affecting the
-runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
-driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
-resumes the device if it's in the suspended state and prevents it from
-being suspended again while those routines are being executed.
-
-To allow bus types and drivers to put devices into the suspended state by
-calling pm_runtime_suspend() from their ->remove() routines, the driver core
-executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
-notifications in __device_release_driver().  This requires bus types and
-drivers to make their ->remove() callbacks avoid races with runtime PM directly,
-but also it allows of more flexibility in the handling of devices during the
-removal of their drivers.
-
-Drivers in ->remove() callback should undo the runtime PM changes done
-in ->probe(). Usually this means calling pm_runtime_disable(),
-pm_runtime_dont_use_autosuspend() etc.
-
-The user space can effectively disallow the driver of the device to power manage
-it at run time by changing the value of its /sys/devices/.../power/control
-attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
-this mechanism may also be used by the driver to effectively turn off the
-runtime power management of the device until the user space turns it on.
-Namely, during the initialization the driver can make sure that the runtime PM
-status of the device is 'active' and call pm_runtime_forbid().  It should be
-noted, however, that if the user space has already intentionally changed the
-value of /sys/devices/.../power/control to "auto" to allow the driver to power
-manage the device at run time, the driver may confuse it by using
-pm_runtime_forbid() this way.
-
-6. Runtime PM and System Sleep
-
-Runtime PM and system sleep (i.e., system suspend and hibernation, also known
-as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
-ways.  If a device is active when a system sleep starts, everything is
-straightforward.  But what should happen if the device is already suspended?
-
-The device may have different wake-up settings for runtime PM and system sleep.
-For example, remote wake-up may be enabled for runtime suspend but disallowed
-for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
-the subsystem-level system suspend callback is responsible for changing the
-device's wake-up setting (it may leave that to the device driver's system
-suspend routine).  It may be necessary to resume the device and suspend it again
-in order to do so.  The same is true if the driver uses different power levels
-or other settings for runtime suspend and system sleep.
-
-During system resume, the simplest approach is to bring all devices back to full
-power, even if they had been suspended before the system suspend began.  There
-are several reasons for this, including:
-
-  * The device might need to switch power levels, wake-up settings, etc.
-
-  * Remote wake-up events might have been lost by the firmware.
-
-  * The device's children may need the device to be at full power in order
-    to resume themselves.
-
-  * The driver's idea of the device state may not agree with the device's
-    physical state.  This can happen during resume from hibernation.
-
-  * The device might need to be reset.
-
-  * Even though the device was suspended, if its usage counter was > 0 then most
-    likely it would need a runtime resume in the near future anyway.
-
-If the device had been suspended before the system suspend began and it's
-brought back to full power during resume, then its runtime PM status will have
-to be updated to reflect the actual post-system sleep status.  The way to do
-this is:
-
-	pm_runtime_disable(dev);
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-The PM core always increments the runtime usage counter before calling the
-->suspend() callback and decrements it after calling the ->resume() callback.
-Hence disabling runtime PM temporarily like this will not cause any runtime
-suspend attempts to be permanently lost.  If the usage count goes to zero
-following the return of the ->resume() callback, the ->runtime_idle() callback
-will be invoked as usual.
-
-On some systems, however, system sleep is not entered through a global firmware
-or hardware operation.  Instead, all hardware components are put into low-power
-states directly by the kernel in a coordinated way.  Then, the system sleep
-state effectively follows from the states the hardware components end up in
-and the system is woken up from that state by a hardware interrupt or a similar
-mechanism entirely under the kernel's control.  As a result, the kernel never
-gives control away and the states of all devices during resume are precisely
-known to it.  If that is the case and none of the situations listed above takes
-place (in particular, if the system is not waking up from hibernation), it may
-be more efficient to leave the devices that had been suspended before the system
-suspend began in the suspended state.
-
-To this end, the PM core provides a mechanism allowing some coordination between
-different levels of device hierarchy.  Namely, if a system suspend .prepare()
-callback returns a positive number for a device, that indicates to the PM core
-that the device appears to be runtime-suspended and its state is fine, so it
-may be left in runtime suspend provided that all of its descendants are also
-left in runtime suspend.  If that happens, the PM core will not execute any
-system suspend and resume callbacks for all of those devices, except for the
-complete callback, which is then entirely responsible for handling the device
-as appropriate.  This only applies to system suspend transitions that are not
-related to hibernation (see Documentation/driver-api/pm/devices.rst for more
-information).
-
-The PM core does its best to reduce the probability of race conditions between
-the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
-out the following operations:
-
-  * During system suspend pm_runtime_get_noresume() is called for every device
-    right before executing the subsystem-level .prepare() callback for it and
-    pm_runtime_barrier() is called for every device right before executing the
-    subsystem-level .suspend() callback for it.  In addition to that the PM core
-    calls  __pm_runtime_disable() with 'false' as the second argument for every
-    device right before executing the subsystem-level .suspend_late() callback
-    for it.
-
-  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
-    every device right after executing the subsystem-level .resume_early()
-    callback and right after executing the subsystem-level .complete() callback
-    for it, respectively.
-
-7. Generic subsystem callbacks
-
-Subsystems may wish to conserve code space by using the set of generic power
-management callbacks provided by the PM core, defined in
-driver/base/power/generic_ops.c:
-
-  int pm_generic_runtime_suspend(struct device *dev);
-    - invoke the ->runtime_suspend() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_runtime_resume(struct device *dev);
-    - invoke the ->runtime_resume() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_suspend(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->suspend()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_suspend_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_resume(struct device *dev);
-    - invoke the ->resume() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_resume_noirq(struct device *dev);
-    - invoke the ->resume_noirq() callback provided by the driver of this device
-
-  int pm_generic_freeze(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->freeze()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_freeze_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_thaw(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->thaw()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_thaw_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_poweroff(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->poweroff()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_poweroff_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_restore(struct device *dev);
-    - invoke the ->restore() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_restore_noirq(struct device *dev);
-    - invoke the ->restore_noirq() callback provided by the device's driver
-
-These functions are the defaults used by the PM core, if a subsystem doesn't
-provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
-->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
-->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
-subsystem-level dev_pm_ops structure.
-
-Device drivers that wish to use the same function as a system suspend, freeze,
-poweroff and runtime suspend callback, and similarly for system resume, thaw,
-restore, and runtime resume, can achieve this with the help of the
-UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
-last argument to NULL).
-
-8. "No-Callback" Devices
-
-Some "devices" are only logical sub-devices of their parent and cannot be
-power-managed on their own.  (The prototype example is a USB interface.  Entire
-USB devices can go into low-power mode or send wake-up requests, but neither is
-possible for individual interfaces.)  The drivers for these devices have no
-need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
-and ->runtime_resume() would always return 0 without doing anything else and
-->runtime_idle() would always call pm_runtime_suspend().
-
-Subsystems can tell the PM core about these devices by calling
-pm_runtime_no_callbacks().  This should be done after the device structure is
-initialized and before it is registered (although after device registration is
-also okay).  The routine will set the device's power.no_callbacks flag and
-prevent the non-debugging runtime PM sysfs attributes from being created.
-
-When power.no_callbacks is set, the PM core will not invoke the
-->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
-Instead it will assume that suspends and resumes always succeed and that idle
-devices should be suspended.
-
-As a consequence, the PM core will never directly inform the device's subsystem
-or driver about runtime power changes.  Instead, the driver for the device's
-parent must take responsibility for telling the device's driver when the
-parent's power state changes.
-
-9. Autosuspend, or automatically-delayed suspends
-
-Changing a device's power state isn't free; it requires both time and energy.
-A device should be put in a low-power state only when there's some reason to
-think it will remain in that state for a substantial time.  A common heuristic
-says that a device which hasn't been used for a while is liable to remain
-unused; following this advice, drivers should not allow devices to be suspended
-at runtime until they have been inactive for some minimum period.  Even when
-the heuristic ends up being non-optimal, it will still prevent devices from
-"bouncing" too rapidly between low-power and full-power states.
-
-The term "autosuspend" is an historical remnant.  It doesn't mean that the
-device is automatically suspended (the subsystem or driver still has to call
-the appropriate PM routines); rather it means that runtime suspends will
-automatically be delayed until the desired period of inactivity has elapsed.
-
-Inactivity is determined based on the power.last_busy field.  Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling pm_runtime_put_autosuspend().  The desired length
-of the inactivity period is a matter of policy.  Subsystems can set this length
-initially by calling pm_runtime_set_autosuspend_delay(), but after device
-registration the length should be controlled by user space, using the
-/sys/devices/.../power/autosuspend_delay_ms attribute.
-
-In order to use autosuspend, subsystems or drivers must call
-pm_runtime_use_autosuspend() (preferably before registering the device), and
-thereafter they should use the various *_autosuspend() helper functions instead
-of the non-autosuspend counterparts:
-
-	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
-	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
-	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
-	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
-
-Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, which means sometimes taking the autosuspend delay into
-account (see pm_runtime_idle).
-
-Under some circumstances a driver or subsystem may want to prevent a device
-from autosuspending immediately, even though the usage counter is zero and the
-autosuspend delay time has expired.  If the ->runtime_suspend() callback
-returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
-in the future (as it normally would be if the callback invoked
-pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
-autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
-itself because no suspend requests of any kind are accepted while the device is
-suspending (i.e., while the callback is running).
-
-The implementation is well suited for asynchronous use in interrupt contexts.
-However such use inevitably involves races, because the PM core can't
-synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
-This synchronization must be handled by the driver, using its private lock.
-Here is a schematic pseudo-code example:
-
-	foo_read_or_write(struct foo_priv *foo, void *data)
-	{
-		lock(&foo->private_lock);
-		add_request_to_io_queue(foo, data);
-		if (foo->num_pending_requests++ == 0)
-			pm_runtime_get(&foo->dev);
-		if (!foo->is_suspended)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-	}
-
-	foo_io_completion(struct foo_priv *foo, void *req)
-	{
-		lock(&foo->private_lock);
-		if (--foo->num_pending_requests == 0) {
-			pm_runtime_mark_last_busy(&foo->dev);
-			pm_runtime_put_autosuspend(&foo->dev);
-		} else {
-			foo_process_next_request(foo);
-		}
-		unlock(&foo->private_lock);
-		/* Send req result back to the user ... */
-	}
-
-	int foo_runtime_suspend(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-		int ret = 0;
-
-		lock(&foo->private_lock);
-		if (foo->num_pending_requests > 0) {
-			ret = -EBUSY;
-		} else {
-			/* ... suspend the device ... */
-			foo->is_suspended = 1;
-		}
-		unlock(&foo->private_lock);
-		return ret;
-	}
-
-	int foo_runtime_resume(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-
-		lock(&foo->private_lock);
-		/* ... resume the device ... */
-		foo->is_suspended = 0;
-		pm_runtime_mark_last_busy(&foo->dev);
-		if (foo->num_pending_requests > 0)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-		return 0;
-	}
-
-The important point is that after foo_io_completion() asks for an autosuspend,
-the foo_runtime_suspend() callback may race with foo_read_or_write().
-Therefore foo_runtime_suspend() has to check whether there are any pending I/O
-requests (while holding the private lock) before allowing the suspend to
-proceed.
-
-In addition, the power.autosuspend_delay field can be changed by user space at
-any time.  If a driver cares about this, it can call
-pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
-callback while holding its private lock.  If the function returns a nonzero
-value then the delay has not yet expired and the callback should return
--EAGAIN.
diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst
new file mode 100644
index 000000000000..d739aa7c742c
--- /dev/null
+++ b/Documentation/power/s2ram.rst
@@ -0,0 +1,87 @@
+========================
+How to get s2ram working
+========================
+
+2006 Linus Torvalds
+2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+   "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+   video.txt. Perhaps problem is as simple as broken module, and
+   simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+Using TRACE_RESUME
+~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this::
+
+	#!/bin/sh
+	sync
+	echo 1 > /sys/power/pm_trace
+	echo mem > /sys/power/state
+
+   to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+   holding the power button down, and look at the dmesg output for things
+   like::
+
+	Magic number: 4:156:725
+	hash matches drivers/base/power/resume.c:28
+	hash matches device 0000:01:00.0
+
+   which means that the last trace event was just before trying to resume
+   device 0000:01:00.0. Then figure out what driver is controlling that
+   device (lspci and /sys/devices/pci* is your friend), and see if you can
+   fix it, disable it, or trace into its resume function.
+
+   If no device matches the hash (or any matches appear to be false positives),
+   the culprit may be a device from a loadable kernel module that is not loaded
+   until after the hash is checked. You can check the hash against the current
+   devices again after more modules are loaded using sysfs::
+
+	cat /sys/power/pm_trace_dev_match
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+pm_trace is not compatible with asynchronous suspend, so it turns
+asynchronous suspend off (which may work around timing or
+ordering-sensitive bugs).
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic number instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
deleted file mode 100644
index 4685aee197fd..000000000000
--- a/Documentation/power/s2ram.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-			How to get s2ram working
-			~~~~~~~~~~~~~~~~~~~~~~~~
-			2006 Linus Torvalds
-			2006 Pavel Machek
-
-1) Check suspend.sf.net, program s2ram there has long whitelist of
-   "known ok" machines, along with tricks to use on each one.
-
-2) If that does not help, try reading tricks.txt and
-   video.txt. Perhaps problem is as simple as broken module, and
-   simple module unload can fix it.
-
-3) You can use Linus' TRACE_RESUME infrastructure, described below.
-
-		      Using TRACE_RESUME
-		      ~~~~~~~~~~~~~~~~~~
-
-I've been working at making the machines I have able to STR, and almost
-always it's a driver that is buggy. Thank God for the suspend/resume
-debugging - the thing that Chuck tried to disable. That's often the _only_
-way to debug these things, and it's actually pretty powerful (but
-time-consuming - having to insert TRACE_RESUME() markers into the device
-driver that doesn't resume and recompile and reboot).
-
-Anyway, the way to debug this for people who are interested (have a
-machine that doesn't boot) is:
-
- - enable PM_DEBUG, and PM_TRACE
-
- - use a script like this:
-
-	#!/bin/sh
-	sync
-	echo 1 > /sys/power/pm_trace
-	echo mem > /sys/power/state
-
-   to suspend
-
- - if it doesn't come back up (which is usually the problem), reboot by
-   holding the power button down, and look at the dmesg output for things
-   like
-
-	Magic number: 4:156:725
-	hash matches drivers/base/power/resume.c:28
-	hash matches device 0000:01:00.0
-
-   which means that the last trace event was just before trying to resume
-   device 0000:01:00.0. Then figure out what driver is controlling that
-   device (lspci and /sys/devices/pci* is your friend), and see if you can
-   fix it, disable it, or trace into its resume function.
-
-   If no device matches the hash (or any matches appear to be false positives),
-   the culprit may be a device from a loadable kernel module that is not loaded
-   until after the hash is checked. You can check the hash against the current
-   devices again after more modules are loaded using sysfs:
-
-	cat /sys/power/pm_trace_dev_match
-
-For example, the above happens to be the VGA device on my EVO, which I
-used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
-that "radeonfb" simply cannot resume that device - it tries to set the
-PLL's, and it just _hangs_. Using the regular VGA console and letting X
-resume it instead works fine.
-
-NOTE
-====
-pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
-Reason for this is that the RTC is the only reliably available piece of
-hardware during resume operations where a value can be set that will
-survive a reboot.
-
-pm_trace is not compatible with asynchronous suspend, so it turns
-asynchronous suspend off (which may work around timing or
-ordering-sensitive bugs).
-
-Consequence is that after a resume (even if it is successful) your system
-clock will have a value corresponding to the magic number instead of the
-correct date/time! It is therefore advisable to use a program like ntp-date
-or rdate to reset the correct date/time from an external time source when
-using this trace option.
-
-As the clock keeps ticking it is also essential that the reboot is done
-quickly after the resume failure. The trace option does not use the seconds
-or the low order bits of the minutes of the RTC, but a too long delay will
-corrupt the magic value.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
new file mode 100644
index 000000000000..7ac8e1f549f4
--- /dev/null
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -0,0 +1,286 @@
+====================================================================
+Interaction of Suspend code (S3) with the CPU hotplug infrastructure
+====================================================================
+
+(C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+
+
+I. Differences between CPU hotplug and Suspend-to-RAM
+======================================================
+
+How does the regular CPU hotplug code differ from how the Suspend-to-RAM
+infrastructure uses it internally? And where do they share common code?
+
+Well, a picture is worth a thousand words... So ASCII art follows :-)
+
+[This depicts the current design in the kernel, and focusses only on the
+interactions involving the freezer and CPU hotplug and also tries to explain
+the locking involved. It outlines the notifications involved as well.
+But please note that here, only the call paths are illustrated, with the aim
+of describing where they take different paths and where they share code.
+What happens when regular CPU hotplug and Suspend-to-RAM race with each other
+is not depicted here.]
+
+On a high level, the suspend-resume cycle goes like this::
+
+  |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
+  |tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
+
+
+More details follow::
+
+                                Suspend call path
+                                -----------------
+
+                                  Write 'mem' to
+                                /sys/power/state
+                                    sysfs file
+                                        |
+                                        v
+                               Acquire system_transition_mutex lock
+                                        |
+                                        v
+                             Send PM_SUSPEND_PREPARE
+                                   notifications
+                                        |
+                                        v
+                                   Freeze tasks
+                                        |
+                                        |
+                                        v
+                              disable_nonboot_cpus()
+                                   /* start */
+                                        |
+                                        v
+                            Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                             Iterate over CURRENTLY
+                                   online CPUs
+                                        |
+                                        |
+                                        |                ----------
+                                        v                          | L
+             ======>               _cpu_down()                     |
+            |              [This takes cpuhotplug.lock             |
+  Common    |               before taking down the CPU             |
+   code     |               and releases it when done]             | O
+            |            While it is at it, notifications          |
+            |            are sent when notable events occur,       |
+             ======>     by running all registered callbacks.      |
+                                        |                          | O
+                                        |                          |
+                                        |                          |
+                                        v                          |
+                            Note down these cpus in                | P
+                                frozen_cpus mask         ----------
+                                        |
+                                        v
+                           Disable regular cpu hotplug
+                        by increasing cpu_hotplug_disabled
+                                        |
+                                        v
+                            Release cpu_add_remove_lock
+                                        |
+                                        v
+                       /* disable_nonboot_cpus() complete */
+                                        |
+                                        v
+                                   Do suspend
+
+
+
+Resuming back is likewise, with the counterparts being (in the order of
+execution during resume):
+
+* enable_nonboot_cpus() which involves::
+
+   |  Acquire cpu_add_remove_lock
+   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
+   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
+   |  Release cpu_add_remove_lock
+   v
+
+* thaw tasks
+* send PM_POST_SUSPEND notifications
+* Release system_transition_mutex lock.
+
+
+It is to be noted here that the system_transition_mutex lock is acquired at the very
+beginning, when we are just starting out to suspend, and then released only
+after the entire cycle is complete (i.e., suspend + resume).
+
+::
+
+
+
+                          Regular CPU hotplug call path
+                          -----------------------------
+
+                                Write 0 (or 1) to
+                       /sys/devices/system/cpu/cpu*/online
+                                    sysfs file
+                                        |
+                                        |
+                                        v
+                                    cpu_down()
+                                        |
+                                        v
+                           Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                          If cpu_hotplug_disabled > 0
+                                return gracefully
+                                        |
+                                        |
+                                        v
+             ======>                _cpu_down()
+            |              [This takes cpuhotplug.lock
+  Common    |               before taking down the CPU
+   code     |               and releases it when done]
+            |            While it is at it, notifications
+            |           are sent when notable events occur,
+             ======>    by running all registered callbacks.
+                                        |
+                                        |
+                                        v
+                          Release cpu_add_remove_lock
+                               [That's it!, for
+                              regular CPU hotplug]
+
+
+
+So, as can be seen from the two diagrams (the parts marked as "Common code"),
+regular CPU hotplug and the suspend code path converge at the _cpu_down() and
+_cpu_up() functions. They differ in the arguments passed to these functions,
+in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
+argument. But during suspend, since the tasks are already frozen by the time
+the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
+with the 'tasks_frozen' argument set to 1.
+[See below for some known issues regarding this.]
+
+
+Important files and functions/entry points:
+-------------------------------------------
+
+- kernel/power/process.c : freeze_processes(), thaw_processes()
+- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
+- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
+
+
+
+II. What are the issues involved in CPU hotplug?
+------------------------------------------------
+
+There are some interesting situations involving CPU hotplug and microcode
+update on the CPUs, as discussed below:
+
+[Please bear in mind that the kernel requests the microcode images from
+userspace, using the request_firmware() function defined in
+drivers/base/firmware_loader/main.c]
+
+
+a. When all the CPUs are identical:
+
+   This is the most common situation and it is quite straightforward: we want
+   to apply the same microcode revision to each of the CPUs.
+   To give an example of x86, the collect_cpu_info() function defined in
+   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
+   and thereby in applying the correct microcode revision to it.
+   But note that the kernel does not maintain a common microcode image for the
+   all CPUs, in order to handle case 'b' described below.
+
+
+b. When some of the CPUs are different than the rest:
+
+   In this case since we probably need to apply different microcode revisions
+   to different CPUs, the kernel maintains a copy of the correct microcode
+   image for each CPU (after appropriate CPU type/model discovery using
+   functions such as collect_cpu_info()).
+
+
+c. When a CPU is physically hot-unplugged and a new (and possibly different
+   type of) CPU is hot-plugged into the system:
+
+   In the current design of the kernel, whenever a CPU is taken offline during
+   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
+   (which is sent by the CPU hotplug code), the microcode update driver's
+   callback for that event reacts by freeing the kernel's copy of the
+   microcode image for that CPU.
+
+   Hence, when a new CPU is brought online, since the kernel finds that it
+   doesn't have the microcode image, it does the CPU type/model discovery
+   afresh and then requests the userspace for the appropriate microcode image
+   for that CPU, which is subsequently applied.
+
+   For example, in x86, the mc_cpu_callback() function (which is the microcode
+   update driver's callback registered for CPU hotplug events) calls
+   microcode_update_cpu() which would call microcode_init_cpu() in this case,
+   instead of microcode_resume_cpu() when it finds that the kernel doesn't
+   have a valid microcode image. This ensures that the CPU type/model
+   discovery is performed and the right microcode is applied to the CPU after
+   getting it from userspace.
+
+
+d. Handling microcode update during suspend/hibernate:
+
+   Strictly speaking, during a CPU hotplug operation which does not involve
+   physically removing or inserting CPUs, the CPUs are not actually powered
+   off during a CPU offline. They are just put to the lowest C-states possible.
+   Hence, in such a case, it is not really necessary to re-apply microcode
+   when the CPUs are brought back online, since they wouldn't have lost the
+   image during the CPU offline operation.
+
+   This is the usual scenario encountered during a resume after a suspend.
+   However, in the case of hibernation, since all the CPUs are completely
+   powered off, during restore it becomes necessary to apply the microcode
+   images to all the CPUs.
+
+   [Note that we don't expect someone to physically pull out nodes and insert
+   nodes with a different type of CPUs in-between a suspend-resume or a
+   hibernate/restore cycle.]
+
+   In the current design of the kernel however, during a CPU offline operation
+   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
+   the existing copy of microcode image in the kernel is not freed up.
+   And during the CPU online operations (during resume/restore), since the
+   kernel finds that it already has copies of the microcode images for all the
+   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
+   type/model and the need for validating whether the microcode revisions are
+   right for the CPUs or not (due to the above assumption that physical CPU
+   hotplug will not be done in-between suspend/resume or hibernate/restore
+   cycles).
+
+
+III. Known problems
+===================
+
+Are there any known problems when regular CPU hotplug and suspend race
+with each other?
+
+Yes, they are listed below:
+
+1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
+   the _cpu_down() and _cpu_up() functions is *always* 0.
+   This might not reflect the true current state of the system, since the
+   tasks could have been frozen by an out-of-band event such as a suspend
+   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+   reflect the frozen state and the CPU hotplug callbacks which evaluate
+   that variable might execute the wrong code path.
+
+2. If a regular CPU hotplug stress test happens to race with the freezer due
+   to a suspend operation in progress at the same time, then we could hit the
+   situation described below:
+
+    * A regular cpu online operation continues its journey from userspace
+      into the kernel, since the freezing has not yet begun.
+    * Then freezer gets to work and freezes userspace.
+    * If cpu online has not yet completed the microcode update stuff by now,
+      it will now start waiting on the frozen userspace in the
+      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
+    * Now the freezer continues and tries to freeze the remaining tasks. But
+      due to this wait mentioned above, the freezer won't be able to freeze
+      the cpu online hotplug task and hence freezing of tasks fails.
+
+   As a result of this task freezing failure, the suspend operation gets
+   aborted.
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
deleted file mode 100644
index a8751b8df10e..000000000000
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ /dev/null
@@ -1,274 +0,0 @@
-Interaction of Suspend code (S3) with the CPU hotplug infrastructure
-
-     (C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
-
-
-I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM
-   infrastructure uses it internally? And where do they share common code?
-
-Well, a picture is worth a thousand words... So ASCII art follows :-)
-
-[This depicts the current design in the kernel, and focusses only on the
-interactions involving the freezer and CPU hotplug and also tries to explain
-the locking involved. It outlines the notifications involved as well.
-But please note that here, only the call paths are illustrated, with the aim
-of describing where they take different paths and where they share code.
-What happens when regular CPU hotplug and Suspend-to-RAM race with each other
-is not depicted here.]
-
-On a high level, the suspend-resume cycle goes like this:
-
-|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
-|tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
-
-
-More details follow:
-
-                                Suspend call path
-                                -----------------
-
-                                  Write 'mem' to
-                                /sys/power/state
-                                    sysfs file
-                                        |
-                                        v
-                               Acquire system_transition_mutex lock
-                                        |
-                                        v
-                             Send PM_SUSPEND_PREPARE
-                                   notifications
-                                        |
-                                        v
-                                   Freeze tasks
-                                        |
-                                        |
-                                        v
-                              disable_nonboot_cpus()
-                                   /* start */
-                                        |
-                                        v
-                            Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                             Iterate over CURRENTLY
-                                   online CPUs
-                                        |
-                                        |
-                                        |                ----------
-                                        v                          | L
-             ======>               _cpu_down()                     |
-            |              [This takes cpuhotplug.lock             |
-  Common    |               before taking down the CPU             |
-   code     |               and releases it when done]             | O
-            |            While it is at it, notifications          |
-            |            are sent when notable events occur,       |
-             ======>     by running all registered callbacks.      |
-                                        |                          | O
-                                        |                          |
-                                        |                          |
-                                        v                          |
-                            Note down these cpus in                | P
-                                frozen_cpus mask         ----------
-                                        |
-                                        v
-                           Disable regular cpu hotplug
-                        by increasing cpu_hotplug_disabled
-                                        |
-                                        v
-                            Release cpu_add_remove_lock
-                                        |
-                                        v
-                       /* disable_nonboot_cpus() complete */
-                                        |
-                                        v
-                                   Do suspend
-
-
-
-Resuming back is likewise, with the counterparts being (in the order of
-execution during resume):
-* enable_nonboot_cpus() which involves:
-   |  Acquire cpu_add_remove_lock
-   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
-   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
-   |  Release cpu_add_remove_lock
-   v
-
-* thaw tasks
-* send PM_POST_SUSPEND notifications
-* Release system_transition_mutex lock.
-
-
-It is to be noted here that the system_transition_mutex lock is acquired at the very
-beginning, when we are just starting out to suspend, and then released only
-after the entire cycle is complete (i.e., suspend + resume).
-
-
-
-                          Regular CPU hotplug call path
-                          -----------------------------
-
-                                Write 0 (or 1) to
-                       /sys/devices/system/cpu/cpu*/online
-                                    sysfs file
-                                        |
-                                        |
-                                        v
-                                    cpu_down()
-                                        |
-                                        v
-                           Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                          If cpu_hotplug_disabled > 0
-                                return gracefully
-                                        |
-                                        |
-                                        v
-             ======>                _cpu_down()
-            |              [This takes cpuhotplug.lock
-  Common    |               before taking down the CPU
-   code     |               and releases it when done]
-            |            While it is at it, notifications
-            |           are sent when notable events occur,
-             ======>    by running all registered callbacks.
-                                        |
-                                        |
-                                        v
-                          Release cpu_add_remove_lock
-                               [That's it!, for
-                              regular CPU hotplug]
-
-
-
-So, as can be seen from the two diagrams (the parts marked as "Common code"),
-regular CPU hotplug and the suspend code path converge at the _cpu_down() and
-_cpu_up() functions. They differ in the arguments passed to these functions,
-in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
-argument. But during suspend, since the tasks are already frozen by the time
-the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
-with the 'tasks_frozen' argument set to 1.
-[See below for some known issues regarding this.]
-
-
-Important files and functions/entry points:
-------------------------------------------
-
-kernel/power/process.c : freeze_processes(), thaw_processes()
-kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
-kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
-
-
-
-II. What are the issues involved in CPU hotplug?
-    -------------------------------------------
-
-There are some interesting situations involving CPU hotplug and microcode
-update on the CPUs, as discussed below:
-
-[Please bear in mind that the kernel requests the microcode images from
-userspace, using the request_firmware() function defined in
-drivers/base/firmware_loader/main.c]
-
-
-a. When all the CPUs are identical:
-
-   This is the most common situation and it is quite straightforward: we want
-   to apply the same microcode revision to each of the CPUs.
-   To give an example of x86, the collect_cpu_info() function defined in
-   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
-   and thereby in applying the correct microcode revision to it.
-   But note that the kernel does not maintain a common microcode image for the
-   all CPUs, in order to handle case 'b' described below.
-
-
-b. When some of the CPUs are different than the rest:
-
-   In this case since we probably need to apply different microcode revisions
-   to different CPUs, the kernel maintains a copy of the correct microcode
-   image for each CPU (after appropriate CPU type/model discovery using
-   functions such as collect_cpu_info()).
-
-
-c. When a CPU is physically hot-unplugged and a new (and possibly different
-   type of) CPU is hot-plugged into the system:
-
-   In the current design of the kernel, whenever a CPU is taken offline during
-   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
-   (which is sent by the CPU hotplug code), the microcode update driver's
-   callback for that event reacts by freeing the kernel's copy of the
-   microcode image for that CPU.
-
-   Hence, when a new CPU is brought online, since the kernel finds that it
-   doesn't have the microcode image, it does the CPU type/model discovery
-   afresh and then requests the userspace for the appropriate microcode image
-   for that CPU, which is subsequently applied.
-
-   For example, in x86, the mc_cpu_callback() function (which is the microcode
-   update driver's callback registered for CPU hotplug events) calls
-   microcode_update_cpu() which would call microcode_init_cpu() in this case,
-   instead of microcode_resume_cpu() when it finds that the kernel doesn't
-   have a valid microcode image. This ensures that the CPU type/model
-   discovery is performed and the right microcode is applied to the CPU after
-   getting it from userspace.
-
-
-d. Handling microcode update during suspend/hibernate:
-
-   Strictly speaking, during a CPU hotplug operation which does not involve
-   physically removing or inserting CPUs, the CPUs are not actually powered
-   off during a CPU offline. They are just put to the lowest C-states possible.
-   Hence, in such a case, it is not really necessary to re-apply microcode
-   when the CPUs are brought back online, since they wouldn't have lost the
-   image during the CPU offline operation.
-
-   This is the usual scenario encountered during a resume after a suspend.
-   However, in the case of hibernation, since all the CPUs are completely
-   powered off, during restore it becomes necessary to apply the microcode
-   images to all the CPUs.
-
-   [Note that we don't expect someone to physically pull out nodes and insert
-   nodes with a different type of CPUs in-between a suspend-resume or a
-   hibernate/restore cycle.]
-
-   In the current design of the kernel however, during a CPU offline operation
-   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
-   the existing copy of microcode image in the kernel is not freed up.
-   And during the CPU online operations (during resume/restore), since the
-   kernel finds that it already has copies of the microcode images for all the
-   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
-   type/model and the need for validating whether the microcode revisions are
-   right for the CPUs or not (due to the above assumption that physical CPU
-   hotplug will not be done in-between suspend/resume or hibernate/restore
-   cycles).
-
-
-III. Are there any known problems when regular CPU hotplug and suspend race
-     with each other?
-
-Yes, they are listed below:
-
-1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
-   the _cpu_down() and _cpu_up() functions is *always* 0.
-   This might not reflect the true current state of the system, since the
-   tasks could have been frozen by an out-of-band event such as a suspend
-   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
-   reflect the frozen state and the CPU hotplug callbacks which evaluate
-   that variable might execute the wrong code path.
-
-2. If a regular CPU hotplug stress test happens to race with the freezer due
-   to a suspend operation in progress at the same time, then we could hit the
-   situation described below:
-
-    * A regular cpu online operation continues its journey from userspace
-      into the kernel, since the freezing has not yet begun.
-    * Then freezer gets to work and freezes userspace.
-    * If cpu online has not yet completed the microcode update stuff by now,
-      it will now start waiting on the frozen userspace in the
-      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
-    * Now the freezer continues and tries to freeze the remaining tasks. But
-      due to this wait mentioned above, the freezer won't be able to freeze
-      the cpu online hotplug task and hence freezing of tasks fails.
-
-   As a result of this task freezing failure, the suspend operation gets
-   aborted.
diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst
new file mode 100644
index 000000000000..4cda6617709a
--- /dev/null
+++ b/Documentation/power/suspend-and-interrupts.rst
@@ -0,0 +1,137 @@
+====================================
+System Suspend and Device Interrupts
+====================================
+
+Copyright (C) 2014 Intel Corp.
+Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+Suspending and Resuming Device IRQs
+-----------------------------------
+
+Device interrupt request lines (IRQs) are generally disabled during system
+suspend after the "late" phase of suspending devices (that is, after all of the
+->prepare, ->suspend and ->suspend_late callbacks have been executed for all
+devices).  That is done by suspend_device_irqs().
+
+The rationale for doing so is that after the "late" phase of device suspend
+there is no legitimate reason why any interrupts from suspended devices should
+trigger and if any devices have not been suspended properly yet, it is better to
+block interrupts from them anyway.  Also, in the past we had problems with
+interrupt handlers for shared IRQs that device drivers implementing them were
+not prepared for interrupts triggering after their devices had been suspended.
+In some cases they would attempt to access, for example, memory address spaces
+of suspended devices and cause unpredictable behavior to ensue as a result.
+Unfortunately, such problems are very difficult to debug and the introduction
+of suspend_device_irqs(), along with the "noirq" phase of device suspend and
+resume, was the only practical way to mitigate them.
+
+Device IRQs are re-enabled during system resume, right before the "early" phase
+of resuming devices (that is, before starting to execute ->resume_early
+callbacks for devices).  The function doing that is resume_device_irqs().
+
+
+The IRQF_NO_SUSPEND Flag
+------------------------
+
+There are interrupts that can legitimately trigger during the entire system
+suspend-resume cycle, including the "noirq" phases of suspending and resuming
+devices as well as during the time when nonboot CPUs are taken offline and
+brought back online.  That applies to timer interrupts in the first place,
+but also to IPIs and to some other special-purpose interrupts.
+
+The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
+requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
+leave the corresponding IRQ enabled so as to allow the interrupt to work as
+expected during the suspend-resume cycle, but does not guarantee that the
+interrupt will wake the system from a suspended state -- for such cases it is
+necessary to use enable_irq_wake().
+
+Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
+user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
+for it will be executed as usual after suspend_device_irqs(), even if the
+IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
+the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
+same time should be avoided.
+
+
+System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
+------------------------------------------------------------------
+
+System wakeup interrupts generally need to be configured to wake up the system
+from sleep states, especially if they are used for different purposes (e.g. as
+I/O interrupts) in the working state.
+
+That may involve turning on a special signal handling logic within the platform
+(such as an SoC) so that signals from a given line are routed in a different way
+during system sleep so as to trigger a system wakeup when needed.  For example,
+the platform may include a dedicated interrupt controller used specifically for
+handling system wakeup events.  Then, if a given interrupt line is supposed to
+wake up the system from sleep sates, the corresponding input of that interrupt
+controller needs to be enabled to receive signals from the line in question.
+After wakeup, it generally is better to disable that input to prevent the
+dedicated controller from triggering interrupts unnecessarily.
+
+The IRQ subsystem provides two helper functions to be used by device drivers for
+those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
+handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
+turns that logic off.
+
+Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
+in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
+it will be disabled, marked as pending and "suspended" so that it will be
+re-enabled by resume_device_irqs() during the subsequent system resume.  Also
+the PM core is notified about the event which causes the system suspend in
+progress to be aborted (that doesn't have to happen immediately, but at one
+of the points where the suspend thread looks for pending wakeup events).
+
+This way every interrupt from a wakeup interrupt source will either cause the
+system suspend currently in progress to be aborted or wake up the system if
+already suspended.  However, after suspend_device_irqs() interrupt handlers are
+not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
+IRQs at that time, but those IRQs should not be configured for system wakeup
+using enable_irq_wake().
+
+
+Interrupts and Suspend-to-Idle
+------------------------------
+
+Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
+system sleep state that works by idling all of the processors and waiting for
+interrupts right after the "noirq" phase of suspending devices.
+
+Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
+set will bring CPUs out of idle while in that state, but they will not cause the
+IRQ subsystem to trigger a system wakeup.
+
+System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
+analogy with what they do in the full system suspend case.  The only difference
+is that the wakeup from suspend-to-idle is signaled using the usual working
+state interrupt delivery mechanisms and doesn't require the platform to use
+any special interrupt handling logic for it to work.
+
+
+IRQF_NO_SUSPEND and enable_irq_wake()
+-------------------------------------
+
+There are very few valid reasons to use both enable_irq_wake() and the
+IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
+same device.
+
+First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
+interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
+directly at odds with the rules for handling system wakeup interrupts (interrupt
+handlers are not invoked after suspend_device_irqs()).
+
+Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
+to individual interrupt handlers, so sharing an IRQ between a system wakeup
+interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
+make sense.
+
+In rare cases an IRQ can be shared between a wakeup device driver and an
+IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
+must be able to discern spurious IRQs from genuine wakeup events (signalling
+the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
+ensure that the IRQ will function as a wakeup source, and must request the IRQ
+with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
+these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt
deleted file mode 100644
index 8afb29a8604a..000000000000
--- a/Documentation/power/suspend-and-interrupts.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-System Suspend and Device Interrupts
-
-Copyright (C) 2014 Intel Corp.
-Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Suspending and Resuming Device IRQs
------------------------------------
-
-Device interrupt request lines (IRQs) are generally disabled during system
-suspend after the "late" phase of suspending devices (that is, after all of the
-->prepare, ->suspend and ->suspend_late callbacks have been executed for all
-devices).  That is done by suspend_device_irqs().
-
-The rationale for doing so is that after the "late" phase of device suspend
-there is no legitimate reason why any interrupts from suspended devices should
-trigger and if any devices have not been suspended properly yet, it is better to
-block interrupts from them anyway.  Also, in the past we had problems with
-interrupt handlers for shared IRQs that device drivers implementing them were
-not prepared for interrupts triggering after their devices had been suspended.
-In some cases they would attempt to access, for example, memory address spaces
-of suspended devices and cause unpredictable behavior to ensue as a result.
-Unfortunately, such problems are very difficult to debug and the introduction
-of suspend_device_irqs(), along with the "noirq" phase of device suspend and
-resume, was the only practical way to mitigate them.
-
-Device IRQs are re-enabled during system resume, right before the "early" phase
-of resuming devices (that is, before starting to execute ->resume_early
-callbacks for devices).  The function doing that is resume_device_irqs().
-
-
-The IRQF_NO_SUSPEND Flag
-------------------------
-
-There are interrupts that can legitimately trigger during the entire system
-suspend-resume cycle, including the "noirq" phases of suspending and resuming
-devices as well as during the time when nonboot CPUs are taken offline and
-brought back online.  That applies to timer interrupts in the first place,
-but also to IPIs and to some other special-purpose interrupts.
-
-The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
-requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
-leave the corresponding IRQ enabled so as to allow the interrupt to work as
-expected during the suspend-resume cycle, but does not guarantee that the
-interrupt will wake the system from a suspended state -- for such cases it is
-necessary to use enable_irq_wake().
-
-Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
-user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
-for it will be executed as usual after suspend_device_irqs(), even if the
-IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
-the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
-same time should be avoided.
-
-
-System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
-------------------------------------------------------------------
-
-System wakeup interrupts generally need to be configured to wake up the system
-from sleep states, especially if they are used for different purposes (e.g. as
-I/O interrupts) in the working state.
-
-That may involve turning on a special signal handling logic within the platform
-(such as an SoC) so that signals from a given line are routed in a different way
-during system sleep so as to trigger a system wakeup when needed.  For example,
-the platform may include a dedicated interrupt controller used specifically for
-handling system wakeup events.  Then, if a given interrupt line is supposed to
-wake up the system from sleep sates, the corresponding input of that interrupt
-controller needs to be enabled to receive signals from the line in question.
-After wakeup, it generally is better to disable that input to prevent the
-dedicated controller from triggering interrupts unnecessarily.
-
-The IRQ subsystem provides two helper functions to be used by device drivers for
-those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
-handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
-turns that logic off.
-
-Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
-in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
-it will be disabled, marked as pending and "suspended" so that it will be
-re-enabled by resume_device_irqs() during the subsequent system resume.  Also
-the PM core is notified about the event which causes the system suspend in
-progress to be aborted (that doesn't have to happen immediately, but at one
-of the points where the suspend thread looks for pending wakeup events).
-
-This way every interrupt from a wakeup interrupt source will either cause the
-system suspend currently in progress to be aborted or wake up the system if
-already suspended.  However, after suspend_device_irqs() interrupt handlers are
-not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
-IRQs at that time, but those IRQs should not be configured for system wakeup
-using enable_irq_wake().
-
-
-Interrupts and Suspend-to-Idle
-------------------------------
-
-Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
-system sleep state that works by idling all of the processors and waiting for
-interrupts right after the "noirq" phase of suspending devices.
-
-Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
-set will bring CPUs out of idle while in that state, but they will not cause the
-IRQ subsystem to trigger a system wakeup.
-
-System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
-analogy with what they do in the full system suspend case.  The only difference
-is that the wakeup from suspend-to-idle is signaled using the usual working
-state interrupt delivery mechanisms and doesn't require the platform to use
-any special interrupt handling logic for it to work.
-
-
-IRQF_NO_SUSPEND and enable_irq_wake()
--------------------------------------
-
-There are very few valid reasons to use both enable_irq_wake() and the
-IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
-same device.
-
-First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
-interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
-directly at odds with the rules for handling system wakeup interrupts (interrupt
-handlers are not invoked after suspend_device_irqs()).
-
-Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
-to individual interrupt handlers, so sharing an IRQ between a system wakeup
-interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
-make sense.
-
-In rare cases an IRQ can be shared between a wakeup device driver and an
-IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
-must be able to discern spurious IRQs from genuine wakeup events (signalling
-the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
-ensure that the IRQ will function as a wakeup source, and must request the IRQ
-with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
-these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst
new file mode 100644
index 000000000000..a33a2919dbe4
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.rst
@@ -0,0 +1,63 @@
+===============================================
+Using swap files with software suspend (swsusp)
+===============================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it.  From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver.  Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk.  For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.::
+
+    # dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+    # mkswap <swap_file_path>
+    # swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line::
+
+    resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.rst (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition.  In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before.  Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
deleted file mode 100644
index f281886de490..000000000000
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Using swap files with software suspend (swsusp)
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-The Linux kernel handles swap files almost in the same way as it handles swap
-partitions and there are only two differences between these two types of swap
-areas:
-(1) swap files need not be contiguous,
-(2) the header of a swap file is not in the first block of the partition that
-holds it.  From the swsusp's point of view (1) is not a problem, because it is
-already taken care of by the swap-handling code, but (2) has to be taken into
-consideration.
-
-In principle the location of a swap file's header may be determined with the
-help of appropriate filesystem driver.  Unfortunately, however, it requires the
-filesystem holding the swap file to be mounted, and if this filesystem is
-journaled, it cannot be mounted during resume from disk.  For this reason to
-identify a swap file swsusp uses the name of the partition that holds the file
-and the offset from the beginning of the partition at which the swap file's
-header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
-units.
-
-In order to use a swap file with swsusp, you need to:
-
-1) Create the swap file and make it active, eg.
-
-# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
-# mkswap <swap_file_path>
-# swapon <swap_file_path>
-
-2) Use an application that will bmap the swap file with the help of the
-FIBMAP ioctl and determine the location of the file's swap header, as the
-offset, in <PAGE_SIZE> units, from the beginning of the partition which
-holds the swap file.
-
-3) Add the following parameters to the kernel command line:
-
-resume=<swap_file_partition> resume_offset=<swap_file_offset>
-
-where <swap_file_partition> is the partition on which the swap file is located
-and <swap_file_offset> is the offset of the swap header determined by the
-application in 2) (of course, this step may be carried out automatically
-by the same application that determines the swap file's header offset using the
-FIBMAP ioctl)
-
-OR
-
-Use a userland suspend application that will set the partition and offset
-with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
-Documentation/power/userland-swsusp.txt (this is the only method to suspend
-to a swap file allowing the resume to be initiated from an initrd or initramfs
-image).
-
-Now, swsusp will use the swap file in the same way in which it would use a swap
-partition.  In particular, the swap file has to be active (ie. be present in
-/proc/swaps) so that it can be used for suspending.
-
-Note that if the swap file used for suspending is deleted and recreated,
-the location of its header need not be the same as before.  Thus every time
-this happens the value of the "resume_offset=" kernel command line parameter
-has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst
new file mode 100644
index 000000000000..426df59172cd
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.rst
@@ -0,0 +1,140 @@
+=======================================
+How to use dm-crypt and swsusp together
+=======================================
+
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.rst and understand it.
+You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following::
+
+  brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains::
+
+  CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines::
+
+  image=/boot/vmlinuz
+  initrd=/boot/initrd.gz
+  label=linux
+  append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following::
+
+  /dev/hda1   /mnt    ext3      ro                            0 0
+  none        /proc   proc      defaults,noatime,nodiratime   0 0
+  none        /sys    sysfs     defaults,noatime,nodiratime   0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen::
+
+  #!/bin/sh
+  PATH=/sbin:/bin:/usr/sbin:/usr/bin
+  mount /proc
+  mount /sys
+  mapped=0
+  noresume=`grep -c noresume /proc/cmdline`
+  if [ "$*" != "" ]
+  then
+    noresume=1
+  fi
+  dmesg -n 1
+  /sbin/cardmgr -q
+  for i in 1 2 3 4 5 6 7 8 9 0
+  do
+    if [ -f /proc/ide/hde/media ]
+    then
+      usleep 500000
+      mount -t ext2 -o ro /dev/hde1 /mnt
+      if [ -f /mnt/swapkey ]
+      then
+        dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+      fi
+      umount /mnt
+      break
+    fi
+    usleep 500000
+  done
+  killproc /sbin/cardmgr
+  dmesg -n 6
+  if [ $mapped = 1 ]
+  then
+    if [ $noresume != 0 ]
+    then
+      mkswap /dev/mapper/swap0 > /dev/null 2>&1
+    fi
+    echo 254:0 > /sys/power/resume
+    dmsetup remove swap0
+  fi
+  umount /sys
+  mount /mnt
+  umount /proc
+  cd /mnt
+  pivot_root . mnt
+  mount /proc
+  umount -l /mnt
+  umount /proc
+  exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
deleted file mode 100644
index b802fbfd95ef..000000000000
--- a/Documentation/power/swsusp-dmcrypt.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-Author: Andreas Steinmetz <ast@domdv.de>
-
-
-How to use dm-crypt and swsusp together:
-========================================
-
-Some prerequisites:
-You know how dm-crypt works. If not, visit the following web page:
-http://www.saout.de/misc/dm-crypt/
-You have read Documentation/power/swsusp.txt and understand it.
-You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
-You know how to create or how to modify an initrd.
-
-Now your system is properly set up, your disk is encrypted except for
-the swap device(s) and the boot partition which may contain a mini
-system for crypto setup and/or rescue purposes. You may even have
-an initrd that does your current crypto setup already.
-
-At this point you want to encrypt your swap, too. Still you want to
-be able to suspend using swsusp. This, however, means that you
-have to be able to either enter a passphrase or that you read
-the key(s) from an external device like a pcmcia flash disk
-or an usb stick prior to resume. So you need an initrd, that sets
-up dm-crypt and then asks swsusp to resume from the encrypted
-swap device.
-
-The most important thing is that you set up dm-crypt in such
-a way that the swap device you suspend to/resume from has
-always the same major/minor within the initrd as well as
-within your running system. The easiest way to achieve this is
-to always set up this swap device first with dmsetup, so that
-it will always look like the following:
-
-brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
-
-Now set up your kernel to use /dev/mapper/swap0 as the default
-resume partition, so your kernel .config contains:
-
-CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
-
-Prepare your boot loader to use the initrd you will create or
-modify. For lilo the simplest setup looks like the following
-lines:
-
-image=/boot/vmlinuz
-initrd=/boot/initrd.gz
-label=linux
-append="root=/dev/ram0 init=/linuxrc rw"
-
-Finally you need to create or modify your initrd. Lets assume
-you create an initrd that reads the required dm-crypt setup
-from a pcmcia flash disk card. The card is formatted with an ext2
-fs which resides on /dev/hde1 when the card is inserted. The
-card contains at least the encrypted swap setup in a file
-named "swapkey". /etc/fstab of your initrd contains something
-like the following:
-
-/dev/hda1   /mnt    ext3      ro                            0 0
-none        /proc   proc      defaults,noatime,nodiratime   0 0
-none        /sys    sysfs     defaults,noatime,nodiratime   0 0
-
-/dev/hda1 contains an unencrypted mini system that sets up all
-of your crypto devices, again by reading the setup from the
-pcmcia flash disk. What follows now is a /linuxrc for your
-initrd that allows you to resume from encrypted swap and that
-continues boot with your mini system on /dev/hda1 if resume
-does not happen:
-
-#!/bin/sh
-PATH=/sbin:/bin:/usr/sbin:/usr/bin
-mount /proc
-mount /sys
-mapped=0
-noresume=`grep -c noresume /proc/cmdline`
-if [ "$*" != "" ]
-then
-  noresume=1
-fi
-dmesg -n 1
-/sbin/cardmgr -q
-for i in 1 2 3 4 5 6 7 8 9 0
-do
-  if [ -f /proc/ide/hde/media ]
-  then
-    usleep 500000
-    mount -t ext2 -o ro /dev/hde1 /mnt
-    if [ -f /mnt/swapkey ]
-    then
-      dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
-    fi
-    umount /mnt
-    break
-  fi
-  usleep 500000
-done
-killproc /sbin/cardmgr
-dmesg -n 6
-if [ $mapped = 1 ]
-then
-  if [ $noresume != 0 ]
-  then
-    mkswap /dev/mapper/swap0 > /dev/null 2>&1
-  fi
-  echo 254:0 > /sys/power/resume
-  dmsetup remove swap0
-fi
-umount /sys
-mount /mnt
-umount /proc
-cd /mnt
-pivot_root . mnt
-mount /proc
-umount -l /mnt
-umount /proc
-exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
-
-Please don't mind the weird loop above, busybox's msh doesn't know
-the let statement. Now, what is happening in the script?
-First we have to decide if we want to try to resume, or not.
-We will not resume if booting with "noresume" or any parameters
-for init like "single" or "emergency" as boot parameters.
-
-Then we need to set up dmcrypt with the setup data from the
-pcmcia flash disk. If this succeeds we need to reset the swap
-device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
-then attempts to resume from the first device mapper device.
-Note that it is important to set the device in /sys/power/resume,
-regardless if resuming or not, otherwise later suspend will fail.
-If resume starts, script execution terminates here.
-
-Otherwise we just remove the encrypted swap device and leave it to the
-mini system on /dev/hda1 to set the whole crypto up (it is up to
-you to modify this to your taste).
-
-What then follows is the well known process to change the root
-file system and continue booting from there. I prefer to unmount
-the initrd prior to continue booting but it is up to you to modify
-this.
diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst
new file mode 100644
index 000000000000..d000312f6965
--- /dev/null
+++ b/Documentation/power/swsusp.rst
@@ -0,0 +1,501 @@
+============
+Swap suspend
+============
+
+Some warnings, first.
+
+.. warning::
+
+   **BIG FAT WARNING**
+
+   If you touch anything on disk between suspend and resume...
+				...kiss your data goodbye.
+
+   If you do resume from initrd after your filesystems are mounted...
+				...bye bye root partition.
+
+			[this is actually same case as above]
+
+   If you have unsupported ( ) devices using DMA, you may have some
+   problems. If your disk driver does not support suspend... (IDE does),
+   it may cause some problems, too. If you change kernel command line
+   between suspend and resume, it may do something wrong. If you change
+   your hardware while system is suspended... well, it was not good idea;
+   but it will probably only crash.
+
+   ( ) suspend/resume support is needed to make it safe.
+
+   If you have any filesystems on USB devices mounted before software suspend,
+   they won't be accessible after resume and you may lose data, as though
+   you have unplugged the USB devices with mounted filesystems on them;
+   see the FAQ below for details.  (This is not true for more traditional
+   power states like "standby", which normally don't turn USB off.)
+
+Swap partition:
+  You need to append resume=/dev/your_swap_partition to kernel command
+  line or specify it using /sys/power/resume.
+
+Swap file:
+  If using a swapfile you can also specify a resume offset using
+  resume_offset=<number> on the kernel command line or specify it
+  in /sys/power/resume_offset.
+
+After preparing then you suspend by::
+
+	echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+- If you feel ACPI works pretty well on your system, you might try::
+
+	echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+- If you would like to write hibernation image to swap and then suspend
+  to RAM (provided your platform supports it), you can try::
+
+	echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
+- If you have SATA disks, you'll need recent kernels with SATA suspend
+  support. For suspend and resume to work, make sure your disk drivers
+  are built into kernel -- not modules. [There's way to make
+  suspend/resume with modular disk drivers, see FAQ, but you probably
+  should not do that.]
+
+If you want to limit the suspend image size to N bytes, do::
+
+	echo N > /sys/power/image_size
+
+before suspend (it is limited to around 2/5 of available RAM by default).
+
+- The resume process checks for the presence of the resume device,
+  if found, it then checks the contents for the hibernation image signature.
+  If both are found, it resumes the hibernation image.
+
+- The resume process may be triggered in two ways:
+
+  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
+     the kernel command line, lateinit runs the resume process.  If the
+     resume device has not been probed yet, the resume process fails and
+     bootup continues.
+  2) Manually from an initrd or initramfs:  May be run from
+     the init script by using the /sys/power/resume file.  It is vital
+     that this be done prior to remounting any filesystems (even as
+     read-only) otherwise data may be corrupted.
+
+Article about goals and implementation of Software Suspend for Linux
+====================================================================
+
+Author: Gábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+-------------------------
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have
+to interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns.  You must explicitly specify the swap partition to resume from with
+`resume=` kernel option. If signature is found it loads and restores saved
+state. If the option `noresume` is specified as a boot parameter, it skips
+the resuming.  If the option `hibernate=nocompress` is specified as a boot
+parameter, it saves hibernation image without compression.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world::
+
+  echo 1 > /proc/acpi/sleep       # for standby
+  echo 2 > /proc/acpi/sleep       # for suspend to ram
+  echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
+  echo 4 > /proc/acpi/sleep       # for suspend to disk
+  echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
+
+and perhaps::
+
+  echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q:
+  well, suspending a server is IMHO a really stupid thing,
+  but... (Diego Zuccato):
+
+A:
+  You bought new UPS for your server. How do you install it without
+  bringing machine down? Suspend to disk, rearrange power cables,
+  resume.
+
+  You have your server on UPS. Power died, and UPS is indicating 30
+  seconds to failure. What do you do? Suspend to disk.
+
+
+Q:
+  Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A:
+  We do use the regular I/O paths. However we cannot restore the data
+  to its original location as we load it. That would create an
+  inconsistent kernel state which would certainly result in an oops.
+  Instead, we load the image into unused memory and then atomically copy
+  it back to it original location. This implies, of course, a maximum
+  image size of half the amount of memory.
+
+  There are two solutions to this:
+
+  * require half of memory to be free during suspend. That way you can
+    read "new" data onto free spots, then cli and copy
+
+  * assume we had special "polling" ide driver that only uses memory
+    between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+    during suspending, but otherwise it would work...
+
+  suspend2 shares this fundamental limitation, but does not include user
+  data and disk caches into "used memory" by saving them in
+  advance. That means that the limitation goes away in practice.
+
+Q:
+  Does linux support ACPI S4?
+
+A:
+  Yes. That's what echo platform > /sys/power/disk does.
+
+Q:
+  What is 'suspend2'?
+
+A:
+  suspend2 is 'Software Suspend 2', a forked implementation of
+  suspend-to-disk which is available as separate patches for 2.4 and 2.6
+  kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+  highmem and preemption. It also has a extensible architecture that
+  allows for arbitrary transformations on the image (compression,
+  encryption) and arbitrary backends for writing the image (eg to swap
+  or an NFS share[Work In Progress]). Questions regarding suspend2
+  should be sent to the mailing list available through the suspend2
+  website, and not to the Linux Kernel Mailing List. We are working
+  toward merging suspend2 into the mainline kernel.
+
+Q:
+  What is the freezing of tasks and why are we using it?
+
+A:
+  The freezing of tasks is a mechanism by which user space processes and some
+  kernel threads are controlled during hibernation or system-wide suspend (on some
+  architectures).  See freezing-of-tasks.txt for details.
+
+Q:
+  What is the difference between "platform" and "shutdown"?
+
+A:
+  shutdown:
+	save state in linux, then tell bios to powerdown
+
+  platform:
+	save state in linux, then tell bios to powerdown and blink
+        "suspended led"
+
+  "platform" is actually right thing to do where supported, but
+  "shutdown" is most reliable (except on ACPI systems).
+
+Q:
+  I do not understand why you have such strong objections to idea of
+  selective suspend.
+
+A:
+  Do selective suspend during runtime power management, that's okay. But
+  it's useless for suspend-to-disk. (And I do not see how you could use
+  it for suspend-to-ram, I hope you do not want that).
+
+  Lets see, so you suggest to
+
+  * SUSPEND all but swap device and parents
+  * Snapshot
+  * Write image to disk
+  * SUSPEND swap device and parents
+  * Powerdown
+
+  Oh no, that does not work, if swap device or its parents uses DMA,
+  you've corrupted data. You'd have to do
+
+  * SUSPEND all but swap device and parents
+  * FREEZE swap device and parents
+  * Snapshot
+  * UNFREEZE swap device and parents
+  * Write
+  * SUSPEND swap device and parents
+
+  Which means that you still need that FREEZE state, and you get more
+  complicated code. (And I have not yet introduce details like system
+  devices).
+
+Q:
+  There don't seem to be any generally useful behavioral
+  distinctions between SUSPEND and FREEZE.
+
+A:
+  Doing SUSPEND when you are asked to do FREEZE is always correct,
+  but it may be unnecessarily slow. If you want your driver to stay simple,
+  slowness may not matter to you. It can always be fixed later.
+
+  For devices like disk it does matter, you do not want to spindown for
+  FREEZE.
+
+Q:
+  After resuming, system is paging heavily, leading to very bad interactivity.
+
+A:
+  Try running::
+
+    cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+    do
+      test -f "$file" && cat "$file" > /dev/null
+    done
+
+  after resume. swapoff -a; swapon -a may also be useful.
+
+Q:
+  What happens to devices during swsusp? They seem to be resumed
+  during system suspend?
+
+A:
+  That's correct. We need to resume them if we want to write image to
+  disk. Whole sequence goes like
+
+      **Suspend part**
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with state snapshot
+
+      state snapshot: copy of whole used memory is taken with interrupts disabled
+
+      resume(): devices are woken up so that we can write image to swap
+
+      write image to swap
+
+      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+      turn the power off
+
+      **Resume part**
+
+      (is actually pretty similar)
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped (in common case there are none,
+      but with resume-from-initrd, no one knows)
+
+      read image from disk
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with image restoration
+
+      image restoration: rewrite memory with image
+
+      resume(): devices are woken up so that system can continue
+
+      thaw all user processes
+
+Q:
+  What is this 'Encrypt suspend image' for?
+
+A:
+  First of all: it is not a replacement for dm-crypt encrypted swap.
+  It cannot protect your computer while it is suspended. Instead it does
+  protect from leaking sensitive data after resume from suspend.
+
+  Think of the following: you suspend while an application is running
+  that keeps sensitive data in memory. The application itself prevents
+  the data from being swapped out. Suspend, however, must write these
+  data to swap to be able to resume later on. Without suspend encryption
+  your sensitive data are then stored in plaintext on disk.  This means
+  that after resume your sensitive data are accessible to all
+  applications having direct access to the swap device which was used
+  for suspend. If you don't need swap after resume these data can remain
+  on disk virtually forever. Thus it can happen that your system gets
+  broken in weeks later and sensitive data which you thought were
+  encrypted and protected are retrieved and stolen from the swap device.
+  To prevent this situation you should use 'Encrypt suspend image'.
+
+  During suspend a temporary key is created and this key is used to
+  encrypt the data written to disk. When, during resume, the data was
+  read back into memory the temporary key is destroyed which simply
+  means that all data written to disk during suspend are then
+  inaccessible so they can't be stolen later on.  The only thing that
+  you must then take care of is that you call 'mkswap' for the swap
+  partition used for suspend as early as possible during regular
+  boot. This asserts that any temporary key from an oopsed suspend or
+  from a failed or aborted resume is erased from the swap device.
+
+  As a rule of thumb use encrypted swap to protect your data while your
+  system is shut down or suspended. Additionally use the encrypted
+  suspend image to prevent sensitive data from being stolen after
+  resume.
+
+Q:
+  Can I suspend to a swap file?
+
+A:
+  Generally, yes, you can.  However, it requires you to use the "resume=" and
+  "resume_offset=" kernel command line parameters, so the resume from a swap file
+  cannot be initiated from an initrd or initramfs image.  See
+  swsusp-and-swap-files.txt for details.
+
+Q:
+  Is there a maximum system RAM size that is supported by swsusp?
+
+A:
+  It should work okay with highmem.
+
+Q:
+  Does swsusp (to disk) use only one swap partition or can it use
+  multiple swap partitions (aggregate them into one logical space)?
+
+A:
+  Only one swap partition, sorry.
+
+Q:
+  If my application(s) causes lots of memory & swap space to be used
+  (over half of the total system RAM), is it correct that it is likely
+  to be useless to try to suspend to disk while that app is running?
+
+A:
+  No, it should work okay, as long as your app does not mlock()
+  it. Just prepare big enough swap partition.
+
+Q:
+  What information is useful for debugging suspend-to-disk problems?
+
+A:
+  Well, last messages on the screen are always useful. If something
+  is broken, it is usually some kernel driver, therefore trying with as
+  little as possible modules loaded helps a lot. I also prefer people to
+  suspend from console, preferably without X running. Booting with
+  init=/bin/bash, then swapon and starting suspend sequence manually
+  usually does the trick. Then it is good idea to try with latest
+  vanilla kernel.
+
+Q:
+  How can distributions ship a swsusp-supporting kernel with modular
+  disk drivers (especially SATA)?
+
+A:
+  Well, it can be done, load the drivers, then do echo into
+  /sys/power/resume file from initrd. Be sure not to mount
+  anything, not even read-only mount, or you are going to lose your
+  data.
+
+Q:
+  How do I make suspend more verbose?
+
+A:
+  If you want to see any non-error kernel messages on the virtual
+  terminal the kernel switches to during suspend, you have to set the
+  kernel console loglevel to at least 4 (KERN_WARNING), for example by
+  doing::
+
+	# save the old loglevel
+	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+	# set the loglevel so we see the progress bar.
+	# if the level is higher than needed, we leave it alone.
+	if [ $LOGLEVEL -lt 5 ]; then
+	        echo 5 > /proc/sys/kernel/printk
+		fi
+
+        IMG_SZ=0
+        read IMG_SZ < /sys/power/image_size
+        echo -n disk > /sys/power/state
+        RET=$?
+        #
+        # the logic here is:
+        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+        # then try again with image_size set to zero.
+	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+                echo 0 > /sys/power/image_size
+                echo -n disk > /sys/power/state
+                RET=$?
+        fi
+
+	# restore previous loglevel
+	echo $LOGLEVEL > /proc/sys/kernel/printk
+	exit $RET
+
+Q:
+  Is this true that if I have a mounted filesystem on a USB device and
+  I suspend to disk, I can lose data unless the filesystem has been mounted
+  with "sync"?
+
+A:
+  That's right ... if you disconnect that device, you may lose data.
+  In fact, even with "-o sync" you can lose data if your programs have
+  information in buffers they haven't written out to a disk you disconnect,
+  or if you disconnect before the device finished saving data you wrote.
+
+  Software suspend normally powers down USB controllers, which is equivalent
+  to disconnecting all USB devices attached to your system.
+
+  Your system might well support low-power modes for its USB controllers
+  while the system is asleep, maintaining the connection, using true sleep
+  modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
+  /sys/power/state file; write "standby" or "mem".)  We've not seen any
+  hardware that can use these modes through software suspend, although in
+  theory some systems might support "platform" modes that won't break the
+  USB connections.
+
+  Remember that it's always a bad idea to unplug a disk drive containing a
+  mounted filesystem.  That's true even when your system is asleep!  The
+  safest thing is to unmount all filesystems on removable media (such USB,
+  Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+  before suspending; then remount them after resuming.
+
+  There is a work-around for this problem.  For more information, see
+  Documentation/driver-api/usb/persist.rst.
+
+Q:
+  Can I suspend-to-disk using a swap partition under LVM?
+
+A:
+  Yes and No.  You can suspend successfully, but the kernel will not be able
+  to resume on its own.  You need an initramfs that can recognize the resume
+  situation, activate the logical volume containing the swap volume (but not
+  touch any filesystems!), and eventually call::
+
+    echo -n "$major:$minor" > /sys/power/resume
+
+  where $major and $minor are the respective major and minor device numbers of
+  the swap volume.
+
+  uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
+
+Q:
+  I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+  compiled with the similar configuration files. Anyway I found that
+  suspend to disk (and resume) is much slower on 2.6.16 compared to
+  2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A:
+  This is because the size of the suspend image is now greater than
+  for 2.6.15 (by saving more data we can get more responsive system
+  after resume).
+
+  There's the /sys/power/image_size knob that controls the size of the
+  image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+  root), the 2.6.15 behavior should be restored.  If it is still too
+  slow, take a look at suspend.sf.net -- userland suspend is faster and
+  supports LZF compression to speed it up further.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
deleted file mode 100644
index 236d1fb13640..000000000000
--- a/Documentation/power/swsusp.txt
+++ /dev/null
@@ -1,446 +0,0 @@
-Some warnings, first.
-
- * BIG FAT WARNING *********************************************************
- *
- * If you touch anything on disk between suspend and resume...
- *				...kiss your data goodbye.
- *
- * If you do resume from initrd after your filesystems are mounted...
- *				...bye bye root partition.
- *			[this is actually same case as above]
- *
- * If you have unsupported (*) devices using DMA, you may have some
- * problems. If your disk driver does not support suspend... (IDE does),
- * it may cause some problems, too. If you change kernel command line
- * between suspend and resume, it may do something wrong. If you change
- * your hardware while system is suspended... well, it was not good idea;
- * but it will probably only crash.
- *
- * (*) suspend/resume support is needed to make it safe.
- *
- * If you have any filesystems on USB devices mounted before software suspend,
- * they won't be accessible after resume and you may lose data, as though
- * you have unplugged the USB devices with mounted filesystems on them;
- * see the FAQ below for details.  (This is not true for more traditional
- * power states like "standby", which normally don't turn USB off.)
-
-Swap partition:
-You need to append resume=/dev/your_swap_partition to kernel command
-line or specify it using /sys/power/resume.
-
-Swap file:
-If using a swapfile you can also specify a resume offset using
-resume_offset=<number> on the kernel command line or specify it
-in /sys/power/resume_offset.
-
-After preparing then you suspend by
-
-echo shutdown > /sys/power/disk; echo disk > /sys/power/state
-
-. If you feel ACPI works pretty well on your system, you might try
-
-echo platform > /sys/power/disk; echo disk > /sys/power/state
-
-. If you would like to write hibernation image to swap and then suspend
-to RAM (provided your platform supports it), you can try
-
-echo suspend > /sys/power/disk; echo disk > /sys/power/state
-
-. If you have SATA disks, you'll need recent kernels with SATA suspend
-support. For suspend and resume to work, make sure your disk drivers
-are built into kernel -- not modules. [There's way to make
-suspend/resume with modular disk drivers, see FAQ, but you probably
-should not do that.]
-
-If you want to limit the suspend image size to N bytes, do
-
-echo N > /sys/power/image_size
-
-before suspend (it is limited to around 2/5 of available RAM by default).
-
-. The resume process checks for the presence of the resume device,
-if found, it then checks the contents for the hibernation image signature.
-If both are found, it resumes the hibernation image.
-
-. The resume process may be triggered in two ways:
-  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
-     the kernel command line, lateinit runs the resume process.  If the
-     resume device has not been probed yet, the resume process fails and
-     bootup continues.
-  2) Manually from an initrd or initramfs:  May be run from
-     the init script by using the /sys/power/resume file.  It is vital
-     that this be done prior to remounting any filesystems (even as
-     read-only) otherwise data may be corrupted.
-
-Article about goals and implementation of Software Suspend for Linux
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Author: Gábor Kuti
-Last revised: 2003-10-20 by Pavel Machek
-
-Idea and goals to achieve
-
-Nowadays it is common in several laptops that they have a suspend button. It
-saves the state of the machine to a filesystem or to a partition and switches
-to standby mode. Later resuming the machine the saved state is loaded back to
-ram and the machine can continue its work. It has two real benefits. First we
-save ourselves the time machine goes down and later boots up, energy costs
-are real high when running from batteries. The other gain is that we don't have to
-interrupt our programs so processes that are calculating something for a long
-time shouldn't need to be written interruptible.
-
-swsusp saves the state of the machine into active swaps and then reboots or
-powerdowns.  You must explicitly specify the swap partition to resume from with
-``resume='' kernel option. If signature is found it loads and restores saved
-state. If the option ``noresume'' is specified as a boot parameter, it skips
-the resuming.  If the option ``hibernate=nocompress'' is specified as a boot
-parameter, it saves hibernation image without compression.
-
-In the meantime while the system is suspended you should not add/remove any
-of the hardware, write to the filesystems, etc.
-
-Sleep states summary
-====================
-
-There are three different interfaces you can use, /proc/acpi should
-work like this:
-
-In a really perfect world:
-echo 1 > /proc/acpi/sleep       # for standby
-echo 2 > /proc/acpi/sleep       # for suspend to ram
-echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
-echo 4 > /proc/acpi/sleep       # for suspend to disk
-echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
-
-and perhaps
-echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
-
-Frequently Asked Questions
-==========================
-
-Q: well, suspending a server is IMHO a really stupid thing,
-but... (Diego Zuccato):
-
-A: You bought new UPS for your server. How do you install it without
-bringing machine down? Suspend to disk, rearrange power cables,
-resume.
-
-You have your server on UPS. Power died, and UPS is indicating 30
-seconds to failure. What do you do? Suspend to disk.
-
-
-Q: Maybe I'm missing something, but why don't the regular I/O paths work?
-
-A: We do use the regular I/O paths. However we cannot restore the data
-to its original location as we load it. That would create an
-inconsistent kernel state which would certainly result in an oops.
-Instead, we load the image into unused memory and then atomically copy
-it back to it original location. This implies, of course, a maximum
-image size of half the amount of memory.
-
-There are two solutions to this:
-
-* require half of memory to be free during suspend. That way you can
-read "new" data onto free spots, then cli and copy
-
-* assume we had special "polling" ide driver that only uses memory
-between 0-640KB. That way, I'd have to make sure that 0-640KB is free
-during suspending, but otherwise it would work...
-
-suspend2 shares this fundamental limitation, but does not include user
-data and disk caches into "used memory" by saving them in
-advance. That means that the limitation goes away in practice.
-
-Q: Does linux support ACPI S4?
-
-A: Yes. That's what echo platform > /sys/power/disk does.
-
-Q: What is 'suspend2'?
-
-A: suspend2 is 'Software Suspend 2', a forked implementation of
-suspend-to-disk which is available as separate patches for 2.4 and 2.6
-kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
-highmem and preemption. It also has a extensible architecture that
-allows for arbitrary transformations on the image (compression,
-encryption) and arbitrary backends for writing the image (eg to swap
-or an NFS share[Work In Progress]). Questions regarding suspend2
-should be sent to the mailing list available through the suspend2
-website, and not to the Linux Kernel Mailing List. We are working
-toward merging suspend2 into the mainline kernel.
-
-Q: What is the freezing of tasks and why are we using it?
-
-A: The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).  See freezing-of-tasks.txt for details.
-
-Q: What is the difference between "platform" and "shutdown"?
-
-A:
-
-shutdown: save state in linux, then tell bios to powerdown
-
-platform: save state in linux, then tell bios to powerdown and blink
-          "suspended led"
-
-"platform" is actually right thing to do where supported, but
-"shutdown" is most reliable (except on ACPI systems).
-
-Q: I do not understand why you have such strong objections to idea of
-selective suspend.
-
-A: Do selective suspend during runtime power management, that's okay. But
-it's useless for suspend-to-disk. (And I do not see how you could use
-it for suspend-to-ram, I hope you do not want that).
-
-Lets see, so you suggest to
-
-* SUSPEND all but swap device and parents
-* Snapshot
-* Write image to disk
-* SUSPEND swap device and parents
-* Powerdown
-
-Oh no, that does not work, if swap device or its parents uses DMA,
-you've corrupted data. You'd have to do
-
-* SUSPEND all but swap device and parents
-* FREEZE swap device and parents
-* Snapshot
-* UNFREEZE swap device and parents
-* Write
-* SUSPEND swap device and parents
-
-Which means that you still need that FREEZE state, and you get more
-complicated code. (And I have not yet introduce details like system
-devices).
-
-Q: There don't seem to be any generally useful behavioral
-distinctions between SUSPEND and FREEZE.
-
-A: Doing SUSPEND when you are asked to do FREEZE is always correct,
-but it may be unnecessarily slow. If you want your driver to stay simple,
-slowness may not matter to you. It can always be fixed later.
-
-For devices like disk it does matter, you do not want to spindown for
-FREEZE.
-
-Q: After resuming, system is paging heavily, leading to very bad interactivity.
-
-A: Try running
-
-cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
-do
-  test -f "$file" && cat "$file" > /dev/null
-done
-
-after resume. swapoff -a; swapon -a may also be useful.
-
-Q: What happens to devices during swsusp? They seem to be resumed
-during system suspend?
-
-A: That's correct. We need to resume them if we want to write image to
-disk. Whole sequence goes like
-
-      Suspend part
-      ~~~~~~~~~~~~
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with state snapshot
-
-      state snapshot: copy of whole used memory is taken with interrupts disabled
-
-      resume(): devices are woken up so that we can write image to swap
-
-      write image to swap
-
-      suspend(PMSG_SUSPEND): suspend devices so that we can power off
-
-      turn the power off
-
-      Resume part
-      ~~~~~~~~~~~
-      (is actually pretty similar)
-
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows)
-
-      read image from disk
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with image restoration
-
-      image restoration: rewrite memory with image
-
-      resume(): devices are woken up so that system can continue
-
-      thaw all user processes
-
-Q: What is this 'Encrypt suspend image' for?
-
-A: First of all: it is not a replacement for dm-crypt encrypted swap.
-It cannot protect your computer while it is suspended. Instead it does
-protect from leaking sensitive data after resume from suspend.
-
-Think of the following: you suspend while an application is running
-that keeps sensitive data in memory. The application itself prevents
-the data from being swapped out. Suspend, however, must write these
-data to swap to be able to resume later on. Without suspend encryption
-your sensitive data are then stored in plaintext on disk.  This means
-that after resume your sensitive data are accessible to all
-applications having direct access to the swap device which was used
-for suspend. If you don't need swap after resume these data can remain
-on disk virtually forever. Thus it can happen that your system gets
-broken in weeks later and sensitive data which you thought were
-encrypted and protected are retrieved and stolen from the swap device.
-To prevent this situation you should use 'Encrypt suspend image'.
-
-During suspend a temporary key is created and this key is used to
-encrypt the data written to disk. When, during resume, the data was
-read back into memory the temporary key is destroyed which simply
-means that all data written to disk during suspend are then
-inaccessible so they can't be stolen later on.  The only thing that
-you must then take care of is that you call 'mkswap' for the swap
-partition used for suspend as early as possible during regular
-boot. This asserts that any temporary key from an oopsed suspend or
-from a failed or aborted resume is erased from the swap device.
-
-As a rule of thumb use encrypted swap to protect your data while your
-system is shut down or suspended. Additionally use the encrypted
-suspend image to prevent sensitive data from being stolen after
-resume.
-
-Q: Can I suspend to a swap file?
-
-A: Generally, yes, you can.  However, it requires you to use the "resume=" and
-"resume_offset=" kernel command line parameters, so the resume from a swap file
-cannot be initiated from an initrd or initramfs image.  See
-swsusp-and-swap-files.txt for details.
-
-Q: Is there a maximum system RAM size that is supported by swsusp?
-
-A: It should work okay with highmem.
-
-Q: Does swsusp (to disk) use only one swap partition or can it use
-multiple swap partitions (aggregate them into one logical space)?
-
-A: Only one swap partition, sorry.
-
-Q: If my application(s) causes lots of memory & swap space to be used
-(over half of the total system RAM), is it correct that it is likely
-to be useless to try to suspend to disk while that app is running?
-
-A: No, it should work okay, as long as your app does not mlock()
-it. Just prepare big enough swap partition.
-
-Q: What information is useful for debugging suspend-to-disk problems?
-
-A: Well, last messages on the screen are always useful. If something
-is broken, it is usually some kernel driver, therefore trying with as
-little as possible modules loaded helps a lot. I also prefer people to
-suspend from console, preferably without X running. Booting with
-init=/bin/bash, then swapon and starting suspend sequence manually
-usually does the trick. Then it is good idea to try with latest
-vanilla kernel.
-
-Q: How can distributions ship a swsusp-supporting kernel with modular
-disk drivers (especially SATA)?
-
-A: Well, it can be done, load the drivers, then do echo into
-/sys/power/resume file from initrd. Be sure not to mount
-anything, not even read-only mount, or you are going to lose your
-data.
-
-Q: How do I make suspend more verbose?
-
-A: If you want to see any non-error kernel messages on the virtual
-terminal the kernel switches to during suspend, you have to set the
-kernel console loglevel to at least 4 (KERN_WARNING), for example by
-doing
-
-	# save the old loglevel
-	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
-	# set the loglevel so we see the progress bar.
-	# if the level is higher than needed, we leave it alone.
-	if [ $LOGLEVEL -lt 5 ]; then
-	        echo 5 > /proc/sys/kernel/printk
-		fi
-
-        IMG_SZ=0
-        read IMG_SZ < /sys/power/image_size
-        echo -n disk > /sys/power/state
-        RET=$?
-        #
-        # the logic here is:
-        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
-        # then try again with image_size set to zero.
-	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
-                echo 0 > /sys/power/image_size
-                echo -n disk > /sys/power/state
-                RET=$?
-        fi
-
-	# restore previous loglevel
-	echo $LOGLEVEL > /proc/sys/kernel/printk
-	exit $RET
-
-Q: Is this true that if I have a mounted filesystem on a USB device and
-I suspend to disk, I can lose data unless the filesystem has been mounted
-with "sync"?
-
-A: That's right ... if you disconnect that device, you may lose data.
-In fact, even with "-o sync" you can lose data if your programs have
-information in buffers they haven't written out to a disk you disconnect,
-or if you disconnect before the device finished saving data you wrote.
-
-Software suspend normally powers down USB controllers, which is equivalent
-to disconnecting all USB devices attached to your system.
-
-Your system might well support low-power modes for its USB controllers
-while the system is asleep, maintaining the connection, using true sleep
-modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
-/sys/power/state file; write "standby" or "mem".)  We've not seen any
-hardware that can use these modes through software suspend, although in
-theory some systems might support "platform" modes that won't break the
-USB connections.
-
-Remember that it's always a bad idea to unplug a disk drive containing a
-mounted filesystem.  That's true even when your system is asleep!  The
-safest thing is to unmount all filesystems on removable media (such USB,
-Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
-before suspending; then remount them after resuming.
-
-There is a work-around for this problem.  For more information, see
-Documentation/driver-api/usb/persist.rst.
-
-Q: Can I suspend-to-disk using a swap partition under LVM?
-
-A: Yes and No.  You can suspend successfully, but the kernel will not be able
-to resume on its own.  You need an initramfs that can recognize the resume
-situation, activate the logical volume containing the swap volume (but not
-touch any filesystems!), and eventually call
-
-echo -n "$major:$minor" > /sys/power/resume
-
-where $major and $minor are the respective major and minor device numbers of
-the swap volume.
-
-uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
-
-Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
-compiled with the similar configuration files. Anyway I found that
-suspend to disk (and resume) is much slower on 2.6.16 compared to
-2.6.15. Any idea for why that might happen or how can I speed it up?
-
-A: This is because the size of the suspend image is now greater than
-for 2.6.15 (by saving more data we can get more responsive system
-after resume).
-
-There's the /sys/power/image_size knob that controls the size of the
-image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
-root), the 2.6.15 behavior should be restored.  If it is still too
-slow, take a look at suspend.sf.net -- userland suspend is faster and
-supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst
new file mode 100644
index 000000000000..ca787f142c3f
--- /dev/null
+++ b/Documentation/power/tricks.rst
@@ -0,0 +1,29 @@
+================
+swsusp/S3 tricks
+================
+
+Pavel Machek <pavel@ucw.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+  really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+  wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+  want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+  user mode.
+
+* due to video issues, swsusp should be easier to get working than
+  S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
deleted file mode 100644
index a1b8f7249f4c..000000000000
--- a/Documentation/power/tricks.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-	swsusp/S3 tricks
-	~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@ucw.cz>
-
-If you want to trick swsusp/S3 into working, you might want to try:
-
-* go with minimal config, turn off drivers like USB, AGP you don't
-  really need
-
-* turn off APIC and preempt
-
-* use ext2. At least it has working fsck. [If something seems to go
-  wrong, force fsck when you have a chance]
-
-* turn off modules
-
-* use vga text console, shut down X. [If you really want X, you might
-  want to try vesafb later]
-
-* try running as few processes as possible, preferably go to single
-  user mode.
-
-* due to video issues, swsusp should be easier to get working than
-  S3. Try that first.
-
-When you make it work, try to find out what exactly was it that broke
-suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst
new file mode 100644
index 000000000000..a0fa51bb1a4d
--- /dev/null
+++ b/Documentation/power/userland-swsusp.rst
@@ -0,0 +1,191 @@
+=====================================================
+Documentation for userland software suspend interface
+=====================================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://suspend.sourceforge.net>.  You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h .  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for simultaneous
+reading and writing.  It is also impossible to have the device open more than
+once at a time.
+
+Even opening the device has side effects. Data structures are
+allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
+called.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE
+	freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE
+	thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE
+	create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE
+	restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE
+	free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE
+	set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE
+	return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE
+	return the amount of available swap in bytes (the
+	last argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE
+	allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES
+	free all swap pages allocated by
+	SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA
+	set the resume partition and the offset (in <PAGE_SIZE>
+	units) from the beginning of the partition at which the swap header is
+	located (the last ioctl() argument should point to a struct
+	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+	containing the resume device specification and the offset); for swap
+	partitions the offset is always 0, but it is different from zero for
+	swap files (see Documentation/power/swsusp-and-swap-files.rst for
+	details).
+
+SNAPSHOT_PLATFORM_SUPPORT
+	enable/disable the hibernation platform support,
+	depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF
+	make the kernel transition the system to the hibernation
+	state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM
+	suspend to RAM; using this call causes the kernel to
+	immediately enter the suspend-to-RAM state, so this call must always
+	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
+	is needed to implement the suspend-to-both mechanism in which the
+	suspend image is first created, as though the system had been suspended
+	to disk, and then the system is suspended to RAM (this makes it possible
+	to resume the system from RAM if there's enough battery power or restore
+	its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+
+- you cannot read() more than one virtual memory page at a time
+- read()s across page boundaries are impossible (ie. if you read() 1/2 of
+  a page in the previous call, you will only be able to read()
+  **at most** 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap partition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file).  However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image.  The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_CREATE_IMAGE has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
deleted file mode 100644
index bbfcd1bbedc5..000000000000
--- a/Documentation/power/userland-swsusp.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-Documentation for userland software suspend interface
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-First, the warnings at the beginning of swsusp.txt still apply.
-
-Second, you should read the FAQ in swsusp.txt _now_ if you have not
-done it already.
-
-Now, to use the userland interface for software suspend you need special
-utilities that will read/write the system memory snapshot from/to the
-kernel.  Such utilities are available, for example, from
-<http://suspend.sourceforge.net>.  You may want to have a look at them if you
-are going to develop your own suspend/resume utilities.
-
-The interface consists of a character device providing the open(),
-release(), read(), and write() operations as well as several ioctl()
-commands defined in include/linux/suspend_ioctls.h .  The major and minor
-numbers of the device are, respectively, 10 and 231, and they can
-be read from /sys/class/misc/snapshot/dev.
-
-The device can be open either for reading or for writing.  If open for
-reading, it is considered to be in the suspend mode.  Otherwise it is
-assumed to be in the resume mode.  The device cannot be open for simultaneous
-reading and writing.  It is also impossible to have the device open more than
-once at a time.
-
-Even opening the device has side effects. Data structures are
-allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
-called.
-
-The ioctl() commands recognized by the device are:
-
-SNAPSHOT_FREEZE - freeze user space processes (the current process is
-	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
-	and SNAPSHOT_ATOMIC_RESTORE to succeed
-
-SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-
-SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
-	last argument of ioctl() should be a pointer to an int variable,
-	the value of which will indicate whether the call returned after
-	creating the snapshot (1) or after restoring the system memory state
-	from it (0) (after resume the system finds itself finishing the
-	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
-	has been created the read() operation can be used to transfer
-	it out of the kernel
-
-SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
-	uploaded snapshot image; before calling it you should transfer
-	the system memory snapshot back to the kernel using the write()
-	operation; this call will not succeed if the snapshot
-	image is not available to the kernel
-
-SNAPSHOT_FREE - free memory allocated for the snapshot image
-
-SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
-	(the kernel will do its best to ensure the image size will not exceed
-	this number, but if it turns out to be impossible, the kernel will
-	create the smallest image possible)
-
-SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
-
-SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
-	last argument should be a pointer to an unsigned int variable that will
-	contain the result if the call is successful).
-
-SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
-	(the last argument should be a pointer to a loff_t variable that
-	will contain the swap page offset if the call is successful)
-
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
-	SNAPSHOT_ALLOC_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
-	units) from the beginning of the partition at which the swap header is
-	located (the last ioctl() argument should point to a struct
-	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
-	containing the resume device specification and the offset); for swap
-	partitions the offset is always 0, but it is different from zero for
-	swap files (see Documentation/power/swsusp-and-swap-files.txt for
-	details).
-
-SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
-	depending on the argument value (enable, if the argument is nonzero)
-
-SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
-	state (eg. ACPI S4) using the platform (eg. ACPI) driver
-
-SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
-	immediately enter the suspend-to-RAM state, so this call must always
-	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
-	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
-	is needed to implement the suspend-to-both mechanism in which the
-	suspend image is first created, as though the system had been suspended
-	to disk, and then the system is suspended to RAM (this makes it possible
-	to resume the system from RAM if there's enough battery power or restore
-	its state on the basis of the saved suspend image otherwise)
-
-The device's read() operation can be used to transfer the snapshot image from
-the kernel.  It has the following limitations:
-- you cannot read() more than one virtual memory page at a time
-- read()s across page boundaries are impossible (ie. if you read() 1/2 of
-	a page in the previous call, you will only be able to read()
-	_at_ _most_ 1/2 of the page in the next call)
-
-The device's write() operation is used for uploading the system memory snapshot
-into the kernel.  It has the same limitations as the read() operation.
-
-The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
-Thus it is not necessary to use either SNAPSHOT_FREE or
-SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
-unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
-still frozen when the device is being closed).
-
-Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, or a swap file as storage space (if a swap file is used, the resume
-partition is the partition that holds this file).  However, this is not really
-required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
-mounted afterwards.
-
-These utilities MUST NOT make any assumptions regarding the ordering of
-data within the snapshot image.  The contents of the image are entirely owned
-by the kernel and its structure may be changed in future kernel releases.
-
-The snapshot image MUST be written to the kernel unaltered (ie. all of the image
-data, metadata and header MUST be written in _exactly_ the same amount, form
-and order in which they have been read).  Otherwise, the behavior of the
-resumed system may be totally unpredictable.
-
-While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
-structure of the snapshot image is consistent with the information stored
-in the image header.  If any inconsistencies are detected,
-SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
-mechanism and the userland utilities using the interface SHOULD use additional
-means, such as checksums, to ensure the integrity of the snapshot image.
-
-The suspending and resuming utilities MUST lock themselves in memory,
-preferably using mlockall(), before calling SNAPSHOT_FREEZE.
-
-The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
-in the memory location pointed to by the last argument of ioctl() and proceed
-in accordance with it:
-1. 	If the value is 1 (ie. the system memory snapshot has just been
-	created and the system is ready for saving it):
-	(a)	The suspending utility MUST NOT close the snapshot device
-		_unless_ the whole suspend procedure is to be cancelled, in
-		which case, if the snapshot image has already been saved, the
-		suspending utility SHOULD destroy it, preferably by zapping
-		its header.  If the suspend is not to be cancelled, the
-		system MUST be powered off or rebooted after the snapshot
-		image has been saved.
-	(b)	The suspending utility SHOULD NOT attempt to perform any
-		file system operations (including reads) on the file systems
-		that were mounted before SNAPSHOT_CREATE_IMAGE has been
-		called.  However, it MAY mount a file system that was not
-		mounted at that time and perform some operations on it (eg.
-		use it for saving the image).
-2.	If the value is 0 (ie. the system state has just been restored from
-	the snapshot image), the suspending utility MUST close the snapshot
-	device.  Afterwards it will be treated as a regular userland process,
-	so it need not exit.
-
-The resuming utility SHOULD NOT attempt to mount any file systems that could
-be mounted before suspend and SHOULD NOT attempt to perform any operations
-involving such file systems.
-
-For details, please refer to the source code.
diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst
new file mode 100644
index 000000000000..337a2ba9f32f
--- /dev/null
+++ b/Documentation/power/video.rst
@@ -0,0 +1,213 @@
+===========================
+Video issues with S3 resume
+===========================
+
+2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+    resume. Unfortunately, it is not correct to call the video BIOS at
+    that point, but it happens to work on some machines. Use
+    acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+    the BIOS works well enough to be able to set video mode. Use
+    acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+    acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+    a new enough X, and a plain text console (no vesafb or radeonfb). See
+    http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+    Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+    to life. It needs text console to be working. Do vbetool vbestate
+    save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+    vbestate restore < /tmp/delme; setfont <whatever>, and your video
+    should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+    POSTing bios works. Ole Rohne has patch to do just that at
+    http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility and or
+    do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will
+    initialize the display in console mode. If you are in X, you can switch
+    to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
+    the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+
+=============================== ===============================================
+Model                           hack (or "how to do it")
+=============================== ===============================================
+Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
+Acer TM 230			s3_bios (2)
+Acer TM 242FX			vbetool (6)
+Acer TM C110			video_post (8)
+Acer TM C300                    vga=normal (only suspend on console, not in X),
+				vbetool (6) or video_post (8)
+Acer TM 4052LCi		        s3_bios (2)
+Acer TM 636Lci			s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text
+				console back
+Acer TM 660			??? [#f1]_
+Acer TM 800			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803LCi			vga=normal, vbetool (6)
+Arima W730a			vbetool needed (6)
+Asus L2400D                     s3_mode (3) [#f2]_ (S1 also works OK)
+Asus L3350M (SiS 740)           (6)
+Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
+Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver
+				instead of fglrx in x.org
+Athlon64 desktop prototype	s3_bios (2)
+Compal CL-50			??? [#f1]_
+Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
+Compaq Evo N620c		vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
+Dell D610			vga=normal and X (possibly vbestate (6) too,
+				but not tested)
+Dell Inspiron 4000		??? [#f1]_
+Dell Inspiron 500m		??? [#f1]_
+Dell Inspiron 510m		???
+Dell Inspiron 5150		vbetool needed (6)
+Dell Inspiron 600m		??? [#f1]_
+Dell Inspiron 8200		??? [#f1]_
+Dell Inspiron 8500		??? [#f1]_
+Dell Inspiron 8600		??? [#f1]_
+eMachines athlon64 machines	vbetool needed (6) (someone please get
+				me model #s)
+HP NC6000			s3_bios, may not use radeonfb (2);
+				or vbetool (6)
+HP NX7000			??? [#f1]_
+HP Pavilion ZD7000		vbetool post needed, need open-source nv
+				driver for X
+HP Omnibook XE3	athlon version	none (1)
+HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF		vbetool (6)
+HP Omnibook 5150		none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294
+				Savage/IX-MV, vesafb gets "interesting"
+				but X work.
+IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with
+				BIOS 1.04 2002-08-23, but not at all with
+				BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG      none (1)
+IBM TP R40 2722B3G		??? [#f1]_
+IBM TP R50p / Type 1832-22U     s3_bios (2)
+IBM TP R51			none (1)
+IBM TP T30	236681A		??? [#f1]_
+IBM TP T40 / Type 2373-MU4      none (1)
+IBM TP T40p			none (1)
+IBM TP R40p			s3_bios (2)
+IBM TP T41p			s3_bios (2), switch to X after resume
+IBM TP T42			s3_bios (2)
+IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
+IBM TP X20			??? [#f1]_
+IBM TP X30			s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH      none (1), use radeontool
+				(http://fdd.com/software/radeon/) to
+				turn off backlight.
+IBM TP X32			none (1), but backlight is on and video is
+				trashed after long suspend. s3_bios,
+				s3_mode (4) works too. Perhaps that gets
+				better results?
+IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
+IBM TP 600e			none(1), but a switch to console and
+				back to X is needed
+Medion MD4220			??? [#f1]_
+Samsung P35			vbetool needed (6)
+Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K		s3_bios (2)
+Sony Vaio PCG-F403		??? [#f1]_
+Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use
+				radeontool (http://fdd.com/software/radeon/)
+				to turn off backlight.
+Sony Vaio PCG-N505SN		??? [#f1]_
+Sony Vaio vgn-s260		X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will
+				be blank unless you return to X.
+Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
+Toshiba Libretto L5		none (1)
+Toshiba Libretto 100CT/110CT    vbetool (6)
+Toshiba Portege 3020CT		s3_mode (3)
+Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT      ??? [#f1]_
+Toshiba Satellite P10-554       s3_bios,s3_mode (4)[#f3]_
+Toshiba M30                     (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO			??? [#f1]_
+=============================== ===============================================
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+=================== ============================= ========================
+Mainboard	    Graphics card                 hack (or "how to do it")
+=================== ============================= ========================
+Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
+=================== ============================= ========================
+
+
+.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure
+         which options to use. If you know, please tell me.
+
+.. [#f2] To be tested with a newer kernel.
+
+.. [#f3] Not with SMP kernel, UP only.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
deleted file mode 100644
index 3e6272bc4472..000000000000
--- a/Documentation/power/video.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-		Video issues with S3 resume
-		~~~~~~~~~~~~~~~~~~~~~~~~~~~
-		  2003-2006, Pavel Machek
-
-During S3 resume, hardware needs to be reinitialized. For most
-devices, this is easy, and kernel driver knows how to do
-it. Unfortunately there's one exception: video card. Those are usually
-initialized by BIOS, and kernel does not have enough information to
-boot video card. (Kernel usually does not even contain video card
-driver -- vesafb and vgacon are widely used).
-
-This is not problem for swsusp, because during swsusp resume, BIOS is
-run normally so video card is normally initialized. It should not be
-problem for S1 standby, because hardware should retain its state over
-that.
-
-We either have to run video BIOS during early resume, or interpret it
-using vbetool later, or maybe nothing is necessary on particular
-system because video state is preserved. Unfortunately different
-methods work on different systems, and no known method suits all of
-them.
-
-Userland application called s2ram has been developed; it contains long
-whitelist of systems, and automatically selects working method for a
-given system. It can be downloaded from CVS at
-www.sf.net/projects/suspend . If you get a system that is not in the
-whitelist, please try to find a working solution, and submit whitelist
-entry so that work does not need to be repeated.
-
-Currently, VBE_SAVE method (6 below) works on most
-systems. Unfortunately, vbetool only runs after userland is resumed,
-so it makes debugging of early resume problems
-hard/impossible. Methods that do not rely on userland are preferable.
-
-Details
-~~~~~~~
-
-There are a few types of systems where video works after S3 resume:
-
-(1) systems where video state is preserved over S3.
-
-(2) systems where it is possible to call the video BIOS during S3
-  resume. Unfortunately, it is not correct to call the video BIOS at
-  that point, but it happens to work on some machines. Use
-  acpi_sleep=s3_bios.
-
-(3) systems that initialize video card into vga text mode and where
-  the BIOS works well enough to be able to set video mode. Use
-  acpi_sleep=s3_mode on these.
-
-(4) on some systems s3_bios kicks video into text mode, and
-  acpi_sleep=s3_bios,s3_mode is needed.
-
-(5) radeon systems, where X can soft-boot your video card. You'll need
-  a new enough X, and a plain text console (no vesafb or radeonfb). See
-  http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
-  Alternatively, you should use vbetool (6) instead.
-
-(6) other radeon systems, where vbetool is enough to bring system back
-  to life. It needs text console to be working. Do vbetool vbestate
-  save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
-  vbestate restore < /tmp/delme; setfont <whatever>, and your video
-  should work.
-
-(7) on some systems, it is possible to boot most of kernel, and then
-  POSTing bios works. Ole Rohne has patch to do just that at
-  http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
-
-(8) on some systems, you can use the video_post utility and or 
-  do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will 
-  initialize the display in console mode. If you are in X, you can switch
-  to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
-  the display working in graphical mode again.
-
-Now, if you pass acpi_sleep=something, and it does not work with your
-bios, you'll get a hard crash during resume. Be careful. Also it is
-safest to do your experiments with plain old VGA console. The vesafb
-and radeonfb (etc) drivers have a tendency to crash the machine during
-resume.
-
-You may have a system where none of above works. At that point you
-either invent another ugly hack that works, or write proper driver for
-your video card (good luck getting docs :-(). Maybe suspending from X
-(proper X, knowing your hardware, not XF68_FBcon) might have better
-chance of working.
-
-Table of known working notebooks:
-
-Model                           hack (or "how to do it")
-------------------------------------------------------------------------------
-Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
-Acer TM 230			s3_bios (2)
-Acer TM 242FX			vbetool (6)
-Acer TM C110			video_post (8)
-Acer TM C300                    vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
-Acer TM 4052LCi		        s3_bios (2)
-Acer TM 636Lci			s3_bios,s3_mode (4)
-Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text console back
-Acer TM 660			??? (*)
-Acer TM 800			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803LCi			vga=normal, vbetool (6)
-Arima W730a			vbetool needed (6)
-Asus L2400D                     s3_mode (3)(***) (S1 also works OK)
-Asus L3350M (SiS 740)           (6)
-Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
-Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
-Athlon64 desktop prototype	s3_bios (2)
-Compal CL-50			??? (*)
-Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
-Compaq Evo N620c		vga=normal, s3_bios (2)
-Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
-Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
-Dell D610			vga=normal and X (possibly vbestate (6) too, but not tested)
-Dell Inspiron 4000		??? (*)
-Dell Inspiron 500m		??? (*)
-Dell Inspiron 510m		???
-Dell Inspiron 5150		vbetool needed (6)
-Dell Inspiron 600m		??? (*)
-Dell Inspiron 8200		??? (*)
-Dell Inspiron 8500		??? (*)
-Dell Inspiron 8600		??? (*)
-eMachines athlon64 machines	vbetool needed (6) (someone please get me model #s)
-HP NC6000			s3_bios, may not use radeonfb (2); or vbetool (6)
-HP NX7000			??? (*)
-HP Pavilion ZD7000		vbetool post needed, need open-source nv driver for X
-HP Omnibook XE3	athlon version	none (1)
-HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
-HP Omnibook XE3L-GF		vbetool (6)
-HP Omnibook 5150		none (1), (S1 also works OK)
-IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
-IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
-IBM TP R32 / Type 2658-MMG      none (1)
-IBM TP R40 2722B3G		??? (*)
-IBM TP R50p / Type 1832-22U     s3_bios (2)
-IBM TP R51			none (1)
-IBM TP T30	236681A		??? (*)
-IBM TP T40 / Type 2373-MU4      none (1)
-IBM TP T40p			none (1)
-IBM TP R40p			s3_bios (2)
-IBM TP T41p			s3_bios (2), switch to X after resume
-IBM TP T42			s3_bios (2)
-IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
-IBM TP X20			??? (*)
-IBM TP X30			s3_bios, s3_mode (4)
-IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32			none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
-IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
-IBM TP 600e			none(1), but a switch to console and back to X is needed
-Medion MD4220			??? (*)
-Samsung P35			vbetool needed (6)
-Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
-Sony Vaio PCG-C1VRX/K		s3_bios (2)
-Sony Vaio PCG-F403		??? (*)
-Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
-Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-Sony Vaio PCG-N505SN		??? (*)
-Sony Vaio vgn-s260		X or boot-radeon can init it (5)
-Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will be blank unless you return to X.
-Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
-Toshiba Libretto L5		none (1)
-Toshiba Libretto 100CT/110CT    vbetool (6)
-Toshiba Portege 3020CT		s3_mode (3)
-Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4090XCDT      ??? (*)
-Toshiba Satellite P10-554       s3_bios,s3_mode (4)(****)
-Toshiba M30                     (2) xor X with nvidia driver using internal AGP
-Uniwill 244IIO			??? (*)
-
-Known working desktop systems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Mainboard	    Graphics card                 hack (or "how to do it")
-------------------------------------------------------------------------------
-Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
-
-
-(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
-    which options to use. If you know, please tell me.
-
-(***) To be tested with a newer kernel.
-
-(****) Not with SMP kernel, UP only.
diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst
index 58bc047e7b95..1acaa14903d6 100644
--- a/Documentation/process/submitting-drivers.rst
+++ b/Documentation/process/submitting-drivers.rst
@@ -117,7 +117,7 @@ PM support:
 		implemented") error.  You should also try to make sure that your
 		driver uses as little power as possible when it's not doing
 		anything.  For the driver testing instructions see
-		Documentation/power/drivers-testing.txt and for a relatively
+		Documentation/power/drivers-testing.rst and for a relatively
 		complete overview of the power management issues related to
 		drivers see :ref:`Documentation/driver-api/pm/devices.rst <driverapi_pm_devices>`.
 
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
index 197d81f4b836..d97207b9accb 100644
--- a/Documentation/scheduler/sched-energy.txt
+++ b/Documentation/scheduler/sched-energy.txt
@@ -22,7 +22,7 @@ the highest.
 
 The actual EM used by EAS is _not_ maintained by the scheduler, but by a
 dedicated framework. For details about this framework and what it provides,
-please refer to its documentation (see Documentation/power/energy-model.txt).
+please refer to its documentation (see Documentation/power/energy-model.rst).
 
 
 2. Background and Terminology
@@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback.
 
 The rest of platform knowledge used by EAS is directly read from the Energy
 Model (EM) framework. The EM of a platform is composed of a power cost table
-per 'performance domain' in the system (see Documentation/power/energy-model.txt
+per 'performance domain' in the system (see Documentation/power/energy-model.rst
 for futher details about performance domains).
 
 The scheduler manages references to the EM objects in the topology code when the
@@ -352,7 +352,7 @@ could be amended in the future if proven otherwise.
 EAS uses the EM of a platform to estimate the impact of scheduling decisions on
 energy. So, your platform must provide power cost tables to the EM framework in
 order to make EAS start. To do so, please refer to documentation of the
-independent EM framework in Documentation/power/energy-model.txt.
+independent EM framework in Documentation/power/energy-model.rst.
 
 Please also note that the scheduling domains need to be re-built after the
 EM has been registered in order to start EAS.
diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt
index f07e38094b40..1a660a39e3c0 100644
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ b/Documentation/trace/coresight-cpu-debug.txt
@@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods:
 
 It is possible to disable CPU idle states by way of the PM QoS
 subsystem, more specifically by using the "/dev/cpu_dma_latency"
-interface (see Documentation/power/pm_qos_interface.txt for more
+interface (see Documentation/power/pm_qos_interface.rst for more
 details).  As specified in the PM QoS documentation the requested
 parameter will stay in effect until the file descriptor is released.
 For example:
diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst
index 72c6cd935821..f1c3906c69a8 100644
--- a/Documentation/translations/zh_CN/process/submitting-drivers.rst
+++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst
@@ -97,7 +97,7 @@ Linux 2.6:
 		函数定义成返回 -ENOSYS（功能未实现）错误。你还应该尝试确
 		保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动
 		程序测试的指导，请参阅
-		Documentation/power/drivers-testing.txt。有关驱动程序电
+		Documentation/power/drivers-testing.rst。有关驱动程序电
 		源管理问题相对全面的概述，请参阅
 		Documentation/driver-api/pm/devices.rst。
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c382053ce6a..5a6137df3f0e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6446,7 +6446,7 @@ M:	"Rafael J. Wysocki" <rjw@rjwysocki.net>
 M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/power/freezing-of-tasks.txt
+F:	Documentation/power/freezing-of-tasks.rst
 F:	include/linux/freezer.h
 F:	kernel/freezer.c
 
@@ -11764,7 +11764,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
 F:	drivers/opp/
 F:	include/linux/pm_opp.h
-F:	Documentation/power/opp.txt
+F:	Documentation/power/opp.rst
 F:	Documentation/devicetree/bindings/opp/
 
 OPL4 DRIVER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..77a724771dbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2447,7 +2447,7 @@ menuconfig APM
 	  machines with more than one CPU.
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 066fd2a12851..10d040e2e807 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1175,7 +1175,7 @@ struct skl_wm_params {
  * to be disabled. This shouldn't happen and we'll print some error messages in
  * case it happens.
  *
- * For more, read the Documentation/power/runtime_pm.txt.
+ * For more, read the Documentation/power/runtime_pm.rst.
  */
 struct i915_runtime_pm {
 	atomic_t wakeref_count;
diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig
index a7fbb93f302c..1f64a3d46c8a 100644
--- a/drivers/opp/Kconfig
+++ b/drivers/opp/Kconfig
@@ -10,4 +10,4 @@ config PM_OPP
 	  OPP layer organizes the data internally using device pointers
 	  representing individual voltage domains and provides SOC
 	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
+	  For more information, read <file:Documentation/power/opp.rst>
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..11f9c875b028 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -607,7 +607,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 
 	/* The property and field names below must correspond to elements
 	 * in enum power_supply_property. For reasoning, see
-	 * Documentation/power/power_supply_class.txt.
+	 * Documentation/power/power_supply_class.rst.
 	 */
 
 	of_property_read_u32(battery_np, "energy-full-design-microwatt-hours",
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..5b8328a99b2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -52,7 +52,7 @@
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend.  Does not guarantee
  *                   that this interrupt will wake the system from a suspended
- *                   state.  See Documentation/power/suspend-and-interrupts.txt
+ *                   state.  See Documentation/power/suspend-and-interrupts.rst
  * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b74b2a4e6df2..3d9a167ca5c3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -807,7 +807,7 @@ struct module;
  * @suspend_late: Put device into low power state.
  * @resume_early: Wake device from low power state.
  * @resume:	Wake device from low power state.
- *		(Please see Documentation/power/pci.txt for descriptions
+ *		(Please see Documentation/power/pci.rst for descriptions
  *		of PCI Power Management and the related functions.)
  * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
  *		Intended to stop any idling DMA operations.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 66c19a65a514..c14ad8bc1a41 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -284,7 +284,7 @@ typedef struct pm_message {
  * actions to be performed by a device driver's callbacks generally depend on
  * the platform and subsystem the device belongs to.
  *
- * Refer to Documentation/power/runtime_pm.txt for more information about the
+ * Refer to Documentation/power/runtime_pm.rst for more information about the
  * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle()
  * callbacks in device runtime power management.
  */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..7a4dda9e5309 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -65,7 +65,7 @@ config HIBERNATION
 	  need to run mkswap against the swap partition used for the suspend.
 
 	  It also works with swap files to a limited extent (for details see
-	  <file:Documentation/power/swsusp-and-swap-files.txt>).
+	  <file:Documentation/power/swsusp-and-swap-files.rst>).
 
 	  Right now you may boot without resuming and resume later but in the
 	  meantime you cannot use the swap partition(s)/file(s) involved in
@@ -74,7 +74,7 @@ config HIBERNATION
 	  MOUNT any journaled filesystems mounted before the suspend or they
 	  will get corrupted in a nasty way.
 
-	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+	  For more information take a look at <file:Documentation/power/swsusp.rst>.
 
 config ARCH_SAVE_PAGE_KEYS
 	bool
@@ -255,7 +255,7 @@ config APM_EMULATION
 	  notification of APM "events" (e.g. battery status change).
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 41722046b937..0cd26289bfbc 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -165,7 +165,7 @@ config CFG80211_DEFAULT_PS
 
 	  If this causes your applications to misbehave you should fix your
 	  applications instead -- they need to register their network
-	  latency requirement, see Documentation/power/pm_qos_interface.txt.
+	  latency requirement, see Documentation/power/pm_qos_interface.rst.
 
 config CFG80211_DEBUGFS
 	bool "cfg80211 DebugFS entries"
-- 
cgit v1.2.3


From e1714daad7cf8fe4d6dd91adcfbbdd0604b0210d Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:31 +0200
Subject: i2c: headers: don't use 'dev' as adapter variable

It is not a struct device, so 'dev' is confusing. Use 'adap', the most
common name.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e982b8913b73..6bd199cfe61f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -703,14 +703,14 @@ struct i2c_adapter {
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-static inline void *i2c_get_adapdata(const struct i2c_adapter *dev)
+static inline void *i2c_get_adapdata(const struct i2c_adapter *adap)
 {
-	return dev_get_drvdata(&dev->dev);
+	return dev_get_drvdata(&adap->dev);
 }
 
-static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
+static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data)
 {
-	dev_set_drvdata(&dev->dev, data);
+	dev_set_drvdata(&adap->dev, data);
 }
 
 static inline struct i2c_adapter *
-- 
cgit v1.2.3


From d68222d4d6647611be5a32c80a53a145e7c80ce9 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:32 +0200
Subject: i2c: headers: always have a named variable in arguments

Much better to read and understand. Naming for i2c_adapter is not
consistent (yet), so use the name which is also used in core code.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 6bd199cfe61f..14e04fb4f46f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -40,7 +40,8 @@ struct i2c_device_identity;
 union i2c_smbus_data;
 struct i2c_board_info;
 enum i2c_slave_event;
-typedef int (*i2c_slave_cb_t)(struct i2c_client *, enum i2c_slave_event, u8 *);
+typedef int (*i2c_slave_cb_t)(struct i2c_client *client,
+			      enum i2c_slave_event event, u8 *val);
 
 struct module;
 struct property_entry;
@@ -257,16 +258,16 @@ struct i2c_driver {
 	unsigned int class;
 
 	/* Standard driver model interfaces */
-	int (*probe)(struct i2c_client *, const struct i2c_device_id *);
-	int (*remove)(struct i2c_client *);
+	int (*probe)(struct i2c_client *client, const struct i2c_device_id *id);
+	int (*remove)(struct i2c_client *client);
 
 	/* New driver model interface to aid the seamless removal of the
 	 * current probe()'s, more commonly unused than used second parameter.
 	 */
-	int (*probe_new)(struct i2c_client *);
+	int (*probe_new)(struct i2c_client *client);
 
 	/* driver model interfaces that don't relate to enumeration  */
-	void (*shutdown)(struct i2c_client *);
+	void (*shutdown)(struct i2c_client *client);
 
 	/* Alert callback, for example for the SMBus alert protocol.
 	 * The format and meaning of the data value depends on the protocol.
@@ -275,7 +276,7 @@ struct i2c_driver {
 	 * For the SMBus Host Notify protocol, the data corresponds to the
 	 * 16-bit payload data reported by the slave device acting as master.
 	 */
-	void (*alert)(struct i2c_client *, enum i2c_alert_protocol protocol,
+	void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol,
 		      unsigned int data);
 
 	/* a ioctl like command that can be used to perform specific functions
@@ -287,7 +288,7 @@ struct i2c_driver {
 	const struct i2c_device_id *id_table;
 
 	/* Device detection callback for automatic device creation */
-	int (*detect)(struct i2c_client *, struct i2c_board_info *);
+	int (*detect)(struct i2c_client *client, struct i2c_board_info *info);
 	const unsigned short *address_list;
 	struct list_head clients;
 
@@ -447,10 +448,10 @@ extern struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
 		      unsigned short const *addr_list,
-		      int (*probe)(struct i2c_adapter *, unsigned short addr));
+		      int (*probe)(struct i2c_adapter *adap, unsigned short addr));
 
 /* Common custom probe functions */
-extern int i2c_probe_func_quick_read(struct i2c_adapter *, unsigned short addr);
+extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr);
 
 /* For devices that use several addresses, use i2c_new_dummy() to make
  * client handles for the extra addresses.
@@ -466,7 +467,7 @@ i2c_new_secondary_device(struct i2c_client *client,
 				const char *name,
 				u16 default_addr);
 
-extern void i2c_unregister_device(struct i2c_client *);
+extern void i2c_unregister_device(struct i2c_client *client);
 #endif /* I2C */
 
 /* Mainboard arch_initcall() code should register all its I2C devices.
@@ -551,9 +552,9 @@ struct i2c_algorithm {
  * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus.
  */
 struct i2c_lock_operations {
-	void (*lock_bus)(struct i2c_adapter *, unsigned int flags);
-	int (*trylock_bus)(struct i2c_adapter *, unsigned int flags);
-	void (*unlock_bus)(struct i2c_adapter *, unsigned int flags);
+	void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags);
 };
 
 /**
@@ -726,7 +727,7 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 		return NULL;
 }
 
-int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *));
+int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data));
 
 /* Adapter locking functions, exported for shared pin cases */
 #define I2C_LOCK_ROOT_ADAPTER BIT(0)
@@ -832,12 +833,12 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 /* administration...
  */
 #if IS_ENABLED(CONFIG_I2C)
-extern int i2c_add_adapter(struct i2c_adapter *);
-extern void i2c_del_adapter(struct i2c_adapter *);
-extern int i2c_add_numbered_adapter(struct i2c_adapter *);
+extern int i2c_add_adapter(struct i2c_adapter *adap);
+extern void i2c_del_adapter(struct i2c_adapter *adap);
+extern int i2c_add_numbered_adapter(struct i2c_adapter *adap);
 
-extern int i2c_register_driver(struct module *, struct i2c_driver *);
-extern void i2c_del_driver(struct i2c_driver *);
+extern int i2c_register_driver(struct module *owner, struct i2c_driver *driver);
+extern void i2c_del_driver(struct i2c_driver *driver);
 
 /* use a define to avoid include chaining to get THIS_MODULE */
 #define i2c_add_driver(driver) \
-- 
cgit v1.2.3


From 2caea56f569ac361fc854f6bf2fe94b70514c917 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:34 +0200
Subject: i2c: headers: update docs about I2C_CLIENT_*

Update kerneldoc for i2c client flags because they increased over time.
Also, move them to a position where they can be more easily found.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 14e04fb4f46f..9853fae9b505 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -298,8 +298,7 @@ struct i2c_driver {
 
 /**
  * struct i2c_client - represent an I2C slave device
- * @flags: I2C_CLIENT_TEN indicates the device uses a ten bit chip address;
- *	I2C_CLIENT_PEC indicates it uses SMBus Packet Error Checking
+ * @flags: see I2C_CLIENT_* for possible flags
  * @addr: Address used on the I2C bus connected to the parent adapter.
  * @name: Indicates the type of the device, usually a chip name that's
  *	generic enough to hide second-sourcing and compatible revisions.
@@ -317,6 +316,15 @@ struct i2c_driver {
  */
 struct i2c_client {
 	unsigned short flags;		/* div., see below		*/
+#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
+#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
+					/* Must equal I2C_M_TEN below */
+#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
+#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
+#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
+#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
+					/* Must match I2C_M_STOP|IGNORE_NAK */
+
 	unsigned short addr;		/* chip address - NOTE: 7bit	*/
 					/* addresses are stored in the	*/
 					/* _LOWER_ 7 bits		*/
@@ -803,16 +811,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 	i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
 }
 
-/*flags for the client struct: */
-#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
-#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
-					/* Must equal I2C_M_TEN below */
-#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
-#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
-#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
-#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
-					/* Must match I2C_M_STOP|IGNORE_NAK */
-
 /* i2c adapter classes (bitmask) */
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
 #define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
-- 
cgit v1.2.3


From 76cc9f0efd952d376e93e79b1f19fd6fdb8291bc Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:35 +0200
Subject: i2c: headers: reformat header comment and update copyright

Let's stick to coding style.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 9853fae9b505..d8f9060179d0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1,16 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* ------------------------------------------------------------------------- */
-/*									     */
-/* i2c.h - definitions for the i2c-bus interface			     */
-/*									     */
-/* ------------------------------------------------------------------------- */
-/*   Copyright (C) 1995-2000 Simon G. Vogl
-
+/*
+ * i2c.h - definitions for the Linux i2c bus interface
+ * Copyright (C) 1995-2000 Simon G. Vogl
+ * Copyright (C) 2013-2019 Wolfram Sang <wsa@the-dreams.de>
+ *
+ * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
+ * Frodo Looijaard <frodol@dds.nl>
  */
-/* ------------------------------------------------------------------------- */
-
-/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
-   Frodo Looijaard <frodol@dds.nl> */
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
-- 
cgit v1.2.3


From 7f94208c8f9a0a6d2ff0e0c0858c00ad8e5c8617 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 12 Jun 2019 17:18:47 +0800
Subject: bpf: Fix build error without CONFIG_INET

If CONFIG_INET is not set, building fails:

kernel/bpf/verifier.o: In function `check_mem_access':
verifier.c: undefined reference to `bpf_xdp_sock_is_valid_access'
kernel/bpf/verifier.o: In function `convert_ctx_accesses':
verifier.c: undefined reference to `bpf_xdp_sock_convert_ctx_access'

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1fe137afa898..b15fb5fcb741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -728,13 +728,6 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
-bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
-				  struct bpf_insn_access_aux *info);
-u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
-				    const struct bpf_insn *si,
-				    struct bpf_insn *insn_buf,
-				    struct bpf_prog *prog,
-				    u32 *target_size);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@@ -1110,6 +1103,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 				    struct bpf_insn *insn_buf,
 				    struct bpf_prog *prog,
 				    u32 *target_size);
+
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info);
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog,
+				    u32 *target_size);
 #else
 static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
 						enum bpf_access_type type,
@@ -1126,6 +1128,21 @@ static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 {
 	return 0;
 }
+static inline bool bpf_xdp_sock_is_valid_access(int off, int size,
+						enum bpf_access_type type,
+						struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+						  const struct bpf_insn *si,
+						  struct bpf_insn *insn_buf,
+						  struct bpf_prog *prog,
+						  u32 *target_size)
+{
+	return 0;
+}
 #endif /* CONFIG_INET */
 
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3


From 7c86f20d15b7c1132e0c24358ce240ba4cb002b7 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Wed, 12 Jun 2019 21:31:15 +0200
Subject: net: stmmac: use GPIO descriptors in stmmac_mdio_reset

Switch stmmac_mdio_reset to use GPIO descriptors. GPIO core handles the
"snps,reset-gpio" for GPIO descriptors so we don't need to take care of
it inside the driver anymore.

The advantage of this is that we now preserve the GPIO flags which are
passed via devicetree. This is required on some newer Amlogic boards
which use an Open Drain pin for the reset GPIO. This pin can only output
a LOW signal or switch to input mode but it cannot output a HIGH signal.
There are already devicetree bindings for these special cases and GPIO
core already takes care of them but only if we use GPIO descriptors
instead of GPIO numbers.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 27 +++++++++++------------
 include/linux/stmmac.h                            |  2 +-
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 093a223fe408..f1c39dd048e7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -20,11 +20,11 @@
   Maintainer: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 *******************************************************************************/
 
+#include <linux/gpio/consumer.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
 #include <linux/mii.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/of_mdio.h>
 #include <linux/phy.h>
 #include <linux/slab.h>
@@ -251,37 +251,36 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
+		struct gpio_desc *reset_gpio;
+
 		if (data->reset_gpio < 0) {
 			struct device_node *np = priv->device->of_node;
 
 			if (!np)
 				return 0;
 
-			data->reset_gpio = of_get_named_gpio(np,
-						"snps,reset-gpio", 0);
-			if (data->reset_gpio < 0)
-				return 0;
+			reset_gpio = devm_gpiod_get_optional(priv->device,
+							     "snps,reset",
+							     GPIOD_OUT_LOW);
+			if (IS_ERR(reset_gpio))
+				return PTR_ERR(reset_gpio);
 
-			data->active_low = of_property_read_bool(np,
-						"snps,reset-active-low");
 			of_property_read_u32_array(np,
 				"snps,reset-delays-us", data->delays, 3);
+		} else {
+			reset_gpio = gpio_to_desc(data->reset_gpio);
 
-			if (devm_gpio_request(priv->device, data->reset_gpio,
-					      "mdio-reset"))
-				return 0;
+			gpiod_direction_output(reset_gpio, 0);
 		}
 
-		gpio_direction_output(data->reset_gpio,
-				      data->active_low ? 1 : 0);
 		if (data->delays[0])
 			msleep(DIV_ROUND_UP(data->delays[0], 1000));
 
-		gpio_set_value(data->reset_gpio, data->active_low ? 0 : 1);
+		gpiod_set_value_cansleep(reset_gpio, 1);
 		if (data->delays[1])
 			msleep(DIV_ROUND_UP(data->delays[1], 1000));
 
-		gpio_set_value(data->reset_gpio, data->active_low ? 1 : 0);
+		gpiod_set_value_cansleep(reset_gpio, 0);
 		if (data->delays[2])
 			msleep(DIV_ROUND_UP(data->delays[2], 1000));
 	}
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 4335bd771ce5..816edb545592 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -97,7 +97,7 @@ struct stmmac_mdio_bus_data {
 	int *irqs;
 	int probed_phy_irq;
 #ifdef CONFIG_OF
-	int reset_gpio, active_low;
+	int reset_gpio;
 	u32 delays[3];
 #endif
 };
-- 
cgit v1.2.3


From f01c373fbeed9f5870bb056b65750ccef42f1f20 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 13 Jun 2019 11:08:15 -0400
Subject: locking/static_key: always define static_branch_deferred_inc

This interface is currently only defined if CONFIG_JUMP_LABEL. Make it
available also when jump labels are off.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/jump_label_ratelimit.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 42710d5949ba..8c3ee291b2d8 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -60,8 +60,6 @@ extern void jump_label_update_timeout(struct work_struct *work);
 						   0),			\
 	}
 
-#define static_branch_deferred_inc(x)	static_branch_inc(&(x)->key)
-
 #else	/* !CONFIG_JUMP_LABEL */
 struct static_key_deferred {
 	struct static_key  key;
@@ -95,4 +93,7 @@ jump_label_rate_limit(struct static_key_deferred *key,
 	STATIC_KEY_CHECK_USE(key);
 }
 #endif	/* CONFIG_JUMP_LABEL */
+
+#define static_branch_deferred_inc(x)	static_branch_inc(&(x)->key)
+
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */
-- 
cgit v1.2.3


From 7928260539f3a13b5b23a3fa0a7c0e4f5255940b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 8 Jun 2019 11:39:05 +0200
Subject: processor: remove spin_cpu_yield

spin_cpu_yield is unused, therefore remove it.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/powerpc/include/asm/processor.h | 2 --
 include/linux/processor.h            | 9 ---------
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ef573fe9873e..a9993e7a443b 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -346,8 +346,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode)
 
 #define spin_cpu_relax()	barrier()
 
-#define spin_cpu_yield()	spin_cpu_relax()
-
 #define spin_end()	HMT_medium()
 
 #define spin_until_cond(cond)					\
diff --git a/include/linux/processor.h b/include/linux/processor.h
index dbc952eec869..dc78bdc7079a 100644
--- a/include/linux/processor.h
+++ b/include/linux/processor.h
@@ -32,15 +32,6 @@
 #define spin_cpu_relax() cpu_relax()
 #endif
 
-/*
- * spin_cpu_yield may be called to yield (undirected) to the hypervisor if
- * necessary. This should be used if the wait is expected to take longer
- * than context switch overhead, but we can't sleep or do a directed yield.
- */
-#ifndef spin_cpu_yield
-#define spin_cpu_yield() cpu_relax_yield()
-#endif
-
 #ifndef spin_end
 #define spin_end()
 #endif
-- 
cgit v1.2.3


From 38f2c691a4b3e89d476f8e8350d1ca299974b89d Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 17 May 2019 12:50:42 +0200
Subject: s390: improve wait logic of stop_machine

The stop_machine loop to advance the state machine and to wait for all
affected CPUs to check-in calls cpu_relax_yield in a tight loop until
the last missing CPUs acknowledged the state transition.

On a virtual system where not all logical CPUs are backed by real CPUs
all the time it can take a while for all CPUs to check-in. With the
current definition of cpu_relax_yield a diagnose 0x44 is done which
tells the hypervisor to schedule *some* other CPU. That can be any
CPU and not necessarily one of the CPUs that need to run in order to
advance the state machine. This can lead to a pretty bad diagnose 0x44
storm until the last missing CPU finally checked-in.

Replace the undirected cpu_relax_yield based on diagnose 0x44 with a
directed yield. Each CPU in the wait loop will pick up the next CPU
in the cpumask of stop_machine. The diagnose 0x9c is used to tell the
hypervisor to run this next CPU instead of the current one. If there
is only a limited number of real CPUs backing the virtual CPUs we
end up with the real CPUs passed around in a round-robin fashion.

[heiko.carstens@de.ibm.com]:
    Use cpumask_next_wrap as suggested by Peter Zijlstra.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/processor.h |  3 ++-
 arch/s390/kernel/processor.c      | 17 ++++++++++++-----
 arch/s390/kernel/smp.c            |  2 +-
 include/linux/sched.h             |  2 +-
 kernel/stop_machine.c             | 14 +++++++++-----
 5 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index b0fcbc37b637..445ce9ee4404 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -36,6 +36,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/cpumask.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
 #include <asm/cpu.h>
@@ -225,7 +226,7 @@ static __no_kasan_or_inline unsigned short stap(void)
  * Give up the time slice of the virtual PU.
  */
 #define cpu_relax_yield cpu_relax_yield
-void cpu_relax_yield(void);
+void cpu_relax_yield(const struct cpumask *cpumask);
 
 #define cpu_relax() barrier()
 
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 5de13307b703..4cdaefec1b7c 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -31,6 +31,7 @@ struct cpu_info {
 };
 
 static DEFINE_PER_CPU(struct cpu_info, cpu_info);
+static DEFINE_PER_CPU(int, cpu_relax_retry);
 
 static bool machine_has_cpu_mhz;
 
@@ -58,13 +59,19 @@ void s390_update_cpu_mhz(void)
 		on_each_cpu(update_cpu_mhz, NULL, 0);
 }
 
-void notrace cpu_relax_yield(void)
+void notrace cpu_relax_yield(const struct cpumask *cpumask)
 {
-	if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) {
-		diag_stat_inc(DIAG_STAT_X044);
-		asm volatile("diag 0,0,0x44");
+	int cpu, this_cpu;
+
+	this_cpu = smp_processor_id();
+	if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) {
+		__this_cpu_write(cpu_relax_retry, 0);
+		cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false);
+		if (cpu >= nr_cpu_ids)
+			return;
+		if (arch_vcpu_is_preempted(cpu))
+			smp_yield_cpu(cpu);
 	}
-	barrier();
 }
 EXPORT_SYMBOL(cpu_relax_yield);
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index f00955940694..44974654cbd0 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -414,7 +414,7 @@ void smp_yield_cpu(int cpu)
 		diag_stat_inc_norecursion(DIAG_STAT_X09C);
 		asm volatile("diag %0,0,0x9c"
 			     : : "d" (pcpu_devices[cpu].address));
-	} else if (MACHINE_HAS_DIAG44) {
+	} else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) {
 		diag_stat_inc_norecursion(DIAG_STAT_X044);
 		asm volatile("diag 0,0,0x44");
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..1f9f3160da7e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,7 +1519,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
 #endif
 
 #ifndef cpu_relax_yield
-#define cpu_relax_yield() cpu_relax()
+#define cpu_relax_yield(cpumask) cpu_relax()
 #endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2b5a6754646f..b8b0c5ff8da9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -183,6 +183,7 @@ static int multi_cpu_stop(void *data)
 	struct multi_stop_data *msdata = data;
 	enum multi_stop_state curstate = MULTI_STOP_NONE;
 	int cpu = smp_processor_id(), err = 0;
+	const struct cpumask *cpumask;
 	unsigned long flags;
 	bool is_active;
 
@@ -192,15 +193,18 @@ static int multi_cpu_stop(void *data)
 	 */
 	local_save_flags(flags);
 
-	if (!msdata->active_cpus)
-		is_active = cpu == cpumask_first(cpu_online_mask);
-	else
-		is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+	if (!msdata->active_cpus) {
+		cpumask = cpu_online_mask;
+		is_active = cpu == cpumask_first(cpumask);
+	} else {
+		cpumask = msdata->active_cpus;
+		is_active = cpumask_test_cpu(cpu, cpumask);
+	}
 
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read multi_stop_state. */
-		cpu_relax_yield();
+		cpu_relax_yield(cpumask);
 		if (msdata->state != curstate) {
 			curstate = msdata->state;
 			switch (curstate) {
-- 
cgit v1.2.3


From 4ecf0a43e729a7e641d800c294faabe87378fc05 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 8 Jun 2019 12:13:57 +0200
Subject: processor: get rid of cpu_relax_yield

stop_machine is the only user left of cpu_relax_yield. Given that it
now has special semantics which are tied to stop_machine introduce a
weak stop_machine_yield function which architectures can override, and
get rid of the generic cpu_relax_yield implementation.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/processor.h | 6 ------
 arch/s390/kernel/processor.c      | 4 ++--
 include/linux/sched.h             | 4 ----
 include/linux/stop_machine.h      | 1 +
 kernel/stop_machine.c             | 7 ++++++-
 5 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 445ce9ee4404..14883b1562e0 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -222,12 +222,6 @@ static __no_kasan_or_inline unsigned short stap(void)
 	return cpu_address;
 }
 
-/*
- * Give up the time slice of the virtual PU.
- */
-#define cpu_relax_yield cpu_relax_yield
-void cpu_relax_yield(const struct cpumask *cpumask);
-
 #define cpu_relax() barrier()
 
 #define ECAG_CACHE_ATTRIBUTE	0
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 4cdaefec1b7c..6ebc2117c66c 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -7,6 +7,7 @@
 #define KMSG_COMPONENT "cpu"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/stop_machine.h>
 #include <linux/cpufeature.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
@@ -59,7 +60,7 @@ void s390_update_cpu_mhz(void)
 		on_each_cpu(update_cpu_mhz, NULL, 0);
 }
 
-void notrace cpu_relax_yield(const struct cpumask *cpumask)
+void notrace stop_machine_yield(const struct cpumask *cpumask)
 {
 	int cpu, this_cpu;
 
@@ -73,7 +74,6 @@ void notrace cpu_relax_yield(const struct cpumask *cpumask)
 			smp_yield_cpu(cpu);
 	}
 }
-EXPORT_SYMBOL(cpu_relax_yield);
 
 /*
  * cpu_init - initializes state that is per-CPU.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f9f3160da7e..911675416b05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1518,10 +1518,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
 }
 #endif
 
-#ifndef cpu_relax_yield
-#define cpu_relax_yield(cpumask) cpu_relax()
-#endif
-
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 6d3635c86dbe..f9a0c6189852 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -36,6 +36,7 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
+void stop_machine_yield(const struct cpumask *cpumask);
 
 #else	/* CONFIG_SMP */
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b8b0c5ff8da9..b4f83f7bdf86 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -177,6 +177,11 @@ static void ack_state(struct multi_stop_data *msdata)
 		set_state(msdata, msdata->state + 1);
 }
 
+void __weak stop_machine_yield(const struct cpumask *cpumask)
+{
+	cpu_relax();
+}
+
 /* This is the cpu_stop function which stops the CPU. */
 static int multi_cpu_stop(void *data)
 {
@@ -204,7 +209,7 @@ static int multi_cpu_stop(void *data)
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read multi_stop_state. */
-		cpu_relax_yield(cpumask);
+		stop_machine_yield(cpumask);
 		if (msdata->state != curstate) {
 			curstate = msdata->state;
 			switch (curstate) {
-- 
cgit v1.2.3


From 4838a54050284daac15dfeb1d65677e4dacf1bf5 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 14 Jun 2019 17:06:57 +0200
Subject: net: stmmac: Fix wrapper drivers not detecting PHY

Because of PHYLINK conversion we stopped parsing the phy-handle property
from DT. Unfortunatelly, some wrapper drivers still rely on this phy
node to configure the PHY.

Let's restore the parsing of PHY handle while these wrapper drivers are
not fully converted to PHYLINK.

Fixes: 74371272f97f ("net: stmmac: Convert to phylink and remove phylib logic")
Reported-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Tested-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 4 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 9 ++++++++-
 include/linux/stmmac.h                                | 1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ad007d8bf9d7..069951590018 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -958,7 +958,7 @@ static int stmmac_init_phy(struct net_device *dev)
 	struct device_node *node;
 	int ret;
 
-	node = priv->plat->phy_node;
+	node = priv->plat->phylink_node;
 
 	if (node) {
 		ret = phylink_of_phy_connect(priv->phylink, node, 0);
@@ -980,7 +980,7 @@ static int stmmac_init_phy(struct net_device *dev)
 
 static int stmmac_phy_setup(struct stmmac_priv *priv)
 {
-	struct device_node *node = priv->plat->phy_node;
+	struct device_node *node = priv->plat->phylink_node;
 	int mode = priv->plat->interface;
 	struct phylink *phylink;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 898f94aced53..49adda9b0ad8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -381,7 +381,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 
 	*mac = of_get_mac_address(np);
 	plat->interface = of_get_phy_mode(np);
-	plat->phy_node = np;
+
+	/* Some wrapper drivers still rely on phy_node. Let's save it while
+	 * they are not converted to phylink. */
+	plat->phy_node = of_parse_phandle(np, "phy-handle", 0);
+
+	/* PHYLINK automatically parses the phy-handle property */
+	plat->phylink_node = np;
 
 	/* Get max speed of operation from device tree */
 	if (of_property_read_u32(np, "max-speed", &plat->max_speed))
@@ -577,6 +583,7 @@ error_pclk_get:
 void stmmac_remove_config_dt(struct platform_device *pdev,
 			     struct plat_stmmacenet_data *plat)
 {
+	of_node_put(plat->phy_node);
 	of_node_put(plat->mdio_node);
 }
 #else
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 816edb545592..a3c2d9945bcf 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -151,6 +151,7 @@ struct plat_stmmacenet_data {
 	int interface;
 	struct stmmac_mdio_bus_data *mdio_bus_data;
 	struct device_node *phy_node;
+	struct device_node *phylink_node;
 	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
 	int clk_csr;
-- 
cgit v1.2.3


From 82b11f071936a11094e1c44730030cd3d894e0b4 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Wed, 12 Jun 2019 15:20:12 +0300
Subject: net/mlx5: Expose eswitch encap mode

Add API to get the current Eswitch encap mode.
It will be used in downstream patches to check if
flow table can be created with encap support or not.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 11 +++++++++++
 include/linux/mlx5/eswitch.h                      | 12 ++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a4df109fbeb7..12010f85fa35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2457,6 +2457,17 @@ u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
 
+enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
+{
+	struct mlx5_eswitch *esw;
+
+	esw = dev->priv.eswitch;
+	return ESW_ALLOWED(esw) ? esw->offloads.encap :
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode);
+
 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 {
 	if ((dev0->priv.eswitch->mode == SRIOV_NONE &&
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index d81ee4df181c..174eec0871d9 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -7,6 +7,7 @@
 #define _MLX5_ESWITCH_
 
 #include <linux/mlx5/driver.h>
+#include <net/devlink.h>
 
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
 
@@ -62,4 +63,15 @@ u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw);
 struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 				    int vport, u32 sqn);
+
+#ifdef CONFIG_MLX5_ESWITCH
+enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
+#else  /* CONFIG_MLX5_ESWITCH */
+static inline enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
+{
+	return DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+}
+#endif /* CONFIG_MLX5_ESWITCH */
 #endif
-- 
cgit v1.2.3


From 7e770b252a62e7498cfa9411018100fd86e56d47 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:30 +0200
Subject: net: stmmac: drop the reset GPIO from struct stmmac_mdio_bus_data

No platform uses the "reset_gpio" field from stmmac_mdio_bus_data
anymore. Drop it so we don't get any new consumers either.

Plain GPIO numbers are being deprecated in favor of GPIO descriptors. If
needed any new non-OF platform can add a GPIO descriptor lookup table.
devm_gpiod_get_optional() will find the GPIO in that case.

Suggested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 29 +++++++----------------
 include/linux/stmmac.h                            |  1 -
 2 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 4614f1f2bffb..459ef8afe4fb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -253,21 +253,15 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	if (priv->device->of_node) {
 		struct gpio_desc *reset_gpio;
 
-		if (data->reset_gpio < 0) {
-			reset_gpio = devm_gpiod_get_optional(priv->device,
-							     "snps,reset",
-							     GPIOD_OUT_LOW);
-			if (IS_ERR(reset_gpio))
-				return PTR_ERR(reset_gpio);
-
-			device_property_read_u32_array(priv->device,
-						       "snps,reset-delays-us",
-						       data->delays, 3);
-		} else {
-			reset_gpio = gpio_to_desc(data->reset_gpio);
-
-			gpiod_direction_output(reset_gpio, 0);
-		}
+		reset_gpio = devm_gpiod_get_optional(priv->device,
+						     "snps,reset",
+						     GPIOD_OUT_LOW);
+		if (IS_ERR(reset_gpio))
+			return PTR_ERR(reset_gpio);
+
+		device_property_read_u32_array(priv->device,
+					       "snps,reset-delays-us",
+					       data->delays, 3);
 
 		if (data->delays[0])
 			msleep(DIV_ROUND_UP(data->delays[0], 1000));
@@ -323,11 +317,6 @@ int stmmac_mdio_register(struct net_device *ndev)
 	if (mdio_bus_data->irqs)
 		memcpy(new_bus->irq, mdio_bus_data->irqs, sizeof(new_bus->irq));
 
-#ifdef CONFIG_OF
-	if (priv->device->of_node)
-		mdio_bus_data->reset_gpio = -1;
-#endif
-
 	new_bus->name = "stmmac";
 
 	if (priv->plat->has_xgmac) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a3c2d9945bcf..a0cc6fa4965b 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -97,7 +97,6 @@ struct stmmac_mdio_bus_data {
 	int *irqs;
 	int probed_phy_irq;
 #ifdef CONFIG_OF
-	int reset_gpio;
 	u32 delays[3];
 #endif
 };
-- 
cgit v1.2.3


From ce4ab73ab0c27c6a3853695aa8ec0f453c6329cd Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:31 +0200
Subject: net: stmmac: drop the reset delays from struct stmmac_mdio_bus_data

Only OF platforms use the reset delays and these delays are only read in
stmmac_mdio_reset(). Move them from struct stmmac_mdio_bus_data to a
stack variable inside stmmac_mdio_reset() because that's the only usage
of these delays.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 15 ++++++++-------
 include/linux/stmmac.h                            |  3 ---
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 459ef8afe4fb..c9454cf4f189 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -252,6 +252,7 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
 		struct gpio_desc *reset_gpio;
+		u32 delays[3];
 
 		reset_gpio = devm_gpiod_get_optional(priv->device,
 						     "snps,reset",
@@ -261,18 +262,18 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 
 		device_property_read_u32_array(priv->device,
 					       "snps,reset-delays-us",
-					       data->delays, 3);
+					       delays, ARRAY_SIZE(delays));
 
-		if (data->delays[0])
-			msleep(DIV_ROUND_UP(data->delays[0], 1000));
+		if (delays[0])
+			msleep(DIV_ROUND_UP(delays[0], 1000));
 
 		gpiod_set_value_cansleep(reset_gpio, 1);
-		if (data->delays[1])
-			msleep(DIV_ROUND_UP(data->delays[1], 1000));
+		if (delays[1])
+			msleep(DIV_ROUND_UP(delays[1], 1000));
 
 		gpiod_set_value_cansleep(reset_gpio, 0);
-		if (data->delays[2])
-			msleep(DIV_ROUND_UP(data->delays[2], 1000));
+		if (delays[2])
+			msleep(DIV_ROUND_UP(delays[2], 1000));
 	}
 #endif
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a0cc6fa4965b..7c8328edd501 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -96,9 +96,6 @@ struct stmmac_mdio_bus_data {
 	unsigned int phy_mask;
 	int *irqs;
 	int probed_phy_irq;
-#ifdef CONFIG_OF
-	u32 delays[3];
-#endif
 };
 
 struct stmmac_dma_cfg {
-- 
cgit v1.2.3


From fead5b1b5838ba2f231d76e1b8ed31a4e9449382 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:32 +0200
Subject: net: stmmac: drop the phy_reset hook from struct stmmac_mdio_bus_data

The phy_reset hook is not set anywhere. Drop it to make
stmmac_mdio_reset() smaller.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 6 ------
 include/linux/stmmac.h                            | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index c9454cf4f189..14aa3ee14082 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -247,7 +247,6 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	struct net_device *ndev = bus->priv;
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
-	struct stmmac_mdio_bus_data *data = priv->plat->mdio_bus_data;
 
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
@@ -277,11 +276,6 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	}
 #endif
 
-	if (data->phy_reset) {
-		netdev_dbg(ndev, "stmmac_mdio_reset: calling phy_reset\n");
-		data->phy_reset(priv->plat->bsp_priv);
-	}
-
 	/* This is a workaround for problems with the STE101P PHY.
 	 * It doesn't complete its reset until at least one clock cycle
 	 * on MDC, so perform a dummy mdio read. To be updated for GMAC4
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 7c8328edd501..6dfb5aa75b0c 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -92,7 +92,6 @@
 /* Platfrom data for platform device structure's platform_data field */
 
 struct stmmac_mdio_bus_data {
-	int (*phy_reset)(void *priv);
 	unsigned int phy_mask;
 	int *irqs;
 	int probed_phy_irq;
-- 
cgit v1.2.3


From 013e868bc9465452c7b667830712ab57de236d08 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 15 May 2019 15:38:47 +0530
Subject: mfd: lp87565: Add support for 4-phase LP87561 combination

Add support for 4-phase LP87561 combination.

Data Sheet: https://www.ti.com/lit/ds/symlink/lp87561-q1.pdf

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lp87565.c       | 4 ++++
 include/linux/mfd/lp87565.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/lp87565.c b/drivers/mfd/lp87565.c
index 32d2a07d4354..8ad688fe75f9 100644
--- a/drivers/mfd/lp87565.c
+++ b/drivers/mfd/lp87565.c
@@ -33,6 +33,10 @@ static const struct of_device_id of_lp87565_match_table[] = {
 		.compatible = "ti,lp87565-q1",
 		.data = (void *)LP87565_DEVICE_TYPE_LP87565_Q1,
 	},
+	{
+		.compatible = "ti,lp87561-q1",
+		.data = (void *)LP87565_DEVICE_TYPE_LP87561_Q1,
+	},
 	{}
 };
 MODULE_DEVICE_TABLE(of, of_lp87565_match_table);
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index d0c91ba65525..976447607ea2 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -17,6 +17,7 @@
 
 enum lp87565_device_type {
 	LP87565_DEVICE_TYPE_UNKNOWN	= 0,
+	LP87565_DEVICE_TYPE_LP87561_Q1,
 	LP87565_DEVICE_TYPE_LP87565_Q1,
 };
 
@@ -249,6 +250,7 @@ enum LP87565_regulator_id {
 	LP87565_BUCK_3,
 	LP87565_BUCK_10,
 	LP87565_BUCK_23,
+	LP87565_BUCK_3210,
 };
 
 /**
-- 
cgit v1.2.3


From c2ba8a15f310d915f8748dd8324c91c82b12b5ff Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 12 Jun 2019 11:57:30 +0200
Subject: jump_label: Batch updates if arch supports it

If the architecture supports the batching of jump label updates, use it!

An easy way to see the benefits of this patch is switching the
schedstats on and off. For instance:

-------------------------- %< ----------------------------
  #!/bin/sh
  while [ true ]; do
      sysctl -w kernel.sched_schedstats=1
      sleep 2
      sysctl -w kernel.sched_schedstats=0
      sleep 2
  done
-------------------------- >% ----------------------------

while watching the IPI count:

-------------------------- %< ----------------------------
  # watch -n1 "cat /proc/interrupts | grep Function"
-------------------------- >% ----------------------------

With the current mode, it is possible to see +- 168 IPIs each 2 seconds,
while with this patch the number of IPIs goes to 3 each 2 seconds.

Regarding the performance impact of this patch set, I made two measurements:

    The time to update a key (the task that is causing the change)
    The time to run the int3 handler (the side effect on a thread that
                                      hits the code being changed)

The schedstats static key was chosen as the key to being switched on and off.
The reason being is that it is used in more than 56 places, in a hot path. The
change in the schedstats static key will be done with the following command:

while [ true ]; do
    sysctl -w kernel.sched_schedstats=1
    usleep 500000
    sysctl -w kernel.sched_schedstats=0
    usleep 500000
done

In this way, they key will be updated twice per second. To force the hit of the
int3 handler, the system will also run a kernel compilation with two jobs per
CPU. The test machine is a two nodes/24 CPUs box with an Intel Xeon processor
@2.27GHz.

Regarding the update part, on average, the regular kernel takes 57 ms to update
the schedstats key, while the kernel with the batch updates takes just 1.4 ms
on average. Although it seems to be too good to be true, it makes sense: the
schedstats key is used in 56 places, so it was expected that it would take
around 56 times to update the keys with the current implementation, as the
IPIs are the most expensive part of the update.

Regarding the int3 handler, the non-batch handler takes 45 ns on average, while
the batch version takes around 180 ns. At first glance, it seems to be a high
value. But it is not, considering that it is doing 56 updates, rather than one!
It is taking four times more, only. This gain is possible because the patch
uses a binary search in the vector: log2(56)=5.8. So, it was expected to have
an overhead within four times.

(voice of tv propaganda) But, that is not all! As the int3 handler keeps on for
a shorter period (because the update part is on for a shorter time), the number
of hits in the int3 handler decreased by 10%.

The question then is: Is it worth paying the price of "135 ns" more in the int3
handler?

Considering that, in this test case, we are saving the handling of 53 IPIs,
that takes more than these 135 ns, it seems to be a meager price to be paid.
Moreover, the test case was forcing the hit of the int3, in practice, it
does not take that often. While the IPI takes place on all CPUs, hitting
the int3 handler or not!

For instance, in an isolated CPU with a process running in user-space
(nohz_full use-case), the chances of hitting the int3 handler is barely zero,
while there is no way to avoid the IPIs. By bounding the IPIs, we are improving
a lot this scenario.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris von Recklinghausen <crecklin@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Wood <swood@redhat.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/acc891dbc2dbc9fd616dd680529a2337b1d1274c.1560325897.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h |  3 +++
 kernel/jump_label.c        | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 3e113a1fa0f1..3526c0aee954 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -215,6 +215,9 @@ extern void arch_jump_label_transform(struct jump_entry *entry,
 				      enum jump_label_type type);
 extern void arch_jump_label_transform_static(struct jump_entry *entry,
 					     enum jump_label_type type);
+extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
+					    enum jump_label_type type);
+extern void arch_jump_label_transform_apply(void);
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index ca00ac10d9b9..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -414,6 +414,7 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
 	return true;
 }
 
+#ifndef HAVE_JUMP_LABEL_BATCH
 static void __jump_label_update(struct static_key *key,
 				struct jump_entry *entry,
 				struct jump_entry *stop,
@@ -424,6 +425,28 @@ static void __jump_label_update(struct static_key *key,
 			arch_jump_label_transform(entry, jump_label_type(entry));
 	}
 }
+#else
+static void __jump_label_update(struct static_key *key,
+				struct jump_entry *entry,
+				struct jump_entry *stop,
+				bool init)
+{
+	for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+
+		if (!jump_label_can_update(entry, init))
+			continue;
+
+		if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+			/*
+			 * Queue is full: Apply the current queue and try again.
+			 */
+			arch_jump_label_transform_apply();
+			BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
+		}
+	}
+	arch_jump_label_transform_apply();
+}
+#endif
 
 void __init jump_label_init(void)
 {
-- 
cgit v1.2.3


From 9ffbe8ac05dbb4ab4a4836a55a47fc6be945a38f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 31 May 2019 13:06:51 +0300
Subject: locking/lockdep: Rename lockdep_assert_held_exclusive() ->
 lockdep_assert_held_write()

All callers of lockdep_assert_held_exclusive() use it to verify the
correct locking state of either a semaphore (ldisc_sem in tty,
mmap_sem for perf events, i_rwsem of inode for dax) or rwlock by
apparmor. Thus it makes sense to rename _exclusive to _write since
that's the semantics callers care. Additionally there is already
lockdep_assert_held_read(), which this new naming is more consistent with.

No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190531100651.3969-1-nborisov@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c           | 2 +-
 drivers/infiniband/core/device.c | 2 +-
 drivers/tty/tty_ldisc.c          | 8 ++++----
 fs/dax.c                         | 2 +-
 include/linux/lockdep.h          | 4 ++--
 security/apparmor/label.c        | 8 ++++----
 6 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f315425d8468..cf91d80b8452 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2179,7 +2179,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 	 * For now, this can't happen because all callers hold mmap_sem
 	 * for write.  If this changes, we'll need a different solution.
 	 */
-	lockdep_assert_held_exclusive(&mm->mmap_sem);
+	lockdep_assert_held_write(&mm->mmap_sem);
 
 	if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
 		on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 29f7b15c81d9..d020bb4d03d5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -457,7 +457,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name)
 	int rc;
 	int i;
 
-	lockdep_assert_held_exclusive(&devices_rwsem);
+	lockdep_assert_held_write(&devices_rwsem);
 	ida_init(&inuse);
 	xa_for_each (&devices, index, device) {
 		char buf[IB_DEVICE_NAME_MAX];
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index e38f104db174..fde8d4073e74 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -487,7 +487,7 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
 
 static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 {
-	lockdep_assert_held_exclusive(&tty->ldisc_sem);
+	lockdep_assert_held_write(&tty->ldisc_sem);
 	WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags));
 	clear_bit(TTY_LDISC_OPEN, &tty->flags);
 	if (ld->ops->close)
@@ -509,7 +509,7 @@ static int tty_ldisc_failto(struct tty_struct *tty, int ld)
 	struct tty_ldisc *disc = tty_ldisc_get(tty, ld);
 	int r;
 
-	lockdep_assert_held_exclusive(&tty->ldisc_sem);
+	lockdep_assert_held_write(&tty->ldisc_sem);
 	if (IS_ERR(disc))
 		return PTR_ERR(disc);
 	tty->ldisc = disc;
@@ -633,7 +633,7 @@ EXPORT_SYMBOL_GPL(tty_set_ldisc);
  */
 static void tty_ldisc_kill(struct tty_struct *tty)
 {
-	lockdep_assert_held_exclusive(&tty->ldisc_sem);
+	lockdep_assert_held_write(&tty->ldisc_sem);
 	if (!tty->ldisc)
 		return;
 	/*
@@ -681,7 +681,7 @@ int tty_ldisc_reinit(struct tty_struct *tty, int disc)
 	struct tty_ldisc *ld;
 	int retval;
 
-	lockdep_assert_held_exclusive(&tty->ldisc_sem);
+	lockdep_assert_held_write(&tty->ldisc_sem);
 	ld = tty_ldisc_get(tty, disc);
 	if (IS_ERR(ld)) {
 		BUG_ON(disc == N_TTY);
diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..bf8686d48b2d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1188,7 +1188,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 	unsigned flags = 0;
 
 	if (iov_iter_rw(iter) == WRITE) {
-		lockdep_assert_held_exclusive(&inode->i_rwsem);
+		lockdep_assert_held_write(&inode->i_rwsem);
 		flags |= IOMAP_WRITE;
 	} else {
 		lockdep_assert_held(&inode->i_rwsem);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 30a0f81aa130..151d55711082 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -394,7 +394,7 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
-#define lockdep_assert_held_exclusive(l)	do {			\
+#define lockdep_assert_held_write(l)	do {			\
 		WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));	\
 	} while (0)
 
@@ -479,7 +479,7 @@ struct lockdep_map { };
 #define lockdep_is_held_type(l, r)		(1)
 
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
-#define lockdep_assert_held_exclusive(l)	do { (void)(l); } while (0)
+#define lockdep_assert_held_write(l)	do { (void)(l); } while (0)
 #define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
diff --git a/security/apparmor/label.c b/security/apparmor/label.c
index 068e93c5d29c..59f1cc2557a7 100644
--- a/security/apparmor/label.c
+++ b/security/apparmor/label.c
@@ -76,7 +76,7 @@ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new)
 
 	AA_BUG(!orig);
 	AA_BUG(!new);
-	lockdep_assert_held_exclusive(&labels_set(orig)->lock);
+	lockdep_assert_held_write(&labels_set(orig)->lock);
 
 	tmp = rcu_dereference_protected(orig->proxy->label,
 					&labels_ns(orig)->lock);
@@ -566,7 +566,7 @@ static bool __label_remove(struct aa_label *label, struct aa_label *new)
 
 	AA_BUG(!ls);
 	AA_BUG(!label);
-	lockdep_assert_held_exclusive(&ls->lock);
+	lockdep_assert_held_write(&ls->lock);
 
 	if (new)
 		__aa_proxy_redirect(label, new);
@@ -603,7 +603,7 @@ static bool __label_replace(struct aa_label *old, struct aa_label *new)
 	AA_BUG(!ls);
 	AA_BUG(!old);
 	AA_BUG(!new);
-	lockdep_assert_held_exclusive(&ls->lock);
+	lockdep_assert_held_write(&ls->lock);
 	AA_BUG(new->flags & FLAG_IN_TREE);
 
 	if (!label_is_stale(old))
@@ -640,7 +640,7 @@ static struct aa_label *__label_insert(struct aa_labelset *ls,
 	AA_BUG(!ls);
 	AA_BUG(!label);
 	AA_BUG(labels_set(label) != ls);
-	lockdep_assert_held_exclusive(&ls->lock);
+	lockdep_assert_held_write(&ls->lock);
 	AA_BUG(label->flags & FLAG_IN_TREE);
 
 	/* Figure out where to put new node */
-- 
cgit v1.2.3


From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:00 -0400
Subject: locking/rwsem: Make owner available even if
 !CONFIG_RWSEM_SPIN_ON_OWNER

The owner field in the rw_semaphore structure is used primarily for
optimistic spinning. However, identifying the rwsem owner can also be
helpful in debugging as well as tracing locking related issues when
analyzing crash dump. The owner field may also store state information
that can be important to the operation of the rwsem.

So the owner field is now made a permanent member of the rw_semaphore
structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h       |  9 +++++----
 kernel/locking/rwsem-xadd.c |  2 +-
 kernel/locking/rwsem.h      | 23 -----------------------
 lib/Kconfig.debug           |  8 ++++----
 4 files changed, 10 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 2ea18a3def04..148983e21d47 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -34,12 +34,12 @@
  */
 struct rw_semaphore {
 	atomic_long_t count;
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	/*
-	 * Write owner. Used as a speculative check to see
-	 * if the owner is running on the cpu.
+	 * Write owner or one of the read owners. Can be used as a
+	 * speculative check to see if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 	raw_spinlock_t wait_lock;
@@ -73,13 +73,14 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 #endif
 
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED
 #else
 #define __RWSEM_OPT_INIT(lockname)
 #endif
 
 #define __RWSEM_INITIALIZER(name)				\
 	{ __RWSEM_INIT_COUNT(name),				\
+	  .owner = NULL,					\
 	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0b1f77957240..c0500679fd2f 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -86,8 +86,8 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	sem->owner = NULL;
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	osq_lock_init(&sem->osq);
 #endif
 }
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..eb9c8534299b 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -61,7 +61,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
  * All writes to owner are protected by WRITE_ONCE() to make sure that
  * store tearing can't happen as optimistic spinners may read and use
@@ -126,7 +125,6 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
  * real owner or one of the real owners. The only exception is when the
  * unlock is done by up_read_non_owner().
  */
-#define rwsem_clear_reader_owned rwsem_clear_reader_owned
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 	unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
@@ -135,28 +133,7 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 		cmpxchg_relaxed((unsigned long *)&sem->owner, val,
 				RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
 }
-#endif
-
 #else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
-					   struct task_struct *owner)
-{
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-
-#ifndef rwsem_clear_reader_owned
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cbdfae379896..417bdd9e80fb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1095,7 +1095,7 @@ config PROVE_LOCKING
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
-	select DEBUG_RWSEMS if RWSEM_SPIN_ON_OWNER
+	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
 	select TRACE_IRQFLAGS
@@ -1199,10 +1199,10 @@ config DEBUG_WW_MUTEX_SLOWPATH
 
 config DEBUG_RWSEMS
 	bool "RW Semaphore debugging: basic checks"
-	depends on DEBUG_KERNEL && RWSEM_SPIN_ON_OWNER
+	depends on DEBUG_KERNEL
 	help
-	  This debugging feature allows mismatched rw semaphore locks and unlocks
-	  to be detected and reported.
+	  This debugging feature allows mismatched rw semaphore locks
+	  and unlocks to be detected and reported.
 
 config DEBUG_LOCK_ALLOC
 	bool "Lock debugging: detect incorrect freeing of live locks"
-- 
cgit v1.2.3


From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:07 -0400
Subject: locking/rwsem: Always release wait_lock before waking up tasks

With the use of wake_q, we can do task wakeups without holding the
wait_lock. There is one exception in the rwsem code, though. It is
when the writer in the slowpath detects that there are waiters ahead
but the rwsem is not held by a writer. This can lead to a long wait_lock
hold time especially when a large number of readers are to be woken up.

Remediate this situation by releasing the wait_lock before waking
up tasks and re-acquiring it afterward. The rwsem_try_write_lock()
function is also modified to read the rwsem count directly to avoid
stale count value.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/wake_q.h |  5 +++++
 kernel/locking/rwsem.c       | 31 +++++++++++++++----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index ad826d2a4557..26a2013ac39c 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -51,6 +51,11 @@ static inline void wake_q_init(struct wake_q_head *head)
 	head->lastp = &head->first;
 }
 
+static inline bool wake_q_empty(struct wake_q_head *head)
+{
+	return head->first == WAKE_Q_TAIL;
+}
+
 extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
 extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index decda9fb8c6d..5532304406f7 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -400,13 +400,14 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
  * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
  * bit is set or the lock is acquired with handoff bit cleared.
  */
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem,
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 					enum writer_wait_state wstate)
 {
-	long new;
+	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
 
+	count = atomic_long_read(&sem->count);
 	do {
 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 
@@ -751,26 +752,25 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 					? RWSEM_WAKE_READERS
 					: RWSEM_WAKE_ANY, &wake_q);
 
-		/*
-		 * The wakeup is normally called _after_ the wait_lock
-		 * is released, but given that we are proactively waking
-		 * readers we can deal with the wake_q overhead as it is
-		 * similar to releasing and taking the wait_lock again
-		 * for attempting rwsem_try_write_lock().
-		 */
-		wake_up_q(&wake_q);
-
-		/* We need wake_q again below, reinitialize */
-		wake_q_init(&wake_q);
+		if (!wake_q_empty(&wake_q)) {
+			/*
+			 * We want to minimize wait_lock hold time especially
+			 * when a large number of readers are to be woken up.
+			 */
+			raw_spin_unlock_irq(&sem->wait_lock);
+			wake_up_q(&wake_q);
+			wake_q_init(&wake_q);	/* Used again, reinit */
+			raw_spin_lock_irq(&sem->wait_lock);
+		}
 	} else {
-		count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count);
+		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
 	}
 
 wait:
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	while (true) {
-		if (rwsem_try_write_lock(count, sem, wstate))
+		if (rwsem_try_write_lock(sem, wstate))
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
@@ -811,7 +811,6 @@ wait:
 		}
 
 		raw_spin_lock_irq(&sem->wait_lock);
-		count = atomic_long_read(&sem->count);
 	}
 	__set_current_state(TASK_RUNNING);
 	list_del(&waiter.list);
-- 
cgit v1.2.3


From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:10 -0400
Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit

Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an
anonymous owner - readers or an anonymous writer. The setting of this
anonymous bit is used as an indicator that optimistic spinning cannot
be done on this rwsem.

With the upcoming reader optimistic spinning patches, a reader-owned
rwsem can be spinned on for a limit period of time. We still need
this bit to indicate a rwsem is nonspinnable, but not setting this
bit loses its meaning that the owner is known. So rename the bit
to RWSEM_NONSPINNABLE to clarify its meaning.

This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write().

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h  |  2 +-
 kernel/locking/rwsem.c | 43 +++++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 148983e21d47..bb76e82398b2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -50,7 +50,7 @@ struct rw_semaphore {
 };
 
 /*
- * Setting bit 1 of the owner field but not bit 0 will indicate
+ * Setting all bits of the owner field except bit 0 will indicate
  * that the rwsem is writer-owned with an unknown owner.
  */
 #define RWSEM_OWNER_UNKNOWN	((struct task_struct *)-2L)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ded96023f4dc..180455b6b0d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -33,17 +33,18 @@
 /*
  * The least significant 2 bits of the owner value has the following
  * meanings when set.
- *  - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
- *  - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
- *    i.e. the owner(s) cannot be readily determined. It can be reader
- *    owned or the owning writer is indeterminate.
+ *  - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ *  - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem
+ *    The rwsem is anonymously owned, i.e. the owner(s) cannot be
+ *    readily determined. It can be reader owned or the owning writer
+ *    is indeterminate.
  *
  * When a writer acquires a rwsem, it puts its task_struct pointer
  * into the owner field. It is cleared after an unlock.
  *
  * When a reader acquires a rwsem, it will also puts its task_struct
  * pointer into the owner field with both the RWSEM_READER_OWNED and
- * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
+ * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will
  * largely be left untouched. So for a free or reader-owned rwsem,
  * the owner value may contain information about the last reader that
  * acquires the rwsem. The anonymous bit is set because that particular
@@ -55,7 +56,8 @@
  * a rwsem, but the overhead is simply too big.
  */
 #define RWSEM_READER_OWNED	(1UL << 0)
-#define RWSEM_ANONYMOUSLY_OWNED	(1UL << 1)
+#define RWSEM_NONSPINNABLE	(1UL << 1)
+#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
@@ -132,7 +134,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
 					    struct task_struct *owner)
 {
 	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
-						 | RWSEM_ANONYMOUSLY_OWNED;
+						 | RWSEM_NONSPINNABLE;
 
 	WRITE_ONCE(sem->owner, (struct task_struct *)val);
 }
@@ -144,20 +146,12 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 
 /*
  * Return true if the a rwsem waiter can spin on the rwsem's owner
- * and steal the lock, i.e. the lock is not anonymously owned.
+ * and steal the lock.
  * N.B. !owner is considered spinnable.
  */
 static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 {
-	return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
-}
-
-/*
- * Return true if rwsem is owned by an anonymous writer or readers.
- */
-static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
-{
-	return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
+	return !((unsigned long)owner & RWSEM_NONSPINNABLE);
 }
 
 #ifdef CONFIG_DEBUG_RWSEMS
@@ -170,10 +164,10 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 	unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
-						   | RWSEM_ANONYMOUSLY_OWNED;
+						   | RWSEM_NONSPINNABLE;
 	if (READ_ONCE(sem->owner) == (struct task_struct *)val)
 		cmpxchg_relaxed((unsigned long *)&sem->owner, val,
-				RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
+				RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
 }
 #else
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
@@ -495,7 +489,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 	struct task_struct *owner;
 	bool ret = true;
 
-	BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+	BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN));
 
 	if (need_resched())
 		return false;
@@ -534,7 +528,7 @@ static inline enum owner_state rwsem_owner_state(unsigned long owner)
 	if (!owner)
 		return OWNER_NULL;
 
-	if (owner & RWSEM_ANONYMOUSLY_OWNED)
+	if (owner & RWSEM_NONSPINNABLE)
 		return OWNER_NONSPINNABLE;
 
 	if (owner & RWSEM_READER_OWNED)
@@ -1043,7 +1037,12 @@ static inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+	/*
+	 * sem->owner may differ from current if the ownership is transferred
+	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+	 */
+	DEBUG_RWSEMS_WARN_ON((sem->owner != current) &&
+			    !((long)sem->owner & RWSEM_NONSPINNABLE), sem);
 	rwsem_clear_owner(sem);
 	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
 	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
-- 
cgit v1.2.3


From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:12 -0400
Subject: locking/rwsem: Make rwsem->owner an atomic_long_t

The rwsem->owner contains not just the task structure pointer, it also
holds some flags for storing the current state of the rwsem. Some of
the flags may have to be atomically updated. To reflect the new reality,
the owner is now changed to an atomic_long_t type.

New helper functions are added to properly separate out the task
structure pointer and the embedded flags.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h |   4 +-
 include/linux/rwsem.h        |  11 ++--
 kernel/locking/rwsem.c       | 125 +++++++++++++++++++++++++++----------------
 3 files changed, 88 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 03cb4b6f842e..0a43830f1932 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -117,7 +117,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 	lock_release(&sem->rw_sem.dep_map, 1, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
-		sem->rw_sem.owner = RWSEM_OWNER_UNKNOWN;
+		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
 #endif
 }
 
@@ -127,7 +127,7 @@ static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
-		sem->rw_sem.owner = current;
+		atomic_long_set(&sem->rw_sem.owner, (long)current);
 #endif
 }
 
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index bb76e82398b2..e401358c4e7e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -35,10 +35,11 @@
 struct rw_semaphore {
 	atomic_long_t count;
 	/*
-	 * Write owner or one of the read owners. Can be used as a
-	 * speculative check to see if the owner is running on the cpu.
+	 * Write owner or one of the read owners as well flags regarding
+	 * the current state of the rwsem. Can be used as a speculative
+	 * check to see if the write owner is running on the cpu.
 	 */
-	struct task_struct *owner;
+	atomic_long_t owner;
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
@@ -53,7 +54,7 @@ struct rw_semaphore {
  * Setting all bits of the owner field except bit 0 will indicate
  * that the rwsem is writer-owned with an unknown owner.
  */
-#define RWSEM_OWNER_UNKNOWN	((struct task_struct *)-2L)
+#define RWSEM_OWNER_UNKNOWN	(-2L)
 
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
@@ -80,7 +81,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 
 #define __RWSEM_INITIALIZER(name)				\
 	{ __RWSEM_INIT_COUNT(name),				\
-	  .owner = NULL,					\
+	  .owner = ATOMIC_LONG_INIT(0),				\
 	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 985a03ad3f8c..fae557be8334 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -64,7 +64,7 @@
 	if (!debug_locks_silent &&				\
 	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
 		#c, atomic_long_read(&(sem)->count),		\
-		(long)((sem)->owner), (long)current,		\
+		atomic_long_read(&(sem)->owner), (long)current,	\
 		list_empty(&(sem)->wait_list) ? "" : "not "))	\
 			debug_locks_off();			\
 	} while (0)
@@ -114,12 +114,20 @@
  */
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
-	WRITE_ONCE(sem->owner, current);
+	atomic_long_set(&sem->owner, (long)current);
 }
 
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
-	WRITE_ONCE(sem->owner, NULL);
+	atomic_long_set(&sem->owner, 0);
+}
+
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+	return atomic_long_read(&sem->owner) & flags;
 }
 
 /*
@@ -133,10 +141,9 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
 					    struct task_struct *owner)
 {
-	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
-						 | RWSEM_NONSPINNABLE;
+	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE;
 
-	WRITE_ONCE(sem->owner, (struct task_struct *)val);
+	atomic_long_set(&sem->owner, val);
 }
 
 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
@@ -145,13 +152,20 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 }
 
 /*
- * Return true if the a rwsem waiter can spin on the rwsem's owner
- * and steal the lock.
- * N.B. !owner is considered spinnable.
+ * Return true if the rwsem is owned by a reader.
  */
-static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 {
-	return !((unsigned long)owner & RWSEM_NONSPINNABLE);
+#ifdef CONFIG_DEBUG_RWSEMS
+	/*
+	 * Check the count to see if it is write-locked.
+	 */
+	long count = atomic_long_read(&sem->count);
+
+	if (count & RWSEM_WRITER_MASK)
+		return false;
+#endif
+	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
 }
 
 #ifdef CONFIG_DEBUG_RWSEMS
@@ -163,11 +177,13 @@ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
  */
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
-	unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
-						   | RWSEM_NONSPINNABLE;
-	if (READ_ONCE(sem->owner) == (struct task_struct *)val)
-		cmpxchg_relaxed((unsigned long *)&sem->owner, val,
-				RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
+	unsigned long val = atomic_long_read(&sem->owner);
+
+	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+		if (atomic_long_try_cmpxchg(&sem->owner, &val,
+					    val & RWSEM_OWNER_FLAGS_MASK))
+			return;
+	}
 }
 #else
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
@@ -175,6 +191,28 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 }
 #endif
 
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+	return (struct task_struct *)
+		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+	unsigned long owner = atomic_long_read(&sem->owner);
+
+	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
 /*
  * Guide to the rw_semaphore's count field.
  *
@@ -208,7 +246,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-	sem->owner = NULL;
+	atomic_long_set(&sem->owner, 0L);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	osq_lock_init(&sem->osq);
 #endif
@@ -511,9 +549,10 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
 	struct task_struct *owner;
+	unsigned long flags;
 	bool ret = true;
 
-	BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN));
+	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
 
 	if (need_resched()) {
 		lockevent_inc(rwsem_opt_fail);
@@ -522,11 +561,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 
 	preempt_disable();
 	rcu_read_lock();
-	owner = READ_ONCE(sem->owner);
-	if (owner) {
-		ret = is_rwsem_owner_spinnable(owner) &&
-		      owner_on_cpu(owner);
-	}
+	owner = rwsem_owner_flags(sem, &flags);
+	if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner)))
+		ret = false;
 	rcu_read_unlock();
 	preempt_enable();
 
@@ -553,25 +590,26 @@ enum owner_state {
 };
 #define OWNER_SPINNABLE		(OWNER_NULL | OWNER_WRITER)
 
-static inline enum owner_state rwsem_owner_state(unsigned long owner)
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags)
 {
-	if (!owner)
-		return OWNER_NULL;
-
-	if (owner & RWSEM_NONSPINNABLE)
+	if (flags & RWSEM_NONSPINNABLE)
 		return OWNER_NONSPINNABLE;
 
-	if (owner & RWSEM_READER_OWNED)
+	if (flags & RWSEM_READER_OWNED)
 		return OWNER_READER;
 
-	return OWNER_WRITER;
+	return owner ? OWNER_WRITER : OWNER_NULL;
 }
 
 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
-	struct task_struct *tmp, *owner = READ_ONCE(sem->owner);
-	enum owner_state state = rwsem_owner_state((unsigned long)owner);
+	struct task_struct *new, *owner;
+	unsigned long flags, new_flags;
+	enum owner_state state;
 
+	owner = rwsem_owner_flags(sem, &flags);
+	state = rwsem_owner_state(owner, flags);
 	if (state != OWNER_WRITER)
 		return state;
 
@@ -582,9 +620,9 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 			break;
 		}
 
-		tmp = READ_ONCE(sem->owner);
-		if (tmp != owner) {
-			state = rwsem_owner_state((unsigned long)tmp);
+		new = rwsem_owner_flags(sem, &new_flags);
+		if ((new != owner) || (new_flags != flags)) {
+			state = rwsem_owner_state(new, new_flags);
 			break;
 		}
 
@@ -1001,8 +1039,7 @@ inline void __down_read(struct rw_semaphore *sem)
 	if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
 			&sem->count) & RWSEM_READ_FAILED_MASK)) {
 		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
-		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-					RWSEM_READER_OWNED), sem);
+		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	} else {
 		rwsem_set_reader_owned(sem);
 	}
@@ -1014,8 +1051,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
 			&sem->count) & RWSEM_READ_FAILED_MASK)) {
 		if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
 			return -EINTR;
-		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-					RWSEM_READER_OWNED), sem);
+		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	} else {
 		rwsem_set_reader_owned(sem);
 	}
@@ -1084,7 +1120,7 @@ inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem);
+	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
 	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
@@ -1103,8 +1139,8 @@ static inline void __up_write(struct rw_semaphore *sem)
 	 * sem->owner may differ from current if the ownership is transferred
 	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
 	 */
-	DEBUG_RWSEMS_WARN_ON((sem->owner != current) &&
-			    !((long)sem->owner & RWSEM_NONSPINNABLE), sem);
+	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
 	rwsem_clear_owner(sem);
 	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
 	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
@@ -1125,7 +1161,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * read-locked region is ok to be re-ordered into the
 	 * write side. As such, rely on RELEASE semantics.
 	 */
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
 	tmp = atomic_long_fetch_add_release(
 		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
 	rwsem_set_reader_owned(sem);
@@ -1296,8 +1332,7 @@ EXPORT_SYMBOL(down_write_killable_nested);
 
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
-				sem);
+	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	__up_read(sem);
 }
 EXPORT_SYMBOL(up_read_non_owner);
-- 
cgit v1.2.3


From 9ed7d75b2f09d836e71d597cd5879abb1a44e7a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 Feb 2019 09:48:51 +0100
Subject: x86/percpu: Relax smp_processor_id()

Nadav reported that since this_cpu_read() became asm-volatile, many
smp_processor_id() users generated worse code due to the extra
constraints.

However since smp_processor_id() is reading a stable value, we can use
__this_cpu_read().

While this does reduce text size somewhat, this mostly results in code
movement to .text.unlikely as a result of more/larger .cold.
subfunctions. Less text on the hotpath is good for I$.

  $ ./compare.sh defconfig-build1 defconfig-build2 vmlinux.o
  setup_APIC_ibs                                             90         98   -12,+20
  force_ibs_eilvt_setup                                     400        413   -57,+70
  pci_serr_error                                            109        104   -54,+49
  pci_serr_error                                            109        104   -54,+49
  unknown_nmi_error                                         125        120   -76,+71
  unknown_nmi_error                                         125        120   -76,+71
  io_check_error                                            125        132   -97,+104
  intel_thermal_interrupt                                   730        822   +92,+0
  intel_init_thermal                                        951        945   -6,+0
  generic_get_mtrr                                          301        294   -7,+0
  generic_get_mtrr                                          301        294   -7,+0
  generic_set_all                                           749        754   -44,+49
  get_fixed_ranges                                          352        360   -41,+49
  x86_acpi_suspend_lowlevel                                 369        363   -6,+0
  check_tsc_sync_source                                     412        412   -71,+71
  irq_migrate_all_off_this_cpu                              662        674   -14,+26
  clocksource_watchdog                                      748        748   -113,+113
  __perf_event_account_interrupt                            204        197   -7,+0
  attempt_merge                                            1748       1741   -7,+0
  intel_guc_send_ct                                        1424       1409   -15,+0
  __fini_doorbell                                           235        231   -4,+0
  bdw_set_cdclk                                             928        923   -5,+0
  gen11_dsi_disable                                        1571       1556   -15,+0
  gmbus_wait                                                493        488   -5,+0
  md_make_request                                           376        369   -7,+0
  __split_and_process_bio                                   543        536   -7,+0
  delay_tsc                                                  96         89   -7,+0
  hsw_disable_pc8                                           696        691   -5,+0
  tsc_verify_tsc_adjust                                     215        228   -22,+35
  cpuidle_driver_unref                                       56         49   -7,+0
  blk_account_io_completion                                 159        148   -11,+0
  mtrr_wrmsr                                                 95         99   -29,+33
  __intel_wait_for_register_fw                              401        419   +18,+0
  cpuidle_driver_ref                                         43         36   -7,+0
  cpuidle_get_driver                                         15          8   -7,+0
  blk_account_io_done                                       535        528   -7,+0
  irq_migrate_all_off_this_cpu                              662        674   -14,+26
  check_tsc_sync_source                                     412        412   -71,+71
  irq_wait_for_poll                                         170        163   -7,+0
  generic_end_io_acct                                       329        322   -7,+0
  x86_acpi_suspend_lowlevel                                 369        363   -6,+0
  nohz_balance_enter_idle                                   198        191   -7,+0
  generic_start_io_acct                                     254        247   -7,+0
  blk_account_io_start                                      341        334   -7,+0
  perf_event_task_tick                                      682        675   -7,+0
  intel_init_thermal                                        951        945   -6,+0
  amd_e400_c1e_apic_setup                                    47         51   -28,+32
  setup_APIC_eilvt                                          350        328   -22,+0
  hsw_enable_pc8                                           1611       1605   -6,+0
                                               total   12985947   12985892   -994,+939

Reported-by: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/smp.h |  3 ++-
 include/linux/smp.h        | 45 +++++++++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index da545df207b2..0d3fe060a44f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -162,7 +162,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r);
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (this_cpu_read(cpu_number))
+#define raw_smp_processor_id()  this_cpu_read(cpu_number)
+#define __smp_processor_id() __this_cpu_read(cpu_number)
 
 #ifdef CONFIG_X86_32
 extern int safe_smp_processor_id(void);
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a56f08ff3097..aa9e5e82d8c3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -181,29 +181,46 @@ static inline int get_boot_cpu_id(void)
 
 #endif /* !SMP */
 
-/*
- * smp_processor_id(): get the current CPU ID.
+/**
+ * raw_processor_id() - get the current (unstable) CPU id
+ *
+ * For then you know what you are doing and need an unstable
+ * CPU id.
+ */
+
+/**
+ * smp_processor_id() - get the current (stable) CPU id
+ *
+ * This is the normal accessor to the CPU id and should be used
+ * whenever possible.
+ *
+ * The CPU id is stable when:
  *
- * if DEBUG_PREEMPT is enabled then we check whether it is
- * used in a preemption-safe way. (smp_processor_id() is safe
- * if it's used in a preemption-off critical section, or in
- * a thread that is bound to the current CPU.)
+ *  - IRQs are disabled;
+ *  - preemption is disabled;
+ *  - the task is CPU affine.
  *
- * NOTE: raw_smp_processor_id() is for internal use only
- * (smp_processor_id() is the preferred variant), but in rare
- * instances it might also be used to turn off false positives
- * (i.e. smp_processor_id() use that the debugging code reports but
- * which use for some reason is legal). Don't use this to hack around
- * the warning message, as your code might not work under PREEMPT.
+ * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN
+ * when smp_processor_id() is used when the CPU id is not stable.
  */
+
+/*
+ * Allow the architecture to differentiate between a stable and unstable read.
+ * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a
+ * regular asm read for the stable.
+ */
+#ifndef __smp_processor_id
+#define __smp_processor_id(x) raw_smp_processor_id(x)
+#endif
+
 #ifdef CONFIG_DEBUG_PREEMPT
   extern unsigned int debug_smp_processor_id(void);
 # define smp_processor_id() debug_smp_processor_id()
 #else
-# define smp_processor_id() raw_smp_processor_id()
+# define smp_processor_id() __smp_processor_id()
 #endif
 
-#define get_cpu()		({ preempt_disable(); smp_processor_id(); })
+#define get_cpu()		({ preempt_disable(); __smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
 /*
-- 
cgit v1.2.3


From e7488e58c7cfe4be0c52db68622a0397bb75258e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 20:59:01 +0200
Subject: platform/x86: wmi: Add function to get _UID of WMI device

Add a new function to acpi.h / wmi.c that returns _UID of the ACPI WMI
device. For example, it returns "ATK" for the following declaration in
DSDT:
Device (ATKD)
{
    Name (_HID, "PNP0C14" /* Windows Management Instrumentation Device */)
      // _HID: Hardware ID
    Name (_UID, "ATK")  // _UID: Unique ID
    ..

Generally, it is possible that multiple PNP0C14 ACPI devices are present in
the system as mentioned in the commit message of commit bff431e49ff5
("ACPI: WMI: Add ACPI-WMI mapping driver").

Therefore the _UID is returned for a specific ACPI device that declares the
given GUID, to which it is also mapped by other methods of wmi module.

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 19 +++++++++++++++++++
 include/linux/acpi.h       |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 7b26b6ccf1a0..b08ffb769cbe 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -635,6 +635,25 @@ bool wmi_has_guid(const char *guid_string)
 }
 EXPORT_SYMBOL_GPL(wmi_has_guid);
 
+/**
+ * wmi_get_acpi_device_uid() - Get _UID name of ACPI device that defines GUID
+ * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ *
+ * Find the _UID of ACPI device associated with this WMI GUID.
+ *
+ * Return: The ACPI _UID field value or NULL if the WMI GUID was not found
+ */
+char *wmi_get_acpi_device_uid(const char *guid_string)
+{
+	struct wmi_block *wblock = NULL;
+
+	if (!find_guid(guid_string, &wblock))
+		return NULL;
+
+	return acpi_device_uid(wblock->acpi_device);
+}
+EXPORT_SYMBOL_GPL(wmi_get_acpi_device_uid);
+
 static struct wmi_block *dev_to_wblock(struct device *dev)
 {
 	return container_of(dev, struct wmi_block, dev.dev);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 98440df7fe42..d867a9a904f9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -380,6 +380,7 @@ extern acpi_status wmi_install_notify_handler(const char *guid,
 extern acpi_status wmi_remove_notify_handler(const char *guid);
 extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out);
 extern bool wmi_has_guid(const char *guid);
+extern char *wmi_get_acpi_device_uid(const char *guid);
 
 #endif	/* CONFIG_ACPI_WMI */
 
-- 
cgit v1.2.3


From e0668f28888184f6c633110a37386f2d4a6fa00e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:00:31 +0200
Subject: platform/x86: asus-wmi: Improve DSTS WMI method ID detection

The DSTS method detection mistakenly selects DCTS instead of DSTS if
nothing is returned when the method ID is not defined in WMNB. As a result,
the control of keyboard backlight is not functional for TUF Gaming series
laptops. Implement detection based on _UID of the WMI device instead.

There is evidence that DCTS is handled by ACPI WMI devices that have _UID
ASUSWMI, whereas none of the devices without ASUSWMI respond to DCTS and
DSTS is used instead [1].

DSDT examples:

FX505GM (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (Zero)
    }
    ...
    // No return
}

K54C (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (0x02)
    }
    ...
    Return (0xFFFFFFFE)
}

[1] Link: https://lkml.org/lkml/2019/4/11/322

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/hid/hid-asus.c                     |  2 +-
 drivers/platform/x86/asus-wmi.c            | 23 ++++++++++++++++++++---
 include/linux/platform_data/x86/asus-wmi.h |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index 336aeaed1159..1d01fe23ca0c 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -396,7 +396,7 @@ static bool asus_kbd_wmi_led_control_present(struct hid_device *hdev)
 	if (!IS_ENABLED(CONFIG_ASUS_WMI))
 		return false;
 
-	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS2,
+	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS,
 				       ASUS_WMI_DEVID_KBD_BACKLIGHT, 0, &value);
 	hid_dbg(hdev, "WMI backlight check: rc %d value %x", ret, value);
 	if (ret)
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index c67f11e0d6e7..ef526dcfeac5 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -83,6 +83,8 @@ MODULE_LICENSE("GPL");
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
+#define ASUS_ACPI_UID_ASUSWMI		"ASUSWMI"
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static bool ashs_present(void)
@@ -1874,6 +1876,8 @@ static int asus_wmi_sysfs_init(struct platform_device *device)
  */
 static int asus_wmi_platform_init(struct asus_wmi *asus)
 {
+	struct device *dev = &asus->platform_device->dev;
+	char *wmi_uid;
 	int rv;
 
 	/* INIT enable hotkeys on some models */
@@ -1903,11 +1907,24 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 	 * Note, on most Eeepc, there is no way to check if a method exist
 	 * or note, while on notebooks, they returns 0xFFFFFFFE on failure,
 	 * but once again, SPEC may probably be used for that kind of things.
+	 *
+	 * Additionally at least TUF Gaming series laptops return nothing for
+	 * unknown methods, so the detection in this way is not possible.
+	 *
+	 * There is strong indication that only ACPI WMI devices that have _UID
+	 * equal to "ASUSWMI" use DCTS whereas those with "ATK" use DSTS.
 	 */
-	if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL))
+	wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID);
+	if (!wmi_uid)
+		return -ENODEV;
+
+	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) {
+		dev_info(dev, "Detected ASUSWMI, use DCTS\n");
+		asus->dsts_id = ASUS_WMI_METHODID_DCTS;
+	} else {
+		dev_info(dev, "Detected %s, not ASUSWMI, use DSTS\n", wmi_uid);
 		asus->dsts_id = ASUS_WMI_METHODID_DSTS;
-	else
-		asus->dsts_id = ASUS_WMI_METHODID_DSTS2;
+	}
 
 	/* CWAP allow to define the behavior of the Fn+F2 key,
 	 * this method doesn't seems to be present on Eee PCs */
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index bfba245636a7..0668f76df921 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -18,8 +18,8 @@
 #define ASUS_WMI_METHODID_GDSP		0x50534447 /* Get DiSPlay output */
 #define ASUS_WMI_METHODID_DEVP		0x50564544 /* DEVice Policy */
 #define ASUS_WMI_METHODID_OSVR		0x5256534F /* OS VeRsion */
-#define ASUS_WMI_METHODID_DSTS		0x53544344 /* Device STatuS */
-#define ASUS_WMI_METHODID_DSTS2		0x53545344 /* Device STatuS #2*/
+#define ASUS_WMI_METHODID_DCTS		0x53544344 /* Device status (DCTS) */
+#define ASUS_WMI_METHODID_DSTS		0x53545344 /* Device status (DSTS) */
 #define ASUS_WMI_METHODID_BSTS		0x53545342 /* Bios STatuS ? */
 #define ASUS_WMI_METHODID_DEVS		0x53564544 /* DEVice Set */
 #define ASUS_WMI_METHODID_CFVS		0x53564643 /* CPU Frequency Volt Set */
-- 
cgit v1.2.3


From b096f626a6827ad2ced5ebdbdc04e62422d463f6 Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:07:05 +0200
Subject: platform/x86: asus-wmi: Switch fan boost mode

The WMI exposes a write-only device ID where up to three fan modes can be
switched on some laptops (TUF Gaming FX505GM). There is a hotkey
combination Fn-F5 that does have a fan icon, which is designed to toggle
between fan modes. The DSTS of the device ID returns information about the
presence of this capability and the presence of each of the two additional
fan modes as a bitmask (0x01 - overboost present, 0x02 - silent present)
[1].

Add a SysFS entry that reads the last written value and updates value in
WMI on write and a hotkey handler that toggles the modes taking into
account their availability according to DSTS.

Modes:
* 0x00 - normal or balanced,
* 0x01 - overboost, increased fan RPM,
* 0x02 - silent, decreased fan RPM

[1] Link: https://lkml.org/lkml/2019/4/12/110

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |  10 ++
 drivers/platform/x86/asus-wmi.c                   | 151 ++++++++++++++++++++--
 include/linux/platform_data/x86/asus-wmi.h        |   1 +
 3 files changed, 154 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..87ae5cc983bf 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,13 @@ KernelVersion:	3.5
 Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/<platform>/fan_mode
+Date:		Apr 2019
+KernelVersion:	5.2
+Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
+Description:
+		Fan boost mode:
+			* 0 - normal,
+			* 1 - overboost,
+			* 2 - silent
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index a1d85667383c..5712bc56fa10 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -70,6 +70,7 @@ MODULE_LICENSE("GPL");
 #define NOTIFY_KBD_BRTUP		0xc4
 #define NOTIFY_KBD_BRTDWN		0xc5
 #define NOTIFY_KBD_BRTTOGGLE		0xc7
+#define NOTIFY_KBD_FBM			0x99
 
 #define ASUS_WMI_FNLOCK_BIOS_DISABLED	BIT(0)
 
@@ -80,6 +81,13 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
+#define ASUS_FAN_MODE_NORMAL		0
+#define ASUS_FAN_MODE_OVERBOOST		1
+#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_MODE_SILENT		2
+#define ASUS_FAN_MODE_SILENT_MASK	0x02
+#define ASUS_FAN_MODES_MASK		0x03
+
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
@@ -187,6 +195,10 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
+	bool fan_mode_available;
+	u8 fan_mode_mask;
+	u8 fan_mode;
+
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
 	struct mutex wmi_lock;
@@ -1483,6 +1495,116 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 	return 0;
 }
 
+/* Fan mode *******************************************************************/
+
+static int fan_mode_check_present(struct asus_wmi *asus)
+{
+	u32 result;
+	int err;
+
+	asus->fan_mode_available = false;
+
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	if (err) {
+		if (err == -ENODEV)
+			return 0;
+		else
+			return err;
+	}
+
+	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
+			(result & ASUS_FAN_MODES_MASK)) {
+		asus->fan_mode_available = true;
+		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+	}
+
+	return 0;
+}
+
+static int fan_mode_write(struct asus_wmi *asus)
+{
+	int err;
+	u8 value;
+	u32 retval;
+
+	value = asus->fan_mode;
+
+	pr_info("Set fan mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+
+	if (err) {
+		pr_warn("Failed to set fan mode: %d\n", err);
+		return err;
+	}
+
+	if (retval != 1) {
+		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int fan_mode_switch_next(struct asus_wmi *asus)
+{
+	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
+		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+		else
+			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	} else {
+		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	}
+
+	return fan_mode_write(asus);
+}
+
+static ssize_t fan_mode_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+}
+
+static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	int result;
+	u8 new_mode;
+
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	result = kstrtou8(buf, 10, &new_mode);
+	if (result < 0) {
+		pr_warn("Trying to store invalid value\n");
+		return result;
+	}
+
+	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+			return -EINVAL;
+	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+			return -EINVAL;
+	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+		return -EINVAL;
+	}
+
+	asus->fan_mode = new_mode;
+	fan_mode_write(asus);
+
+	return result;
+}
+
+// Fan mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_mode);
+
 /* Backlight ******************************************************************/
 
 static int read_backlight_power(struct asus_wmi *asus)
@@ -1761,6 +1883,11 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
+	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_mode_switch_next(asus);
+		return;
+	}
+
 	if (is_display_toggle(code) && asus->driver->quirks->no_display_toggle)
 		return;
 
@@ -1917,6 +2044,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
+	&dev_attr_fan_mode.attr,
 	NULL
 };
 
@@ -1938,6 +2066,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
+	else if (attr == &dev_attr_fan_mode.attr)
+		ok = asus->fan_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2037,12 +2167,7 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 		asus_wmi_set_devstate(ASUS_WMI_DEVID_CWAP,
 				      asus->driver->quirks->wapf, NULL);
 
-	return asus_wmi_sysfs_init(asus->platform_device);
-}
-
-static void asus_wmi_platform_exit(struct asus_wmi *asus)
-{
-	asus_wmi_sysfs_exit(asus->platform_device);
+	return 0;
 }
 
 /* debugfs ********************************************************************/
@@ -2200,6 +2325,14 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
+	err = fan_mode_check_present(asus);
+	if (err)
+		goto fail_fan_mode;
+
+	err = asus_wmi_sysfs_init(asus->platform_device);
+	if (err)
+		goto fail_sysfs;
+
 	err = asus_wmi_input_init(asus);
 	if (err)
 		goto fail_input;
@@ -2277,7 +2410,9 @@ fail_leds:
 fail_hwmon:
 	asus_wmi_input_exit(asus);
 fail_input:
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
+fail_sysfs:
+fail_fan_mode:
 fail_platform:
 	kfree(asus);
 	return err;
@@ -2294,7 +2429,7 @@ static int asus_wmi_remove(struct platform_device *device)
 	asus_wmi_led_exit(asus);
 	asus_wmi_rfkill_exit(asus);
 	asus_wmi_debugfs_exit(asus);
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
 	asus_hwmon_fan_set_auto(asus);
 
 	kfree(asus);
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 0668f76df921..8551156b8dca 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,6 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3


From a48e23385fcf397e69e2a75d72a81c545ec8bec2 Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:29 +0200
Subject: platform/x86: wmi: add context pointer field to struct wmi_device_id

When using wmi_install_notify_handler() to initialize a WMI handler a
data pointer can be supplied which will be passed on to the notification
handler. No similar feature exist when handling WMI events via struct
wmi_driver.

Add a context field pointer to struct wmi_device_id and add a function
find_guid_context() to retrieve that context pointer.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c      | 22 ++++++++++++++++++++++
 include/linux/mod_devicetable.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index b08ffb769cbe..f3be1c008856 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -146,6 +146,28 @@ static bool find_guid(const char *guid_string, struct wmi_block **out)
 	return false;
 }
 
+static const void *find_guid_context(struct wmi_block *wblock,
+				      struct wmi_driver *wdriver)
+{
+	const struct wmi_device_id *id;
+	uuid_le guid_input;
+
+	if (wblock == NULL || wdriver == NULL)
+		return NULL;
+	if (wdriver->id_table == NULL)
+		return NULL;
+
+	id = wdriver->id_table;
+	while (*id->guid_string) {
+		if (uuid_le_to_bin(id->guid_string, &guid_input))
+			continue;
+		if (!memcmp(wblock->gblock.guid, &guid_input, 16))
+			return id->context;
+		id++;
+	}
+	return NULL;
+}
+
 static int get_subobj_info(acpi_handle handle, const char *pathname,
 			   struct acpi_device_info **info)
 {
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..09366859aac2 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -798,6 +798,7 @@ struct tee_client_device_id {
  */
 struct wmi_device_id {
 	const char guid_string[UUID_STRING_LEN+1];
+	const void *context;
 };
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From 440c4983de262f78033ec58f6abcd199a664327d Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:30 +0200
Subject: platform/x86: wmi: add context argument to the probe function

The struct wmi_device_id has a context pointer field, forward this
pointer as an argument to the probe function in struct wmi_driver.

Update existing users of the same probe function to accept this new
context argument.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/dell-smbios-wmi.c       | 2 +-
 drivers/platform/x86/dell-wmi-descriptor.c   | 3 ++-
 drivers/platform/x86/dell-wmi.c              | 2 +-
 drivers/platform/x86/huawei-wmi.c            | 2 +-
 drivers/platform/x86/intel-wmi-thunderbolt.c | 3 ++-
 drivers/platform/x86/wmi-bmof.c              | 2 +-
 drivers/platform/x86/wmi.c                   | 3 ++-
 include/linux/wmi.h                          | 2 +-
 8 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c
index c3ed3c8c17b9..add2687079f7 100644
--- a/drivers/platform/x86/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell-smbios-wmi.c
@@ -146,7 +146,7 @@ fail_smbios_cmd:
 	return ret;
 }
 
-static int dell_smbios_wmi_probe(struct wmi_device *wdev)
+static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct wmi_driver *wdriver =
 		container_of(wdev->dev.driver, struct wmi_driver, driver);
diff --git a/drivers/platform/x86/dell-wmi-descriptor.c b/drivers/platform/x86/dell-wmi-descriptor.c
index 14ab250b7d5a..9994fd1a5acf 100644
--- a/drivers/platform/x86/dell-wmi-descriptor.c
+++ b/drivers/platform/x86/dell-wmi-descriptor.c
@@ -106,7 +106,8 @@ EXPORT_SYMBOL_GPL(dell_wmi_get_hotfix);
  * WMI buffer length        12       4    <length>
  * WMI hotfix number        16       4    <hotfix>
  */
-static int dell_wmi_descriptor_probe(struct wmi_device *wdev)
+static int dell_wmi_descriptor_probe(struct wmi_device *wdev,
+				     const void *context)
 {
 	union acpi_object *obj = NULL;
 	struct descriptor_priv *priv;
diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index d118bb73fcae..72b0a69a6ed0 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -672,7 +672,7 @@ static int dell_wmi_events_set_enabled(bool enable)
 	return dell_smbios_error(ret);
 }
 
-static int dell_wmi_probe(struct wmi_device *wdev)
+static int dell_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct dell_wmi_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c
index 52fcac5b393a..195a7f3638cb 100644
--- a/drivers/platform/x86/huawei-wmi.c
+++ b/drivers/platform/x86/huawei-wmi.c
@@ -166,7 +166,7 @@ static int huawei_wmi_input_setup(struct wmi_device *wdev)
 	return input_register_device(priv->idev);
 }
 
-static int huawei_wmi_probe(struct wmi_device *wdev)
+static int huawei_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct huawei_wmi_priv *priv;
 	int err;
diff --git a/drivers/platform/x86/intel-wmi-thunderbolt.c b/drivers/platform/x86/intel-wmi-thunderbolt.c
index 4dfa61434a76..974c22a7ff61 100644
--- a/drivers/platform/x86/intel-wmi-thunderbolt.c
+++ b/drivers/platform/x86/intel-wmi-thunderbolt.c
@@ -56,7 +56,8 @@ static const struct attribute_group tbt_attribute_group = {
 	.attrs = tbt_attrs,
 };
 
-static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev)
+static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev,
+				       const void *context)
 {
 	int ret;
 
diff --git a/drivers/platform/x86/wmi-bmof.c b/drivers/platform/x86/wmi-bmof.c
index 8751a13134be..105a82b6b076 100644
--- a/drivers/platform/x86/wmi-bmof.c
+++ b/drivers/platform/x86/wmi-bmof.c
@@ -54,7 +54,7 @@ read_bmof(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
-static int wmi_bmof_probe(struct wmi_device *wdev)
+static int wmi_bmof_probe(struct wmi_device *wdev, const void *context)
 {
 	struct bmof_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index f3be1c008856..2163fd8bf9e1 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -945,7 +945,8 @@ static int wmi_dev_probe(struct device *dev)
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
-		ret = wdriver->probe(dev_to_wdev(dev));
+		ret = wdriver->probe(dev_to_wdev(dev),
+				find_guid_context(wblock, wdriver));
 		if (ret != 0)
 			goto probe_failure;
 	}
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 592f81afecbb..1e84c474a993 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -44,7 +44,7 @@ struct wmi_driver {
 	struct device_driver driver;
 	const struct wmi_device_id *id_table;
 
-	int (*probe)(struct wmi_device *wdev);
+	int (*probe)(struct wmi_device *wdev, const void *context);
 	int (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
-- 
cgit v1.2.3


From 36f34737ff48f66c8a19b8788311e4b40d4adf80 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sat, 15 Jun 2019 20:47:37 +0300
Subject: spi: Add a prototype for exported spi_set_cs_timing()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiler is not happy about spi_set_cs_timing() prototype.

drivers/spi/spi.c:3016:6: warning: no previous prototype for ‘spi_set_cs_timing’ [-Wmissing-prototypes]
 void spi_set_cs_timing(struct spi_device *spi, u8 setup, u8 hold,
      ^~~~~~~~~~~~~~~~~

Let's add it to the header.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index d0c5ba746e01..0ec11f2911af 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -980,6 +980,8 @@ static inline void spi_message_free(struct spi_message *m)
 	kfree(m);
 }
 
+extern void spi_set_cs_timing(struct spi_device *spi, u8 setup, u8 hold, u8 inactive_dly);
+
 extern int spi_setup(struct spi_device *spi);
 extern int spi_async(struct spi_device *spi, struct spi_message *message);
 extern int spi_async_locked(struct spi_device *spi,
-- 
cgit v1.2.3


From 3006a5224f15cf68edc4878799ac6d6089861518 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Fri, 7 Jun 2019 02:36:05 +0200
Subject: netfilter: synproxy: remove module dependency on IPv6 SYNPROXY

This is a prerequisite for the infrastructure module NETFILTER_SYNPROXY.
The new module is needed to avoid duplicated code for the SYNPROXY
nftables support.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 36 ++++++++++++++++++++++++++++++++++++
 net/ipv6/netfilter.c           |  2 ++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 3a3dc4b1f0e7..35b12525ee45 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -8,6 +8,7 @@
 #define __LINUX_IP6_NETFILTER_H
 
 #include <uapi/linux/netfilter_ipv6.h>
+#include <net/tcp.h>
 
 /* Extra routing may needed on local out, as the QUEUE target never returns
  * control to the table.
@@ -35,6 +36,10 @@ struct nf_ipv6_ops {
 		       struct in6_addr *saddr);
 	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		     bool strict);
+	u32 (*cookie_init_sequence)(const struct ipv6hdr *iph,
+				    const struct tcphdr *th, u16 *mssp);
+	int (*cookie_v6_check)(const struct ipv6hdr *iph,
+			       const struct tcphdr *th, __u32 cookie);
 #endif
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -154,6 +159,37 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 #endif
 }
 
+static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
+					       const struct tcphdr *th,
+					       u16 *mssp)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (v6_ops)
+		return v6_ops->cookie_init_sequence(iph, th, mssp);
+
+	return 0;
+#else
+	return __cookie_v6_init_sequence(iph, th, mssp);
+#endif
+}
+
+static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
+				     const struct tcphdr *th, __u32 cookie)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (v6_ops)
+		return v6_ops->cookie_v6_check(iph, th, cookie);
+
+	return 0;
+#else
+	return __cookie_v6_check(iph, th, cookie);
+#endif
+}
+
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			unsigned int dataoff, u_int8_t protocol);
 
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 86048dce301b..dffb10fdc3e8 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -234,6 +234,8 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_me_harder	= ip6_route_me_harder,
 	.dev_get_saddr		= ipv6_dev_get_saddr,
 	.route			= __nf_ip6_route,
+	.cookie_init_sequence	= __cookie_v6_init_sequence,
+	.cookie_v6_check	= __cookie_v6_check,
 #endif
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
-- 
cgit v1.2.3


From c681edae33e86ff27be2d6cc717663d91df20b0e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 17 Jun 2019 10:09:33 +0200
Subject: net: ipv4: move tcp_fastopen server side code to SipHash library

Using a bare block cipher in non-crypto code is almost always a bad idea,
not only for security reasons (and we've seen some examples of this in
the kernel in the past), but also for performance reasons.

In the TCP fastopen case, we call into the bare AES block cipher one or
two times (depending on whether the connection is IPv4 or IPv6). On most
systems, this results in a call chain such as

  crypto_cipher_encrypt_one(ctx, dst, src)
    crypto_cipher_crt(tfm)->cit_encrypt_one(crypto_cipher_tfm(tfm), ...);
      aesni_encrypt
        kernel_fpu_begin();
        aesni_enc(ctx, dst, src); // asm routine
        kernel_fpu_end();

It is highly unlikely that the use of special AES instructions has a
benefit in this case, especially since we are doing the above twice
for IPv6 connections, instead of using a transform which can process
the entire input in one go.

We could switch to the cbcmac(aes) shash, which would at least get
rid of the duplicated overhead in *some* cases (i.e., today, only
arm64 has an accelerated implementation of cbcmac(aes), while x86 will
end up using the generic cbcmac template wrapping the AES-NI cipher,
which basically ends up doing exactly the above). However, in the given
context, it makes more sense to use a light-weight MAC algorithm that
is more suitable for the purpose at hand, such as SipHash.

Since the output size of SipHash already matches our chosen value for
TCP_FASTOPEN_COOKIE_SIZE, and given that it accepts arbitrary input
sizes, this greatly simplifies the code as well.

NOTE: Server farms backing a single server IP for load balancing purposes
      and sharing a single fastopen key will be adversely affected by
      this change unless all systems in the pool receive their kernel
      upgrades at the same time.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  7 +---
 include/net/tcp.h       | 10 ++---
 net/Kconfig             |  2 -
 net/ipv4/tcp_fastopen.c | 97 ++++++++++++++++---------------------------------
 4 files changed, 36 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c23019a3b264..9ea0e71f5c6a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -58,12 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
-	union {
-		u8	val[TCP_FASTOPEN_COOKIE_MAX];
-#if IS_ENABLED(CONFIG_IPV6)
-		struct in6_addr addr;
-#endif
-	};
+	u64	val[TCP_FASTOPEN_COOKIE_MAX / sizeof(u64)];
 	s8	len;
 	bool	exp;	/* In RFC6994 experimental option format */
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 96e0e53ff440..184930b02779 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1628,9 +1628,9 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 
 /* Fastopen key context */
 struct tcp_fastopen_context {
-	struct crypto_cipher	*tfm[TCP_FASTOPEN_KEY_MAX];
-	__u8			key[TCP_FASTOPEN_KEY_BUF_LENGTH];
-	struct rcu_head		rcu;
+	__u8		key[TCP_FASTOPEN_KEY_MAX][TCP_FASTOPEN_KEY_LENGTH];
+	int		num;
+	struct rcu_head	rcu;
 };
 
 extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
@@ -1665,9 +1665,7 @@ bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
 static inline
 int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
 {
-	if (ctx->tfm[1])
-		return 2;
-	return 1;
+	return ctx->num;
 }
 
 /* Latencies incurred by various limits for a sender. They are
diff --git a/net/Kconfig b/net/Kconfig
index d122f53c6fa2..57f51a279ad6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -67,8 +67,6 @@ source "net/xdp/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
-	select CRYPTO
-	select CRYPTO_AES
 	---help---
 	  These are the protocols used on the Internet and on most local
 	  Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7d19fa4c8121..46b67128e1ca 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -7,6 +7,7 @@
 #include <linux/tcp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/siphash.h>
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
@@ -37,14 +38,8 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 {
 	struct tcp_fastopen_context *ctx =
 	    container_of(head, struct tcp_fastopen_context, rcu);
-	int i;
 
-	/* We own ctx, thus no need to hold the Fastopen-lock */
-	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) {
-		if (ctx->tfm[i])
-			crypto_free_cipher(ctx->tfm[i]);
-	}
-	kfree(ctx);
+	kzfree(ctx);
 }
 
 void tcp_fastopen_destroy_cipher(struct sock *sk)
@@ -72,41 +67,6 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
 }
 
-static struct tcp_fastopen_context *tcp_fastopen_alloc_ctx(void *primary_key,
-							   void *backup_key,
-							   unsigned int len)
-{
-	struct tcp_fastopen_context *new_ctx;
-	void *key = primary_key;
-	int err, i;
-
-	new_ctx = kmalloc(sizeof(*new_ctx), GFP_KERNEL);
-	if (!new_ctx)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++)
-		new_ctx->tfm[i] = NULL;
-	for (i = 0; i < (backup_key ? 2 : 1); i++) {
-		new_ctx->tfm[i] = crypto_alloc_cipher("aes", 0, 0);
-		if (IS_ERR(new_ctx->tfm[i])) {
-			err = PTR_ERR(new_ctx->tfm[i]);
-			new_ctx->tfm[i] = NULL;
-			pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
-			goto out;
-		}
-		err = crypto_cipher_setkey(new_ctx->tfm[i], key, len);
-		if (err) {
-			pr_err("TCP: TFO cipher key error: %d\n", err);
-			goto out;
-		}
-		memcpy(&new_ctx->key[i * TCP_FASTOPEN_KEY_LENGTH], key, len);
-		key = backup_key;
-	}
-	return new_ctx;
-out:
-	tcp_fastopen_ctx_free(&new_ctx->rcu);
-	return ERR_PTR(err);
-}
-
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 			      void *primary_key, void *backup_key,
 			      unsigned int len)
@@ -115,11 +75,20 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 	struct fastopen_queue *q;
 	int err = 0;
 
-	ctx = tcp_fastopen_alloc_ctx(primary_key, backup_key, len);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		err = -ENOMEM;
 		goto out;
 	}
+
+	memcpy(ctx->key[0], primary_key, len);
+	if (backup_key) {
+		memcpy(ctx->key[1], backup_key, len);
+		ctx->num = 2;
+	} else {
+		ctx->num = 1;
+	}
+
 	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 	if (sk) {
 		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
@@ -141,31 +110,30 @@ out:
 
 static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 					     struct sk_buff *syn,
-					     struct crypto_cipher *tfm,
+					     const u8 *key,
 					     struct tcp_fastopen_cookie *foc)
 {
+	BUILD_BUG_ON(TCP_FASTOPEN_KEY_LENGTH != sizeof(siphash_key_t));
+	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
+
 	if (req->rsk_ops->family == AF_INET) {
 		const struct iphdr *iph = ip_hdr(syn);
-		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
 
-		crypto_cipher_encrypt_one(tfm, foc->val, (void *)path);
+		foc->val[0] = siphash(&iph->saddr,
+				      sizeof(iph->saddr) +
+				      sizeof(iph->daddr),
+				      (const siphash_key_t *)key);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
-
 #if IS_ENABLED(CONFIG_IPV6)
 	if (req->rsk_ops->family == AF_INET6) {
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
-		struct tcp_fastopen_cookie tmp;
-		struct in6_addr *buf;
-		int i;
-
-		crypto_cipher_encrypt_one(tfm, tmp.val,
-					  (void *)&ip6h->saddr);
-		buf = &tmp.addr;
-		for (i = 0; i < 4; i++)
-			buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-		crypto_cipher_encrypt_one(tfm, foc->val, (void *)buf);
+
+		foc->val[0] = siphash(&ip6h->saddr,
+				      sizeof(ip6h->saddr) +
+				      sizeof(ip6h->daddr),
+				      (const siphash_key_t *)key);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -173,11 +141,8 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 	return false;
 }
 
-/* Generate the fastopen cookie by doing aes128 encryption on both
- * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
- * addresses. For the longer IPv6 addresses use CBC-MAC.
- *
- * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
  */
 static void tcp_fastopen_cookie_gen(struct sock *sk,
 				    struct request_sock *req,
@@ -189,7 +154,7 @@ static void tcp_fastopen_cookie_gen(struct sock *sk,
 	rcu_read_lock();
 	ctx = tcp_fastopen_get_ctx(sk);
 	if (ctx)
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[0], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[0], foc);
 	rcu_read_unlock();
 }
 
@@ -253,7 +218,7 @@ static int tcp_fastopen_cookie_gen_check(struct sock *sk,
 	if (!ctx)
 		goto out;
 	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[i], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[i], foc);
 		if (tcp_fastopen_cookie_match(foc, orig)) {
 			ret = i + 1;
 			goto out;
-- 
cgit v1.2.3


From 4ed9890c4c44d2ead7b57ad65425e3fbe9b9d42a Mon Sep 17 00:00:00 2001
From: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com>
Date: Fri, 10 May 2019 12:37:27 +0530
Subject: usb: gadget: send usb_gadget as an argument in get_config_params

Passing struct usb_gadget * as an extra argument in get_config_params
makes gadget drivers to easily update the U1DevExitLat & U2DevExitLat
values based on the values passed from the device tree. This patch
does the same

Signed-off-by: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/composite.c | 2 +-
 include/linux/usb/gadget.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index b8a15840b4ff..9118b42c70b6 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -653,7 +653,7 @@ static int bos_desc(struct usb_composite_dev *cdev)
 
 		/* Get Controller configuration */
 		if (cdev->gadget->ops->get_config_params) {
-			cdev->gadget->ops->get_config_params(
+			cdev->gadget->ops->get_config_params(cdev->gadget,
 				&dcd_config_params);
 		} else {
 			dcd_config_params.bU1devExitLat =
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 7595056b96c1..fb19141151d8 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -310,7 +310,8 @@ struct usb_gadget_ops {
 	int	(*pullup) (struct usb_gadget *, int is_on);
 	int	(*ioctl)(struct usb_gadget *,
 				unsigned code, unsigned long param);
-	void	(*get_config_params)(struct usb_dcd_config_params *);
+	void	(*get_config_params)(struct usb_gadget *,
+				     struct usb_dcd_config_params *);
 	int	(*udc_start)(struct usb_gadget *,
 			struct usb_gadget_driver *);
 	int	(*udc_stop)(struct usb_gadget *);
-- 
cgit v1.2.3


From 99600fd47eafd20b9ba6e04562bb2fcc48475344 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:15:05 +0800
Subject: clk: Add CLK_HW_INIT_* macros using .parent_hws

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_hws, for clk drivers to directly list parents by
pointing to their respective struct clk_hw's.

Add macros that can take either one single struct clk_hw *, or an array
of them, for drivers to use.

A special CLK_HW_INIT_HWS macro is included, which takes an array of
struct clk_hw *, but sets .num_parents to 1. This variant is to allow
the reuse of the array, instead of having a compound literal allocated
for each clk sharing the same parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..70aad5cefea7 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -904,6 +904,29 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_HW(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= (const struct clk_hw*[]) { _parent },	\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
+/*
+ * This macro is intended for drivers to be able to share the otherwise
+ * individual struct clk_hw[] compound literals created by the compiler
+ * when using CLK_HW_INIT_HW. It does NOT support multiple parents.
+ */
+#define CLK_HW_INIT_HWS(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= _parent,				\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
@@ -913,6 +936,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_HW(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_hws	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3


From 2d6b4f33e637bf51c50c536966a19e94a59f3212 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:49:03 +0800
Subject: clk: Add CLK_HW_INIT_FW_NAME macro using .fw_name in .parent_data

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers that have parents referenced using a
combination of device tree clock-names, clock indices, and/or clk_hw
pointers.

Add a CLK_HW_INIT macro for specifying a single parent from the device
tree using .fw_name in struct clk_parent_data.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 70aad5cefea7..b19063512a29 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -927,6 +927,17 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,					\
 	})
 
+#define CLK_HW_INIT_FW_NAME(_name, _parent, _ops, _flags)		\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_data	= (const struct clk_parent_data[]) {	\
+					{ .fw_name = _parent },		\
+				  },					\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
-- 
cgit v1.2.3


From 13933109dff0a5abbfc3980304c6c21c90829810 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:17:50 +0800
Subject: clk: Add CLK_HW_INIT_PARENT_DATA macro using .parent_data

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_data, for clk drivers that have parents referenced using
a combination of device tree clock-names, clock indices, and/or struct
clk_hw pointers.

Add a new macro that can take a list of struct clk_parent_data for
drivers to use.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b19063512a29..0fd14c4874d6 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -956,6 +956,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_DATA(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_data	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3


From d7b15114aba956ca395ec5cc28f68fe861ffc208 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:19:46 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HW which takes clk_hw pointer
 as parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HW, that can take a struct clk_hw
pointer, instead of a string, as its parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0fd14c4874d6..c85e9f3809f2 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -985,6 +985,17 @@ extern struct of_device_id __clk_of_table;
 					      _flags),			\
 	}
 
+#define CLK_FIXED_FACTOR_HW(_struct, _name, _parent,			\
+			    _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HW(_name,			\
+						 _parent,		\
+						 &clk_fixed_factor_ops,	\
+						 _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 1bef004e2680511ecbb6b5db3954fba430501ecb Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 6 May 2019 10:43:16 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HWS which takes list of
 struct clk_hw *

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HWS, that can take an array of pointers
to struct clk_hw, instead of a string, as its parent. Taking an array
instead of a direct pointer allows the reuse of the array for multiple
clks, rather than having one compound literal with the same contents
allocated for each clk declaration.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c85e9f3809f2..146a6859969e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -996,6 +996,21 @@ extern struct of_device_id __clk_of_table;
 						 _flags),		\
 	}
 
+/*
+ * This macro allows the driver to reuse the _parent array for multiple
+ * fixed factor clk declarations.
+ */
+#define CLK_FIXED_FACTOR_HWS(_struct, _name, _parent,			\
+			     _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HWS(_name,		\
+						  _parent,		\
+						  &clk_fixed_factor_ops, \
+						  _flags),	\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 8b13a48b891c7c855e9f3a401d91391a946f4ca7 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:58:20 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_FW_NAME for DT clock-names
 parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers to specify parents using a combination of
device tree clock-names, pointers to struct clk_hw, device tree clocks,
and/or fallback global clock names.

Add a new macro, CLK_FIXED_FACTOR_FW_NAME, that takes a string to match
a clock-names entry in the device tree to specify the clock parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 146a6859969e..e5c44f6dd897 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1011,6 +1011,17 @@ extern struct of_device_id __clk_of_table;
 						  _flags),	\
 	}
 
+#define CLK_FIXED_FACTOR_FW_NAME(_struct, _name, _parent,		\
+				 _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_FW_NAME(_name,		\
+						      _parent,		\
+						      &clk_fixed_factor_ops, \
+						      _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 4eb293487d05a69862a4907ee944aa271ed49a4c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:32 +0900
Subject: pinctrl: make pinconf.h self-contained

This header uses 'bool', but it does not include any header by itself.

So, it could cause unknown type name error, depending on the header
include order, although probably <linux/types.h> has been included by
someone else.

Include <linux/types.h> to make it self-contained.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 93c9dd133e9d..9bebc3554809 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_PINCONF
 
+#include <linux/types.h>
+
 struct pinctrl_dev;
 struct seq_file;
 
-- 
cgit v1.2.3


From 29875a52915e09abb9703722054f6443cb492ccc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Fri, 12 Oct 2018 17:06:06 +0200
Subject: mm: Add an apply_to_pfn_range interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.

Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range

The reason for not using the page-walk code is that we want to perform
the page-walk on vmas pointing to an address space without requiring the
mmap_sem to be held rather than on vmas belonging to a process with the
mmap_sem held.

Notable changes since RFC:
Don't export apply_to_pfn range.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 include/linux/mm.h |  10 ++++
 mm/memory.c        | 135 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 113 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e8834ac32b7..3d06ce2a64af 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
+struct pfn_range_apply;
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+			 struct pfn_range_apply *closure);
+struct pfn_range_apply {
+	struct mm_struct *mm;
+	pter_fn_t ptefn;
+	unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+			      unsigned long address, unsigned long size);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 168f546af1ad..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+			      unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
+	pte = (closure->mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = fn(pte++, token, addr, data);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (mm != &init_mm)
+	if (closure->mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+			      unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
+	pmd = pmd_alloc(closure->mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
+
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+			continue;
+		err = apply_to_pte_range(closure, pmd, addr, next);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
+			      unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
+	pud = pud_alloc(closure->mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
+
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+		if (!closure->alloc && pud_none_or_clear_bad(pud))
+			continue;
+		err = apply_to_pmd_range(closure, pud, addr, next);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
+			      unsigned long addr, unsigned long end)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
+	p4d = p4d_alloc(closure->mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
+
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
+			continue;
+		err = apply_to_pud_range(closure, p4d, addr, next);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
+/**
+ * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
+ * function on each leaf page table entry
+ * @closure: Details about how to scan and what function to apply
+ * @addr: Start virtual address
+ * @size: Size of the region
+ *
+ * If @closure->alloc is set to 1, the function will fill in the page table
+ * as necessary. Otherwise it will skip non-present parts.
+ * Note: The caller must ensure that the range does not contain huge pages.
+ * The caller must also assure that the proper mmu_notifier functions are
+ * called before and after the call to apply_to_pfn_range.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ *
+ * Return: Zero on success. If the provided function returns a non-zero status,
+ * the page table walk will terminate and that status will be returned.
+ * If @closure->alloc is set to 1, then this function may also return memory
+ * allocation errors arising from allocating page table memory.
  */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+int apply_to_pfn_range(struct pfn_range_apply *closure,
+		       unsigned long addr, unsigned long size)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(closure->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(closure, pgd, addr, next);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/**
+ * struct page_range_apply - Closure structure for apply_to_page_range()
+ * @pter: The base closure structure we derive from
+ * @fn: The leaf pte function to call
+ * @data: The leaf pte function closure
+ */
+struct page_range_apply {
+	struct pfn_range_apply pter;
+	pte_fn_t fn;
+	void *data;
+};
+
+/*
+ * Callback wrapper to enable use of apply_to_pfn_range for
+ * the apply_to_page_range interface
+ */
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+				       unsigned long addr,
+				       struct pfn_range_apply *pter)
+{
+	struct page_range_apply *pra =
+		container_of(pter, typeof(*pra), pter);
+
+	return pra->fn(pte, token, addr, pra->data);
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	struct page_range_apply pra = {
+		.pter = {.mm = mm,
+			 .alloc = 1,
+			 .ptefn = apply_to_page_range_wrapper },
+		.fn = fn,
+		.data = data
+	};
+
+	return apply_to_pfn_range(&pra.pter, addr, size);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
-- 
cgit v1.2.3


From 4fe51e9e7902b5724b618dadd9527b1bbf2b55cc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 19 Mar 2019 13:12:30 +0100
Subject: mm: Add write-protect and clean utilities for address space ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two utilities to a) write-protect and b) clean all ptes pointing into
a range of an address space.
The utilities are intended to aid in tracking dirty pages (either
driver-allocated system memory or pci device memory).
The write-protect utility should be used in conjunction with
page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
accesses. Typically one would want to use this on sparse accesses into
large memory regions. The clean utility should be used to utilize
hardware dirtying functionality and avoid the overhead of page-faults,
typically on large accesses into small memory regions.

The added file "as_dirty_helpers.c" is initially listed as maintained by
VMware under our DRM driver. If somebody would like it elsewhere,
that's of course no problem.

Notable changes since RFC:
- Added comments to help avoid the usage of these function for VMAs
  it's not intended for. We also do advisory checks on the vm_flags and
  warn on illegal usage.
- Perform the pte modifications the same way softdirty does.
- Add mmu_notifier range invalidation calls.
- Add a config option so that this code is not unconditionally included.
- Tell the mmu_gather code about pending tlb flushes.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 MAINTAINERS           |   1 +
 include/linux/mm.h    |   9 +-
 mm/Kconfig            |   3 +
 mm/Makefile           |   1 +
 mm/as_dirty_helpers.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 mm/as_dirty_helpers.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a2f487ea49a..a55d4ef91b0b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5179,6 +5179,7 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
+F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d06ce2a64af..a0bc2a82917e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2685,7 +2685,14 @@ struct pfn_range_apply {
 };
 extern int apply_to_pfn_range(struct pfn_range_apply *closure,
 			      unsigned long address, unsigned long size);
-
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr);
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end);
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..5006d0e6a5c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,4 +765,7 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
+config AS_DIRTY_HELPERS
+        bool
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..f5d412bbc2f7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
new file mode 100644
index 000000000000..f600e31534fb
--- /dev/null
+++ b/mm/as_dirty_helpers.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct apply_as - Closure structure for apply_as_range
+ * @base: struct pfn_range_apply we derive from
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ * @vma: Pointer to the struct vm_area_struct we're currently operating on
+ */
+struct apply_as {
+	struct pfn_range_apply base;
+	unsigned long start;
+	unsigned long end;
+	unsigned long total;
+	struct vm_area_struct *vma;
+};
+
+/**
+ * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+			      unsigned long addr,
+			      struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	pte_t ptent = *pte;
+
+	if (pte_write(ptent)) {
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_wrprotect(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+	}
+
+	return 0;
+}
+
+/**
+ * struct apply_as_clean - Closure structure for apply_as_clean
+ * @base: struct apply_as we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct apply_as_clean {
+	struct apply_as base;
+	pgoff_t bitmap_pgoff;
+	unsigned long *bitmap;
+	pgoff_t start;
+	pgoff_t end;
+};
+
+/**
+ * apply_pt_clean - Leaf pte callback to clean a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as_clean
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
+			  unsigned long addr,
+			  struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
+	pte_t ptent = *pte;
+
+	if (pte_dirty(ptent)) {
+		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
+			aas->vma->vm_pgoff - clean->bitmap_pgoff;
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_mkclean(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+
+		__set_bit(pgoff, clean->bitmap);
+		clean->start = min(clean->start, pgoff);
+		clean->end = max(clean->end, pgoff + 1);
+	}
+
+	return 0;
+}
+
+/**
+ * apply_as_range - Apply a pte callback to all PTEs pointing into a range
+ * of an address_space.
+ * @mapping: Pointer to the struct address_space
+ * @aas: Closure structure
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Return: Number of ptes touched. Note that this number might be larger
+ * than @nr if there are overlapping vmas
+ */
+static unsigned long apply_as_range(struct address_space *mapping,
+				    struct apply_as *aas,
+				    pgoff_t first_index, pgoff_t nr)
+{
+	struct vm_area_struct *vma;
+	pgoff_t vba, vea, cba, cea;
+	unsigned long start_addr, end_addr;
+	struct mmu_notifier_range range;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+				  first_index + nr - 1) {
+		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
+
+		/*
+		 * We can only do advisory flag tests below, since we can't
+		 * require the vm's mmap_sem to be held to protect the flags.
+		 * Therefore, callers that strictly depend on specific mmap
+		 * flags to remain constant throughout the operation must
+		 * either ensure those flags are immutable for all relevant
+		 * vmas or can't use this function. Fixing this properly would
+		 * require the vma::vm_flags to be protected by a separate
+		 * lock taken after the i_mmap_lock
+		 */
+
+		/* Skip non-applicable VMAs */
+		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
+		    (VM_SHARED | VM_WRITE))
+			continue;
+
+		/* Warn on and skip VMAs whose flags indicate illegal usage */
+		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
+			continue;
+
+		/* Clip to the vma */
+		vba = vma->vm_pgoff;
+		vea = vba + vma_pages(vma);
+		cba = first_index;
+		cba = max(cba, vba);
+		cea = first_index + nr;
+		cea = min(cea, vea);
+
+		/* Translate to virtual address */
+		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+		if (start_addr >= end_addr)
+			continue;
+
+		aas->base.mm = vma->vm_mm;
+		aas->vma = vma;
+		aas->start = end_addr;
+		aas->end = start_addr;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+					vma, vma->vm_mm, start_addr, end_addr);
+		mmu_notifier_invalidate_range_start(&range);
+
+		/* Needed when we only change protection? */
+		flush_cache_range(vma, start_addr, end_addr);
+
+		/*
+		 * We're not using tlb_gather_mmu() since typically
+		 * only a small subrange of PTEs are affected.
+		 */
+		inc_tlb_flush_pending(vma->vm_mm);
+
+		/* Should not error since aas->base.alloc == 0 */
+		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
+					   end_addr - start_addr));
+		if (aas->end > aas->start)
+			flush_tlb_range(vma, aas->start, aas->end);
+
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(vma->vm_mm);
+	}
+	i_mmap_unlock_read(mapping);
+
+	return aas->total;
+}
+
+/**
+ * apply_as_wrprotect - Write-protect all ptes in an address_space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr)
+{
+	struct apply_as aas = {
+		.base = {
+			.alloc = 0,
+			.ptefn = apply_pt_wrprotect,
+		},
+		.total = 0,
+	};
+
+	return apply_as_range(mapping, &aas, first_index, nr);
+}
+EXPORT_SYMBOL_GPL(apply_as_wrprotect);
+
+/**
+ * apply_as_clean - Clean all ptes in an address_space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end)
+{
+	bool none_set = (*start >= *end);
+	struct apply_as_clean clean = {
+		.base = {
+			.base = {
+				.alloc = 0,
+				.ptefn = apply_pt_clean,
+			},
+			.total = 0,
+		},
+		.bitmap_pgoff = bitmap_pgoff,
+		.bitmap = bitmap,
+		.start = none_set ? nr : *start,
+		.end = none_set ? 0 : *end,
+	};
+	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
+					   nr);
+
+	*start = clean.start;
+	*end = clean.end;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(apply_as_clean);
-- 
cgit v1.2.3


From 378a60406415bd20ec6e845a3d6883d460656537 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:17:22 -0300
Subject: mm/hmm: Remove duplicate condition test before wait_event_timeout

The wait_event_timeout macro already tests the condition as its first
action, so there is no reason to open code another version of this, all
that does is skip the might_sleep() debugging in common cases, which is
not helpful.

Further, based on prior patches, we can now simplify the required condition
test:
 - If range is valid memory then so is range->hmm
 - If hmm_release() has run then range->valid is set to false
   at the same time as dead, so no reason to check both.
 - A valid hmm has a valid hmm->mm.

Allowing the return value of wait_event_timeout() (along with its internal
barriers) to compute the result of the function.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d97b6d62c5b..26e7c477490c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -209,17 +209,8 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
 {
-	/* Check if mm is dead ? */
-	if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
-		range->valid = false;
-		return false;
-	}
-	if (range->valid)
-		return true;
-	wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
-			   msecs_to_jiffies(timeout));
-	/* Return current valid status just in case we get lucky */
-	return range->valid;
+	return wait_event_timeout(range->hmm->wq, range->valid,
+				  msecs_to_jiffies(timeout)) != 0;
 }
 
 /*
-- 
cgit v1.2.3


From 47f245985a4f3e270b1e4f28aa49f4c939527981 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:08:28 -0300
Subject: mm/hmm: Hold on to the mmget for the lifetime of the range

Range functions like hmm_range_snapshot() and hmm_range_fault() call
find_vma, which requires hodling the mmget() and the mmap_sem for the mm.

Make this simpler for the callers by holding the mmget() inside the range
for the lifetime of the range. Other functions that accept a range should
only be called if the range is registered.

This has the side effect of directly preventing hmm_release() from
happening while a range is registered. That means range->dead cannot be
false during the lifetime of the range, so remove dead and
hmm_mirror_mm_is_alive() entirely.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 26 --------------------------
 mm/hmm.c            | 32 +++++++++++---------------------
 2 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 26e7c477490c..bf013e965257 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -82,7 +82,6 @@
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  * @wq: wait queue for user waiting on a range invalidation
  * @notifiers: count of active mmu notifiers
- * @dead: is the mm dead ?
  */
 struct hmm {
 	struct mm_struct	*mm;
@@ -95,7 +94,6 @@ struct hmm {
 	wait_queue_head_t	wq;
 	struct rcu_head		rcu;
 	long			notifiers;
-	bool			dead;
 };
 
 /*
@@ -459,30 +457,6 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
-/*
- * hmm_mirror_mm_is_alive() - test if mm is still alive
- * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Return: false if the mm is dead, true otherwise
- *
- * This is an optimization, it will not always accurately return false if the
- * mm is dead; i.e., there can be false negatives (process is being killed but
- * HMM is not yet informed of that). It is only intended to be used to optimize
- * out cases where the driver is about to do something time consuming and it
- * would be better to skip it if the mm is dead.
- */
-static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
-{
-	struct mm_struct *mm;
-
-	if (!mirror || !mirror->hmm)
-		return false;
-	mm = READ_ONCE(mirror->hmm->mm);
-	if (mirror->hmm->dead || !mm)
-		return false;
-
-	return true;
-}
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
diff --git a/mm/hmm.c b/mm/hmm.c
index 73c8af4827fe..1eddda45cefa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -67,7 +67,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	mutex_init(&hmm->lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
-	hmm->dead = false;
 	hmm->mm = mm;
 
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
@@ -120,21 +119,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
-	struct hmm_range *range;
 
 	/* Bail out if hmm is in the process of being freed */
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	/* Report this HMM as dying. */
-	hmm->dead = true;
-
-	/* Wake-up everyone waiting on any range. */
-	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list)
-		range->valid = false;
-	wake_up_all(&hmm->wq);
-	mutex_unlock(&hmm->lock);
+	/*
+	 * Since hmm_range_register() holds the mmget() lock hmm_release() is
+	 * prevented as long as a range exists.
+	 */
+	WARN_ON(!list_empty_careful(&hmm->ranges));
 
 	down_write(&hmm->mirrors_sem);
 	mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -903,8 +897,8 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
+	/* Prevent hmm_release() from running while the range is valid */
+	if (!mmget_not_zero(hmm->mm))
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
@@ -942,11 +936,12 @@ void hmm_range_unregister(struct hmm_range *range)
 		return;
 
 	mutex_lock(&hmm->lock);
-	list_del(&range->list);
+	list_del_init(&range->list);
 	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
+	mmput(hmm->mm);
 	hmm_put(hmm);
 	range->hmm = NULL;
 }
@@ -974,10 +969,7 @@ long hmm_range_snapshot(struct hmm_range *range)
 	struct vm_area_struct *vma;
 	struct mm_walk mm_walk;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
-
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 	do {
 		/* If range is no longer valid force retry. */
 		if (!range->valid)
@@ -1072,9 +1064,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
 	struct mm_walk mm_walk;
 	int ret;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 
 	do {
 		/* If range is no longer valid force retry. */
-- 
cgit v1.2.3


From 90ec7a76cc4ba65bfedeb8621cba09cd5a317d8f Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Thu, 16 May 2019 15:00:20 +0530
Subject: iommu/io-pgtable-arm: Add support to use system cache

Few Qualcomm platforms such as, sdm845 have an additional outer
cache called as System cache, aka. Last level cache (LLC) that
allows non-coherent devices to upgrade to using caching.
This cache sits right before the DDR, and is tightly coupled
with the memory controller. The clients using this cache request
their slices from this system cache, make it active, and can then
start using it.

There is a fundamental assumption that non-coherent devices can't
access caches. This change adds an exception where they *can* use
some level of cache despite still being non-coherent overall.
The coherent devices that use cacheable memory, and CPU make use of
this system cache by default.

Looking at memory types, we have following -
a) Normal uncached :- MAIR 0x44, inner non-cacheable,
                      outer non-cacheable;
b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
                      outer read write-back non-transient;
                      attribute setting for coherenet I/O devices.
and, for non-coherent i/o devices that can allocate in system cache
another type gets added -
c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable,
                        outer read write-back non-transient

Coherent I/O devices use system cache by marking the memory as
normal cached.
Non-coherent I/O devices should mark the memory as normal
sys-cached in page tables to use system cache.

Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/io-pgtable-arm.c | 9 ++++++++-
 include/linux/iommu.h          | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 4e21efbc4459..2454ac11aa97 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -167,10 +167,12 @@
 #define ARM_LPAE_MAIR_ATTR_MASK		0xff
 #define ARM_LPAE_MAIR_ATTR_DEVICE	0x04
 #define ARM_LPAE_MAIR_ATTR_NC		0x44
+#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA	0xf4
 #define ARM_LPAE_MAIR_ATTR_WBRWA	0xff
 #define ARM_LPAE_MAIR_ATTR_IDX_NC	0
 #define ARM_LPAE_MAIR_ATTR_IDX_CACHE	1
 #define ARM_LPAE_MAIR_ATTR_IDX_DEV	2
+#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE	3
 
 #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)
 #define ARM_MALI_LPAE_TTBR_READ_INNER	BIT(2)
@@ -470,6 +472,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 		else if (prot & IOMMU_CACHE)
 			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
 				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+		else if (prot & IOMMU_QCOM_SYS_CACHE)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
 	}
 
 	if (prot & IOMMU_NOEXEC)
@@ -857,7 +862,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	      (ARM_LPAE_MAIR_ATTR_WBRWA
 	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
 	      (ARM_LPAE_MAIR_ATTR_DEVICE
-	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+	      (ARM_LPAE_MAIR_ATTR_INC_OWBRWA
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE));
 
 	cfg->arm_lpae_s1_cfg.mair[0] = reg;
 	cfg->arm_lpae_s1_cfg.mair[1] = 0;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a815cf6f6f47..8ee3fbaf5855 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,12 @@
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV	(1 << 5)
+/*
+ * Non-coherent masters on few Qualcomm SoCs can use this page protection flag
+ * to set correct cacheability attributes to use an outer level of cache -
+ * last level cache, aka system cache.
+ */
+#define IOMMU_QCOM_SYS_CACHE	(1 << 6)
 
 struct iommu_ops;
 struct iommu_group;
-- 
cgit v1.2.3


From 2589726d12a1b12eaaa93c7f1ea64287e383c7a5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 15 Jun 2019 12:12:20 -0700
Subject: bpf: introduce bounded loops

Allow the verifier to validate the loops by simulating their execution.
Exisiting programs have used '#pragma unroll' to unroll the loops
by the compiler. Instead let the verifier simulate all iterations
of the loop.
In order to do that introduce parentage chain of bpf_verifier_state and
'branches' counter for the number of branches left to explore.
See more detailed algorithm description in bpf_verifier.h

This algorithm borrows the key idea from Edward Cree approach:
https://patchwork.ozlabs.org/patch/877222/
Additional state pruning heuristics make such brute force loop walk
practical even for large loops.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  51 ++++++++++++++-
 kernel/bpf/verifier.c        | 143 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 181 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 704ed7971472..03037373b447 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -194,6 +194,53 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
+	struct bpf_verifier_state *parent;
+	/*
+	 * 'branches' field is the number of branches left to explore:
+	 * 0 - all possible paths from this state reached bpf_exit or
+	 * were safely pruned
+	 * 1 - at least one path is being explored.
+	 * This state hasn't reached bpf_exit
+	 * 2 - at least two paths are being explored.
+	 * This state is an immediate parent of two children.
+	 * One is fallthrough branch with branches==1 and another
+	 * state is pushed into stack (to be explored later) also with
+	 * branches==1. The parent of this state has branches==1.
+	 * The verifier state tree connected via 'parent' pointer looks like:
+	 * 1
+	 * 1
+	 * 2 -> 1 (first 'if' pushed into stack)
+	 * 1
+	 * 2 -> 1 (second 'if' pushed into stack)
+	 * 1
+	 * 1
+	 * 1 bpf_exit.
+	 *
+	 * Once do_check() reaches bpf_exit, it calls update_branch_counts()
+	 * and the verifier state tree will look:
+	 * 1
+	 * 1
+	 * 2 -> 1 (first 'if' pushed into stack)
+	 * 1
+	 * 1 -> 1 (second 'if' pushed into stack)
+	 * 0
+	 * 0
+	 * 0 bpf_exit.
+	 * After pop_stack() the do_check() will resume at second 'if'.
+	 *
+	 * If is_state_visited() sees a state with branches > 0 it means
+	 * there is a loop. If such state is exactly equal to the current state
+	 * it's an infinite loop. Note states_equal() checks for states
+	 * equvalency, so two states being 'states_equal' does not mean
+	 * infinite loop. The exact comparison is provided by
+	 * states_maybe_looping() function. It's a stronger pre-check and
+	 * much faster than states_equal().
+	 *
+	 * This algorithm may not find all possible infinite loops or
+	 * loop iteration count may be too high.
+	 * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in.
+	 */
+	u32 branches;
 	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
@@ -312,7 +359,9 @@ struct bpf_verifier_env {
 	} cfg;
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
-	u32 insn_processed;
+	u32 prev_insn_processed, insn_processed;
+	/* number of jmps, calls, exits analyzed so far */
+	u32 prev_jmps_processed, jmps_processed;
 	/* total verification time */
 	u64 verification_time;
 	/* maximum number of verifier states kept in 'branching' instructions */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d3a4ef1d969..25baa3c8cdd2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -721,6 +721,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	dst_state->active_spin_lock = src->active_spin_lock;
+	dst_state->branches = src->branches;
+	dst_state->parent = src->parent;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -736,6 +738,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	return 0;
 }
 
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+	while (st) {
+		u32 br = --st->branches;
+
+		/* WARN_ON(br > 1) technically makes sense here,
+		 * but see comment in push_stack(), hence:
+		 */
+		WARN_ONCE((int)br < 0,
+			  "BUG update_branch_counts:branches_to_explore=%d\n",
+			  br);
+		if (br)
+			break;
+		st = st->parent;
+	}
+}
+
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 		     int *insn_idx)
 {
@@ -789,6 +808,18 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 			env->stack_size);
 		goto err;
 	}
+	if (elem->st.parent) {
+		++elem->st.parent->branches;
+		/* WARN_ON(branches > 2) technically makes sense here,
+		 * but
+		 * 1. speculative states will bump 'branches' for non-branch
+		 * instructions
+		 * 2. is_state_visited() heuristics may decide not to create
+		 * a new state for a sequence of branches and all such current
+		 * and cloned states will be pointing to a single parent state
+		 * which might have large 'branches' count.
+		 */
+	}
 	return &elem->st;
 err:
 	free_verifier_state(env->cur_state, true);
@@ -5682,7 +5713,8 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx)
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+		     bool loop_ok)
 {
 	int *insn_stack = env->cfg.insn_stack;
 	int *insn_state = env->cfg.insn_state;
@@ -5712,6 +5744,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[env->cfg.cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+		if (loop_ok && env->allow_ptr_leaks)
+			return 0;
 		verbose_linfo(env, t, "%d: ", t);
 		verbose_linfo(env, w, "%d: ", w);
 		verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -5763,7 +5797,7 @@ peek_stack:
 		if (opcode == BPF_EXIT) {
 			goto mark_explored;
 		} else if (opcode == BPF_CALL) {
-			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5772,7 +5806,8 @@ peek_stack:
 				init_explored_state(env, t + 1);
 			if (insns[t].src_reg == BPF_PSEUDO_CALL) {
 				init_explored_state(env, t);
-				ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+				ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+						env, false);
 				if (ret == 1)
 					goto peek_stack;
 				else if (ret < 0)
@@ -5785,7 +5820,7 @@ peek_stack:
 			}
 			/* unconditional jump with single edge */
 			ret = push_insn(t, t + insns[t].off + 1,
-					FALLTHROUGH, env);
+					FALLTHROUGH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5798,13 +5833,13 @@ peek_stack:
 		} else {
 			/* conditional jump with two edges */
 			init_explored_state(env, t);
-			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
 
-			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5814,7 +5849,7 @@ peek_stack:
 		/* all other non-branch instructions with single
 		 * fall-through edge
 		 */
-		ret = push_insn(t, t + 1, FALLTHROUGH, env);
+		ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 		if (ret == 1)
 			goto peek_stack;
 		else if (ret < 0)
@@ -6247,6 +6282,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 
 	sl = *explored_state(env, insn);
 	while (sl) {
+		if (sl->state.branches)
+			goto next;
 		if (sl->state.insn_idx != insn ||
 		    sl->state.curframe != cur->curframe)
 			goto next;
@@ -6611,12 +6648,32 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+				 struct bpf_verifier_state *cur)
+{
+	struct bpf_func_state *fold, *fcur;
+	int i, fr = cur->curframe;
+
+	if (old->curframe != fr)
+		return false;
+
+	fold = old->frame[fr];
+	fcur = cur->frame[fr];
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (memcmp(&fold->regs[i], &fcur->regs[i],
+			   offsetof(struct bpf_reg_state, parent)))
+			return false;
+	return true;
+}
+
+
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
+	bool add_new_state = false;
 
 	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
@@ -6624,6 +6681,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		 */
 		return 0;
 
+	/* bpf progs typically have pruning point every 4 instructions
+	 * http://vger.kernel.org/bpfconf2019.html#session-1
+	 * Do not add new state for future pruning if the verifier hasn't seen
+	 * at least 2 jumps and at least 8 instructions.
+	 * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+	 * In tests that amounts to up to 50% reduction into total verifier
+	 * memory consumption and 20% verifier time speedup.
+	 */
+	if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+	    env->insn_processed - env->prev_insn_processed >= 8)
+		add_new_state = true;
+
 	pprev = explored_state(env, insn_idx);
 	sl = *pprev;
 
@@ -6633,6 +6702,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		states_cnt++;
 		if (sl->state.insn_idx != insn_idx)
 			goto next;
+		if (sl->state.branches) {
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur)) {
+				verbose_linfo(env, insn_idx, "; ");
+				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+				return -EINVAL;
+			}
+			/* if the verifier is processing a loop, avoid adding new state
+			 * too often, since different loop iterations have distinct
+			 * states and may not help future pruning.
+			 * This threshold shouldn't be too low to make sure that
+			 * a loop with large bound will be rejected quickly.
+			 * The most abusive loop will be:
+			 * r1 += 1
+			 * if r1 < 1000000 goto pc-2
+			 * 1M insn_procssed limit / 100 == 10k peak states.
+			 * This threshold shouldn't be too high either, since states
+			 * at the end of the loop are likely to be useful in pruning.
+			 */
+			if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+			    env->insn_processed - env->prev_insn_processed < 100)
+				add_new_state = false;
+			goto miss;
+		}
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -6650,7 +6743,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		sl->miss_cnt++;
+miss:
+		/* when new state is not going to be added do not increase miss count.
+		 * Otherwise several loop iterations will remove the state
+		 * recorded earlier. The goal of these heuristics is to have
+		 * states from some iterations of the loop (some in the beginning
+		 * and some at the end) to help pruning.
+		 */
+		if (add_new_state)
+			sl->miss_cnt++;
 		/* heuristic to determine whether this state is beneficial
 		 * to keep checking from state equivalence point of view.
 		 * Higher numbers increase max_states_per_insn and verification time,
@@ -6662,6 +6763,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 */
 			*pprev = sl->next;
 			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+				u32 br = sl->state.branches;
+
+				WARN_ONCE(br,
+					  "BUG live_done but branches_to_explore %d\n",
+					  br);
 				free_verifier_state(&sl->state, false);
 				kfree(sl);
 				env->peak_states--;
@@ -6687,18 +6793,25 @@ next:
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
 		return 0;
 
-	/* there were no equivalent states, remember current one.
-	 * technically the current state is not proven to be safe yet,
+	if (!add_new_state)
+		return 0;
+
+	/* There were no equivalent states, remember the current one.
+	 * Technically the current state is not proven to be safe yet,
 	 * but it will either reach outer most bpf_exit (which means it's safe)
-	 * or it will be rejected. Since there are no loops, we won't be
+	 * or it will be rejected. When there are no loops the verifier won't be
 	 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
-	 * again on the way to bpf_exit
+	 * again on the way to bpf_exit.
+	 * When looping the sl->state.branches will be > 0 and this state
+	 * will not be considered for equivalence until branches == 0.
 	 */
 	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 	env->total_states++;
 	env->peak_states++;
+	env->prev_jmps_processed = env->jmps_processed;
+	env->prev_insn_processed = env->insn_processed;
 
 	/* add new state to the head of linked list */
 	new = &new_sl->state;
@@ -6709,6 +6822,9 @@ next:
 		return err;
 	}
 	new->insn_idx = insn_idx;
+	WARN_ONCE(new->branches != 1,
+		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+	cur->parent = new;
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -6795,6 +6911,7 @@ static int do_check(struct bpf_verifier_env *env)
 		return -ENOMEM;
 	state->curframe = 0;
 	state->speculative = false;
+	state->branches = 1;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);
@@ -7001,6 +7118,7 @@ static int do_check(struct bpf_verifier_env *env)
 		} else if (class == BPF_JMP || class == BPF_JMP32) {
 			u8 opcode = BPF_OP(insn->code);
 
+			env->jmps_processed++;
 			if (opcode == BPF_CALL) {
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->off != 0 ||
@@ -7086,6 +7204,7 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
+				update_branch_counts(env, env->cur_state);
 				err = pop_stack(env, &env->prev_insn_idx,
 						&env->insn_idx);
 				if (err < 0) {
-- 
cgit v1.2.3


From b5dc0163d8fd78e64a7e21f309cf932fda34353e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 15 Jun 2019 12:12:25 -0700
Subject: bpf: precise scalar_value tracking

Introduce precision tracking logic that
helps cilium programs the most:
                  old clang  old clang    new clang  new clang
                          with all patches         with all patches
bpf_lb-DLB_L3.o      1838     2283         1923       1863
bpf_lb-DLB_L4.o      3218     2657         3077       2468
bpf_lb-DUNKNOWN.o    1064     545          1062       544
bpf_lxc-DDROP_ALL.o  26935    23045        166729     22629
bpf_lxc-DUNKNOWN.o   34439    35240        174607     28805
bpf_netdev.o         9721     8753         8407       6801
bpf_overlay.o        6184     7901         5420       4754
bpf_lxc_jit.o        39389    50925        39389      50925

Consider code:
654: (85) call bpf_get_hash_recalc#34
655: (bf) r7 = r0
656: (15) if r8 == 0x0 goto pc+29
657: (bf) r2 = r10
658: (07) r2 += -48
659: (18) r1 = 0xffff8881e41e1b00
661: (85) call bpf_map_lookup_elem#1
662: (15) if r0 == 0x0 goto pc+23
663: (69) r1 = *(u16 *)(r0 +0)
664: (15) if r1 == 0x0 goto pc+21
665: (bf) r8 = r7
666: (57) r8 &= 65535
667: (bf) r2 = r8
668: (3f) r2 /= r1
669: (2f) r2 *= r1
670: (bf) r1 = r8
671: (1f) r1 -= r2
672: (57) r1 &= 255
673: (25) if r1 > 0x1e goto pc+12
 R0=map_value(id=0,off=0,ks=20,vs=64,imm=0) R1_w=inv(id=0,umax_value=30,var_off=(0x0; 0x1f))
674: (67) r1 <<= 1
675: (0f) r0 += r1

At this point the verifier will notice that scalar R1 is used in map pointer adjustment.
R1 has to be precise for later operations on R0 to be validated properly.

The verifier will backtrack the above code in the following way:
last_idx 675 first_idx 664
regs=2 stack=0 before 675: (0f) r0 += r1         // started backtracking R1 regs=2 is a bitmask
regs=2 stack=0 before 674: (67) r1 <<= 1
regs=2 stack=0 before 673: (25) if r1 > 0x1e goto pc+12
regs=2 stack=0 before 672: (57) r1 &= 255
regs=2 stack=0 before 671: (1f) r1 -= r2         // now both R1 and R2 has to be precise -> regs=6 mask
regs=6 stack=0 before 670: (bf) r1 = r8          // after this insn R8 and R2 has to be precise
regs=104 stack=0 before 669: (2f) r2 *= r1       // after this one R8, R2, and R1
regs=106 stack=0 before 668: (3f) r2 /= r1
regs=106 stack=0 before 667: (bf) r2 = r8
regs=102 stack=0 before 666: (57) r8 &= 65535
regs=102 stack=0 before 665: (bf) r8 = r7
regs=82 stack=0 before 664: (15) if r1 == 0x0 goto pc+21
 // this is the end of verifier state. The following regs will be marked precised:
 R1_rw=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R7_rw=invP(id=0)
parent didn't have regs=82 stack=0 marks         // so backtracking continues into parent state
last_idx 663 first_idx 655
regs=82 stack=0 before 663: (69) r1 = *(u16 *)(r0 +0)   // R1 was assigned no need to track it further
regs=80 stack=0 before 662: (15) if r0 == 0x0 goto pc+23    // keep tracking R7
regs=80 stack=0 before 661: (85) call bpf_map_lookup_elem#1  // keep tracking R7
regs=80 stack=0 before 659: (18) r1 = 0xffff8881e41e1b00
regs=80 stack=0 before 658: (07) r2 += -48
regs=80 stack=0 before 657: (bf) r2 = r10
regs=80 stack=0 before 656: (15) if r8 == 0x0 goto pc+29
regs=80 stack=0 before 655: (bf) r7 = r0                // here the assignment into R7
 // mark R0 to be precise:
 R0_rw=invP(id=0)
parent didn't have regs=1 stack=0 marks                 // regs=1 -> tracking R0
last_idx 654 first_idx 644
regs=1 stack=0 before 654: (85) call bpf_get_hash_recalc#34 // and in the parent frame it was a return value
  // nothing further to backtrack

Two scalar registers not marked precise are equivalent from state pruning point of view.
More details in the patch comments.

It doesn't support bpf2bpf calls yet and enabled for root only.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  18 ++
 kernel/bpf/verifier.c        | 491 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 498 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 03037373b447..19393b0964a8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -139,6 +139,8 @@ struct bpf_reg_state {
 	 */
 	s32 subreg_def;
 	enum bpf_reg_liveness live;
+	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
+	bool precise;
 };
 
 enum bpf_stack_slot_type {
@@ -190,6 +192,11 @@ struct bpf_func_state {
 	struct bpf_stack_state *stack;
 };
 
+struct bpf_idx_pair {
+	u32 prev_idx;
+	u32 idx;
+};
+
 #define MAX_CALL_FRAMES 8
 struct bpf_verifier_state {
 	/* call stack tracking */
@@ -245,6 +252,17 @@ struct bpf_verifier_state {
 	u32 curframe;
 	u32 active_spin_lock;
 	bool speculative;
+
+	/* first and last insn idx of this verifier state */
+	u32 first_insn_idx;
+	u32 last_insn_idx;
+	/* jmp history recorded from first to last.
+	 * backtracking is using it to go from last to first.
+	 * For most states jmp_history_cnt is [0-3].
+	 * For loops can go up to ~40.
+	 */
+	struct bpf_idx_pair *jmp_history;
+	u32 jmp_history_cnt;
 };
 
 #define bpf_get_spilled_reg(slot, frame)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 870c8f19ce80..709ce4cef8ba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -455,12 +455,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
 		verbose(env, "=%s", reg_type_str[t]);
+		if (t == SCALAR_VALUE && reg->precise)
+			verbose(env, "P");
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
-			if (t == PTR_TO_STACK)
-				verbose(env, ",call_%d", func(env, reg)->callsite);
 		} else {
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -522,11 +522,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			continue;
 		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 		print_liveness(env, state->stack[i].spilled_ptr.live);
-		if (state->stack[i].slot_type[0] == STACK_SPILL)
-			verbose(env, "=%s",
-				reg_type_str[state->stack[i].spilled_ptr.type]);
-		else
+		if (state->stack[i].slot_type[0] == STACK_SPILL) {
+			reg = &state->stack[i].spilled_ptr;
+			t = reg->type;
+			verbose(env, "=%s", reg_type_str[t]);
+			if (t == SCALAR_VALUE && reg->precise)
+				verbose(env, "P");
+			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+				verbose(env, "%lld", reg->var_off.value + reg->off);
+		} else {
 			verbose(env, "=%s", types_buf);
+		}
 	}
 	if (state->acquired_refs && state->refs[0].id) {
 		verbose(env, " refs=%d", state->refs[0].id);
@@ -675,6 +681,13 @@ static void free_func_state(struct bpf_func_state *state)
 	kfree(state);
 }
 
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+	kfree(state->jmp_history);
+	state->jmp_history = NULL;
+	state->jmp_history_cnt = 0;
+}
+
 static void free_verifier_state(struct bpf_verifier_state *state,
 				bool free_self)
 {
@@ -684,6 +697,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		free_func_state(state->frame[i]);
 		state->frame[i] = NULL;
 	}
+	clear_jmp_history(state);
 	if (free_self)
 		kfree(state);
 }
@@ -711,8 +725,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 			       const struct bpf_verifier_state *src)
 {
 	struct bpf_func_state *dst;
+	u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
 	int i, err;
 
+	if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
+		kfree(dst_state->jmp_history);
+		dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
+		if (!dst_state->jmp_history)
+			return -ENOMEM;
+	}
+	memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+	dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
 	/* if dst has more stack frames then src frame, free them */
 	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
 		free_func_state(dst_state->frame[i]);
@@ -723,6 +747,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->active_spin_lock = src->active_spin_lock;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
+	dst_state->first_insn_idx = src->first_insn_idx;
+	dst_state->last_insn_idx = src->last_insn_idx;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -967,6 +993,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 	reg->smax_value = S64_MAX;
 	reg->umin_value = 0;
 	reg->umax_value = U64_MAX;
+
+	/* constant backtracking is enabled for root only for now */
+	reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -1378,6 +1407,389 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *cur)
+{
+	u32 cnt = cur->jmp_history_cnt;
+	struct bpf_idx_pair *p;
+
+	cnt++;
+	p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+	if (!p)
+		return -ENOMEM;
+	p[cnt - 1].idx = env->insn_idx;
+	p[cnt - 1].prev_idx = env->prev_insn_idx;
+	cur->jmp_history = p;
+	cur->jmp_history_cnt = cnt;
+	return 0;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+			     u32 *history)
+{
+	u32 cnt = *history;
+
+	if (cnt && st->jmp_history[cnt - 1].idx == i) {
+		i = st->jmp_history[cnt - 1].prev_idx;
+		(*history)--;
+	} else {
+		i--;
+	}
+	return i;
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+			  u32 *reg_mask, u64 *stack_mask)
+{
+	const struct bpf_insn_cbs cbs = {
+		.cb_print	= verbose,
+		.private_data	= env,
+	};
+	struct bpf_insn *insn = env->prog->insnsi + idx;
+	u8 class = BPF_CLASS(insn->code);
+	u8 opcode = BPF_OP(insn->code);
+	u8 mode = BPF_MODE(insn->code);
+	u32 dreg = 1u << insn->dst_reg;
+	u32 sreg = 1u << insn->src_reg;
+	u32 spi;
+
+	if (insn->code == 0)
+		return 0;
+	if (env->log.level & BPF_LOG_LEVEL) {
+		verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+		verbose(env, "%d: ", idx);
+		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+	}
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		if (opcode == BPF_MOV) {
+			if (BPF_SRC(insn->code) == BPF_X) {
+				/* dreg = sreg
+				 * dreg needs precision after this insn
+				 * sreg needs precision before this insn
+				 */
+				*reg_mask &= ~dreg;
+				*reg_mask |= sreg;
+			} else {
+				/* dreg = K
+				 * dreg needs precision after this insn.
+				 * Corresponding register is already marked
+				 * as precise=true in this verifier state.
+				 * No further markings in parent are necessary
+				 */
+				*reg_mask &= ~dreg;
+			}
+		} else {
+			if (BPF_SRC(insn->code) == BPF_X) {
+				/* dreg += sreg
+				 * both dreg and sreg need precision
+				 * before this insn
+				 */
+				*reg_mask |= sreg;
+			} /* else dreg += K
+			   * dreg still needs precision before this insn
+			   */
+		}
+	} else if (class == BPF_LDX) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		*reg_mask &= ~dreg;
+
+		/* scalars can only be spilled into stack w/o losing precision.
+		 * Load from any other memory can be zero extended.
+		 * The desire to keep that precision is already indicated
+		 * by 'precise' mark in corresponding register of this state.
+		 * No further tracking necessary.
+		 */
+		if (insn->src_reg != BPF_REG_FP)
+			return 0;
+		if (BPF_SIZE(insn->code) != BPF_DW)
+			return 0;
+
+		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
+		 * that [fp - off] slot contains scalar that needs to be
+		 * tracked with precision
+		 */
+		spi = (-insn->off - 1) / BPF_REG_SIZE;
+		if (spi >= 64) {
+			verbose(env, "BUG spi %d\n", spi);
+			WARN_ONCE(1, "verifier backtracking bug");
+			return -EFAULT;
+		}
+		*stack_mask |= 1ull << spi;
+	} else if (class == BPF_STX) {
+		if (*reg_mask & dreg)
+			/* stx shouldn't be using _scalar_ dst_reg
+			 * to access memory. It means backtracking
+			 * encountered a case of pointer subtraction.
+			 */
+			return -ENOTSUPP;
+		/* scalars can only be spilled into stack */
+		if (insn->dst_reg != BPF_REG_FP)
+			return 0;
+		if (BPF_SIZE(insn->code) != BPF_DW)
+			return 0;
+		spi = (-insn->off - 1) / BPF_REG_SIZE;
+		if (spi >= 64) {
+			verbose(env, "BUG spi %d\n", spi);
+			WARN_ONCE(1, "verifier backtracking bug");
+			return -EFAULT;
+		}
+		if (!(*stack_mask & (1ull << spi)))
+			return 0;
+		*stack_mask &= ~(1ull << spi);
+		*reg_mask |= sreg;
+	} else if (class == BPF_JMP || class == BPF_JMP32) {
+		if (opcode == BPF_CALL) {
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return -ENOTSUPP;
+			/* regular helper call sets R0 */
+			*reg_mask &= ~1;
+			if (*reg_mask & 0x3f) {
+				/* if backtracing was looking for registers R1-R5
+				 * they should have been found already.
+				 */
+				verbose(env, "BUG regs %x\n", *reg_mask);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+		} else if (opcode == BPF_EXIT) {
+			return -ENOTSUPP;
+		}
+	} else if (class == BPF_LD) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		*reg_mask &= ~dreg;
+		/* It's ld_imm64 or ld_abs or ld_ind.
+		 * For ld_imm64 no further tracking of precision
+		 * into parent is necessary
+		 */
+		if (mode == BPF_IND || mode == BPF_ABS)
+			/* to be analyzed */
+			return -ENOTSUPP;
+	} else if (class == BPF_ST) {
+		if (*reg_mask & dreg)
+			/* likely pointer subtraction */
+			return -ENOTSUPP;
+	}
+	return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ *   .  ptr + scalar alu
+ *   . if (scalar cond K|scalar)
+ *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
+ *   backtrack through the verifier states and mark all registers and
+ *   stack slots with spilled constants that these scalar regisers
+ *   should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ *   are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+				     struct bpf_verifier_state *st)
+{
+	struct bpf_func_state *func;
+	struct bpf_reg_state *reg;
+	int i, j;
+
+	/* big hammer: mark all scalars precise in this path.
+	 * pop_stack may still get !precise scalars.
+	 */
+	for (; st; st = st->parent)
+		for (i = 0; i <= st->curframe; i++) {
+			func = st->frame[i];
+			for (j = 0; j < BPF_REG_FP; j++) {
+				reg = &func->regs[j];
+				if (reg->type != SCALAR_VALUE)
+					continue;
+				reg->precise = true;
+			}
+			for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+				if (func->stack[j].slot_type[0] != STACK_SPILL)
+					continue;
+				reg = &func->stack[j].spilled_ptr;
+				if (reg->type != SCALAR_VALUE)
+					continue;
+				reg->precise = true;
+			}
+		}
+}
+
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+	struct bpf_verifier_state *st = env->cur_state;
+	int first_idx = st->first_insn_idx;
+	int last_idx = env->insn_idx;
+	struct bpf_func_state *func;
+	struct bpf_reg_state *reg;
+	u32 reg_mask = 1u << regno;
+	u64 stack_mask = 0;
+	bool skip_first = true;
+	int i, err;
+
+	if (!env->allow_ptr_leaks)
+		/* backtracking is root only for now */
+		return 0;
+
+	func = st->frame[st->curframe];
+	reg = &func->regs[regno];
+	if (reg->type != SCALAR_VALUE) {
+		WARN_ONCE(1, "backtracing misuse");
+		return -EFAULT;
+	}
+	if (reg->precise)
+		return 0;
+	func->regs[regno].precise = true;
+
+	for (;;) {
+		DECLARE_BITMAP(mask, 64);
+		bool new_marks = false;
+		u32 history = st->jmp_history_cnt;
+
+		if (env->log.level & BPF_LOG_LEVEL)
+			verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+		for (i = last_idx;;) {
+			if (skip_first) {
+				err = 0;
+				skip_first = false;
+			} else {
+				err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+			}
+			if (err == -ENOTSUPP) {
+				mark_all_scalars_precise(env, st);
+				return 0;
+			} else if (err) {
+				return err;
+			}
+			if (!reg_mask && !stack_mask)
+				/* Found assignment(s) into tracked register in this state.
+				 * Since this state is already marked, just return.
+				 * Nothing to be tracked further in the parent state.
+				 */
+				return 0;
+			if (i == first_idx)
+				break;
+			i = get_prev_insn_idx(st, i, &history);
+			if (i >= env->prog->len) {
+				/* This can happen if backtracking reached insn 0
+				 * and there are still reg_mask or stack_mask
+				 * to backtrack.
+				 * It means the backtracking missed the spot where
+				 * particular register was initialized with a constant.
+				 */
+				verbose(env, "BUG backtracking idx %d\n", i);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+		}
+		st = st->parent;
+		if (!st)
+			break;
+
+		func = st->frame[st->curframe];
+		bitmap_from_u64(mask, reg_mask);
+		for_each_set_bit(i, mask, 32) {
+			reg = &func->regs[i];
+			if (reg->type != SCALAR_VALUE)
+				continue;
+			if (!reg->precise)
+				new_marks = true;
+			reg->precise = true;
+		}
+
+		bitmap_from_u64(mask, stack_mask);
+		for_each_set_bit(i, mask, 64) {
+			if (i >= func->allocated_stack / BPF_REG_SIZE) {
+				/* This can happen if backtracking
+				 * is propagating stack precision where
+				 * caller has larger stack frame
+				 * than callee, but backtrack_insn() should
+				 * have returned -ENOTSUPP.
+				 */
+				verbose(env, "BUG spi %d stack_size %d\n",
+					i, func->allocated_stack);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+
+			if (func->stack[i].slot_type[0] != STACK_SPILL)
+				continue;
+			reg = &func->stack[i].spilled_ptr;
+			if (reg->type != SCALAR_VALUE)
+				continue;
+			if (!reg->precise)
+				new_marks = true;
+			reg->precise = true;
+		}
+		if (env->log.level & BPF_LOG_LEVEL) {
+			print_verifier_state(env, func);
+			verbose(env, "parent %s regs=%x stack=%llx marks\n",
+				new_marks ? "didn't have" : "already had",
+				reg_mask, stack_mask);
+		}
+
+		if (!new_marks)
+			break;
+
+		last_idx = st->last_insn_idx;
+		first_idx = st->first_insn_idx;
+	}
+	return 0;
+}
+
+
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
 	switch (type) {
@@ -1435,6 +1847,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+	u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
 	struct bpf_reg_state *reg = NULL;
 
 	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
@@ -1457,6 +1870,17 @@ static int check_stack_write(struct bpf_verifier_env *env,
 
 	if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
 	    !register_is_null(reg) && env->allow_ptr_leaks) {
+		if (dst_reg != BPF_REG_FP) {
+			/* The backtracking logic can only recognize explicit
+			 * stack slot address like [fp - 8]. Other spill of
+			 * scalar via different register has to be conervative.
+			 * Backtrack from here and mark all registers as precise
+			 * that contributed into 'reg' being a constant.
+			 */
+			err = mark_chain_precision(env, value_regno);
+			if (err)
+				return err;
+		}
 		save_register_state(state, spi, reg);
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
@@ -1529,8 +1953,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		/* when we zero initialize stack slots mark them as such */
-		if (reg && register_is_null(reg))
+		if (reg && register_is_null(reg)) {
+			/* backtracking doesn't work for STACK_ZERO yet. */
+			err = mark_chain_precision(env, value_regno);
+			if (err)
+				return err;
 			type = STACK_ZERO;
+		}
 
 		/* Mark slots affected by this stack write. */
 		for (i = 0; i < size; i++)
@@ -1610,6 +2039,17 @@ static int check_stack_read(struct bpf_verifier_env *env,
 				 * so the whole register == const_zero
 				 */
 				__mark_reg_const_zero(&state->regs[value_regno]);
+				/* backtracking doesn't support STACK_ZERO yet,
+				 * so mark it precise here, so that later
+				 * backtracking can stop here.
+				 * Backtracking may not need this if this register
+				 * doesn't participate in pointer adjustment.
+				 * Forward propagation of precise flag is not
+				 * necessary either. This mark is only to stop
+				 * backtracking. Any register that contributed
+				 * to const 0 was marked precise before spill.
+				 */
+				state->regs[value_regno].precise = true;
 			} else {
 				/* have read misc data from the stack */
 				mark_reg_unknown(env, state->regs, value_regno);
@@ -2925,6 +3365,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno - 1,
 					      reg->umax_value,
 					      zero_size_allowed, meta);
+		if (!err)
+			err = mark_chain_precision(env, regno);
 	} else if (arg_type_is_int_ptr(arg_type)) {
 		int size = int_ptr_type_to_size(arg_type);
 
@@ -4361,6 +4803,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
+	int err;
 
 	dst_reg = &regs[insn->dst_reg];
 	src_reg = NULL;
@@ -4387,11 +4830,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * This is legal, but we have to reverse our
 				 * src/dest handling in computing the range
 				 */
+				err = mark_chain_precision(env, insn->dst_reg);
+				if (err)
+					return err;
 				return adjust_ptr_min_max_vals(env, insn,
 							       src_reg, dst_reg);
 			}
 		} else if (ptr_reg) {
 			/* pointer += scalar */
+			err = mark_chain_precision(env, insn->src_reg);
+			if (err)
+				return err;
 			return adjust_ptr_min_max_vals(env, insn,
 						       dst_reg, src_reg);
 		}
@@ -5348,6 +5797,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 tnum_is_const(src_reg->var_off))
 		pred = is_branch_taken(dst_reg, src_reg->var_off.value,
 				       opcode, is_jmp32);
+	if (pred >= 0) {
+		err = mark_chain_precision(env, insn->dst_reg);
+		if (BPF_SRC(insn->code) == BPF_X && !err)
+			err = mark_chain_precision(env, insn->src_reg);
+		if (err)
+			return err;
+	}
 	if (pred == 1) {
 		/* only follow the goto, ignore fall-through */
 		*insn_idx += insn->off;
@@ -5825,6 +6281,11 @@ peek_stack:
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
+			/* unconditional jmp is not a good pruning point,
+			 * but it's marked, since backtracking needs
+			 * to record jmp history in is_state_visited().
+			 */
+			init_explored_state(env, t + insns[t].off + 1);
 			/* tell verifier to check for equivalent states
 			 * after every call and jump
 			 */
@@ -6325,6 +6786,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	switch (rold->type) {
 	case SCALAR_VALUE:
 		if (rcur->type == SCALAR_VALUE) {
+			if (!rold->precise && !rcur->precise)
+				return true;
 			/* new val must satisfy old val knowledge */
 			return range_within(rold, rcur) &&
 			       tnum_in(rold->var_off, rcur->var_off);
@@ -6675,6 +7138,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	int i, j, err, states_cnt = 0;
 	bool add_new_state = false;
 
+	cur->last_insn_idx = env->prev_insn_idx;
 	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
@@ -6791,10 +7255,10 @@ next:
 		env->max_states_per_insn = states_cnt;
 
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
-		return 0;
+		return push_jmp_history(env, cur);
 
 	if (!add_new_state)
-		return 0;
+		return push_jmp_history(env, cur);
 
 	/* There were no equivalent states, remember the current one.
 	 * Technically the current state is not proven to be safe yet,
@@ -6824,7 +7288,10 @@ next:
 	new->insn_idx = insn_idx;
 	WARN_ONCE(new->branches != 1,
 		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+
 	cur->parent = new;
+	cur->first_insn_idx = insn_idx;
+	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -6904,6 +7371,7 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	bool do_print_state = false;
+	int prev_insn_idx = -1;
 
 	env->prev_linfo = NULL;
 
@@ -6929,6 +7397,7 @@ static int do_check(struct bpf_verifier_env *env)
 		u8 class;
 		int err;
 
+		env->prev_insn_idx = prev_insn_idx;
 		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				env->insn_idx, insn_cnt);
@@ -7001,6 +7470,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		regs = cur_regs(env);
 		env->insn_aux_data[env->insn_idx].seen = true;
+		prev_insn_idx = env->insn_idx;
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
@@ -7174,7 +7644,6 @@ static int do_check(struct bpf_verifier_env *env)
 
 				if (state->curframe) {
 					/* exit from nested function */
-					env->prev_insn_idx = env->insn_idx;
 					err = prepare_func_exit(env, &env->insn_idx);
 					if (err)
 						return err;
@@ -7206,7 +7675,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 process_bpf_exit:
 				update_branch_counts(env, env->cur_state);
-				err = pop_stack(env, &env->prev_insn_idx,
+				err = pop_stack(env, &prev_insn_idx,
 						&env->insn_idx);
 				if (err < 0) {
 					if (err != -ENOENT)
-- 
cgit v1.2.3


From 0b385a0c3bd3f6d1044728b732bfc7dfb01c9fb5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 18 Jun 2019 10:18:28 +0200
Subject: PM: suspend: Rename pm_suspend_via_s2idle()

The name of pm_suspend_via_s2idle() is confusing, as it doesn't
reflect the purpose of the function precisely enough and it is
very similar to pm_suspend_via_firmware(), which has a different
purpose, so rename it as pm_suspend_default_s2idle() and update
its only caller, i8042_register_ports(), accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/serio/i8042.c | 2 +-
 include/linux/suspend.h     | 4 ++--
 kernel/power/suspend.c      | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 6462f1798fbb..8384abc41d7f 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -1410,7 +1410,7 @@ static void __init i8042_register_ports(void)
 		 * behavior on many platforms using suspend-to-RAM (ACPI S3)
 		 * by default.
 		 */
-		if (pm_suspend_via_s2idle() && i == I8042_KBD_PORT_NO)
+		if (pm_suspend_default_s2idle() && i == I8042_KBD_PORT_NO)
 			device_set_wakeup_enable(&serio->dev, true);
 	}
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 05645f726815..d07ae7fb9315 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -282,7 +282,7 @@ static inline bool idle_should_enter_s2idle(void)
 	return unlikely(s2idle_state == S2IDLE_STATE_ENTER);
 }
 
-extern bool pm_suspend_via_s2idle(void);
+extern bool pm_suspend_default_s2idle(void);
 extern void __init pm_states_init(void);
 extern void s2idle_set_ops(const struct platform_s2idle_ops *ops);
 extern void s2idle_wake(void);
@@ -314,7 +314,7 @@ static inline void pm_set_suspend_via_firmware(void) {}
 static inline void pm_set_resume_via_firmware(void) {}
 static inline bool pm_suspend_via_firmware(void) { return false; }
 static inline bool pm_resume_via_firmware(void) { return false; }
-static inline bool pm_suspend_via_s2idle(void) { return false; }
+static inline bool pm_suspend_default_s2idle(void) { return false; }
 
 static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 9505101ed2bc..8703b0ca4986 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -62,16 +62,16 @@ enum s2idle_states __read_mostly s2idle_state;
 static DEFINE_RAW_SPINLOCK(s2idle_lock);
 
 /**
- * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend.
+ * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
  *
  * Return 'true' if suspend-to-idle has been selected as the default system
  * suspend method.
  */
-bool pm_suspend_via_s2idle(void)
+bool pm_suspend_default_s2idle(void)
 {
 	return mem_sleep_current == PM_SUSPEND_TO_IDLE;
 }
-EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle);
+EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
 
 void s2idle_set_ops(const struct platform_s2idle_ops *ops)
 {
-- 
cgit v1.2.3


From 387e3746d01c34457d6a73688acd90428725070b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 7 Jun 2019 17:24:38 +0300
Subject: locks: eliminate false positive conflicts for write lease

check_conflicting_open() is checking for existing fd's open for read or
for write before allowing to take a write lease.  The check that was
implemented using i_count and d_count is an approximation that has
several false positives.  For example, overlayfs since v4.19, takes an
extra reference on the dentry; An open with O_PATH takes a reference on
the dentry although the file cannot be read nor written.

Change the implementation to use i_readcount and i_writecount to
eliminate the false positive conflicts and allow a write lease to be
taken on an overlayfs file.

The change of behavior with existing fd's open with O_PATH is symmetric
w.r.t. current behavior of lease breakers - an open with O_PATH currently
does not break a write lease.

This increases the size of struct inode by 4 bytes on 32bit archs when
CONFIG_FILE_LOCKING is defined and CONFIG_IMA was not already
defined.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c         | 42 +++++++++++++++++++++++++++---------------
 include/linux/fs.h |  4 ++--
 2 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 0cc2b9f30e22..de87a3231789 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1763,10 +1763,10 @@ int fcntl_getlease(struct file *filp)
 }
 
 /**
- * check_conflicting_open - see if the given dentry points to a file that has
+ * check_conflicting_open - see if the given file points to an inode that has
  *			    an existing open that would conflict with the
  *			    desired lease.
- * @dentry:	dentry to check
+ * @filp:	file to check
  * @arg:	type of lease that we're trying to acquire
  * @flags:	current lock flags
  *
@@ -1774,30 +1774,42 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
+check_conflicting_open(struct file *filp, const long arg, int flags)
 {
-	int ret = 0;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
+	int self_wcount = 0, self_rcount = 0;
 
 	if (flags & FL_LAYOUT)
 		return 0;
 
-	if ((arg == F_RDLCK) && inode_is_open_for_write(inode))
-		return -EAGAIN;
+	if (arg == F_RDLCK)
+		return inode_is_open_for_write(inode) ? -EAGAIN : 0;
+	else if (arg != F_WRLCK)
+		return 0;
+
+	/*
+	 * Make sure that only read/write count is from lease requestor.
+	 * Note that this will result in denying write leases when i_writecount
+	 * is negative, which is what we want.  (We shouldn't grant write leases
+	 * on files open for execution.)
+	 */
+	if (filp->f_mode & FMODE_WRITE)
+		self_wcount = 1;
+	else if (filp->f_mode & FMODE_READ)
+		self_rcount = 1;
 
-	if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
-	    (atomic_read(&inode->i_count) > 1)))
-		ret = -EAGAIN;
+	if (atomic_read(&inode->i_writecount) != self_wcount ||
+	    atomic_read(&inode->i_readcount) != self_rcount)
+		return -EAGAIN;
 
-	return ret;
+	return 0;
 }
 
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
 	struct file_lock *fl, *my_fl = NULL, *lease;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
@@ -1832,7 +1844,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1889,7 +1901,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease);
 		goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..419e327022de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -694,7 +694,7 @@ struct inode {
 	atomic_t		i_count;
 	atomic_t		i_dio_count;
 	atomic_t		i_writecount;
-#ifdef CONFIG_IMA
+#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
 	union {
@@ -2890,7 +2890,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode)
 	return atomic_read(&inode->i_writecount) > 0;
 }
 
-#ifdef CONFIG_IMA
+#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
 static inline void i_readcount_dec(struct inode *inode)
 {
 	BUG_ON(!atomic_read(&inode->i_readcount));
-- 
cgit v1.2.3


From 82828b88f081a0084cd65f90a4a1d3652f5adb66 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 19 Jun 2019 09:41:02 +0300
Subject: flow_dissector: add support for ingress ifindex dissection

Add new key meta that contains ingress ifindex value and add a function
to dissect this from skb. The key and function is prepared to cover
other potential skb metadata values dissection.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  4 ++++
 include/net/flow_dissector.h |  9 +++++++++
 net/core/flow_dissector.c    | 16 ++++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28bdaf978e72..b5d427b149c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1320,6 +1320,10 @@ skb_flow_dissect_flow_keys_basic(const struct net *net,
 				  data, proto, nhoff, hlen, flags);
 }
 
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+			   struct flow_dissector *flow_dissector,
+			   void *target_container);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d7ce647a8ca9..02478e48fae4 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -200,6 +200,14 @@ struct flow_dissector_key_ip {
 	__u8	ttl;
 };
 
+/**
+ * struct flow_dissector_key_meta:
+ * @ingress_ifindex: ingress ifindex
+ */
+struct flow_dissector_key_meta {
+	int ingress_ifindex;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -225,6 +233,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
 	FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
 	FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
+	FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c0559af9e5e5..01ad60b5aa75 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -199,6 +199,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+			   struct flow_dissector *flow_dissector,
+			   void *target_container)
+{
+	struct flow_dissector_key_meta *meta;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+		return;
+
+	meta = skb_flow_dissector_target(flow_dissector,
+					 FLOW_DISSECTOR_KEY_META,
+					 target_container);
+	meta->ingress_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
 static void
 skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 				   struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From 896f1950e5944532b971d880a6bae7fba3b6a8d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 19 Jun 2019 16:10:15 +0100
Subject: keys: Provide request_key_rcu()

Provide a request_key_rcu() function that can be used to request a key
under RCU conditions.  It can only search and check permissions; it cannot
allocate a new key, upcall or wait for an upcall to complete.  It may
return a partially constructed key.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/core.rst        | 10 +++++++
 Documentation/security/keys/request-key.rst |  9 ++++++
 include/linux/key.h                         |  3 ++
 security/keys/request_key.c                 | 44 +++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 82dd457ff78d..003f1452a5b7 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1147,6 +1147,16 @@ payload contents" for more information.
     case error ERESTARTSYS will be returned.
 
 
+ *  To search for a key under RCU conditions, call::
+
+	struct key *request_key_rcu(const struct key_type *type,
+				    const char *description);
+
+    which is similar to request_key() except that it does not check for keys
+    that are under construction and it will not call out to userspace to
+    construct a key if it can't find a match.
+
+
  *  When it is no longer required, the key should be released using::
 
 	void key_put(struct key *key);
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 07af991463b5..7caedc4d29f1 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -36,6 +36,11 @@ or::
 					     	   size_t callout_len,
 						   void *aux);
 
+or::
+
+	struct key *request_key_rcu(const struct key_type *type,
+				    const char *description);
+
 Or by userspace invoking the request_key system call::
 
 	key_serial_t request_key(const char *type,
@@ -57,6 +62,10 @@ The two async in-kernel calls may return keys that are still in the process of
 being constructed.  The two non-async ones will wait for construction to
 complete first.
 
+The request_key_rcu() call is like the in-kernel request_key() call, except
+that it doesn't check for keys that are under construction and doesn't attempt
+to construct missing keys.
+
 The userspace interface links the key to a keyring associated with the process
 to prevent the key from going away, and returns the serial number of the key to
 the caller.
diff --git a/include/linux/key.h b/include/linux/key.h
index 612e1cf84049..3604a554df99 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -274,6 +274,9 @@ extern struct key *request_key(struct key_type *type,
 			       const char *description,
 			       const char *callout_info);
 
+extern struct key *request_key_rcu(struct key_type *type,
+				   const char *description);
+
 extern struct key *request_key_with_auxdata(struct key_type *type,
 					    const char *description,
 					    const void *callout_info,
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index bf1d223ec21c..b4b3677657d6 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -756,3 +756,47 @@ struct key *request_key_async_with_auxdata(struct key_type *type,
 				    callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA);
 }
 EXPORT_SYMBOL(request_key_async_with_auxdata);
+
+/**
+ * request_key_rcu - Request key from RCU-read-locked context
+ * @type: The type of key we want.
+ * @description: The name of the key we want.
+ *
+ * Request a key from a context that we may not sleep in (such as RCU-mode
+ * pathwalk).  Keys under construction are ignored.
+ *
+ * Return a pointer to the found key if successful, -ENOKEY if we couldn't find
+ * a key or some other error if the key found was unsuitable or inaccessible.
+ */
+struct key *request_key_rcu(struct key_type *type, const char *description)
+{
+	struct keyring_search_context ctx = {
+		.index_key.type		= type,
+		.index_key.description	= description,
+		.index_key.desc_len	= strlen(description),
+		.cred			= current_cred(),
+		.match_data.cmp		= key_default_cmp,
+		.match_data.raw_data	= description,
+		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
+		.flags			= (KEYRING_SEARCH_DO_STATE_CHECK |
+					   KEYRING_SEARCH_SKIP_EXPIRED),
+	};
+	struct key *key;
+	key_ref_t key_ref;
+
+	kenter("%s,%s", type->name, description);
+
+	/* search all the process keyrings for a key */
+	key_ref = search_process_keyrings_rcu(&ctx);
+	if (IS_ERR(key_ref)) {
+		key = ERR_CAST(key_ref);
+		if (PTR_ERR(key_ref) == -EAGAIN)
+			key = ERR_PTR(-ENOKEY);
+	} else {
+		key = key_ref_to_ptr(key_ref);
+	}
+
+	kleave(" = %p", key);
+	return key;
+}
+EXPORT_SYMBOL(request_key_rcu);
-- 
cgit v1.2.3


From 7743c48e54ee9be9c799cbf3b8e3e9f2b8d19e72 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 19 Jun 2019 16:10:15 +0100
Subject: keys: Cache result of request_key*() temporarily in task_struct

If a filesystem uses keys to hold authentication tokens, then it needs a
token for each VFS operation that might perform an authentication check -
either by passing it to the server, or using to perform a check based on
authentication data cached locally.

For open files this isn't a problem, since the key should be cached in the
file struct since it represents the subject performing operations on that
file descriptor.

During pathwalk, however, there isn't anywhere to cache the key, except
perhaps in the nameidata struct - but that isn't exposed to the
filesystems.  Further, a pathwalk can incur a lot of operations, calling
one or more of the following, for instance:

	->lookup()
	->permission()
	->d_revalidate()
	->d_automount()
	->get_acl()
	->getxattr()

on each dentry/inode it encounters - and each one may need to call
request_key().  And then, at the end of pathwalk, it will call the actual
operation:

	->mkdir()
	->mknod()
	->getattr()
	->open()
	...

which may need to go and get the token again.

However, it is very likely that all of the operations on a single
dentry/inode - and quite possibly a sequence of them - will all want to use
the same authentication token, which suggests that caching it would be a
good idea.

To this end:

 (1) Make it so that a positive result of request_key() and co. that didn't
     require upcalling to userspace is cached temporarily in task_struct.

 (2) The cache is 1 deep, so a new result displaces the old one.

 (3) The key is released by exit and by notify-resume.

 (4) The cache is cleared in a newly forked process.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/request-key.rst |  7 +++++-
 include/linux/sched.h                       |  5 ++++
 include/linux/tracehook.h                   |  7 ++++++
 kernel/cred.c                               |  9 +++++++
 security/keys/Kconfig                       | 18 ++++++++++++++
 security/keys/request_key.c                 | 37 +++++++++++++++++++++++++++++
 6 files changed, 82 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 7caedc4d29f1..45049abdf290 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -176,6 +176,9 @@ The process stops immediately a valid key is found with permission granted to
 use it.  Any error from a previous match attempt is discarded and the key is
 returned.
 
+When request_key() is invoked, if CONFIG_KEYS_REQUEST_CACHE=y, a per-task
+one-key cache is first checked for a match.
+
 When search_process_keyrings() is invoked, it performs the following searches
 until one succeeds:
 
@@ -195,7 +198,9 @@ until one succeeds:
       c) The calling process's session keyring is searched.
 
 The moment one succeeds, all pending errors are discarded and the found key is
-returned.
+returned.  If CONFIG_KEYS_REQUEST_CACHE=y, then that key is placed in the
+per-task cache, displacing the previous key.  The cache is cleared on exit or
+just prior to resumption of userspace.
 
 Only if all these fail does the whole thing fail with the highest priority
 error.  Note that several errors may have come from LSM.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..e5f18857dd53 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -831,6 +831,11 @@ struct task_struct {
 	/* Effective (overridable) subjective task credentials (COW): */
 	const struct cred __rcu		*cred;
 
+#ifdef CONFIG_KEYS
+	/* Cached requested key. */
+	struct key			*cached_requested_key;
+#endif
+
 	/*
 	 * executable name, excluding path.
 	 *
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index df20f8bdbfa3..81824467e6a6 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -187,6 +187,13 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	if (unlikely(current->task_works))
 		task_work_run();
 
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	if (unlikely(current->cached_requested_key)) {
+		key_put(current->cached_requested_key);
+		current->cached_requested_key = NULL;
+	}
+#endif
+
 	mem_cgroup_handle_over_high();
 	blkcg_maybe_throttle_current();
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 3bd40de9e192..26da7e77098f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -174,6 +174,11 @@ void exit_creds(struct task_struct *tsk)
 	validate_creds(cred);
 	alter_cred_subscribers(cred, -1);
 	put_cred(cred);
+
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	key_put(current->cached_requested_key);
+	current->cached_requested_key = NULL;
+#endif
 }
 
 /**
@@ -327,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	p->cached_requested_key = NULL;
+#endif
+
 	if (
 #ifdef CONFIG_KEYS
 		!p->cred->thread_keyring &&
diff --git a/security/keys/Kconfig b/security/keys/Kconfig
index 6462e6654ccf..12f70b556d09 100644
--- a/security/keys/Kconfig
+++ b/security/keys/Kconfig
@@ -24,6 +24,24 @@ config KEYS_COMPAT
 	def_bool y
 	depends on COMPAT && KEYS
 
+config KEYS_REQUEST_CACHE
+	bool "Enable temporary caching of the last request_key() result"
+	depends on KEYS
+	help
+	  This option causes the result of the last successful request_key()
+	  call that didn't upcall to the kernel to be cached temporarily in the
+	  task_struct.  The cache is cleared by exit and just prior to the
+	  resumption of userspace.
+
+	  This allows the key used for multiple step processes where each step
+	  wants to request a key that is likely the same as the one requested
+	  by the last step to save on the searching.
+
+	  An example of such a process is a pathwalk through a network
+	  filesystem in which each method needs to request an authentication
+	  key.  Pathwalk will call multiple methods for each dentry traversed
+	  (permission, d_revalidate, lookup, getxattr, getacl, ...).
+
 config PERSISTENT_KEYRINGS
 	bool "Enable register of persistent per-UID keyrings"
 	depends on KEYS
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index b4b3677657d6..f289982cb5db 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -22,6 +22,31 @@
 
 #define key_negative_timeout	60	/* default timeout on a negative key's existence */
 
+static struct key *check_cached_key(struct keyring_search_context *ctx)
+{
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	struct key *key = current->cached_requested_key;
+
+	if (key &&
+	    ctx->match_data.cmp(key, &ctx->match_data) &&
+	    !(key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+			    (1 << KEY_FLAG_REVOKED))))
+		return key_get(key);
+#endif
+	return NULL;
+}
+
+static void cache_requested_key(struct key *key)
+{
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	struct task_struct *t = current;
+
+	key_put(t->cached_requested_key);
+	t->cached_requested_key = key_get(key);
+	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+#endif
+}
+
 /**
  * complete_request_key - Complete the construction of a key.
  * @authkey: The authorisation key.
@@ -562,6 +587,10 @@ struct key *request_key_and_link(struct key_type *type,
 		}
 	}
 
+	key = check_cached_key(&ctx);
+	if (key)
+		return key;
+
 	/* search all the process keyrings for a key */
 	rcu_read_lock();
 	key_ref = search_process_keyrings_rcu(&ctx);
@@ -587,6 +616,9 @@ struct key *request_key_and_link(struct key_type *type,
 				goto error_free;
 			}
 		}
+
+		/* Only cache the key on immediate success */
+		cache_requested_key(key);
 	} else if (PTR_ERR(key_ref) != -EAGAIN) {
 		key = ERR_CAST(key_ref);
 	} else  {
@@ -786,6 +818,10 @@ struct key *request_key_rcu(struct key_type *type, const char *description)
 
 	kenter("%s,%s", type->name, description);
 
+	key = check_cached_key(&ctx);
+	if (key)
+		return key;
+
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings_rcu(&ctx);
 	if (IS_ERR(key_ref)) {
@@ -794,6 +830,7 @@ struct key *request_key_rcu(struct key_type *type, const char *description)
 			key = ERR_PTR(-ENOKEY);
 	} else {
 		key = key_ref_to_ptr(key_ref);
+		cache_requested_key(key);
 	}
 
 	kleave(" = %p", key);
-- 
cgit v1.2.3


From 3ae72562ad917df36a1b1247d749240e3b4865db Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Wed, 19 Jun 2019 23:45:09 -0400
Subject: ext4: optimize case-insensitive lookups

Temporarily cache a casefolded version of the file name under lookup in
ext4_filename, to avoid repeatedly casefolding it.  I got up to 30%
speedup on lookups of large directories (>100k entries), depending on
the length of the string under lookup.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/dir.c           |  2 +-
 fs/ext4/ext4.h          | 39 ++++++++++++++++++++++++++++++++++++---
 fs/ext4/namei.c         | 43 ++++++++++++++++++++++++++++++++++++++-----
 fs/unicode/utf8-core.c  | 28 ++++++++++++++++++++++++++++
 include/linux/unicode.h |  3 +++
 5 files changed, 106 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1f7784bee42a..770a1e6d4672 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -677,7 +677,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
 		return memcmp(str, name->name, len);
 	}
 
-	return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr);
+	return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
 }
 
 static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ceb74093e138..7215a2a2a0de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2078,6 +2078,9 @@ struct ext4_filename {
 #ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_str crypto_buf;
 #endif
+#ifdef CONFIG_UNICODE
+	struct fscrypt_str cf_name;
+#endif
 };
 
 #define fname_name(p) ((p)->disk_name.name)
@@ -2303,6 +2306,12 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
+#ifdef CONFIG_UNICODE
+extern void ext4_fname_setup_ci_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 struct fscrypt_str *fname);
+#endif
+
 #ifdef CONFIG_FS_ENCRYPTION
 static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
 						const struct fscrypt_name *src)
@@ -2329,6 +2338,10 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
 		return err;
 
 	ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
 	return 0;
 }
 
@@ -2344,6 +2357,10 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
 		return err;
 
 	ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
+#endif
 	return 0;
 }
 
@@ -2357,6 +2374,11 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
 	fname->crypto_buf.name = NULL;
 	fname->usr_fname = NULL;
 	fname->disk_name.name = NULL;
+
+#ifdef CONFIG_UNICODE
+	kfree(fname->cf_name.name);
+	fname->cf_name.name = NULL;
+#endif
 }
 #else /* !CONFIG_FS_ENCRYPTION */
 static inline int ext4_fname_setup_filename(struct inode *dir,
@@ -2367,6 +2389,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
 	fname->usr_fname = iname;
 	fname->disk_name.name = (unsigned char *) iname->name;
 	fname->disk_name.len = iname->len;
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+
 	return 0;
 }
 
@@ -2377,7 +2404,13 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
 	return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
 }
 
-static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+static inline void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+#ifdef CONFIG_UNICODE
+	kfree(fname->cf_name.name);
+	fname->cf_name.name = NULL;
+#endif
+}
 #endif /* !CONFIG_FS_ENCRYPTION */
 
 /* dir.c */
@@ -3120,8 +3153,8 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
 					 struct inode *inode,
 					 struct buffer_head *bh);
 extern int ext4_ci_compare(const struct inode *parent,
-			   const struct qstr *name,
-			   const struct qstr *entry);
+			   const struct qstr *fname,
+			   const struct qstr *entry, bool quick);
 
 #define S_SHIFT 12
 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cd01c4a67ffb..4909ced4e672 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1259,19 +1259,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 #ifdef CONFIG_UNICODE
 /*
  * Test whether a case-insensitive directory entry matches the filename
- * being searched for.
+ * being searched for.  If quick is set, assume the name being looked up
+ * is already in the casefolded form.
  *
  * Returns: 0 if the directory entry matches, more than 0 if it
  * doesn't match or less than zero on error.
  */
 int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
-		    const struct qstr *entry)
+		    const struct qstr *entry, bool quick)
 {
 	const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
 	const struct unicode_map *um = sbi->s_encoding;
 	int ret;
 
-	ret = utf8_strncasecmp(um, name, entry);
+	if (quick)
+		ret = utf8_strncasecmp_folded(um, name, entry);
+	else
+		ret = utf8_strncasecmp(um, name, entry);
+
 	if (ret < 0) {
 		/* Handle invalid character sequence as either an error
 		 * or as an opaque byte sequence.
@@ -1287,6 +1292,27 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
 
 	return ret;
 }
+
+void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+				  struct fscrypt_str *cf_name)
+{
+	if (!IS_CASEFOLDED(dir)) {
+		cf_name->name = NULL;
+		return;
+	}
+
+	cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+	if (!cf_name->name)
+		return;
+
+	cf_name->len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+				     iname, cf_name->name,
+				     EXT4_NAME_LEN);
+	if (cf_name->len <= 0) {
+		kfree(cf_name->name);
+		cf_name->name = NULL;
+	}
+}
 #endif
 
 /*
@@ -1313,8 +1339,15 @@ static inline bool ext4_match(const struct inode *parent,
 #endif
 
 #ifdef CONFIG_UNICODE
-	if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent))
-		return (ext4_ci_compare(parent, fname->usr_fname, &entry) == 0);
+	if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+		if (fname->cf_name.name) {
+			struct qstr cf = {.name = fname->cf_name.name,
+					  .len = fname->cf_name.len};
+			return !ext4_ci_compare(parent, &cf, &entry, true);
+		}
+		return !ext4_ci_compare(parent, fname->usr_fname, &entry,
+					false);
+	}
 #endif
 
 	return fscrypt_match_name(&f, de->name, de->name_len);
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 6afab4fdce90..71ca4d047d65 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -73,6 +73,34 @@ int utf8_strncasecmp(const struct unicode_map *um,
 }
 EXPORT_SYMBOL(utf8_strncasecmp);
 
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int utf8_strncasecmp_folded(const struct unicode_map *um,
+			    const struct qstr *cf,
+			    const struct qstr *s1)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur1;
+	int c1, c2;
+	int i = 0;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = cf->name[i++];
+		if (c1 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp_folded);
+
 int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
 		  unsigned char *dest, size_t dlen)
 {
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index aec2c6d800aa..990aa97d8049 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -17,6 +17,9 @@ int utf8_strncmp(const struct unicode_map *um,
 
 int utf8_strncasecmp(const struct unicode_map *um,
 		 const struct qstr *s1, const struct qstr *s2);
+int utf8_strncasecmp_folded(const struct unicode_map *um,
+			    const struct qstr *cf,
+			    const struct qstr *s1);
 
 int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
 		   unsigned char *dest, size_t dlen);
-- 
cgit v1.2.3


From f03631da4be33219021323630a8cf788fd066267 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:52:54 -0600
Subject: coresight: Introduce generic platform data helper

So far we have hard coded the DT platform parsing code in
every driver. Introduce generic helper to parse the information
provided by the firmware in a platform agnostic manner, in preparation
for the ACPI support.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       | 13 ++---
 drivers/hwtracing/coresight/coresight-etb10.c      | 11 ++--
 drivers/hwtracing/coresight/coresight-etm3x.c      | 12 ++---
 drivers/hwtracing/coresight/coresight-etm4x.c      | 11 ++--
 drivers/hwtracing/coresight/coresight-funnel.c     | 11 ++--
 drivers/hwtracing/coresight/coresight-platform.c   | 58 ++++++++++++++++------
 drivers/hwtracing/coresight/coresight-replicator.c | 11 ++--
 drivers/hwtracing/coresight/coresight-stm.c        | 11 ++--
 drivers/hwtracing/coresight/coresight-tmc.c        | 13 ++---
 drivers/hwtracing/coresight/coresight-tpiu.c       | 11 ++--
 include/linux/coresight.h                          |  7 +--
 11 files changed, 83 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index 63109c98765c..799ba1dd877e 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -503,17 +503,14 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	struct coresight_desc catu_desc;
 	struct coresight_platform_data *pdata = NULL;
 	struct device *dev = &adev->dev;
-	struct device_node *np = dev->of_node;
 	void __iomem *base;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata)) {
-			ret = PTR_ERR(pdata);
-			goto out;
-		}
-		dev->platform_data = pdata;
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata)) {
+		ret = PTR_ERR(pdata);
+		goto out;
 	}
+	dev->platform_data = pdata;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata) {
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0c9161fe4233..cb8cb03e0cdb 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -732,14 +732,11 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 	struct etb_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		adev->dev.platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	adev->dev.platform_data = pdata;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index fa2f1417cafb..fa2164ff07c2 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -790,20 +790,16 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 	struct etm_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-
-		adev->dev.platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
 
+	adev->dev.platform_data = pdata;
 	drvdata->use_cp14 = fwnode_property_read_bool(dev->fwnode, "arm,cp14");
 	dev_set_drvdata(dev, drvdata);
 
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 77d1d837da52..4355b2e8c308 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1084,18 +1084,15 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 	struct etmv4_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		adev->dev.platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	adev->dev.platform_data = pdata;
 
 	dev_set_drvdata(dev, drvdata);
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 3423042e7a52..fc033fdb6cd5 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -187,14 +187,11 @@ static int funnel_probe(struct device *dev, struct resource *res)
 	struct coresight_platform_data *pdata = NULL;
 	struct funnel_drvdata *drvdata;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = dev->of_node;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		dev->platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	dev->platform_data = pdata;
 
 	if (is_of_node(dev_fwnode(dev)) &&
 	    of_device_is_compatible(dev->of_node, "arm,coresight-funnel"))
diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c
index 4c31299607cf..5d78f4fbd97d 100644
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -230,23 +230,16 @@ static int of_coresight_parse_endpoint(struct device *dev,
 	return ret;
 }
 
-struct coresight_platform_data *
-of_get_coresight_platform_data(struct device *dev,
-			       const struct device_node *node)
+static int of_get_coresight_platform_data(struct device *dev,
+					  struct coresight_platform_data *pdata)
 {
 	int ret = 0;
-	struct coresight_platform_data *pdata;
 	struct coresight_connection *conn;
 	struct device_node *ep = NULL;
 	const struct device_node *parent = NULL;
 	bool legacy_binding = false;
+	struct device_node *node = dev->of_node;
 
-	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return ERR_PTR(-ENOMEM);
-
-	/* Use device name as sysfs handle */
-	pdata->name = dev_name(dev);
 	pdata->cpu = of_coresight_get_cpu(node);
 
 	/* Get the number of input and output port for this component */
@@ -254,11 +247,11 @@ of_get_coresight_platform_data(struct device *dev,
 
 	/* If there are no output connections, we are done */
 	if (!pdata->nr_outport)
-		return pdata;
+		return 0;
 
 	ret = coresight_alloc_conns(dev, pdata);
 	if (ret)
-		return ERR_PTR(ret);
+		return ret;
 
 	parent = of_coresight_get_output_ports_node(node);
 	/*
@@ -292,11 +285,46 @@ of_get_coresight_platform_data(struct device *dev,
 		case 0:
 			break;
 		default:
-			return ERR_PTR(ret);
+			return ret;
 		}
 	}
 
-	return pdata;
+	return 0;
+}
+#else
+static inline int
+of_get_coresight_platform_data(struct device *dev,
+			       struct coresight_platform_data *pdata)
+{
+	return -ENOENT;
 }
-EXPORT_SYMBOL_GPL(of_get_coresight_platform_data);
 #endif
+
+struct coresight_platform_data *
+coresight_get_platform_data(struct device *dev)
+{
+	int ret = -ENOENT;
+	struct coresight_platform_data *pdata;
+	struct fwnode_handle *fwnode = dev_fwnode(dev);
+
+	if (IS_ERR_OR_NULL(fwnode))
+		goto error;
+
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* Use device name as sysfs handle */
+	pdata->name = dev_name(dev);
+
+	if (is_of_node(fwnode))
+		ret = of_get_coresight_platform_data(dev, pdata);
+
+	if (!ret)
+		return pdata;
+error:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(coresight_get_platform_data);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 7e0514557920..054b33554695 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -177,15 +177,12 @@ static int replicator_probe(struct device *dev, struct resource *res)
 	struct coresight_platform_data *pdata = NULL;
 	struct replicator_drvdata *drvdata;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = dev->of_node;
 	void __iomem *base;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		dev->platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	dev->platform_data = pdata;
 
 	if (is_of_node(dev_fwnode(dev)) &&
 	    of_device_is_compatible(dev->of_node, "arm,coresight-replicator"))
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 3992a35e34e4..9faa1ed01500 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -809,14 +809,11 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	struct resource ch_res;
 	size_t bitmap_size;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		adev->dev.platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	adev->dev.platform_data = pdata;
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index 9c5e615c1486..be0bd98ca8c1 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -397,16 +397,13 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 	struct tmc_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata)) {
-			ret = PTR_ERR(pdata);
-			goto out;
-		}
-		adev->dev.platform_data = pdata;
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata)) {
+		ret = PTR_ERR(pdata);
+		goto out;
 	}
+	adev->dev.platform_data = pdata;
 
 	ret = -ENOMEM;
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index 4dd3e7f63050..aec0ed7bf924 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -124,14 +124,11 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	struct tpiu_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
-	struct device_node *np = adev->dev.of_node;
 
-	if (np) {
-		pdata = of_get_coresight_platform_data(dev, np);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-		adev->dev.platform_data = pdata;
-	}
+	pdata = coresight_get_platform_data(dev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	dev->platform_data = pdata;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 62a520df8add..e2b95e05e0be 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -294,14 +294,11 @@ static inline void coresight_disclaim_device_unlocked(void __iomem *base) {}
 
 #ifdef CONFIG_OF
 extern int of_coresight_get_cpu(const struct device_node *node);
-extern struct coresight_platform_data *
-of_get_coresight_platform_data(struct device *dev,
-			       const struct device_node *node);
 #else
 static inline int of_coresight_get_cpu(const struct device_node *node)
 { return 0; }
-static inline struct coresight_platform_data *of_get_coresight_platform_data(
-	struct device *dev, const struct device_node *node) { return NULL; }
 #endif
 
+struct coresight_platform_data *coresight_get_platform_data(struct device *dev);
+
 #endif
-- 
cgit v1.2.3


From 91824db2ea2d2bacacd54de55a7faba10c63b166 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:52:55 -0600
Subject: coresight: Make device to CPU mapping generic

The CoreSight components ETM and CPU-Debug are always associated
with CPUs. Replace the of_coresight_get_cpu() with a platform
agnostic helper, in preparation to add ACPI support.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-cpu-debug.c |  3 +--
 drivers/hwtracing/coresight/coresight-platform.c  | 18 +++++++++++++-----
 include/linux/coresight.h                         |  7 +------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c
index e8819d750938..07a1367c733f 100644
--- a/drivers/hwtracing/coresight/coresight-cpu-debug.c
+++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c
@@ -572,14 +572,13 @@ static int debug_probe(struct amba_device *adev, const struct amba_id *id)
 	struct device *dev = &adev->dev;
 	struct debug_drvdata *drvdata;
 	struct resource *res = &adev->res;
-	struct device_node *np = adev->dev.of_node;
 	int ret;
 
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
 
-	drvdata->cpu = np ? of_coresight_get_cpu(np) : 0;
+	drvdata->cpu = coresight_get_cpu(dev);
 	if (per_cpu(debug_drvdata, drvdata->cpu)) {
 		dev_err(dev, "CPU%d drvdata has already been initialized\n",
 			drvdata->cpu);
diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c
index 5d78f4fbd97d..ba8c14635c6b 100644
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -151,12 +151,14 @@ static void of_coresight_get_ports(const struct device_node *node,
 	}
 }
 
-int of_coresight_get_cpu(const struct device_node *node)
+static int of_coresight_get_cpu(struct device *dev)
 {
 	int cpu;
 	struct device_node *dn;
 
-	dn = of_parse_phandle(node, "cpu", 0);
+	if (!dev->of_node)
+		return 0;
+	dn = of_parse_phandle(dev->of_node, "cpu", 0);
 	/* Affinity defaults to CPU0 */
 	if (!dn)
 		return 0;
@@ -166,7 +168,6 @@ int of_coresight_get_cpu(const struct device_node *node)
 	/* Affinity to CPU0 if no cpu nodes are found */
 	return (cpu < 0) ? 0 : cpu;
 }
-EXPORT_SYMBOL_GPL(of_coresight_get_cpu);
 
 /*
  * of_coresight_parse_endpoint : Parse the given output endpoint @ep
@@ -240,8 +241,6 @@ static int of_get_coresight_platform_data(struct device *dev,
 	bool legacy_binding = false;
 	struct device_node *node = dev->of_node;
 
-	pdata->cpu = of_coresight_get_cpu(node);
-
 	/* Get the number of input and output port for this component */
 	of_coresight_get_ports(node, &pdata->nr_inport, &pdata->nr_outport);
 
@@ -300,6 +299,14 @@ of_get_coresight_platform_data(struct device *dev,
 }
 #endif
 
+int coresight_get_cpu(struct device *dev)
+{
+	if (is_of_node(dev->fwnode))
+		return of_coresight_get_cpu(dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(coresight_get_cpu);
+
 struct coresight_platform_data *
 coresight_get_platform_data(struct device *dev)
 {
@@ -318,6 +325,7 @@ coresight_get_platform_data(struct device *dev)
 
 	/* Use device name as sysfs handle */
 	pdata->name = dev_name(dev);
+	pdata->cpu = coresight_get_cpu(dev);
 
 	if (is_of_node(fwnode))
 		ret = of_get_coresight_platform_data(dev, pdata);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index e2b95e05e0be..98a4440dea3e 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -292,12 +292,7 @@ static inline void coresight_disclaim_device_unlocked(void __iomem *base) {}
 
 #endif
 
-#ifdef CONFIG_OF
-extern int of_coresight_get_cpu(const struct device_node *node);
-#else
-static inline int of_coresight_get_cpu(const struct device_node *node)
-{ return 0; }
-#endif
+extern int coresight_get_cpu(struct device *dev);
 
 struct coresight_platform_data *coresight_get_platform_data(struct device *dev);
 
-- 
cgit v1.2.3


From aff70a45fe3120b08ae459a6e3996346d2766b1f Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:52:56 -0600
Subject: coresight: Remove cpu field from platform data

CPU field is only used by ETMs and there is a separate API
for fetching the same. So, let us use that instead of using
the common platform probing helper. Also, remove it from the
platform_data.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm3x.c    | 2 +-
 drivers/hwtracing/coresight/coresight-etm4x.c    | 2 +-
 drivers/hwtracing/coresight/coresight-platform.c | 1 -
 include/linux/coresight.h                        | 2 --
 4 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index fa2164ff07c2..722fab9632ef 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -819,7 +819,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 			return ret;
 	}
 
-	drvdata->cpu = pdata ? pdata->cpu : 0;
+	drvdata->cpu = coresight_get_cpu(dev);
 
 	cpus_read_lock();
 	etmdrvdata[drvdata->cpu] = drvdata;
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 4355b2e8c308..03576f3ed22d 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1105,7 +1105,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 
 	spin_lock_init(&drvdata->spinlock);
 
-	drvdata->cpu = pdata ? pdata->cpu : 0;
+	drvdata->cpu = coresight_get_cpu(dev);
 
 	cpus_read_lock();
 	etmdrvdata[drvdata->cpu] = drvdata;
diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c
index ba8c14635c6b..541e500a83c2 100644
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -325,7 +325,6 @@ coresight_get_platform_data(struct device *dev)
 
 	/* Use device name as sysfs handle */
 	pdata->name = dev_name(dev);
-	pdata->cpu = coresight_get_cpu(dev);
 
 	if (is_of_node(fwnode))
 		ret = of_get_coresight_platform_data(dev, pdata);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 98a4440dea3e..bf241dbf99c5 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -91,14 +91,12 @@ union coresight_dev_subtype {
 
 /**
  * struct coresight_platform_data - data harvested from the DT specification
- * @cpu:	the CPU a source belongs to. Only applicable for ETM/PTMs.
  * @name:	name of the component as shown under sysfs.
  * @nr_inport:	number of input ports for this component.
  * @nr_outport:	number of output ports for this component.
  * @conns:	Array of nr_outport connections from this component
  */
 struct coresight_platform_data {
-	int cpu;
 	const char *name;
 	int nr_inport;
 	int nr_outport;
-- 
cgit v1.2.3


From 2ede79a6e8a541d1bc7c033b1198f05088e7cefb Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:52:57 -0600
Subject: coresight: Remove name from platform description

We are about to use a name independent of the parent AMBA device
name. As such, there is no need to have it in the platform description.
Let us move this to coresight description instead.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       | 2 ++
 drivers/hwtracing/coresight/coresight-etb10.c      | 3 ++-
 drivers/hwtracing/coresight/coresight-etm3x.c      | 1 +
 drivers/hwtracing/coresight/coresight-etm4x.c      | 1 +
 drivers/hwtracing/coresight/coresight-funnel.c     | 1 +
 drivers/hwtracing/coresight/coresight-platform.c   | 3 ---
 drivers/hwtracing/coresight/coresight-replicator.c | 2 ++
 drivers/hwtracing/coresight/coresight-stm.c        | 1 +
 drivers/hwtracing/coresight/coresight-tmc.c        | 5 +++--
 drivers/hwtracing/coresight/coresight-tpiu.c       | 1 +
 drivers/hwtracing/coresight/coresight.c            | 2 +-
 include/linux/coresight.h                          | 8 ++++----
 12 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index 799ba1dd877e..05c73045532a 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -551,6 +551,8 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	catu_desc.type = CORESIGHT_DEV_TYPE_HELPER;
 	catu_desc.subtype.helper_subtype = CORESIGHT_DEV_SUBTYPE_HELPER_CATU;
 	catu_desc.ops = &catu_ops;
+	catu_desc.name = dev_name(dev);
+
 	drvdata->csdev = coresight_register(&catu_desc);
 	if (IS_ERR(drvdata->csdev))
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index cb8cb03e0cdb..8726d6c7663c 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -777,12 +777,13 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &etb_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
 	desc.groups = coresight_etb_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev))
 		return PTR_ERR(drvdata->csdev);
 
-	drvdata->miscdev.name = pdata->name;
+	drvdata->miscdev.name = desc.name;
 	drvdata->miscdev.minor = MISC_DYNAMIC_MINOR;
 	drvdata->miscdev.fops = &etb_fops;
 	ret = misc_register(&drvdata->miscdev);
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index 722fab9632ef..101fb01e20de 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -854,6 +854,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &etm_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
 	desc.groups = coresight_etm_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 03576f3ed22d..8adc1485cd89 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1142,6 +1142,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.pdata = pdata;
 	desc.dev = dev;
 	desc.groups = coresight_etmv4_groups;
+	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index fc033fdb6cd5..ded33f5e7d43 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -229,6 +229,7 @@ static int funnel_probe(struct device *dev, struct resource *res)
 	desc.ops = &funnel_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c
index 541e500a83c2..f500de61e7f9 100644
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -323,9 +323,6 @@ coresight_get_platform_data(struct device *dev)
 		goto error;
 	}
 
-	/* Use device name as sysfs handle */
-	pdata->name = dev_name(dev);
-
 	if (is_of_node(fwnode))
 		ret = of_get_coresight_platform_data(dev, pdata);
 
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 054b33554695..f28bafd98995 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -220,6 +220,8 @@ static int replicator_probe(struct device *dev, struct resource *res)
 	desc.ops = &replicator_cs_ops;
 	desc.pdata = dev->platform_data;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
+
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 9faa1ed01500..02031d93fb8b 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -871,6 +871,7 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &stm_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
 	desc.groups = coresight_stm_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index be0bd98ca8c1..44a571988219 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -437,6 +437,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.pdata = pdata;
 	desc.dev = dev;
 	desc.groups = coresight_tmc_groups;
+	desc.name = dev_name(dev);
 
 	switch (drvdata->config_type) {
 	case TMC_CONFIG_TYPE_ETB:
@@ -461,7 +462,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 		desc.ops = &tmc_etf_cs_ops;
 		break;
 	default:
-		pr_err("%s: Unsupported TMC config\n", pdata->name);
+		pr_err("%s: Unsupported TMC config\n", desc.name);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -472,7 +473,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out;
 	}
 
-	drvdata->miscdev.name = pdata->name;
+	drvdata->miscdev.name = desc.name;
 	drvdata->miscdev.minor = MISC_DYNAMIC_MINOR;
 	drvdata->miscdev.fops = &tmc_fops;
 	ret = misc_register(&drvdata->miscdev);
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index aec0ed7bf924..d8a2e3991c7e 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -157,6 +157,7 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &tpiu_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
+	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 
 	if (!IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 4b130281236a..04b5d3c2bb3a 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -1199,7 +1199,7 @@ struct coresight_device *coresight_register(struct coresight_desc *desc)
 	csdev->dev.parent = desc->dev;
 	csdev->dev.release = coresight_device_release;
 	csdev->dev.bus = &coresight_bustype;
-	dev_set_name(&csdev->dev, "%s", desc->pdata->name);
+	dev_set_name(&csdev->dev, "%s", desc->name);
 
 	ret = device_register(&csdev->dev);
 	if (ret) {
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index bf241dbf99c5..298db20ba8ce 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -91,13 +91,11 @@ union coresight_dev_subtype {
 
 /**
  * struct coresight_platform_data - data harvested from the DT specification
- * @name:	name of the component as shown under sysfs.
  * @nr_inport:	number of input ports for this component.
  * @nr_outport:	number of output ports for this component.
  * @conns:	Array of nr_outport connections from this component
  */
 struct coresight_platform_data {
-	const char *name;
 	int nr_inport;
 	int nr_outport;
 	struct coresight_connection *conns;
@@ -108,11 +106,12 @@ struct coresight_platform_data {
  * @type:	as defined by @coresight_dev_type.
  * @subtype:	as defined by @coresight_dev_subtype.
  * @ops:	generic operations for this component, as defined
-		by @coresight_ops.
+ *		by @coresight_ops.
  * @pdata:	platform data collected from DT.
  * @dev:	The device entity associated to this component.
  * @groups:	operations specific to this component. These will end up
-		in the component's sysfs sub-directory.
+ *		in the component's sysfs sub-directory.
+ * @name:	name for the coresight device, also shown under sysfs.
  */
 struct coresight_desc {
 	enum coresight_dev_type type;
@@ -121,6 +120,7 @@ struct coresight_desc {
 	struct coresight_platform_data *pdata;
 	struct device *dev;
 	const struct attribute_group **groups;
+	const char *name;
 };
 
 /**
-- 
cgit v1.2.3


From b77e3ed038c0d877f6f3b6ad278b931048a48e34 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:52:59 -0600
Subject: coresight: Reuse platform data structure for connection tracking

The platform specific information describes the connections and
the ports of a given coresigh device. This information is also
recorded in the coresight device as separate fields. Let us reuse
the original platform description to streamline the handling
of the data.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-tmc-etr.c |  4 +--
 drivers/hwtracing/coresight/coresight.c         | 46 ++++++++++++-------------
 include/linux/coresight.h                       |  8 ++---
 3 files changed, 27 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 709448cf2dff..ce0114a5435c 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -756,8 +756,8 @@ tmc_etr_get_catu_device(struct tmc_drvdata *drvdata)
 	if (!IS_ENABLED(CONFIG_CORESIGHT_CATU))
 		return NULL;
 
-	for (i = 0; i < etr->nr_outport; i++) {
-		tmp = etr->conns[i].child_dev;
+	for (i = 0; i < etr->pdata->nr_outport; i++) {
+		tmp = etr->pdata->conns[i].child_dev;
 		if (tmp && coresight_is_catu_device(tmp))
 			return tmp;
 	}
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 068bd2fc0985..96e15154a566 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -100,8 +100,8 @@ static int coresight_find_link_inport(struct coresight_device *csdev,
 	int i;
 	struct coresight_connection *conn;
 
-	for (i = 0; i < parent->nr_outport; i++) {
-		conn = &parent->conns[i];
+	for (i = 0; i < parent->pdata->nr_outport; i++) {
+		conn = &parent->pdata->conns[i];
 		if (conn->child_dev == csdev)
 			return conn->child_port;
 	}
@@ -118,8 +118,8 @@ static int coresight_find_link_outport(struct coresight_device *csdev,
 	int i;
 	struct coresight_connection *conn;
 
-	for (i = 0; i < csdev->nr_outport; i++) {
-		conn = &csdev->conns[i];
+	for (i = 0; i < csdev->pdata->nr_outport; i++) {
+		conn = &csdev->pdata->conns[i];
 		if (conn->child_dev == child)
 			return conn->outport;
 	}
@@ -306,10 +306,10 @@ static void coresight_disable_link(struct coresight_device *csdev,
 
 	if (link_subtype == CORESIGHT_DEV_SUBTYPE_LINK_MERG) {
 		refport = inport;
-		nr_conns = csdev->nr_inport;
+		nr_conns = csdev->pdata->nr_inport;
 	} else if (link_subtype == CORESIGHT_DEV_SUBTYPE_LINK_SPLIT) {
 		refport = outport;
-		nr_conns = csdev->nr_outport;
+		nr_conns = csdev->pdata->nr_outport;
 	} else {
 		refport = 0;
 		nr_conns = 1;
@@ -595,9 +595,10 @@ static void coresight_grab_device(struct coresight_device *csdev)
 {
 	int i;
 
-	for (i = 0; i < csdev->nr_outport; i++) {
-		struct coresight_device *child = csdev->conns[i].child_dev;
+	for (i = 0; i < csdev->pdata->nr_outport; i++) {
+		struct coresight_device *child;
 
+		child  = csdev->pdata->conns[i].child_dev;
 		if (child && child->type == CORESIGHT_DEV_TYPE_HELPER)
 			pm_runtime_get_sync(child->dev.parent);
 	}
@@ -613,9 +614,10 @@ static void coresight_drop_device(struct coresight_device *csdev)
 	int i;
 
 	pm_runtime_put(csdev->dev.parent);
-	for (i = 0; i < csdev->nr_outport; i++) {
-		struct coresight_device *child = csdev->conns[i].child_dev;
+	for (i = 0; i < csdev->pdata->nr_outport; i++) {
+		struct coresight_device *child;
 
+		child  = csdev->pdata->conns[i].child_dev;
 		if (child && child->type == CORESIGHT_DEV_TYPE_HELPER)
 			pm_runtime_put(child->dev.parent);
 	}
@@ -645,9 +647,10 @@ static int _coresight_build_path(struct coresight_device *csdev,
 		goto out;
 
 	/* Not a sink - recursively explore each port found on this element */
-	for (i = 0; i < csdev->nr_outport; i++) {
-		struct coresight_device *child_dev = csdev->conns[i].child_dev;
+	for (i = 0; i < csdev->pdata->nr_outport; i++) {
+		struct coresight_device *child_dev;
 
+		child_dev = csdev->pdata->conns[i].child_dev;
 		if (child_dev &&
 		    _coresight_build_path(child_dev, sink, path) == 0) {
 			found = true;
@@ -1000,8 +1003,8 @@ static int coresight_orphan_match(struct device *dev, void *data)
 	 * Circle throuch all the connection of that component.  If we find
 	 * an orphan connection whose name matches @csdev, link it.
 	 */
-	for (i = 0; i < i_csdev->nr_outport; i++) {
-		conn = &i_csdev->conns[i];
+	for (i = 0; i < i_csdev->pdata->nr_outport; i++) {
+		conn = &i_csdev->pdata->conns[i];
 
 		/* We have found at least one orphan connection */
 		if (conn->child_dev == NULL) {
@@ -1040,8 +1043,8 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
 {
 	int i;
 
-	for (i = 0; i < csdev->nr_outport; i++) {
-		struct coresight_connection *conn = &csdev->conns[i];
+	for (i = 0; i < csdev->pdata->nr_outport; i++) {
+		struct coresight_connection *conn = &csdev->pdata->conns[i];
 		struct device *dev = NULL;
 
 		if (conn->child_name)
@@ -1075,8 +1078,8 @@ static int coresight_remove_match(struct device *dev, void *data)
 	 * Circle throuch all the connection of that component.  If we find
 	 * a connection whose name matches @csdev, remove it.
 	 */
-	for (i = 0; i < iterator->nr_outport; i++) {
-		conn = &iterator->conns[i];
+	for (i = 0; i < iterator->pdata->nr_outport; i++) {
+		conn = &iterator->pdata->conns[i];
 
 		if (conn->child_dev == NULL)
 			continue;
@@ -1108,7 +1111,7 @@ static void coresight_remove_conns(struct coresight_device *csdev)
 	 * doesn't have at least one input port, there is no point
 	 * in searching all the devices.
 	 */
-	if (csdev->nr_inport)
+	if (csdev->pdata->nr_inport)
 		bus_for_each_dev(&coresight_bustype, NULL,
 				 csdev, coresight_remove_match);
 }
@@ -1195,10 +1198,7 @@ struct coresight_device *coresight_register(struct coresight_desc *desc)
 
 	csdev->refcnt = refcnts;
 
-	csdev->nr_inport = desc->pdata->nr_inport;
-	csdev->nr_outport = desc->pdata->nr_outport;
-
-	csdev->conns = desc->pdata->conns;
+	csdev->pdata = desc->pdata;
 
 	csdev->type = desc->type;
 	csdev->subtype = desc->subtype;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 298db20ba8ce..b67d5074ece0 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -140,9 +140,7 @@ struct coresight_connection {
 
 /**
  * struct coresight_device - representation of a device as used by the framework
- * @conns:	array of coresight_connections associated to this component.
- * @nr_inport:	number of input port associated to this component.
- * @nr_outport:	number of output port associated to this component.
+ * @pdata:	Platform data with device connections associated to this device.
  * @type:	as defined by @coresight_dev_type.
  * @subtype:	as defined by @coresight_dev_subtype.
  * @ops:	generic operations for this component, as defined
@@ -157,9 +155,7 @@ struct coresight_connection {
  * @ea:		Device attribute for sink representation under PMU directory.
  */
 struct coresight_device {
-	struct coresight_connection *conns;
-	int nr_inport;
-	int nr_outport;
+	struct coresight_platform_data *pdata;
 	enum coresight_dev_type type;
 	union coresight_dev_subtype subtype;
 	const struct coresight_ops *ops;
-- 
cgit v1.2.3


From 37ea1ffddffa63c920ce826786fe610c78f57842 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:53:03 -0600
Subject: coresight: Use fwnode handle instead of device names

We rely on the device names to find a CoreSight device on the
coresight bus. The device name however is obtained from the platform,
which is bound to the real platform/amba device. As we are about
to use different naming scheme for the coresight devices, we can't
rely on the platform device name to find the corresponding
coresight device. Instead we use the platform agnostic
"fwnode handle" of the parent device to find the devices.
We also reuse the same fwnode as the parent for the Coresight
device we create.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-platform.c | 14 +++++---
 drivers/hwtracing/coresight/coresight-priv.h     |  6 ++--
 drivers/hwtracing/coresight/coresight.c          | 42 +++++++++++++++++++-----
 include/linux/coresight.h                        |  4 +--
 4 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c
index 4394095ad224..49112a58478e 100644
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -36,7 +36,7 @@ static int coresight_alloc_conns(struct device *dev,
 	return 0;
 }
 
-static int coresight_device_fwnode_match(struct device *dev, void *fwnode)
+int coresight_device_fwnode_match(struct device *dev, void *fwnode)
 {
 	return dev_fwnode(dev) == fwnode;
 }
@@ -219,9 +219,15 @@ static int of_coresight_parse_endpoint(struct device *dev,
 		}
 
 		conn->outport = endpoint.port;
-		conn->child_name = devm_kstrdup(dev,
-						dev_name(rdev),
-						GFP_KERNEL);
+		/*
+		 * Hold the refcount to the target device. This could be
+		 * released via:
+		 * 1) coresight_release_platform_data() if the probe fails or
+		 *    this device is unregistered.
+		 * 2) While removing the target device via
+		 *    coresight_remove_match()
+		 */
+		conn->child_fwnode = fwnode_handle_get(rdev_fwnode);
 		conn->child_port = rendpoint.port;
 		/* Connection record updated */
 		ret = 1;
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index c21642114fc3..8b07fe55395a 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -200,8 +200,8 @@ static inline void *coresight_get_uci_data(const struct amba_id *id)
 	return 0;
 }
 
-static inline void
-coresight_release_platform_data(struct coresight_platform_data *pdata)
-{}
+void coresight_release_platform_data(struct coresight_platform_data *pdata);
+
+int coresight_device_fwnode_match(struct device *dev, void *fwnode);
 
 #endif
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 526141c2f876..1287778c3be5 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -978,6 +978,7 @@ static void coresight_device_release(struct device *dev)
 {
 	struct coresight_device *csdev = to_coresight_device(dev);
 
+	fwnode_handle_put(csdev->dev.fwnode);
 	kfree(csdev->refcnt);
 	kfree(csdev);
 }
@@ -1009,13 +1010,11 @@ static int coresight_orphan_match(struct device *dev, void *data)
 		/* We have found at least one orphan connection */
 		if (conn->child_dev == NULL) {
 			/* Does it match this newly added device? */
-			if (conn->child_name &&
-			    !strcmp(dev_name(&csdev->dev), conn->child_name)) {
+			if (conn->child_fwnode == csdev->dev.fwnode)
 				conn->child_dev = csdev;
-			} else {
+			else
 				/* This component still has an orphan */
 				still_orphan = true;
-			}
 		}
 	}
 
@@ -1047,9 +1046,9 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
 		struct coresight_connection *conn = &csdev->pdata->conns[i];
 		struct device *dev = NULL;
 
-		if (conn->child_name)
-			dev = bus_find_device_by_name(&coresight_bustype, NULL,
-						      conn->child_name);
+		dev = bus_find_device(&coresight_bustype, NULL,
+				      (void *)conn->child_fwnode,
+				      coresight_device_fwnode_match);
 		if (dev) {
 			conn->child_dev = to_coresight_device(dev);
 			/* and put reference from 'bus_find_device()' */
@@ -1084,9 +1083,15 @@ static int coresight_remove_match(struct device *dev, void *data)
 		if (conn->child_dev == NULL)
 			continue;
 
-		if (!strcmp(dev_name(&csdev->dev), conn->child_name)) {
+		if (csdev->dev.fwnode == conn->child_fwnode) {
 			iterator->orphan = true;
 			conn->child_dev = NULL;
+			/*
+			 * Drop the reference to the handle for the remote
+			 * device acquired in parsing the connections from
+			 * platform data.
+			 */
+			fwnode_handle_put(conn->child_fwnode);
 			/* No need to continue */
 			break;
 		}
@@ -1166,6 +1171,22 @@ static int __init coresight_init(void)
 }
 postcore_initcall(coresight_init);
 
+/*
+ * coresight_release_platform_data: Release references to the devices connected
+ * to the output port of this device.
+ */
+void coresight_release_platform_data(struct coresight_platform_data *pdata)
+{
+	int i;
+
+	for (i = 0; i < pdata->nr_outport; i++) {
+		if (pdata->conns[i].child_fwnode) {
+			fwnode_handle_put(pdata->conns[i].child_fwnode);
+			pdata->conns[i].child_fwnode = NULL;
+		}
+	}
+}
+
 struct coresight_device *coresight_register(struct coresight_desc *desc)
 {
 	int ret;
@@ -1210,6 +1231,11 @@ struct coresight_device *coresight_register(struct coresight_desc *desc)
 	csdev->dev.parent = desc->dev;
 	csdev->dev.release = coresight_device_release;
 	csdev->dev.bus = &coresight_bustype;
+	/*
+	 * Hold the reference to our parent device. This will be
+	 * dropped only in coresight_device_release().
+	 */
+	csdev->dev.fwnode = fwnode_handle_get(dev_fwnode(desc->dev));
 	dev_set_name(&csdev->dev, "%s", desc->name);
 
 	ret = device_register(&csdev->dev);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index b67d5074ece0..b40544bc06fe 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -126,15 +126,15 @@ struct coresight_desc {
 /**
  * struct coresight_connection - representation of a single connection
  * @outport:	a connection's output port number.
- * @chid_name:	remote component's name.
  * @child_port:	remote component's port number @output is connected to.
+ * @chid_fwnode: remote component's fwnode handle.
  * @child_dev:	a @coresight_device representation of the component
 		connected to @outport.
  */
 struct coresight_connection {
 	int outport;
-	const char *child_name;
 	int child_port;
+	struct fwnode_handle *child_fwnode;
 	struct coresight_device *child_dev;
 };
 
-- 
cgit v1.2.3


From 0f5f9b6ba9e1a706f5a3b1bd467e9242ab31b352 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 19 Jun 2019 13:53:04 -0600
Subject: coresight: Use platform agnostic names

So far we have reused the name of the "platform" device for
the CoreSight device. But this is not very intuitive when
we move to ACPI. Also, the ACPI device names have ":" in them
(e.g, ARMHC97C:01), which the perf tool doesn't like very much.
This patch introduces a generic naming scheme, givin more intuitive
names for the devices that appear on the CoreSight bus.
The names follow the pattern "prefix" followed by "index" (e.g, etm5).
We maintain a list of allocated devices per "prefix" to make sure
we don't allocate a new name when it is reprobed (e.g, due to
unsatisifed device dependencies). So, we maintain the list
of "fwnodes" of the parent devices to allocate a consistent name.
All devices except the ETMs get an index allocated in the order
of probing. ETMs get an index based on the CPU they are attached to.

TMC devices are named using "tmc_etf", "tmc_etb", and "tmc_etr"
prefixes depending on the configuration of the device.

The replicators and funnels are not classified as dynamic/static
anymore. One could easily figure that out by checking the presence
of "mgmt" registers under sysfs.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |  7 ++-
 drivers/hwtracing/coresight/coresight-etb10.c      |  7 ++-
 drivers/hwtracing/coresight/coresight-etm3x.c      |  4 +-
 drivers/hwtracing/coresight/coresight-etm4x.c      |  4 +-
 drivers/hwtracing/coresight/coresight-funnel.c     |  7 ++-
 drivers/hwtracing/coresight/coresight-replicator.c |  7 ++-
 drivers/hwtracing/coresight/coresight-stm.c        | 12 +++--
 drivers/hwtracing/coresight/coresight-tmc.c        | 15 +++++-
 drivers/hwtracing/coresight/coresight-tpiu.c       |  7 ++-
 drivers/hwtracing/coresight/coresight.c            | 58 ++++++++++++++++++++++
 include/linux/coresight.h                          | 25 +++++++++-
 11 files changed, 141 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index 1c1ad1268b9d..16ebf38a9f66 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -28,6 +28,8 @@
 #define catu_dbg(x, ...) do {} while (0)
 #endif
 
+DEFINE_CORESIGHT_DEVLIST(catu_devs, "catu");
+
 struct catu_etr_buf {
 	struct tmc_sg_table *catu_table;
 	dma_addr_t sladdr;
@@ -505,6 +507,10 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	struct device *dev = &adev->dev;
 	void __iomem *base;
 
+	catu_desc.name = coresight_alloc_device_name(&catu_devs, dev);
+	if (!catu_desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata) {
 		ret = -ENOMEM;
@@ -551,7 +557,6 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	catu_desc.type = CORESIGHT_DEV_TYPE_HELPER;
 	catu_desc.subtype.helper_subtype = CORESIGHT_DEV_SUBTYPE_HELPER_CATU;
 	catu_desc.ops = &catu_ops;
-	catu_desc.name = dev_name(dev);
 
 	drvdata->csdev = coresight_register(&catu_desc);
 	if (IS_ERR(drvdata->csdev))
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index fffaac3e3677..d5b9edecf76e 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -63,6 +63,8 @@
 #define ETB_FFSR_BIT		1
 #define ETB_FRAME_SIZE_WORDS	4
 
+DEFINE_CORESIGHT_DEVLIST(etb_devs, "etb");
+
 /**
  * struct etb_drvdata - specifics associated to an ETB component
  * @base:	memory mapped base address for this component.
@@ -733,6 +735,10 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
 
+	desc.name = coresight_alloc_device_name(&etb_devs, dev);
+	if (!desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
@@ -777,7 +783,6 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &etb_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 	desc.groups = coresight_etb_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev))
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index f2d461610a2f..bed729140718 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -815,6 +815,9 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 
 	drvdata->cpu = coresight_get_cpu(dev);
+	desc.name  = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu);
+	if (!desc.name)
+		return -ENOMEM;
 
 	cpus_read_lock();
 	etmdrvdata[drvdata->cpu] = drvdata;
@@ -856,7 +859,6 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &etm_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 	desc.groups = coresight_etm_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 1609da1eaf83..7fe266194ab5 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1101,6 +1101,9 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 	spin_lock_init(&drvdata->spinlock);
 
 	drvdata->cpu = coresight_get_cpu(dev);
+	desc.name = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu);
+	if (!desc.name)
+		return -ENOMEM;
 
 	cpus_read_lock();
 	etmdrvdata[drvdata->cpu] = drvdata;
@@ -1144,7 +1147,6 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.pdata = pdata;
 	desc.dev = dev;
 	desc.groups = coresight_etmv4_groups;
-	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 75fa2d3ad9b4..5867fcb4503b 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -29,6 +29,8 @@
 #define FUNNEL_HOLDTIME		(0x7 << FUNNEL_HOLDTIME_SHFT)
 #define FUNNEL_ENSx_MASK	0xff
 
+DEFINE_CORESIGHT_DEVLIST(funnel_devs, "funnel");
+
 /**
  * struct funnel_drvdata - specifics associated to a funnel component
  * @base:	memory mapped base address for this component.
@@ -192,6 +194,10 @@ static int funnel_probe(struct device *dev, struct resource *res)
 	    of_device_is_compatible(dev->of_node, "arm,coresight-funnel"))
 		pr_warn_once("Uses OBSOLETE CoreSight funnel binding\n");
 
+	desc.name = coresight_alloc_device_name(&funnel_devs, dev);
+	if (!desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
@@ -231,7 +237,6 @@ static int funnel_probe(struct device *dev, struct resource *res)
 	desc.ops = &funnel_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
 		ret = PTR_ERR(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 64dfde7241c1..c0e42253dfe7 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -22,6 +22,8 @@
 #define REPLICATOR_IDFILTER0		0x000
 #define REPLICATOR_IDFILTER1		0x004
 
+DEFINE_CORESIGHT_DEVLIST(replicator_devs, "replicator");
+
 /**
  * struct replicator_drvdata - specifics associated to a replicator component
  * @base:	memory mapped base address for this component. Also indicates
@@ -183,6 +185,10 @@ static int replicator_probe(struct device *dev, struct resource *res)
 	    of_device_is_compatible(dev->of_node, "arm,coresight-replicator"))
 		pr_warn_once("Uses OBSOLETE CoreSight replicator binding\n");
 
+	desc.name = coresight_alloc_device_name(&replicator_devs, dev);
+	if (!desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
@@ -222,7 +228,6 @@ static int replicator_probe(struct device *dev, struct resource *res)
 	desc.ops = &replicator_cs_ops;
 	desc.pdata = dev->platform_data;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 03528f3fa9ff..e3e2b000cfb7 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -107,6 +107,8 @@ struct channel_space {
 	unsigned long		*guaranteed;
 };
 
+DEFINE_CORESIGHT_DEVLIST(stm_devs, "stm");
+
 /**
  * struct stm_drvdata - specifics associated to an STM component
  * @base:		memory mapped base address for this component.
@@ -810,6 +812,10 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	size_t bitmap_size;
 	struct coresight_desc desc = { 0 };
 
+	desc.name = coresight_alloc_device_name(&stm_devs, dev);
+	if (!desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
@@ -854,11 +860,12 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	spin_lock_init(&drvdata->spinlock);
 
 	stm_init_default_data(drvdata);
-	stm_init_generic_data(drvdata, dev_name(dev));
+	stm_init_generic_data(drvdata, desc.name);
 
 	if (stm_register_device(dev, &drvdata->stm, THIS_MODULE)) {
 		dev_info(dev,
-			 "stm_register_device failed, probing deferred\n");
+			 "%s : stm_register_device failed, probing deferred\n",
+			 desc.name);
 		return -EPROBE_DEFER;
 	}
 
@@ -874,7 +881,6 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &stm_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 	desc.groups = coresight_stm_groups;
 	drvdata->csdev = coresight_register(&desc);
 	if (IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index 212630e65cca..be37aff573b4 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -27,6 +27,10 @@
 #include "coresight-priv.h"
 #include "coresight-tmc.h"
 
+DEFINE_CORESIGHT_DEVLIST(etb_devs, "tmc_etb");
+DEFINE_CORESIGHT_DEVLIST(etf_devs, "tmc_etf");
+DEFINE_CORESIGHT_DEVLIST(etr_devs, "tmc_etr");
+
 void tmc_wait_for_tmcready(struct tmc_drvdata *drvdata)
 {
 	/* Ensure formatter, unformatter and hardware fifo are empty */
@@ -397,6 +401,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 	struct tmc_drvdata *drvdata;
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
+	struct coresight_dev_list *dev_list = NULL;
 
 	ret = -ENOMEM;
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
@@ -429,13 +434,13 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 
 	desc.dev = dev;
 	desc.groups = coresight_tmc_groups;
-	desc.name = dev_name(dev);
 
 	switch (drvdata->config_type) {
 	case TMC_CONFIG_TYPE_ETB:
 		desc.type = CORESIGHT_DEV_TYPE_SINK;
 		desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER;
 		desc.ops = &tmc_etb_cs_ops;
+		dev_list = &etb_devs;
 		break;
 	case TMC_CONFIG_TYPE_ETR:
 		desc.type = CORESIGHT_DEV_TYPE_SINK;
@@ -447,11 +452,13 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 			goto out;
 		idr_init(&drvdata->idr);
 		mutex_init(&drvdata->idr_mutex);
+		dev_list = &etr_devs;
 		break;
 	case TMC_CONFIG_TYPE_ETF:
 		desc.type = CORESIGHT_DEV_TYPE_LINKSINK;
 		desc.subtype.link_subtype = CORESIGHT_DEV_SUBTYPE_LINK_FIFO;
 		desc.ops = &tmc_etf_cs_ops;
+		dev_list = &etf_devs;
 		break;
 	default:
 		pr_err("%s: Unsupported TMC config\n", desc.name);
@@ -459,6 +466,12 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out;
 	}
 
+	desc.name = coresight_alloc_device_name(dev_list, dev);
+	if (!desc.name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	pdata = coresight_get_platform_data(dev);
 	if (IS_ERR(pdata)) {
 		ret = PTR_ERR(pdata);
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index b699d613425d..f8583e4032a6 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -47,6 +47,8 @@
 #define FFCR_FON_MAN		BIT(6)
 #define FFCR_STOP_FI		BIT(12)
 
+DEFINE_CORESIGHT_DEVLIST(tpiu_devs, "tpiu");
+
 /**
  * @base:	memory mapped base address for this component.
  * @atclk:	optional clock for the core parts of the TPIU.
@@ -125,6 +127,10 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	struct resource *res = &adev->res;
 	struct coresight_desc desc = { 0 };
 
+	desc.name = coresight_alloc_device_name(&tpiu_devs, dev);
+	if (!desc.name)
+		return -ENOMEM;
+
 	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
 	if (!drvdata)
 		return -ENOMEM;
@@ -157,7 +163,6 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	desc.ops = &tpiu_cs_ops;
 	desc.pdata = pdata;
 	desc.dev = dev;
-	desc.name = dev_name(dev);
 	drvdata->csdev = coresight_register(&desc);
 
 	if (!IS_ERR(drvdata->csdev)) {
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 1287778c3be5..86d1fc2c1bd4 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -1291,3 +1291,61 @@ void coresight_unregister(struct coresight_device *csdev)
 	device_unregister(&csdev->dev);
 }
 EXPORT_SYMBOL_GPL(coresight_unregister);
+
+
+/*
+ * coresight_search_device_idx - Search the fwnode handle of a device
+ * in the given dev_idx list. Must be called with the coresight_mutex held.
+ *
+ * Returns the index of the entry, when found. Otherwise, -ENOENT.
+ */
+static inline int coresight_search_device_idx(struct coresight_dev_list *dict,
+					      struct fwnode_handle *fwnode)
+{
+	int i;
+
+	for (i = 0; i < dict->nr_idx; i++)
+		if (dict->fwnode_list[i] == fwnode)
+			return i;
+	return -ENOENT;
+}
+
+/*
+ * coresight_alloc_device_name - Get an index for a given device in the
+ * device index list specific to a driver. An index is allocated for a
+ * device and is tracked with the fwnode_handle to prevent allocating
+ * duplicate indices for the same device (e.g, if we defer probing of
+ * a device due to dependencies), in case the index is requested again.
+ */
+char *coresight_alloc_device_name(struct coresight_dev_list *dict,
+				  struct device *dev)
+{
+	int idx;
+	char *name = NULL;
+	struct fwnode_handle **list;
+
+	mutex_lock(&coresight_mutex);
+
+	idx = coresight_search_device_idx(dict, dev_fwnode(dev));
+	if (idx < 0) {
+		/* Make space for the new entry */
+		idx = dict->nr_idx;
+		list = krealloc(dict->fwnode_list,
+				(idx + 1) * sizeof(*dict->fwnode_list),
+				GFP_KERNEL);
+		if (ZERO_OR_NULL_PTR(list)) {
+			idx = -ENOMEM;
+			goto done;
+		}
+
+		list[idx] = dev_fwnode(dev);
+		dict->fwnode_list = list;
+		dict->nr_idx = idx + 1;
+	}
+
+	name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", dict->pfx, idx);
+done:
+	mutex_unlock(&coresight_mutex);
+	return name;
+}
+EXPORT_SYMBOL_GPL(coresight_alloc_device_name);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index b40544bc06fe..a2b68823717b 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -168,6 +168,28 @@ struct coresight_device {
 	struct dev_ext_attribute *ea;
 };
 
+/*
+ * coresight_dev_list - Mapping for devices to "name" index for device
+ * names.
+ *
+ * @nr_idx:		Number of entries already allocated.
+ * @pfx:		Prefix pattern for device name.
+ * @fwnode_list:	Array of fwnode_handles associated with each allocated
+ *			index, upto nr_idx entries.
+ */
+struct coresight_dev_list {
+	int			nr_idx;
+	const char		*pfx;
+	struct fwnode_handle	**fwnode_list;
+};
+
+#define DEFINE_CORESIGHT_DEVLIST(var, dev_pfx)				\
+static struct coresight_dev_list (var) = {				\
+						.pfx = dev_pfx,		\
+						.nr_idx = 0,		\
+						.fwnode_list = NULL,	\
+}
+
 #define to_coresight_device(d) container_of(d, struct coresight_device, dev)
 
 #define source_ops(csdev)	csdev->ops->source_ops
@@ -261,7 +283,8 @@ extern int coresight_claim_device_unlocked(void __iomem *base);
 
 extern void coresight_disclaim_device(void __iomem *base);
 extern void coresight_disclaim_device_unlocked(void __iomem *base);
-
+extern char *coresight_alloc_device_name(struct coresight_dev_list *devs,
+					 struct device *dev);
 #else
 static inline struct coresight_device *
 coresight_register(struct coresight_desc *desc) { return NULL; }
-- 
cgit v1.2.3


From cbf4f7325a638ced1d815580dfed44ea3b76163c Mon Sep 17 00:00:00 2001
From: Vitor Soares <Vitor.Soares@synopsys.com>
Date: Wed, 19 Jun 2019 20:36:32 +0200
Subject: i3c: add mixed limited bus mode

The i3c bus spec defines a bus configuration where i2c devices don't
have a 50ns filter but support SCL running at SDR max rate (12.5MHz).

This patch introduces the limited bus mode so that users can use
a higher speed in presence of i2c devices index 1.

Signed-off-by: Vitor Soares <vitor.soares@synopsys.com>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/i3c/master.c       | 5 +++++
 include/linux/i3c/master.h | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 54026be03998..1318adfe0216 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -470,6 +470,7 @@ static int i3c_bus_init(struct i3c_bus *i3cbus)
 static const char * const i3c_bus_mode_strings[] = {
 	[I3C_BUS_MODE_PURE] = "pure",
 	[I3C_BUS_MODE_MIXED_FAST] = "mixed-fast",
+	[I3C_BUS_MODE_MIXED_LIMITED] = "mixed-limited",
 	[I3C_BUS_MODE_MIXED_SLOW] = "mixed-slow",
 };
 
@@ -584,6 +585,7 @@ int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode,
 			i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE;
 		break;
 	case I3C_BUS_MODE_MIXED_FAST:
+	case I3C_BUS_MODE_MIXED_LIMITED:
 		if (!i3cbus->scl_rate.i3c)
 			i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE;
 		if (!i3cbus->scl_rate.i2c)
@@ -2487,6 +2489,9 @@ int i3c_master_register(struct i3c_master_controller *master,
 				mode = I3C_BUS_MODE_MIXED_FAST;
 			break;
 		case I3C_LVR_I2C_INDEX(1):
+			if (mode < I3C_BUS_MODE_MIXED_LIMITED)
+				mode = I3C_BUS_MODE_MIXED_LIMITED;
+			break;
 		case I3C_LVR_I2C_INDEX(2):
 			if (mode < I3C_BUS_MODE_MIXED_SLOW)
 				mode = I3C_BUS_MODE_MIXED_SLOW;
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index eca8337bdaa5..1f08fa8d69d2 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -250,12 +250,17 @@ struct i3c_device {
  *			     the bus. The only impact in this mode is that the
  *			     high SCL pulse has to stay below 50ns to trick I2C
  *			     devices when transmitting I3C frames
+ * @I3C_BUS_MODE_MIXED_LIMITED: I2C devices without 50ns spike filter are
+ *				present on the bus. However they allow
+ *				compliance up to the maximum SDR SCL clock
+ *				frequency.
  * @I3C_BUS_MODE_MIXED_SLOW: I2C devices without 50ns spike filter are present
  *			     on the bus
  */
 enum i3c_bus_mode {
 	I3C_BUS_MODE_PURE,
 	I3C_BUS_MODE_MIXED_FAST,
+	I3C_BUS_MODE_MIXED_LIMITED,
 	I3C_BUS_MODE_MIXED_SLOW,
 };
 
-- 
cgit v1.2.3


From ae9e13d621d6795ec1ad6bf10bd2549c6c3feca4 Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 23 Apr 2019 09:30:05 +0800
Subject: x86/e820, ioport: Add a new I/O resource descriptor
 IORES_DESC_RESERVED

When executing the kexec_file_load() syscall, the first kernel needs to
pass the e820 reserved ranges to the second kernel because some devices
(PCI, for example) need them present in the kdump kernel for proper
initialization.

But the kernel can not exactly match the e820 reserved ranges when
walking through the iomem resources using the default IORES_DESC_NONE
descriptor, because there are several types of e820 ranges which are
marked IORES_DESC_NONE, see e820_type_to_iores_desc().

Therefore, add a new I/O resource descriptor called IORES_DESC_RESERVED
to mark exactly those ranges. It will be used to match the reserved
resource ranges when walking through iomem resources.

 [ bp: Massage commit message. ]

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: bhe@redhat.com
Cc: dave.hansen@linux.intel.com
Cc: dyoung@redhat.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huang Zijiang <huang.zijiang@zte.com.cn>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: kexec@lists.infradead.org
Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190423013007.17838-2-lijiang@redhat.com
---
 arch/x86/kernel/e820.c | 2 +-
 include/linux/ioport.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 8f32e705a980..e69408bf664b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1063,10 +1063,10 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
 	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
 	case E820_TYPE_RAM:		/* Fall-through: */
 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
-	case E820_TYPE_RESERVED:	/* Fall-through: */
 	default:			return IORES_DESC_NONE;
 	}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_DEVICE_PUBLIC_MEMORY		= 7,
+	IORES_DESC_RESERVED			= 8,
 };
 
 /* helpers to define resources */
-- 
cgit v1.2.3


From 5da04cc86d1215fd9fe0e5c88ead6e8428a75e56 Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 23 Apr 2019 09:30:06 +0800
Subject: x86/mm: Rework ioremap resource mapping determination

On ioremap(), __ioremap_check_mem() does a couple of checks on the
supplied memory range to determine how the range should be mapped and in
particular what protection flags should be used.

Generalize the procedure by introducing IORES_MAP_* flags which control
different aspects of the ioremapping and use them in the respective
helpers which determine which descriptor flags should be set per range.

 [ bp:
   - Rewrite commit message.
   - Add/improve comments.
   - Reflow __ioremap_caller()'s args.
   - s/__ioremap_check_desc/__ioremap_check_encrypted/g;
   - s/__ioremap_res_check/__ioremap_collect_map_flags/g;
   - clarify __ioremap_check_ram()'s purpose. ]

Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Co-developed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: bhe@redhat.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: dyoung@redhat.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: kexec@lists.infradead.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190423013007.17838-3-lijiang@redhat.com
---
 arch/x86/mm/ioremap.c  | 71 ++++++++++++++++++++++++++++++++------------------
 include/linux/ioport.h |  9 +++++++
 2 files changed, 54 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4b6423e7bd21..e500f1df1140 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -28,9 +28,11 @@
 
 #include "physaddr.h"
 
-struct ioremap_mem_flags {
-	bool system_ram;
-	bool desc_other;
+/*
+ * Descriptor controlling ioremap() behavior.
+ */
+struct ioremap_desc {
+	unsigned int flags;
 };
 
 /*
@@ -62,13 +64,14 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-static bool __ioremap_check_ram(struct resource *res)
+/* Does the range (or a subset of) contain normal RAM? */
+static unsigned int __ioremap_check_ram(struct resource *res)
 {
 	unsigned long start_pfn, stop_pfn;
 	unsigned long i;
 
 	if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
-		return false;
+		return 0;
 
 	start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	stop_pfn = (res->end + 1) >> PAGE_SHIFT;
@@ -76,28 +79,44 @@ static bool __ioremap_check_ram(struct resource *res)
 		for (i = 0; i < (stop_pfn - start_pfn); ++i)
 			if (pfn_valid(start_pfn + i) &&
 			    !PageReserved(pfn_to_page(start_pfn + i)))
-				return true;
+				return IORES_MAP_SYSTEM_RAM;
 	}
 
-	return false;
+	return 0;
 }
 
-static int __ioremap_check_desc_other(struct resource *res)
+/*
+ * In a SEV guest, NONE and RESERVED should not be mapped encrypted because
+ * there the whole memory is already encrypted.
+ */
+static unsigned int __ioremap_check_encrypted(struct resource *res)
 {
-	return (res->desc != IORES_DESC_NONE);
+	if (!sev_active())
+		return 0;
+
+	switch (res->desc) {
+	case IORES_DESC_NONE:
+	case IORES_DESC_RESERVED:
+		break;
+	default:
+		return IORES_MAP_ENCRYPTED;
+	}
+
+	return 0;
 }
 
-static int __ioremap_res_check(struct resource *res, void *arg)
+static int __ioremap_collect_map_flags(struct resource *res, void *arg)
 {
-	struct ioremap_mem_flags *flags = arg;
+	struct ioremap_desc *desc = arg;
 
-	if (!flags->system_ram)
-		flags->system_ram = __ioremap_check_ram(res);
+	if (!(desc->flags & IORES_MAP_SYSTEM_RAM))
+		desc->flags |= __ioremap_check_ram(res);
 
-	if (!flags->desc_other)
-		flags->desc_other = __ioremap_check_desc_other(res);
+	if (!(desc->flags & IORES_MAP_ENCRYPTED))
+		desc->flags |= __ioremap_check_encrypted(res);
 
-	return flags->system_ram && flags->desc_other;
+	return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) ==
+			       (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED));
 }
 
 /*
@@ -106,15 +125,15 @@ static int __ioremap_res_check(struct resource *res, void *arg)
  * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
  */
 static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
-				struct ioremap_mem_flags *flags)
+				struct ioremap_desc *desc)
 {
 	u64 start, end;
 
 	start = (u64)addr;
 	end = start + size - 1;
-	memset(flags, 0, sizeof(*flags));
+	memset(desc, 0, sizeof(struct ioremap_desc));
 
-	walk_mem_res(start, end, flags, __ioremap_res_check);
+	walk_mem_res(start, end, desc, __ioremap_collect_map_flags);
 }
 
 /*
@@ -131,15 +150,15 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
  * have to convert them into an offset in a page-aligned mapping, but the
  * caller shouldn't need to know that small detail.
  */
-static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-		unsigned long size, enum page_cache_mode pcm,
-		void *caller, bool encrypted)
+static void __iomem *
+__ioremap_caller(resource_size_t phys_addr, unsigned long size,
+		 enum page_cache_mode pcm, void *caller, bool encrypted)
 {
 	unsigned long offset, vaddr;
 	resource_size_t last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
-	struct ioremap_mem_flags mem_flags;
+	struct ioremap_desc io_desc;
 	struct vm_struct *area;
 	enum page_cache_mode new_pcm;
 	pgprot_t prot;
@@ -158,12 +177,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
-	__ioremap_check_mem(phys_addr, size, &mem_flags);
+	__ioremap_check_mem(phys_addr, size, &io_desc);
 
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	if (mem_flags.system_ram) {
+	if (io_desc.flags & IORES_MAP_SYSTEM_RAM) {
 		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
 			  &phys_addr, &last_addr);
 		return NULL;
@@ -201,7 +220,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * resulting mapping.
 	 */
 	prot = PAGE_KERNEL_IO;
-	if ((sev_active() && mem_flags.desc_other) || encrypted)
+	if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted)
 		prot = pgprot_encrypted(prot);
 
 	switch (pcm) {
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6ed59de48bd5..5db386cfc2d4 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -12,6 +12,7 @@
 #ifndef __ASSEMBLY__
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/bits.h>
 /*
  * Resources are tree-like, allowing
  * nesting etc..
@@ -136,6 +137,14 @@ enum {
 	IORES_DESC_RESERVED			= 8,
 };
 
+/*
+ * Flags controlling ioremap() behavior.
+ */
+enum {
+	IORES_MAP_SYSTEM_RAM		= BIT(0),
+	IORES_MAP_ENCRYPTED		= BIT(1),
+};
+
 /* helpers to define resources */
 #define DEFINE_RES_NAMED(_start, _size, _name, _flags)			\
 	{								\
-- 
cgit v1.2.3


From 2f578aaf51624aa6fcff041fc7dc5c2d4dfa447f Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Sun, 9 Jun 2019 05:15:51 +0900
Subject: block: move tag field position in struct request

__data_len and __sector are internal fields which should not be accessed
directly in driver-level like the comment above it. But, tag field can
be accessed by driver level directly so that we need to make the comment
right by moving it to some other place.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 592669bcc536..90e6914bea0c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -137,11 +137,11 @@ struct request {
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 
+	int tag;
 	int internal_tag;
 
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
-	int tag;
 	sector_t __sector;		/* sector cursor */
 
 	struct bio *bio;
-- 
cgit v1.2.3


From 3a211b71529fdd0a89095b18fb19155db0c8fb5d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 May 2019 18:43:11 +0300
Subject: blk-core: Remove blk_end_request*() declarations

Commit a1ce35fa49852db60fc6e268 ("block: remove dead elevator code")
deleted blk_end_request() and friends, but some declaration are still
left. Purge them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 +-
 include/linux/blkdev.h | 12 ------------
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..94c6520bc786 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1348,7 +1348,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
- *     Actual device drivers should use blk_end_request instead.
+ *     Actual device drivers should use blk_mq_end_request instead.
  *
  *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
  *     %false return from this function.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90e6914bea0c..ad49a775c54f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1026,21 +1026,9 @@ void blk_steal_bios(struct bio_list *list, struct request *rq);
  *
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
- *
- * blk_end_request() and friends.  __blk_end_request() must be called
- * with the request queue spinlock acquired.
- *
- * Several drivers define their own end_request and call
- * blk_end_request() for parts of the original function.
- * This prevents code duplication in drivers.
  */
 extern bool blk_update_request(struct request *rq, blk_status_t error,
 			       unsigned int nr_bytes);
-extern void blk_end_request_all(struct request *rq, blk_status_t error);
-extern bool __blk_end_request(struct request *rq, blk_status_t error,
-			      unsigned int nr_bytes);
-extern void __blk_end_request_all(struct request *rq, blk_status_t error);
-extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
 
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-- 
cgit v1.2.3


From 8527fa6cc68a489f735823e61b31ec6cb266274a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jun 2019 14:54:36 +0200
Subject: netfilter: synproxy: fix building syncookie calls

When either CONFIG_IPV6 or CONFIG_SYN_COOKIES are disabled, the kernel
fails to build:

include/linux/netfilter_ipv6.h:180:9: error: implicit declaration of function '__cookie_v6_init_sequence'
      [-Werror,-Wimplicit-function-declaration]
        return __cookie_v6_init_sequence(iph, th, mssp);
include/linux/netfilter_ipv6.h:194:9: error: implicit declaration of function '__cookie_v6_check'
      [-Werror,-Wimplicit-function-declaration]
        return __cookie_v6_check(iph, th, cookie);
net/ipv6/netfilter.c:237:26: error: use of undeclared identifier '__cookie_v6_init_sequence'; did you mean 'cookie_init_sequence'?
net/ipv6/netfilter.c:238:21: error: use of undeclared identifier '__cookie_v6_check'; did you mean '__cookie_v4_check'?

Fix the IS_ENABLED() checks to match the function declaration
and definitions for these.

Fixes: 3006a5224f15 ("netfilter: synproxy: remove module dependency on IPv6 SYNPROXY")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 14 ++++++++------
 net/ipv6/netfilter.c           |  2 ++
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 35b12525ee45..22e6398bc482 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -163,31 +163,33 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
 					       const struct tcphdr *th,
 					       u16 *mssp)
 {
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
 		return v6_ops->cookie_init_sequence(iph, th, mssp);
-
-	return 0;
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return __cookie_v6_init_sequence(iph, th, mssp);
 #endif
+#endif
+	return 0;
 }
 
 static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
 				     const struct tcphdr *th, __u32 cookie)
 {
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
 		return v6_ops->cookie_v6_check(iph, th, cookie);
-
-	return 0;
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return __cookie_v6_check(iph, th, cookie);
 #endif
+#endif
+	return 0;
 }
 
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index dffb10fdc3e8..61819ed858b1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -234,8 +234,10 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_me_harder	= ip6_route_me_harder,
 	.dev_get_saddr		= ipv6_dev_get_saddr,
 	.route			= __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 	.cookie_init_sequence	= __cookie_v6_init_sequence,
 	.cookie_v6_check	= __cookie_v6_check,
+#endif
 #endif
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
-- 
cgit v1.2.3


From 2735b683e1f284560f7e8e1d1ebf385ab111312d Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Wed, 19 Jun 2019 14:41:56 +0100
Subject: ASoC: madera: Add common support for Cirrus Logic Madera codecs

The Cirrus Logic Madera codecs are a family of related codecs with
extensive digital and analogue I/O, digital mixing and routing,
signal processing and programmable DSPs. This patch adds common
support code shared by all Madera codecs.

This patch also adds the pdata to the parent mfd pdata struct.
Since there is a circular build dependency it's convenient to
patch them both atomically.

Signed-off-by: Nariman Poushin <nariman@opensource.cirrus.com>
Signed-off-by: Nikesh Oswal <Nikesh.Oswal@cirrus.com>
Signed-off-by: Piotr Stankiewicz <piotrs@opensource.cirrus.com>
Signed-off-by: Ajit Pandey <ajit.pandey@incubesol.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 MAINTAINERS                      |    4 +
 include/linux/mfd/madera/pdata.h |    4 +
 include/sound/madera-pdata.h     |   63 +
 sound/soc/codecs/Kconfig         |    5 +
 sound/soc/codecs/Makefile        |    2 +
 sound/soc/codecs/madera.c        | 4181 ++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/madera.h        |  446 ++++
 7 files changed, 4705 insertions(+)
 create mode 100644 include/sound/madera-pdata.h
 create mode 100644 sound/soc/codecs/madera.c
 create mode 100644 sound/soc/codecs/madera.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c35d1f72bc73..9ea100957c59 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3915,14 +3915,18 @@ W:	https://github.com/CirrusLogic/linux-drivers/wiki
 S:	Supported
 F:	Documentation/devicetree/bindings/mfd/madera.txt
 F:	Documentation/devicetree/bindings/pinctrl/cirrus,madera-pinctrl.txt
+F:	Documentation/devicetree/bindings/sound/madera.txt
 F:	include/dt-bindings/sound/madera*
 F:	include/linux/irqchip/irq-madera*
 F:	include/linux/mfd/madera/*
+F:	include/sound/madera*
 F:	drivers/gpio/gpio-madera*
 F:	drivers/irqchip/irq-madera*
 F:	drivers/mfd/madera*
 F:	drivers/mfd/cs47l*
 F:	drivers/pinctrl/cirrus/*
+F:	sound/soc/codecs/cs47l*
+F:	sound/soc/codecs/madera*
 
 CLANG-FORMAT FILE
 M:	Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 8dc852402dbb..60cd8ec98563 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -16,6 +16,7 @@
 #include <linux/regulator/arizona-ldo1.h>
 #include <linux/regulator/arizona-micsupp.h>
 #include <linux/regulator/machine.h>
+#include <sound/madera-pdata.h>
 
 #define MADERA_MAX_MICBIAS		4
 #define MADERA_MAX_CHILD_MICBIAS	4
@@ -39,6 +40,7 @@ struct madera_codec_pdata;
  * @gpsw:	    General purpose switch mode setting. Depends on the external
  *		    hardware connected to the switch. (See the SW1_MODE field
  *		    in the datasheet for the available values for your codec)
+ * @codec:	    Substruct of pdata for the ASoC codec driver
  */
 struct madera_pdata {
 	struct gpio_desc *reset;
@@ -53,6 +55,8 @@ struct madera_pdata {
 	int n_gpio_configs;
 
 	u32 gpsw[MADERA_MAX_GPSW];
+
+	struct madera_codec_pdata codec;
 };
 
 #endif
diff --git a/include/sound/madera-pdata.h b/include/sound/madera-pdata.h
new file mode 100644
index 000000000000..441decefb7f3
--- /dev/null
+++ b/include/sound/madera-pdata.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Platform data for Madera codec driver
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MADERA_CODEC_PDATA_H
+#define MADERA_CODEC_PDATA_H
+
+#include <linux/kernel.h>
+
+#define MADERA_MAX_INPUT		6
+#define MADERA_MAX_MUXED_CHANNELS	4
+#define MADERA_MAX_OUTPUT		6
+#define MADERA_MAX_AIF			4
+#define MADERA_MAX_PDM_SPK		2
+#define MADERA_MAX_DSP			7
+
+/**
+ * struct madera_codec_pdata
+ *
+ * @max_channels_clocked: Maximum number of channels that I2S clocks will be
+ *			  generated for. Useful when clock master for systems
+ *			  where the I2S bus has multiple data lines.
+ * @dmic_ref:		  Indicates how the MICBIAS pins have been externally
+ *			  connected to DMICs on each input. A value of 0
+ *			  indicates MICVDD and is the default. Other values are:
+ *			  For CS47L35 one of the CS47L35_DMIC_REF_xxx values
+ *			  For all other codecs one of the MADERA_DMIC_REF_xxx
+ *			  Also see the datasheet for a description of the
+ *			  INn_DMIC_SUP field.
+ * @inmode:		  Mode for the ADC inputs. One of the MADERA_INMODE_xxx
+ *			  values. Two-dimensional array
+ *			  [input_number][channel number], with four slots per
+ *			  input in the order
+ *			  [n][0]=INnAL [n][1]=INnAR [n][2]=INnBL [n][3]=INnBR
+ * @out_mono:		  For each output set the value to TRUE to indicate that
+ *			  the output is mono. [0]=OUT1, [1]=OUT2, ...
+ * @pdm_fmt:		  PDM speaker data format. See the PDM_SPKn_FMT field in
+ *			  the datasheet for a description of this value.
+ * @pdm_mute:		  PDM mute format. See the PDM_SPKn_CTRL_1 register
+ *			  in the datasheet for a description of this value.
+ */
+struct madera_codec_pdata {
+	u32 max_channels_clocked[MADERA_MAX_AIF];
+
+	u32 dmic_ref[MADERA_MAX_INPUT];
+
+	u32 inmode[MADERA_MAX_INPUT][MADERA_MAX_MUXED_CHANNELS];
+
+	bool out_mono[MADERA_MAX_OUTPUT];
+
+	u32 pdm_fmt[MADERA_MAX_PDM_SPK];
+	u32 pdm_mute[MADERA_MAX_PDM_SPK];
+};
+
+#endif
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 1bda52ef0cd0..f3ac661b8845 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -284,10 +284,12 @@ config SND_SOC_WM_HUBS
 config SND_SOC_WM_ADSP
 	tristate
 	select SND_SOC_COMPRESS
+	default y if SND_SOC_MADERA=y
 	default y if SND_SOC_CS47L24=y
 	default y if SND_SOC_WM5102=y
 	default y if SND_SOC_WM5110=y
 	default y if SND_SOC_WM2200=y
+	default m if SND_SOC_MADERA=m
 	default m if SND_SOC_CS47L24=m
 	default m if SND_SOC_WM5102=m
 	default m if SND_SOC_WM5110=m
@@ -704,6 +706,9 @@ config SND_SOC_LOCHNAGAR_SC
 	  This driver support the sound card functionality of the Cirrus
 	  Logic Lochnagar audio development board.
 
+config SND_SOC_MADERA
+	tristate
+
 config SND_SOC_MAX98088
 	tristate "Maxim MAX98088/9 Low-Power, Stereo Audio Codec"
 	depends on I2C
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index 112701fd44a8..d21e1be3e7a7 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -93,6 +93,7 @@ snd-soc-l3-objs := l3.o
 snd-soc-lm4857-objs := lm4857.o
 snd-soc-lm49453-objs := lm49453.o
 snd-soc-lochnagar-sc-objs := lochnagar-sc.o
+snd-soc-madera-objs := madera.o
 snd-soc-max9759-objs := max9759.o
 snd-soc-max9768-objs := max9768.o
 snd-soc-max98088-objs := max98088.o
@@ -369,6 +370,7 @@ obj-$(CONFIG_SND_SOC_L3)	+= snd-soc-l3.o
 obj-$(CONFIG_SND_SOC_LM4857)	+= snd-soc-lm4857.o
 obj-$(CONFIG_SND_SOC_LM49453)   += snd-soc-lm49453.o
 obj-$(CONFIG_SND_SOC_LOCHNAGAR_SC)	+= snd-soc-lochnagar-sc.o
+obj-$(CONFIG_SND_SOC_MADERA)	+= snd-soc-madera.o
 obj-$(CONFIG_SND_SOC_MAX9759)	+= snd-soc-max9759.o
 obj-$(CONFIG_SND_SOC_MAX9768)	+= snd-soc-max9768.o
 obj-$(CONFIG_SND_SOC_MAX98088)	+= snd-soc-max98088.o
diff --git a/sound/soc/codecs/madera.c b/sound/soc/codecs/madera.c
new file mode 100644
index 000000000000..6146c7a070cb
--- /dev/null
+++ b/sound/soc/codecs/madera.c
@@ -0,0 +1,4181 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Cirrus Logic Madera class codecs common support
+//
+// Copyright (C) 2015-2019 Cirrus Logic, Inc. and
+//                         Cirrus Logic International Semiconductor Ltd.
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by the
+// Free Software Foundation; version 2.
+//
+
+#include <linux/delay.h>
+#include <linux/gcd.h>
+#include <linux/module.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/tlv.h>
+
+#include <linux/irqchip/irq-madera.h>
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+#include <linux/mfd/madera/pdata.h>
+#include <sound/madera-pdata.h>
+
+#include <dt-bindings/sound/madera.h>
+
+#include "madera.h"
+
+#define MADERA_AIF_BCLK_CTRL			0x00
+#define MADERA_AIF_TX_PIN_CTRL			0x01
+#define MADERA_AIF_RX_PIN_CTRL			0x02
+#define MADERA_AIF_RATE_CTRL			0x03
+#define MADERA_AIF_FORMAT			0x04
+#define MADERA_AIF_RX_BCLK_RATE			0x06
+#define MADERA_AIF_FRAME_CTRL_1			0x07
+#define MADERA_AIF_FRAME_CTRL_2			0x08
+#define MADERA_AIF_FRAME_CTRL_3			0x09
+#define MADERA_AIF_FRAME_CTRL_4			0x0A
+#define MADERA_AIF_FRAME_CTRL_5			0x0B
+#define MADERA_AIF_FRAME_CTRL_6			0x0C
+#define MADERA_AIF_FRAME_CTRL_7			0x0D
+#define MADERA_AIF_FRAME_CTRL_8			0x0E
+#define MADERA_AIF_FRAME_CTRL_9			0x0F
+#define MADERA_AIF_FRAME_CTRL_10		0x10
+#define MADERA_AIF_FRAME_CTRL_11		0x11
+#define MADERA_AIF_FRAME_CTRL_12		0x12
+#define MADERA_AIF_FRAME_CTRL_13		0x13
+#define MADERA_AIF_FRAME_CTRL_14		0x14
+#define MADERA_AIF_FRAME_CTRL_15		0x15
+#define MADERA_AIF_FRAME_CTRL_16		0x16
+#define MADERA_AIF_FRAME_CTRL_17		0x17
+#define MADERA_AIF_FRAME_CTRL_18		0x18
+#define MADERA_AIF_TX_ENABLES			0x19
+#define MADERA_AIF_RX_ENABLES			0x1A
+#define MADERA_AIF_FORCE_WRITE			0x1B
+
+#define MADERA_DSP_CONFIG_1_OFFS		0x00
+#define MADERA_DSP_CONFIG_2_OFFS		0x02
+
+#define MADERA_DSP_CLK_SEL_MASK			0x70000
+#define MADERA_DSP_CLK_SEL_SHIFT		16
+
+#define MADERA_DSP_RATE_MASK			0x7800
+#define MADERA_DSP_RATE_SHIFT			11
+
+#define MADERA_SYSCLK_6MHZ			0
+#define MADERA_SYSCLK_12MHZ			1
+#define MADERA_SYSCLK_24MHZ			2
+#define MADERA_SYSCLK_49MHZ			3
+#define MADERA_SYSCLK_98MHZ			4
+
+#define MADERA_DSPCLK_9MHZ			0
+#define MADERA_DSPCLK_18MHZ			1
+#define MADERA_DSPCLK_36MHZ			2
+#define MADERA_DSPCLK_73MHZ			3
+#define MADERA_DSPCLK_147MHZ			4
+
+#define MADERA_FLL_VCO_CORNER			141900000
+#define MADERA_FLL_MAX_FREF			13500000
+#define MADERA_FLL_MAX_N			1023
+#define MADERA_FLL_MIN_FOUT			90000000
+#define MADERA_FLL_MAX_FOUT			100000000
+#define MADERA_FLL_MAX_FRATIO			16
+#define MADERA_FLL_MAX_REFDIV			8
+#define MADERA_FLL_OUTDIV			3
+#define MADERA_FLL_VCO_MULT			3
+#define MADERA_FLLAO_MAX_FREF			12288000
+#define MADERA_FLLAO_MIN_N			4
+#define MADERA_FLLAO_MAX_N			1023
+#define MADERA_FLLAO_MAX_FBDIV			254
+
+#define MADERA_FLL_SYNCHRONISER_OFFS		0x10
+#define CS47L35_FLL_SYNCHRONISER_OFFS		0xE
+#define MADERA_FLL_CONTROL_1_OFFS		0x1
+#define MADERA_FLL_CONTROL_2_OFFS		0x2
+#define MADERA_FLL_CONTROL_3_OFFS		0x3
+#define MADERA_FLL_CONTROL_4_OFFS		0x4
+#define MADERA_FLL_CONTROL_5_OFFS		0x5
+#define MADERA_FLL_CONTROL_6_OFFS		0x6
+#define MADERA_FLL_CONTROL_7_OFFS		0x9
+#define MADERA_FLL_EFS_2_OFFS			0xA
+#define MADERA_FLL_SYNCHRONISER_1_OFFS		0x1
+#define MADERA_FLL_SYNCHRONISER_2_OFFS		0x2
+#define MADERA_FLL_SYNCHRONISER_3_OFFS		0x3
+#define MADERA_FLL_SYNCHRONISER_4_OFFS		0x4
+#define MADERA_FLL_SYNCHRONISER_5_OFFS		0x5
+#define MADERA_FLL_SYNCHRONISER_6_OFFS		0x6
+#define MADERA_FLL_SYNCHRONISER_7_OFFS		0x7
+#define MADERA_FLL_SPREAD_SPECTRUM_OFFS		0x9
+#define MADERA_FLL_GPIO_CLOCK_OFFS		0xA
+
+#define MADERA_FLLAO_CONTROL_1_OFFS		0x1
+#define MADERA_FLLAO_CONTROL_2_OFFS		0x2
+#define MADERA_FLLAO_CONTROL_3_OFFS		0x3
+#define MADERA_FLLAO_CONTROL_4_OFFS		0x4
+#define MADERA_FLLAO_CONTROL_5_OFFS		0x5
+#define MADERA_FLLAO_CONTROL_6_OFFS		0x6
+#define MADERA_FLLAO_CONTROL_7_OFFS		0x8
+#define MADERA_FLLAO_CONTROL_8_OFFS		0xA
+#define MADERA_FLLAO_CONTROL_9_OFFS		0xB
+#define MADERA_FLLAO_CONTROL_10_OFFS		0xC
+#define MADERA_FLLAO_CONTROL_11_OFFS		0xD
+
+#define MADERA_FMT_DSP_MODE_A			0
+#define MADERA_FMT_DSP_MODE_B			1
+#define MADERA_FMT_I2S_MODE			2
+#define MADERA_FMT_LEFT_JUSTIFIED_MODE		3
+
+#define madera_fll_err(_fll, fmt, ...) \
+	dev_err(_fll->madera->dev, "FLL%d: " fmt, _fll->id, ##__VA_ARGS__)
+#define madera_fll_warn(_fll, fmt, ...) \
+	dev_warn(_fll->madera->dev, "FLL%d: " fmt, _fll->id, ##__VA_ARGS__)
+#define madera_fll_dbg(_fll, fmt, ...) \
+	dev_dbg(_fll->madera->dev, "FLL%d: " fmt, _fll->id, ##__VA_ARGS__)
+
+#define madera_aif_err(_dai, fmt, ...) \
+	dev_err(_dai->dev, "AIF%d: " fmt, _dai->id, ##__VA_ARGS__)
+#define madera_aif_warn(_dai, fmt, ...) \
+	dev_warn(_dai->dev, "AIF%d: " fmt, _dai->id, ##__VA_ARGS__)
+#define madera_aif_dbg(_dai, fmt, ...) \
+	dev_dbg(_dai->dev, "AIF%d: " fmt, _dai->id, ##__VA_ARGS__)
+
+static const int madera_dsp_bus_error_irqs[MADERA_MAX_ADSP] = {
+	MADERA_IRQ_DSP1_BUS_ERR,
+	MADERA_IRQ_DSP2_BUS_ERR,
+	MADERA_IRQ_DSP3_BUS_ERR,
+	MADERA_IRQ_DSP4_BUS_ERR,
+	MADERA_IRQ_DSP5_BUS_ERR,
+	MADERA_IRQ_DSP6_BUS_ERR,
+	MADERA_IRQ_DSP7_BUS_ERR,
+};
+
+static void madera_spin_sysclk(struct madera_priv *priv)
+{
+	struct madera *madera = priv->madera;
+	unsigned int val;
+	int ret, i;
+
+	/* Skip this if the chip is down */
+	if (pm_runtime_suspended(madera->dev))
+		return;
+
+	/*
+	 * Just read a register a few times to ensure the internal
+	 * oscillator sends out a few clocks.
+	 */
+	for (i = 0; i < 4; i++) {
+		ret = regmap_read(madera->regmap, MADERA_SOFTWARE_RESET, &val);
+		if (ret)
+			dev_err(madera->dev,
+				"Failed to read sysclk spin %d: %d\n", i, ret);
+	}
+
+	udelay(300);
+}
+
+int madera_sysclk_ev(struct snd_soc_dapm_widget *w,
+		     struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+
+	madera_spin_sysclk(priv);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_sysclk_ev);
+
+static int madera_check_speaker_overheat(struct madera *madera,
+					 bool *warn, bool *shutdown)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(madera->regmap, MADERA_IRQ1_RAW_STATUS_15, &val);
+	if (ret) {
+		dev_err(madera->dev, "Failed to read thermal status: %d\n",
+			ret);
+		return ret;
+	}
+
+	*warn = val & MADERA_SPK_OVERHEAT_WARN_STS1;
+	*shutdown = val & MADERA_SPK_OVERHEAT_STS1;
+
+	return 0;
+}
+
+int madera_spk_ev(struct snd_soc_dapm_widget *w,
+		  struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	bool warn, shutdown;
+	int ret;
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		ret = madera_check_speaker_overheat(madera, &warn, &shutdown);
+		if (ret)
+			return ret;
+
+		if (shutdown) {
+			dev_crit(madera->dev,
+				 "Speaker not enabled due to temperature\n");
+			return -EBUSY;
+		}
+
+		regmap_update_bits(madera->regmap, MADERA_OUTPUT_ENABLES_1,
+				   1 << w->shift, 1 << w->shift);
+		break;
+	case SND_SOC_DAPM_PRE_PMD:
+		regmap_update_bits(madera->regmap, MADERA_OUTPUT_ENABLES_1,
+				   1 << w->shift, 0);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_spk_ev);
+
+static irqreturn_t madera_thermal_warn(int irq, void *data)
+{
+	struct madera *madera = data;
+	bool warn, shutdown;
+	int ret;
+
+	ret = madera_check_speaker_overheat(madera, &warn, &shutdown);
+	if (ret || shutdown) { /* for safety attempt to shutdown on error */
+		dev_crit(madera->dev, "Thermal shutdown\n");
+		ret = regmap_update_bits(madera->regmap,
+					 MADERA_OUTPUT_ENABLES_1,
+					 MADERA_OUT4L_ENA |
+					 MADERA_OUT4R_ENA, 0);
+		if (ret != 0)
+			dev_crit(madera->dev,
+				 "Failed to disable speaker outputs: %d\n",
+				 ret);
+	} else if (warn) {
+		dev_alert(madera->dev, "Thermal warning\n");
+	} else {
+		dev_info(madera->dev, "Spurious thermal warning\n");
+		return IRQ_NONE;
+	}
+
+	return IRQ_HANDLED;
+}
+
+int madera_init_overheat(struct madera_priv *priv)
+{
+	struct madera *madera = priv->madera;
+	struct device *dev = madera->dev;
+	int ret;
+
+	ret = madera_request_irq(madera, MADERA_IRQ_SPK_OVERHEAT_WARN,
+				 "Thermal warning", madera_thermal_warn,
+				 madera);
+	if (ret)
+		dev_err(dev, "Failed to get thermal warning IRQ: %d\n", ret);
+
+	ret = madera_request_irq(madera, MADERA_IRQ_SPK_OVERHEAT,
+				 "Thermal shutdown", madera_thermal_warn,
+				 madera);
+	if (ret)
+		dev_err(dev, "Failed to get thermal shutdown IRQ: %d\n", ret);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_init_overheat);
+
+int madera_free_overheat(struct madera_priv *priv)
+{
+	struct madera *madera = priv->madera;
+
+	madera_free_irq(madera, MADERA_IRQ_SPK_OVERHEAT_WARN, madera);
+	madera_free_irq(madera, MADERA_IRQ_SPK_OVERHEAT, madera);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_free_overheat);
+
+int madera_core_init(struct madera_priv *priv)
+{
+	int i;
+
+	/* trap undersized array initializers */
+	BUILD_BUG_ON(!madera_mixer_texts[MADERA_NUM_MIXER_INPUTS - 1]);
+	BUILD_BUG_ON(!madera_mixer_values[MADERA_NUM_MIXER_INPUTS - 1]);
+
+	mutex_init(&priv->rate_lock);
+
+	for (i = 0; i < MADERA_MAX_HP_OUTPUT; i++)
+		priv->madera->out_clamp[i] = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_core_init);
+
+int madera_core_free(struct madera_priv *priv)
+{
+	mutex_destroy(&priv->rate_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_core_free);
+
+static void madera_debug_dump_domain_groups(const struct madera_priv *priv)
+{
+	struct madera *madera = priv->madera;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(priv->domain_group_ref); ++i)
+		dev_dbg(madera->dev, "domain_grp_ref[%d]=%d\n", i,
+			priv->domain_group_ref[i]);
+}
+
+int madera_domain_clk_ev(struct snd_soc_dapm_widget *w,
+			 struct snd_kcontrol *kcontrol,
+			 int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	int dom_grp = w->shift;
+
+	if (dom_grp >= ARRAY_SIZE(priv->domain_group_ref)) {
+		WARN(true, "%s dom_grp exceeds array size\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * We can't rely on the DAPM mutex for locking because we need a lock
+	 * that can safely be called in hw_params
+	 */
+	mutex_lock(&priv->rate_lock);
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		dev_dbg(priv->madera->dev, "Inc ref on domain group %d\n",
+			dom_grp);
+		++priv->domain_group_ref[dom_grp];
+		break;
+	case SND_SOC_DAPM_POST_PMD:
+		dev_dbg(priv->madera->dev, "Dec ref on domain group %d\n",
+			dom_grp);
+		--priv->domain_group_ref[dom_grp];
+		break;
+	default:
+		break;
+	}
+
+	madera_debug_dump_domain_groups(priv);
+
+	mutex_unlock(&priv->rate_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_domain_clk_ev);
+
+int madera_out1_demux_put(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_kcontrol_component(kcontrol);
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_dapm_kcontrol_dapm(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	unsigned int ep_sel, mux, change;
+	bool out_mono;
+	int ret;
+
+	if (ucontrol->value.enumerated.item[0] > e->items - 1)
+		return -EINVAL;
+
+	mux = ucontrol->value.enumerated.item[0];
+
+	snd_soc_dapm_mutex_lock(dapm);
+
+	ep_sel = mux << MADERA_EP_SEL_SHIFT;
+
+	change = snd_soc_component_test_bits(component, MADERA_OUTPUT_ENABLES_1,
+					     MADERA_EP_SEL_MASK,
+					     ep_sel);
+	if (!change)
+		goto end;
+
+	/* EP_SEL should not be modified while HP or EP driver is enabled */
+	ret = regmap_update_bits(madera->regmap, MADERA_OUTPUT_ENABLES_1,
+				 MADERA_OUT1L_ENA | MADERA_OUT1R_ENA, 0);
+	if (ret)
+		dev_warn(madera->dev, "Failed to disable outputs: %d\n", ret);
+
+	usleep_range(2000, 3000); /* wait for wseq to complete */
+
+	/* change demux setting */
+	if (madera->out_clamp[0])
+		ret = regmap_update_bits(madera->regmap,
+					 MADERA_OUTPUT_ENABLES_1,
+					 MADERA_EP_SEL_MASK, ep_sel);
+	if (ret) {
+		dev_err(madera->dev, "Failed to set OUT1 demux: %d\n", ret);
+	} else {
+		/* apply correct setting for mono mode */
+		if (!ep_sel && !madera->pdata.codec.out_mono[0])
+			out_mono = false; /* stereo HP */
+		else
+			out_mono = true; /* EP or mono HP */
+
+		ret = madera_set_output_mode(component, 1, out_mono);
+		if (ret)
+			dev_warn(madera->dev,
+				 "Failed to set output mode: %d\n", ret);
+	}
+
+	/*
+	 * if HPDET has disabled the clamp while switching to HPOUT
+	 * OUT1 should remain disabled
+	 */
+	if (ep_sel ||
+	    (madera->out_clamp[0] && !madera->out_shorted[0])) {
+		ret = regmap_update_bits(madera->regmap,
+					 MADERA_OUTPUT_ENABLES_1,
+					 MADERA_OUT1L_ENA | MADERA_OUT1R_ENA,
+					 madera->hp_ena);
+		if (ret)
+			dev_warn(madera->dev,
+				 "Failed to restore earpiece outputs: %d\n",
+				 ret);
+		else if (madera->hp_ena)
+			msleep(34); /* wait for enable wseq */
+		else
+			usleep_range(2000, 3000); /* wait for disable wseq */
+	}
+
+end:
+	snd_soc_dapm_mutex_unlock(dapm);
+
+	return snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL);
+}
+EXPORT_SYMBOL_GPL(madera_out1_demux_put);
+
+int madera_out1_demux_get(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_kcontrol_component(kcontrol);
+	unsigned int val;
+	int ret;
+
+	ret = snd_soc_component_read(component, MADERA_OUTPUT_ENABLES_1, &val);
+	if (ret)
+		return ret;
+
+	val &= MADERA_EP_SEL_MASK;
+	val >>= MADERA_EP_SEL_SHIFT;
+	ucontrol->value.enumerated.item[0] = val;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_out1_demux_get);
+
+static int madera_inmux_put(struct snd_kcontrol *kcontrol,
+			    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_dapm_kcontrol_component(kcontrol);
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_dapm_kcontrol_dapm(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	struct regmap *regmap = madera->regmap;
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	unsigned int mux, val, mask;
+	unsigned int inmode;
+	bool changed;
+	int ret;
+
+	mux = ucontrol->value.enumerated.item[0];
+	if (mux > 1)
+		return -EINVAL;
+
+	val = mux << e->shift_l;
+	mask = (e->mask << e->shift_l) | MADERA_IN1L_SRC_SE_MASK;
+
+	switch (e->reg) {
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+		inmode = madera->pdata.codec.inmode[0][2 * mux];
+		break;
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+		inmode = madera->pdata.codec.inmode[0][1 + (2 * mux)];
+		break;
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+		inmode = madera->pdata.codec.inmode[1][2 * mux];
+		break;
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+		inmode = madera->pdata.codec.inmode[1][1 + (2 * mux)];
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (inmode & MADERA_INMODE_SE)
+		val |= 1 << MADERA_IN1L_SRC_SE_SHIFT;
+
+	dev_dbg(madera->dev, "mux=%u reg=0x%x inmode=0x%x mask=0x%x val=0x%x\n",
+		mux, e->reg, inmode, mask, val);
+
+	ret = regmap_update_bits_check(regmap, e->reg, mask, val, &changed);
+	if (ret < 0)
+		return ret;
+
+	if (changed)
+		return snd_soc_dapm_mux_update_power(dapm, kcontrol,
+						     mux, e, NULL);
+	else
+		return 0;
+}
+
+static const char * const madera_inmux_texts[] = {
+	"A",
+	"B",
+};
+
+static SOC_ENUM_SINGLE_DECL(madera_in1muxl_enum,
+			    MADERA_ADC_DIGITAL_VOLUME_1L,
+			    MADERA_IN1L_SRC_SHIFT,
+			    madera_inmux_texts);
+
+static SOC_ENUM_SINGLE_DECL(madera_in1muxr_enum,
+			    MADERA_ADC_DIGITAL_VOLUME_1R,
+			    MADERA_IN1R_SRC_SHIFT,
+			    madera_inmux_texts);
+
+static SOC_ENUM_SINGLE_DECL(madera_in2muxl_enum,
+			    MADERA_ADC_DIGITAL_VOLUME_2L,
+			    MADERA_IN2L_SRC_SHIFT,
+			    madera_inmux_texts);
+
+static SOC_ENUM_SINGLE_DECL(madera_in2muxr_enum,
+			    MADERA_ADC_DIGITAL_VOLUME_2R,
+			    MADERA_IN2R_SRC_SHIFT,
+			    madera_inmux_texts);
+
+const struct snd_kcontrol_new madera_inmux[] = {
+	SOC_DAPM_ENUM_EXT("IN1L Mux", madera_in1muxl_enum,
+			  snd_soc_dapm_get_enum_double, madera_inmux_put),
+	SOC_DAPM_ENUM_EXT("IN1R Mux", madera_in1muxr_enum,
+			  snd_soc_dapm_get_enum_double, madera_inmux_put),
+	SOC_DAPM_ENUM_EXT("IN2L Mux", madera_in2muxl_enum,
+			  snd_soc_dapm_get_enum_double, madera_inmux_put),
+	SOC_DAPM_ENUM_EXT("IN2R Mux", madera_in2muxr_enum,
+			  snd_soc_dapm_get_enum_double, madera_inmux_put),
+};
+EXPORT_SYMBOL_GPL(madera_inmux);
+
+static const char * const madera_dmode_texts[] = {
+	"Analog",
+	"Digital",
+};
+
+static SOC_ENUM_SINGLE_DECL(madera_in1dmode_enum,
+			    MADERA_IN1L_CONTROL,
+			    MADERA_IN1_MODE_SHIFT,
+			    madera_dmode_texts);
+
+static SOC_ENUM_SINGLE_DECL(madera_in2dmode_enum,
+			    MADERA_IN2L_CONTROL,
+			    MADERA_IN2_MODE_SHIFT,
+			    madera_dmode_texts);
+
+static SOC_ENUM_SINGLE_DECL(madera_in3dmode_enum,
+			    MADERA_IN3L_CONTROL,
+			    MADERA_IN3_MODE_SHIFT,
+			    madera_dmode_texts);
+
+const struct snd_kcontrol_new madera_inmode[] = {
+	SOC_DAPM_ENUM("IN1 Mode", madera_in1dmode_enum),
+	SOC_DAPM_ENUM("IN2 Mode", madera_in2dmode_enum),
+	SOC_DAPM_ENUM("IN3 Mode", madera_in3dmode_enum),
+};
+EXPORT_SYMBOL_GPL(madera_inmode);
+
+static bool madera_can_change_grp_rate(const struct madera_priv *priv,
+				       unsigned int reg)
+{
+	int count;
+
+	switch (reg) {
+	case MADERA_FX_CTRL1:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_FX];
+		break;
+	case MADERA_ASRC1_RATE1:
+	case MADERA_ASRC1_RATE2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ASRC1];
+		break;
+	case MADERA_ASRC2_RATE1:
+	case MADERA_ASRC2_RATE2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ASRC2];
+		break;
+	case MADERA_ISRC_1_CTRL_1:
+	case MADERA_ISRC_1_CTRL_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ISRC1];
+		break;
+	case MADERA_ISRC_2_CTRL_1:
+	case MADERA_ISRC_2_CTRL_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ISRC2];
+		break;
+	case MADERA_ISRC_3_CTRL_1:
+	case MADERA_ISRC_3_CTRL_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ISRC3];
+		break;
+	case MADERA_ISRC_4_CTRL_1:
+	case MADERA_ISRC_4_CTRL_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_ISRC4];
+		break;
+	case MADERA_OUTPUT_RATE_1:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_OUT];
+		break;
+	case MADERA_SPD1_TX_CONTROL:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_SPD];
+		break;
+	case MADERA_DSP1_CONFIG_1:
+	case MADERA_DSP1_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP1];
+		break;
+	case MADERA_DSP2_CONFIG_1:
+	case MADERA_DSP2_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP2];
+		break;
+	case MADERA_DSP3_CONFIG_1:
+	case MADERA_DSP3_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP3];
+		break;
+	case MADERA_DSP4_CONFIG_1:
+	case MADERA_DSP4_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP4];
+		break;
+	case MADERA_DSP5_CONFIG_1:
+	case MADERA_DSP5_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP5];
+		break;
+	case MADERA_DSP6_CONFIG_1:
+	case MADERA_DSP6_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP6];
+		break;
+	case MADERA_DSP7_CONFIG_1:
+	case MADERA_DSP7_CONFIG_2:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_DSP7];
+		break;
+	case MADERA_AIF1_RATE_CTRL:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_AIF1];
+		break;
+	case MADERA_AIF2_RATE_CTRL:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_AIF2];
+		break;
+	case MADERA_AIF3_RATE_CTRL:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_AIF3];
+		break;
+	case MADERA_AIF4_RATE_CTRL:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_AIF4];
+		break;
+	case MADERA_SLIMBUS_RATES_1:
+	case MADERA_SLIMBUS_RATES_2:
+	case MADERA_SLIMBUS_RATES_3:
+	case MADERA_SLIMBUS_RATES_4:
+	case MADERA_SLIMBUS_RATES_5:
+	case MADERA_SLIMBUS_RATES_6:
+	case MADERA_SLIMBUS_RATES_7:
+	case MADERA_SLIMBUS_RATES_8:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_SLIMBUS];
+		break;
+	case MADERA_PWM_DRIVE_1:
+		count = priv->domain_group_ref[MADERA_DOM_GRP_PWM];
+		break;
+	default:
+		return false;
+	}
+
+	dev_dbg(priv->madera->dev, "Rate reg 0x%x group ref %d\n", reg, count);
+
+	if (count)
+		return false;
+	else
+		return true;
+}
+
+static int madera_adsp_rate_get(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	unsigned int cached_rate;
+	const int adsp_num = e->shift_l;
+	int item;
+
+	mutex_lock(&priv->rate_lock);
+	cached_rate = priv->adsp_rate_cache[adsp_num];
+	mutex_unlock(&priv->rate_lock);
+
+	item = snd_soc_enum_val_to_item(e, cached_rate);
+	ucontrol->value.enumerated.item[0] = item;
+
+	return 0;
+}
+
+static int madera_adsp_rate_put(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	const int adsp_num = e->shift_l;
+	const unsigned int item = ucontrol->value.enumerated.item[0];
+	int ret;
+
+	if (item >= e->items)
+		return -EINVAL;
+
+	/*
+	 * We don't directly write the rate register here but we want to
+	 * maintain consistent behaviour that rate domains cannot be changed
+	 * while in use since this is a hardware requirement
+	 */
+	mutex_lock(&priv->rate_lock);
+
+	if (!madera_can_change_grp_rate(priv, priv->adsp[adsp_num].base)) {
+		dev_warn(priv->madera->dev,
+			 "Cannot change '%s' while in use by active audio paths\n",
+			 kcontrol->id.name);
+		ret = -EBUSY;
+	} else {
+		/* Volatile register so defer until the codec is powered up */
+		priv->adsp_rate_cache[adsp_num] = e->values[item];
+		ret = 0;
+	}
+
+	mutex_unlock(&priv->rate_lock);
+
+	return ret;
+}
+
+static const struct soc_enum madera_adsp_rate_enum[] = {
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 0, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 1, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 2, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 3, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 4, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 5, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(SND_SOC_NOPM, 6, 0xf, MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+};
+
+const struct snd_kcontrol_new madera_adsp_rate_controls[] = {
+	SOC_ENUM_EXT("DSP1 Rate", madera_adsp_rate_enum[0],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP2 Rate", madera_adsp_rate_enum[1],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP3 Rate", madera_adsp_rate_enum[2],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP4 Rate", madera_adsp_rate_enum[3],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP5 Rate", madera_adsp_rate_enum[4],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP6 Rate", madera_adsp_rate_enum[5],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+	SOC_ENUM_EXT("DSP7 Rate", madera_adsp_rate_enum[6],
+		     madera_adsp_rate_get, madera_adsp_rate_put),
+};
+EXPORT_SYMBOL_GPL(madera_adsp_rate_controls);
+
+static int madera_write_adsp_clk_setting(struct madera_priv *priv,
+					 struct wm_adsp *dsp,
+					 unsigned int freq)
+{
+	unsigned int val;
+	unsigned int mask = MADERA_DSP_RATE_MASK;
+	int ret;
+
+	val = priv->adsp_rate_cache[dsp->num - 1] << MADERA_DSP_RATE_SHIFT;
+
+	switch (priv->madera->type) {
+	case CS47L35:
+	case CS47L85:
+	case WM1840:
+		/* use legacy frequency registers */
+		mask |= MADERA_DSP_CLK_SEL_MASK;
+		val |= (freq << MADERA_DSP_CLK_SEL_SHIFT);
+		break;
+	default:
+		/* Configure exact dsp frequency */
+		dev_dbg(priv->madera->dev, "Set DSP frequency to 0x%x\n", freq);
+
+		ret = regmap_write(dsp->regmap,
+				   dsp->base + MADERA_DSP_CONFIG_2_OFFS, freq);
+		if (ret)
+			goto err;
+		break;
+	}
+
+	ret = regmap_update_bits(dsp->regmap,
+				 dsp->base + MADERA_DSP_CONFIG_1_OFFS,
+				 mask, val);
+	if (ret)
+		goto err;
+
+	dev_dbg(priv->madera->dev, "Set DSP clocking to 0x%x\n", val);
+
+	return 0;
+
+err:
+	dev_err(dsp->dev, "Failed to set DSP%d clock: %d\n", dsp->num, ret);
+
+	return ret;
+}
+
+int madera_set_adsp_clk(struct madera_priv *priv, int dsp_num,
+			unsigned int freq)
+{
+	struct wm_adsp *dsp = &priv->adsp[dsp_num];
+	struct madera *madera = priv->madera;
+	unsigned int cur, new;
+	int ret;
+
+	/*
+	 * This is called at a higher DAPM priority than the mux widgets so
+	 * the muxes are still off at this point and it's safe to change
+	 * the rate domain control.
+	 * Also called at a lower DAPM priority than the domain group widgets
+	 * so locking the reads of adsp_rate_cache is not necessary as we know
+	 * changes are locked out by the domain_group_ref reference count.
+	 */
+
+	ret = regmap_read(dsp->regmap,  dsp->base, &cur);
+	if (ret) {
+		dev_err(madera->dev,
+			"Failed to read current DSP rate: %d\n", ret);
+		return ret;
+	}
+
+	cur &= MADERA_DSP_RATE_MASK;
+
+	new = priv->adsp_rate_cache[dsp->num - 1] << MADERA_DSP_RATE_SHIFT;
+
+	if (new == cur) {
+		dev_dbg(madera->dev, "DSP rate not changed\n");
+		return madera_write_adsp_clk_setting(priv, dsp, freq);
+	} else {
+		dev_dbg(madera->dev, "DSP rate changed\n");
+
+		/* The write must be guarded by a number of SYSCLK cycles */
+		madera_spin_sysclk(priv);
+		ret = madera_write_adsp_clk_setting(priv, dsp, freq);
+		madera_spin_sysclk(priv);
+		return ret;
+	}
+}
+EXPORT_SYMBOL_GPL(madera_set_adsp_clk);
+
+int madera_rate_put(struct snd_kcontrol *kcontrol,
+		    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	unsigned int item = ucontrol->value.enumerated.item[0];
+	unsigned int val;
+	int ret;
+
+	if (item >= e->items)
+		return -EINVAL;
+
+	/*
+	 * Prevent the domain powering up while we're checking whether it's
+	 * safe to change rate domain
+	 */
+	mutex_lock(&priv->rate_lock);
+
+	ret = snd_soc_component_read(component, e->reg, &val);
+	if (ret < 0) {
+		dev_warn(priv->madera->dev, "Failed to read 0x%x (%d)\n",
+			 e->reg, ret);
+		goto out;
+	}
+	val >>= e->shift_l;
+	val &= e->mask;
+	if (snd_soc_enum_item_to_val(e, item) == val) {
+		ret = 0;
+		goto out;
+	}
+
+	if (!madera_can_change_grp_rate(priv, e->reg)) {
+		dev_warn(priv->madera->dev,
+			 "Cannot change '%s' while in use by active audio paths\n",
+			 kcontrol->id.name);
+		ret = -EBUSY;
+	} else {
+		/* The write must be guarded by a number of SYSCLK cycles */
+		madera_spin_sysclk(priv);
+		ret = snd_soc_put_enum_double(kcontrol, ucontrol);
+		madera_spin_sysclk(priv);
+	}
+out:
+	mutex_unlock(&priv->rate_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_rate_put);
+
+static void madera_configure_input_mode(struct madera *madera)
+{
+	unsigned int dig_mode, ana_mode_l, ana_mode_r;
+	int max_analogue_inputs, max_dmic_sup, i;
+
+	switch (madera->type) {
+	case CS47L35:
+		max_analogue_inputs = 2;
+		max_dmic_sup = 2;
+		break;
+	case CS47L85:
+	case WM1840:
+		max_analogue_inputs = 3;
+		max_dmic_sup = 3;
+		break;
+	case CS47L90:
+	case CS47L91:
+		max_analogue_inputs = 2;
+		max_dmic_sup = 2;
+		break;
+	default:
+		max_analogue_inputs = 2;
+		max_dmic_sup = 4;
+		break;
+	}
+
+	/*
+	 * Initialize input modes from the A settings. For muxed inputs the
+	 * B settings will be applied if the mux is changed
+	 */
+	for (i = 0; i < max_dmic_sup; i++) {
+		dev_dbg(madera->dev, "IN%d mode %u:%u:%u:%u\n", i + 1,
+			madera->pdata.codec.inmode[i][0],
+			madera->pdata.codec.inmode[i][1],
+			madera->pdata.codec.inmode[i][2],
+			madera->pdata.codec.inmode[i][3]);
+
+		dig_mode = madera->pdata.codec.dmic_ref[i] <<
+			   MADERA_IN1_DMIC_SUP_SHIFT;
+
+		switch (madera->pdata.codec.inmode[i][0]) {
+		case MADERA_INMODE_DIFF:
+			ana_mode_l = 0;
+			break;
+		case MADERA_INMODE_SE:
+			ana_mode_l = 1 << MADERA_IN1L_SRC_SE_SHIFT;
+			break;
+		default:
+			dev_warn(madera->dev,
+				 "IN%dAL Illegal inmode %u ignored\n",
+				 i + 1, madera->pdata.codec.inmode[i][0]);
+			continue;
+		}
+
+		switch (madera->pdata.codec.inmode[i][1]) {
+		case MADERA_INMODE_DIFF:
+			ana_mode_r = 0;
+			break;
+		case MADERA_INMODE_SE:
+			ana_mode_r = 1 << MADERA_IN1R_SRC_SE_SHIFT;
+			break;
+		default:
+			dev_warn(madera->dev,
+				 "IN%dAR Illegal inmode %u ignored\n",
+				 i + 1, madera->pdata.codec.inmode[i][1]);
+			continue;
+		}
+
+		dev_dbg(madera->dev,
+			"IN%dA DMIC mode=0x%x Analogue mode=0x%x,0x%x\n",
+			i + 1, dig_mode, ana_mode_l, ana_mode_r);
+
+		regmap_update_bits(madera->regmap,
+				   MADERA_IN1L_CONTROL + (i * 8),
+				   MADERA_IN1_DMIC_SUP_MASK, dig_mode);
+
+		if (i >= max_analogue_inputs)
+			continue;
+
+		regmap_update_bits(madera->regmap,
+				   MADERA_ADC_DIGITAL_VOLUME_1L + (i * 8),
+				   MADERA_IN1L_SRC_SE_MASK, ana_mode_l);
+
+		regmap_update_bits(madera->regmap,
+				   MADERA_ADC_DIGITAL_VOLUME_1R + (i * 8),
+				   MADERA_IN1R_SRC_SE_MASK, ana_mode_r);
+	}
+}
+
+int madera_init_inputs(struct snd_soc_component *component)
+{
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+
+	madera_configure_input_mode(madera);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_init_inputs);
+
+static const struct snd_soc_dapm_route madera_mono_routes[] = {
+	{ "OUT1R", NULL, "OUT1L" },
+	{ "OUT2R", NULL, "OUT2L" },
+	{ "OUT3R", NULL, "OUT3L" },
+	{ "OUT4R", NULL, "OUT4L" },
+	{ "OUT5R", NULL, "OUT5L" },
+	{ "OUT6R", NULL, "OUT6L" },
+};
+
+int madera_init_outputs(struct snd_soc_component *component, int n_mono_routes)
+{
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_component_get_dapm(component);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	const struct madera_codec_pdata *pdata = &madera->pdata.codec;
+	unsigned int val;
+	int i;
+
+	if (n_mono_routes > MADERA_MAX_OUTPUT) {
+		dev_warn(madera->dev,
+			 "Requested %d mono outputs, using maximum allowed %d\n",
+			 n_mono_routes, MADERA_MAX_OUTPUT);
+		n_mono_routes = MADERA_MAX_OUTPUT;
+	}
+
+	for (i = 0; i < n_mono_routes; i++) {
+		/* Default is 0 so noop with defaults */
+		if (pdata->out_mono[i]) {
+			val = MADERA_OUT1_MONO;
+			snd_soc_dapm_add_routes(dapm,
+						&madera_mono_routes[i], 1);
+		} else {
+			val = 0;
+		}
+
+		regmap_update_bits(madera->regmap,
+				   MADERA_OUTPUT_PATH_CONFIG_1L + (i * 8),
+				   MADERA_OUT1_MONO, val);
+
+		dev_dbg(madera->dev, "OUT%d mono=0x%x\n", i + 1, val);
+	}
+
+	for (i = 0; i < MADERA_MAX_PDM_SPK; i++) {
+		dev_dbg(madera->dev, "PDM%d fmt=0x%x mute=0x%x\n", i + 1,
+			pdata->pdm_fmt[i], pdata->pdm_mute[i]);
+
+		if (pdata->pdm_mute[i])
+			regmap_update_bits(madera->regmap,
+					   MADERA_PDM_SPK1_CTRL_1 + (i * 2),
+					   MADERA_SPK1_MUTE_ENDIAN_MASK |
+					   MADERA_SPK1_MUTE_SEQ1_MASK,
+					   pdata->pdm_mute[i]);
+
+		if (pdata->pdm_fmt[i])
+			regmap_update_bits(madera->regmap,
+					   MADERA_PDM_SPK1_CTRL_2 + (i * 2),
+					   MADERA_SPK1_FMT_MASK,
+					   pdata->pdm_fmt[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_init_outputs);
+
+int madera_init_bus_error_irq(struct madera_priv *priv, int dsp_num,
+			      irq_handler_t handler)
+{
+	struct madera *madera = priv->madera;
+	int ret;
+
+	ret = madera_request_irq(madera,
+				 madera_dsp_bus_error_irqs[dsp_num],
+				 "ADSP2 bus error",
+				 handler,
+				 &priv->adsp[dsp_num]);
+	if (ret)
+		dev_err(madera->dev,
+			"Failed to request DSP Lock region IRQ: %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_init_bus_error_irq);
+
+void madera_free_bus_error_irq(struct madera_priv *priv, int dsp_num)
+{
+	struct madera *madera = priv->madera;
+
+	madera_free_irq(madera,
+			madera_dsp_bus_error_irqs[dsp_num],
+			&priv->adsp[dsp_num]);
+}
+EXPORT_SYMBOL_GPL(madera_free_bus_error_irq);
+
+const char * const madera_mixer_texts[] = {
+	"None",
+	"Tone Generator 1",
+	"Tone Generator 2",
+	"Haptics",
+	"AEC1",
+	"AEC2",
+	"Mic Mute Mixer",
+	"Noise Generator",
+	"IN1L",
+	"IN1R",
+	"IN2L",
+	"IN2R",
+	"IN3L",
+	"IN3R",
+	"IN4L",
+	"IN4R",
+	"IN5L",
+	"IN5R",
+	"IN6L",
+	"IN6R",
+	"AIF1RX1",
+	"AIF1RX2",
+	"AIF1RX3",
+	"AIF1RX4",
+	"AIF1RX5",
+	"AIF1RX6",
+	"AIF1RX7",
+	"AIF1RX8",
+	"AIF2RX1",
+	"AIF2RX2",
+	"AIF2RX3",
+	"AIF2RX4",
+	"AIF2RX5",
+	"AIF2RX6",
+	"AIF2RX7",
+	"AIF2RX8",
+	"AIF3RX1",
+	"AIF3RX2",
+	"AIF3RX3",
+	"AIF3RX4",
+	"AIF4RX1",
+	"AIF4RX2",
+	"SLIMRX1",
+	"SLIMRX2",
+	"SLIMRX3",
+	"SLIMRX4",
+	"SLIMRX5",
+	"SLIMRX6",
+	"SLIMRX7",
+	"SLIMRX8",
+	"EQ1",
+	"EQ2",
+	"EQ3",
+	"EQ4",
+	"DRC1L",
+	"DRC1R",
+	"DRC2L",
+	"DRC2R",
+	"LHPF1",
+	"LHPF2",
+	"LHPF3",
+	"LHPF4",
+	"DSP1.1",
+	"DSP1.2",
+	"DSP1.3",
+	"DSP1.4",
+	"DSP1.5",
+	"DSP1.6",
+	"DSP2.1",
+	"DSP2.2",
+	"DSP2.3",
+	"DSP2.4",
+	"DSP2.5",
+	"DSP2.6",
+	"DSP3.1",
+	"DSP3.2",
+	"DSP3.3",
+	"DSP3.4",
+	"DSP3.5",
+	"DSP3.6",
+	"DSP4.1",
+	"DSP4.2",
+	"DSP4.3",
+	"DSP4.4",
+	"DSP4.5",
+	"DSP4.6",
+	"DSP5.1",
+	"DSP5.2",
+	"DSP5.3",
+	"DSP5.4",
+	"DSP5.5",
+	"DSP5.6",
+	"DSP6.1",
+	"DSP6.2",
+	"DSP6.3",
+	"DSP6.4",
+	"DSP6.5",
+	"DSP6.6",
+	"DSP7.1",
+	"DSP7.2",
+	"DSP7.3",
+	"DSP7.4",
+	"DSP7.5",
+	"DSP7.6",
+	"ASRC1IN1L",
+	"ASRC1IN1R",
+	"ASRC1IN2L",
+	"ASRC1IN2R",
+	"ASRC2IN1L",
+	"ASRC2IN1R",
+	"ASRC2IN2L",
+	"ASRC2IN2R",
+	"ISRC1INT1",
+	"ISRC1INT2",
+	"ISRC1INT3",
+	"ISRC1INT4",
+	"ISRC1DEC1",
+	"ISRC1DEC2",
+	"ISRC1DEC3",
+	"ISRC1DEC4",
+	"ISRC2INT1",
+	"ISRC2INT2",
+	"ISRC2INT3",
+	"ISRC2INT4",
+	"ISRC2DEC1",
+	"ISRC2DEC2",
+	"ISRC2DEC3",
+	"ISRC2DEC4",
+	"ISRC3INT1",
+	"ISRC3INT2",
+	"ISRC3INT3",
+	"ISRC3INT4",
+	"ISRC3DEC1",
+	"ISRC3DEC2",
+	"ISRC3DEC3",
+	"ISRC3DEC4",
+	"ISRC4INT1",
+	"ISRC4INT2",
+	"ISRC4DEC1",
+	"ISRC4DEC2",
+	"DFC1",
+	"DFC2",
+	"DFC3",
+	"DFC4",
+	"DFC5",
+	"DFC6",
+	"DFC7",
+	"DFC8",
+};
+EXPORT_SYMBOL_GPL(madera_mixer_texts);
+
+const unsigned int madera_mixer_values[] = {
+	0x00,	/* None */
+	0x04,	/* Tone Generator 1 */
+	0x05,	/* Tone Generator 2 */
+	0x06,	/* Haptics */
+	0x08,	/* AEC */
+	0x09,	/* AEC2 */
+	0x0c,	/* Noise mixer */
+	0x0d,	/* Comfort noise */
+	0x10,	/* IN1L */
+	0x11,
+	0x12,
+	0x13,
+	0x14,
+	0x15,
+	0x16,
+	0x17,
+	0x18,
+	0x19,
+	0x1A,
+	0x1B,
+	0x20,	/* AIF1RX1 */
+	0x21,
+	0x22,
+	0x23,
+	0x24,
+	0x25,
+	0x26,
+	0x27,
+	0x28,	/* AIF2RX1 */
+	0x29,
+	0x2a,
+	0x2b,
+	0x2c,
+	0x2d,
+	0x2e,
+	0x2f,
+	0x30,	/* AIF3RX1 */
+	0x31,
+	0x32,
+	0x33,
+	0x34,	/* AIF4RX1 */
+	0x35,
+	0x38,	/* SLIMRX1 */
+	0x39,
+	0x3a,
+	0x3b,
+	0x3c,
+	0x3d,
+	0x3e,
+	0x3f,
+	0x50,	/* EQ1 */
+	0x51,
+	0x52,
+	0x53,
+	0x58,	/* DRC1L */
+	0x59,
+	0x5a,
+	0x5b,
+	0x60,	/* LHPF1 */
+	0x61,
+	0x62,
+	0x63,
+	0x68,	/* DSP1.1 */
+	0x69,
+	0x6a,
+	0x6b,
+	0x6c,
+	0x6d,
+	0x70,	/* DSP2.1 */
+	0x71,
+	0x72,
+	0x73,
+	0x74,
+	0x75,
+	0x78,	/* DSP3.1 */
+	0x79,
+	0x7a,
+	0x7b,
+	0x7c,
+	0x7d,
+	0x80,	/* DSP4.1 */
+	0x81,
+	0x82,
+	0x83,
+	0x84,
+	0x85,
+	0x88,	/* DSP5.1 */
+	0x89,
+	0x8a,
+	0x8b,
+	0x8c,
+	0x8d,
+	0xc0,	/* DSP6.1 */
+	0xc1,
+	0xc2,
+	0xc3,
+	0xc4,
+	0xc5,
+	0xc8,	/* DSP7.1 */
+	0xc9,
+	0xca,
+	0xcb,
+	0xcc,
+	0xcd,
+	0x90,	/* ASRC1IN1L */
+	0x91,
+	0x92,
+	0x93,
+	0x94,	/* ASRC2IN1L */
+	0x95,
+	0x96,
+	0x97,
+	0xa0,	/* ISRC1INT1 */
+	0xa1,
+	0xa2,
+	0xa3,
+	0xa4,	/* ISRC1DEC1 */
+	0xa5,
+	0xa6,
+	0xa7,
+	0xa8,	/* ISRC2DEC1 */
+	0xa9,
+	0xaa,
+	0xab,
+	0xac,	/* ISRC2INT1 */
+	0xad,
+	0xae,
+	0xaf,
+	0xb0,	/* ISRC3DEC1 */
+	0xb1,
+	0xb2,
+	0xb3,
+	0xb4,	/* ISRC3INT1 */
+	0xb5,
+	0xb6,
+	0xb7,
+	0xb8,	/* ISRC4INT1 */
+	0xb9,
+	0xbc,	/* ISRC4DEC1 */
+	0xbd,
+	0xf8,	/* DFC1 */
+	0xf9,
+	0xfa,
+	0xfb,
+	0xfc,
+	0xfd,
+	0xfe,
+	0xff,	/* DFC8 */
+};
+EXPORT_SYMBOL_GPL(madera_mixer_values);
+
+const DECLARE_TLV_DB_SCALE(madera_ana_tlv, 0, 100, 0);
+EXPORT_SYMBOL_GPL(madera_ana_tlv);
+
+const DECLARE_TLV_DB_SCALE(madera_eq_tlv, -1200, 100, 0);
+EXPORT_SYMBOL_GPL(madera_eq_tlv);
+
+const DECLARE_TLV_DB_SCALE(madera_digital_tlv, -6400, 50, 0);
+EXPORT_SYMBOL_GPL(madera_digital_tlv);
+
+const DECLARE_TLV_DB_SCALE(madera_noise_tlv, -13200, 600, 0);
+EXPORT_SYMBOL_GPL(madera_noise_tlv);
+
+const DECLARE_TLV_DB_SCALE(madera_ng_tlv, -12000, 600, 0);
+EXPORT_SYMBOL_GPL(madera_ng_tlv);
+
+const DECLARE_TLV_DB_SCALE(madera_mixer_tlv, -3200, 100, 0);
+EXPORT_SYMBOL_GPL(madera_mixer_tlv);
+
+const char * const madera_rate_text[MADERA_RATE_ENUM_SIZE] = {
+	"SYNCCLK rate 1", "SYNCCLK rate 2", "SYNCCLK rate 3",
+	"ASYNCCLK rate 1", "ASYNCCLK rate 2",
+};
+EXPORT_SYMBOL_GPL(madera_rate_text);
+
+const unsigned int madera_rate_val[MADERA_RATE_ENUM_SIZE] = {
+	0x0, 0x1, 0x2, 0x8, 0x9,
+};
+EXPORT_SYMBOL_GPL(madera_rate_val);
+
+static const char * const madera_dfc_width_text[MADERA_DFC_WIDTH_ENUM_SIZE] = {
+	"8 bit", "16 bit", "20 bit", "24 bit", "32 bit",
+};
+
+static const unsigned int madera_dfc_width_val[MADERA_DFC_WIDTH_ENUM_SIZE] = {
+	7, 15, 19, 23, 31,
+};
+
+static const char * const madera_dfc_type_text[MADERA_DFC_TYPE_ENUM_SIZE] = {
+	"Fixed", "Unsigned Fixed", "Single Precision Floating",
+	"Half Precision Floating", "Arm Alternative Floating",
+};
+
+static const unsigned int madera_dfc_type_val[MADERA_DFC_TYPE_ENUM_SIZE] = {
+	0, 1, 2, 4, 5,
+};
+
+const struct soc_enum madera_dfc_width[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC1_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC1_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC2_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC2_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC3_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC3_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC4_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC4_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC5_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC5_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC6_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC6_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC7_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC7_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC8_RX,
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_RX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_RX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC8_TX,
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      MADERA_DFC1_TX_DATA_WIDTH_MASK >>
+			      MADERA_DFC1_TX_DATA_WIDTH_SHIFT,
+			      ARRAY_SIZE(madera_dfc_width_text),
+			      madera_dfc_width_text,
+			      madera_dfc_width_val),
+};
+EXPORT_SYMBOL_GPL(madera_dfc_width);
+
+const struct soc_enum madera_dfc_type[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC1_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC1_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC2_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC2_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC3_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC3_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC4_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC4_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC5_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC5_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC6_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC6_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC7_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC7_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC8_RX,
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_RX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_RX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DFC8_TX,
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      MADERA_DFC1_TX_DATA_TYPE_MASK >>
+			      MADERA_DFC1_TX_DATA_TYPE_SHIFT,
+			      ARRAY_SIZE(madera_dfc_type_text),
+			      madera_dfc_type_text,
+			      madera_dfc_type_val),
+};
+EXPORT_SYMBOL_GPL(madera_dfc_type);
+
+const struct soc_enum madera_isrc_fsh[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_1_CTRL_1,
+			      MADERA_ISRC1_FSH_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_2_CTRL_1,
+			      MADERA_ISRC2_FSH_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_3_CTRL_1,
+			      MADERA_ISRC3_FSH_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_4_CTRL_1,
+			      MADERA_ISRC4_FSH_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+
+};
+EXPORT_SYMBOL_GPL(madera_isrc_fsh);
+
+const struct soc_enum madera_isrc_fsl[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_1_CTRL_2,
+			      MADERA_ISRC1_FSL_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_2_CTRL_2,
+			      MADERA_ISRC2_FSL_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_3_CTRL_2,
+			      MADERA_ISRC3_FSL_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ISRC_4_CTRL_2,
+			      MADERA_ISRC4_FSL_SHIFT, 0xf,
+			      MADERA_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+
+};
+EXPORT_SYMBOL_GPL(madera_isrc_fsl);
+
+const struct soc_enum madera_asrc1_rate[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_ASRC1_RATE1,
+			      MADERA_ASRC1_RATE1_SHIFT, 0xf,
+			      MADERA_SYNC_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ASRC1_RATE2,
+			      MADERA_ASRC1_RATE1_SHIFT, 0xf,
+			      MADERA_ASYNC_RATE_ENUM_SIZE,
+			      madera_rate_text + MADERA_SYNC_RATE_ENUM_SIZE,
+			      madera_rate_val + MADERA_SYNC_RATE_ENUM_SIZE),
+
+};
+EXPORT_SYMBOL_GPL(madera_asrc1_rate);
+
+const struct soc_enum madera_asrc2_rate[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_ASRC2_RATE1,
+			      MADERA_ASRC2_RATE1_SHIFT, 0xf,
+			      MADERA_SYNC_RATE_ENUM_SIZE,
+			      madera_rate_text, madera_rate_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_ASRC2_RATE2,
+			      MADERA_ASRC2_RATE2_SHIFT, 0xf,
+			      MADERA_ASYNC_RATE_ENUM_SIZE,
+			      madera_rate_text + MADERA_SYNC_RATE_ENUM_SIZE,
+			      madera_rate_val + MADERA_SYNC_RATE_ENUM_SIZE),
+
+};
+EXPORT_SYMBOL_GPL(madera_asrc2_rate);
+
+static const char * const madera_vol_ramp_text[] = {
+	"0ms/6dB", "0.5ms/6dB", "1ms/6dB", "2ms/6dB", "4ms/6dB", "8ms/6dB",
+	"15ms/6dB", "30ms/6dB",
+};
+
+SOC_ENUM_SINGLE_DECL(madera_in_vd_ramp,
+		     MADERA_INPUT_VOLUME_RAMP,
+		     MADERA_IN_VD_RAMP_SHIFT,
+		     madera_vol_ramp_text);
+EXPORT_SYMBOL_GPL(madera_in_vd_ramp);
+
+SOC_ENUM_SINGLE_DECL(madera_in_vi_ramp,
+		     MADERA_INPUT_VOLUME_RAMP,
+		     MADERA_IN_VI_RAMP_SHIFT,
+		     madera_vol_ramp_text);
+EXPORT_SYMBOL_GPL(madera_in_vi_ramp);
+
+SOC_ENUM_SINGLE_DECL(madera_out_vd_ramp,
+		     MADERA_OUTPUT_VOLUME_RAMP,
+		     MADERA_OUT_VD_RAMP_SHIFT,
+		     madera_vol_ramp_text);
+EXPORT_SYMBOL_GPL(madera_out_vd_ramp);
+
+SOC_ENUM_SINGLE_DECL(madera_out_vi_ramp,
+		     MADERA_OUTPUT_VOLUME_RAMP,
+		     MADERA_OUT_VI_RAMP_SHIFT,
+		     madera_vol_ramp_text);
+EXPORT_SYMBOL_GPL(madera_out_vi_ramp);
+
+static const char * const madera_lhpf_mode_text[] = {
+	"Low-pass", "High-pass"
+};
+
+SOC_ENUM_SINGLE_DECL(madera_lhpf1_mode,
+		     MADERA_HPLPF1_1,
+		     MADERA_LHPF1_MODE_SHIFT,
+		     madera_lhpf_mode_text);
+EXPORT_SYMBOL_GPL(madera_lhpf1_mode);
+
+SOC_ENUM_SINGLE_DECL(madera_lhpf2_mode,
+		     MADERA_HPLPF2_1,
+		     MADERA_LHPF2_MODE_SHIFT,
+		     madera_lhpf_mode_text);
+EXPORT_SYMBOL_GPL(madera_lhpf2_mode);
+
+SOC_ENUM_SINGLE_DECL(madera_lhpf3_mode,
+		     MADERA_HPLPF3_1,
+		     MADERA_LHPF3_MODE_SHIFT,
+		     madera_lhpf_mode_text);
+EXPORT_SYMBOL_GPL(madera_lhpf3_mode);
+
+SOC_ENUM_SINGLE_DECL(madera_lhpf4_mode,
+		     MADERA_HPLPF4_1,
+		     MADERA_LHPF4_MODE_SHIFT,
+		     madera_lhpf_mode_text);
+EXPORT_SYMBOL_GPL(madera_lhpf4_mode);
+
+static const char * const madera_ng_hold_text[] = {
+	"30ms", "120ms", "250ms", "500ms",
+};
+
+SOC_ENUM_SINGLE_DECL(madera_ng_hold,
+		     MADERA_NOISE_GATE_CONTROL,
+		     MADERA_NGATE_HOLD_SHIFT,
+		     madera_ng_hold_text);
+EXPORT_SYMBOL_GPL(madera_ng_hold);
+
+static const char * const madera_in_hpf_cut_text[] = {
+	"2.5Hz", "5Hz", "10Hz", "20Hz", "40Hz"
+};
+
+SOC_ENUM_SINGLE_DECL(madera_in_hpf_cut_enum,
+		     MADERA_HPF_CONTROL,
+		     MADERA_IN_HPF_CUT_SHIFT,
+		     madera_in_hpf_cut_text);
+EXPORT_SYMBOL_GPL(madera_in_hpf_cut_enum);
+
+static const char * const madera_in_dmic_osr_text[MADERA_OSR_ENUM_SIZE] = {
+	"384kHz", "768kHz", "1.536MHz", "3.072MHz", "6.144MHz",
+};
+
+static const unsigned int madera_in_dmic_osr_val[MADERA_OSR_ENUM_SIZE] = {
+	2, 3, 4, 5, 6,
+};
+
+const struct soc_enum madera_in_dmic_osr[] = {
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC1L_CONTROL, MADERA_IN1_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC2L_CONTROL, MADERA_IN2_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC3L_CONTROL, MADERA_IN3_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC4L_CONTROL, MADERA_IN4_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC5L_CONTROL, MADERA_IN5_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+	SOC_VALUE_ENUM_SINGLE(MADERA_DMIC6L_CONTROL, MADERA_IN6_OSR_SHIFT,
+			      0x7, MADERA_OSR_ENUM_SIZE,
+			      madera_in_dmic_osr_text, madera_in_dmic_osr_val),
+};
+EXPORT_SYMBOL_GPL(madera_in_dmic_osr);
+
+static const char * const madera_anc_input_src_text[] = {
+	"None", "IN1", "IN2", "IN3", "IN4", "IN5", "IN6",
+};
+
+static const char * const madera_anc_channel_src_text[] = {
+	"None", "Left", "Right", "Combine",
+};
+
+const struct soc_enum madera_anc_input_src[] = {
+	SOC_ENUM_SINGLE(MADERA_ANC_SRC,
+			MADERA_IN_RXANCL_SEL_SHIFT,
+			ARRAY_SIZE(madera_anc_input_src_text),
+			madera_anc_input_src_text),
+	SOC_ENUM_SINGLE(MADERA_FCL_ADC_REFORMATTER_CONTROL,
+			MADERA_FCL_MIC_MODE_SEL_SHIFT,
+			ARRAY_SIZE(madera_anc_channel_src_text),
+			madera_anc_channel_src_text),
+	SOC_ENUM_SINGLE(MADERA_ANC_SRC,
+			MADERA_IN_RXANCR_SEL_SHIFT,
+			ARRAY_SIZE(madera_anc_input_src_text),
+			madera_anc_input_src_text),
+	SOC_ENUM_SINGLE(MADERA_FCR_ADC_REFORMATTER_CONTROL,
+			MADERA_FCR_MIC_MODE_SEL_SHIFT,
+			ARRAY_SIZE(madera_anc_channel_src_text),
+			madera_anc_channel_src_text),
+};
+EXPORT_SYMBOL_GPL(madera_anc_input_src);
+
+static const char * const madera_anc_ng_texts[] = {
+	"None", "Internal", "External",
+};
+
+SOC_ENUM_SINGLE_DECL(madera_anc_ng_enum, SND_SOC_NOPM, 0, madera_anc_ng_texts);
+EXPORT_SYMBOL_GPL(madera_anc_ng_enum);
+
+static const char * const madera_out_anc_src_text[] = {
+	"None", "RXANCL", "RXANCR",
+};
+
+const struct soc_enum madera_output_anc_src[] = {
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_1L,
+			MADERA_OUT1L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_1R,
+			MADERA_OUT1R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_2L,
+			MADERA_OUT2L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_2R,
+			MADERA_OUT2R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_3L,
+			MADERA_OUT3L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_3R,
+			MADERA_OUT3R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_4L,
+			MADERA_OUT4L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_4R,
+			MADERA_OUT4R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_5L,
+			MADERA_OUT5L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_5R,
+			MADERA_OUT5R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_6L,
+			MADERA_OUT6L_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+	SOC_ENUM_SINGLE(MADERA_OUTPUT_PATH_CONFIG_6R,
+			MADERA_OUT6R_ANC_SRC_SHIFT,
+			ARRAY_SIZE(madera_out_anc_src_text),
+			madera_out_anc_src_text),
+};
+EXPORT_SYMBOL_GPL(madera_output_anc_src);
+
+int madera_dfc_put(struct snd_kcontrol *kcontrol,
+		   struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_component_get_dapm(component);
+	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+	unsigned int reg = e->reg;
+	unsigned int val;
+	int ret = 0;
+
+	reg = ((reg / 6) * 6) - 2;
+
+	snd_soc_dapm_mutex_lock(dapm);
+
+	ret = snd_soc_component_read(component, reg, &val);
+	if (ret)
+		goto exit;
+
+	if (val & MADERA_DFC1_ENA) {
+		ret = -EBUSY;
+		dev_err(component->dev, "Can't change mode on an active DFC\n");
+		goto exit;
+	}
+
+	ret = snd_soc_put_enum_double(kcontrol, ucontrol);
+exit:
+	snd_soc_dapm_mutex_unlock(dapm);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_dfc_put);
+
+int madera_lp_mode_put(struct snd_kcontrol *kcontrol,
+		       struct snd_ctl_elem_value *ucontrol)
+{
+	struct soc_mixer_control *mc =
+		(struct soc_mixer_control *)kcontrol->private_value;
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_component_get_dapm(component);
+	unsigned int val, mask;
+	int ret;
+
+	snd_soc_dapm_mutex_lock(dapm);
+
+	/* Cannot change lp mode on an active input */
+	ret = snd_soc_component_read(component, MADERA_INPUT_ENABLES, &val);
+	if (ret)
+		goto exit;
+	mask = (mc->reg - MADERA_ADC_DIGITAL_VOLUME_1L) / 4;
+	mask ^= 0x1; /* Flip bottom bit for channel order */
+
+	if (val & (1 << mask)) {
+		ret = -EBUSY;
+		dev_err(component->dev,
+			"Can't change lp mode on an active input\n");
+		goto exit;
+	}
+
+	ret = snd_soc_put_volsw(kcontrol, ucontrol);
+
+exit:
+	snd_soc_dapm_mutex_unlock(dapm);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_lp_mode_put);
+
+const struct snd_kcontrol_new madera_dsp_trigger_output_mux[] = {
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+};
+EXPORT_SYMBOL_GPL(madera_dsp_trigger_output_mux);
+
+const struct snd_kcontrol_new madera_drc_activity_output_mux[] = {
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+	SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0),
+};
+EXPORT_SYMBOL_GPL(madera_drc_activity_output_mux);
+
+static void madera_in_set_vu(struct madera_priv *priv, bool enable)
+{
+	unsigned int val;
+	int i, ret;
+
+	if (enable)
+		val = MADERA_IN_VU;
+	else
+		val = 0;
+
+	for (i = 0; i < priv->num_inputs; i++) {
+		ret = regmap_update_bits(priv->madera->regmap,
+					 MADERA_ADC_DIGITAL_VOLUME_1L + (i * 4),
+					 MADERA_IN_VU, val);
+		if (ret)
+			dev_warn(priv->madera->dev,
+				 "Failed to modify VU bits: %d\n", ret);
+	}
+}
+
+int madera_in_ev(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol,
+		 int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	unsigned int reg, val;
+	int ret;
+
+	if (w->shift % 2)
+		reg = MADERA_ADC_DIGITAL_VOLUME_1L + ((w->shift / 2) * 8);
+	else
+		reg = MADERA_ADC_DIGITAL_VOLUME_1R + ((w->shift / 2) * 8);
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		priv->in_pending++;
+		break;
+	case SND_SOC_DAPM_POST_PMU:
+		priv->in_pending--;
+		snd_soc_component_update_bits(component, reg,
+					      MADERA_IN1L_MUTE, 0);
+
+		/* If this is the last input pending then allow VU */
+		if (priv->in_pending == 0) {
+			usleep_range(1000, 3000);
+			madera_in_set_vu(priv, true);
+		}
+		break;
+	case SND_SOC_DAPM_PRE_PMD:
+		snd_soc_component_update_bits(component, reg,
+					      MADERA_IN1L_MUTE | MADERA_IN_VU,
+					      MADERA_IN1L_MUTE | MADERA_IN_VU);
+		break;
+	case SND_SOC_DAPM_POST_PMD:
+		/* Disable volume updates if no inputs are enabled */
+		ret = snd_soc_component_read(component, MADERA_INPUT_ENABLES,
+					     &val);
+		if (!ret && !val)
+			madera_in_set_vu(priv, false);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_in_ev);
+
+int madera_out_ev(struct snd_soc_dapm_widget *w,
+		  struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	int out_up_delay;
+
+	switch (madera->type) {
+	case CS47L90:
+	case CS47L91:
+		out_up_delay = 6;
+		break;
+	default:
+		out_up_delay = 17;
+		break;
+	}
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		switch (w->shift) {
+		case MADERA_OUT1L_ENA_SHIFT:
+		case MADERA_OUT1R_ENA_SHIFT:
+		case MADERA_OUT2L_ENA_SHIFT:
+		case MADERA_OUT2R_ENA_SHIFT:
+		case MADERA_OUT3L_ENA_SHIFT:
+		case MADERA_OUT3R_ENA_SHIFT:
+			priv->out_up_pending++;
+			priv->out_up_delay += out_up_delay;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case SND_SOC_DAPM_POST_PMU:
+		switch (w->shift) {
+		case MADERA_OUT1L_ENA_SHIFT:
+		case MADERA_OUT1R_ENA_SHIFT:
+		case MADERA_OUT2L_ENA_SHIFT:
+		case MADERA_OUT2R_ENA_SHIFT:
+		case MADERA_OUT3L_ENA_SHIFT:
+		case MADERA_OUT3R_ENA_SHIFT:
+			priv->out_up_pending--;
+			if (!priv->out_up_pending) {
+				msleep(priv->out_up_delay);
+				priv->out_up_delay = 0;
+			}
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	case SND_SOC_DAPM_PRE_PMD:
+		switch (w->shift) {
+		case MADERA_OUT1L_ENA_SHIFT:
+		case MADERA_OUT1R_ENA_SHIFT:
+		case MADERA_OUT2L_ENA_SHIFT:
+		case MADERA_OUT2R_ENA_SHIFT:
+		case MADERA_OUT3L_ENA_SHIFT:
+		case MADERA_OUT3R_ENA_SHIFT:
+			priv->out_down_pending++;
+			priv->out_down_delay++;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case SND_SOC_DAPM_POST_PMD:
+		switch (w->shift) {
+		case MADERA_OUT1L_ENA_SHIFT:
+		case MADERA_OUT1R_ENA_SHIFT:
+		case MADERA_OUT2L_ENA_SHIFT:
+		case MADERA_OUT2R_ENA_SHIFT:
+		case MADERA_OUT3L_ENA_SHIFT:
+		case MADERA_OUT3R_ENA_SHIFT:
+			priv->out_down_pending--;
+			if (!priv->out_down_pending) {
+				msleep(priv->out_down_delay);
+				priv->out_down_delay = 0;
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_out_ev);
+
+int madera_hp_ev(struct snd_soc_dapm_widget *w,
+		 struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	unsigned int mask = 1 << w->shift;
+	unsigned int out_num = w->shift / 2;
+	unsigned int val;
+	unsigned int ep_sel = 0;
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		val = mask;
+		break;
+	case SND_SOC_DAPM_PRE_PMD:
+		val = 0;
+		break;
+	case SND_SOC_DAPM_PRE_PMU:
+	case SND_SOC_DAPM_POST_PMD:
+		return madera_out_ev(w, kcontrol, event);
+	default:
+		return 0;
+	}
+
+	/* Store the desired state for the HP outputs */
+	madera->hp_ena &= ~mask;
+	madera->hp_ena |= val;
+
+	/* if OUT1 is routed to EPOUT, ignore HP clamp and impedance */
+	regmap_read(madera->regmap, MADERA_OUTPUT_ENABLES_1, &ep_sel);
+	ep_sel &= MADERA_EP_SEL_MASK;
+
+	/* Force off if HPDET has disabled the clamp for this output */
+	if (!ep_sel &&
+	    (!madera->out_clamp[out_num] || madera->out_shorted[out_num]))
+		val = 0;
+
+	regmap_update_bits(madera->regmap, MADERA_OUTPUT_ENABLES_1, mask, val);
+
+	return madera_out_ev(w, kcontrol, event);
+}
+EXPORT_SYMBOL_GPL(madera_hp_ev);
+
+int madera_anc_ev(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol,
+		  int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	unsigned int val;
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		val = 1 << w->shift;
+		break;
+	case SND_SOC_DAPM_PRE_PMD:
+		val = 1 << (w->shift + 1);
+		break;
+	default:
+		return 0;
+	}
+
+	snd_soc_component_write(component, MADERA_CLOCK_CONTROL, val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_anc_ev);
+
+static const unsigned int madera_opclk_ref_48k_rates[] = {
+	6144000,
+	12288000,
+	24576000,
+	49152000,
+};
+
+static const unsigned int madera_opclk_ref_44k1_rates[] = {
+	5644800,
+	11289600,
+	22579200,
+	45158400,
+};
+
+static int madera_set_opclk(struct snd_soc_component *component,
+			    unsigned int clk, unsigned int freq)
+{
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	unsigned int mask = MADERA_OPCLK_DIV_MASK | MADERA_OPCLK_SEL_MASK;
+	unsigned int reg, val;
+	const unsigned int *rates;
+	int ref, div, refclk;
+
+	BUILD_BUG_ON(ARRAY_SIZE(madera_opclk_ref_48k_rates) !=
+		     ARRAY_SIZE(madera_opclk_ref_44k1_rates));
+
+	switch (clk) {
+	case MADERA_CLK_OPCLK:
+		reg = MADERA_OUTPUT_SYSTEM_CLOCK;
+		refclk = priv->sysclk;
+		break;
+	case MADERA_CLK_ASYNC_OPCLK:
+		reg = MADERA_OUTPUT_ASYNC_CLOCK;
+		refclk = priv->asyncclk;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (refclk % 4000)
+		rates = madera_opclk_ref_44k1_rates;
+	else
+		rates = madera_opclk_ref_48k_rates;
+
+	for (ref = 0; ref < ARRAY_SIZE(madera_opclk_ref_48k_rates); ++ref) {
+		if (rates[ref] > refclk)
+			continue;
+
+		div = 2;
+		while ((rates[ref] / div >= freq) && (div <= 30)) {
+			if (rates[ref] / div == freq) {
+				dev_dbg(component->dev, "Configured %dHz OPCLK\n",
+					freq);
+
+				val = (div << MADERA_OPCLK_DIV_SHIFT) | ref;
+
+				snd_soc_component_update_bits(component, reg,
+							      mask, val);
+				return 0;
+			}
+			div += 2;
+		}
+	}
+
+	dev_err(component->dev, "Unable to generate %dHz OPCLK\n", freq);
+
+	return -EINVAL;
+}
+
+static int madera_get_sysclk_setting(unsigned int freq)
+{
+	switch (freq) {
+	case 0:
+	case 5644800:
+	case 6144000:
+		return 0;
+	case 11289600:
+	case 12288000:
+		return MADERA_SYSCLK_12MHZ << MADERA_SYSCLK_FREQ_SHIFT;
+	case 22579200:
+	case 24576000:
+		return MADERA_SYSCLK_24MHZ << MADERA_SYSCLK_FREQ_SHIFT;
+	case 45158400:
+	case 49152000:
+		return MADERA_SYSCLK_49MHZ << MADERA_SYSCLK_FREQ_SHIFT;
+	case 90316800:
+	case 98304000:
+		return MADERA_SYSCLK_98MHZ << MADERA_SYSCLK_FREQ_SHIFT;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int madera_get_legacy_dspclk_setting(struct madera *madera,
+					    unsigned int freq)
+{
+	switch (freq) {
+	case 0:
+		return 0;
+	case 45158400:
+	case 49152000:
+		switch (madera->type) {
+		case CS47L85:
+		case WM1840:
+			if (madera->rev < 3)
+				return -EINVAL;
+			else
+				return MADERA_SYSCLK_49MHZ <<
+				       MADERA_SYSCLK_FREQ_SHIFT;
+		default:
+			return -EINVAL;
+		}
+	case 135475200:
+	case 147456000:
+		return MADERA_DSPCLK_147MHZ << MADERA_DSP_CLK_FREQ_LEGACY_SHIFT;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int madera_get_dspclk_setting(struct madera *madera,
+				     unsigned int freq,
+				     unsigned int *clock_2_val)
+{
+	switch (madera->type) {
+	case CS47L35:
+	case CS47L85:
+	case WM1840:
+		*clock_2_val = 0; /* don't use MADERA_DSP_CLOCK_2 */
+		return madera_get_legacy_dspclk_setting(madera, freq);
+	default:
+		if (freq > 150000000)
+			return -EINVAL;
+
+		/* Use new exact frequency control */
+		*clock_2_val = freq / 15625; /* freq * (2^6) / (10^6) */
+		return 0;
+	}
+}
+
+int madera_set_sysclk(struct snd_soc_component *component, int clk_id,
+		      int source, unsigned int freq, int dir)
+{
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	char *name;
+	unsigned int reg, clock_2_val = 0;
+	unsigned int mask = MADERA_SYSCLK_FREQ_MASK | MADERA_SYSCLK_SRC_MASK;
+	unsigned int val = source << MADERA_SYSCLK_SRC_SHIFT;
+	int clk_freq_sel, *clk;
+	int ret = 0;
+
+	switch (clk_id) {
+	case MADERA_CLK_SYSCLK_1:
+		name = "SYSCLK";
+		reg = MADERA_SYSTEM_CLOCK_1;
+		clk = &priv->sysclk;
+		clk_freq_sel = madera_get_sysclk_setting(freq);
+		mask |= MADERA_SYSCLK_FRAC;
+		break;
+	case MADERA_CLK_ASYNCCLK_1:
+		name = "ASYNCCLK";
+		reg = MADERA_ASYNC_CLOCK_1;
+		clk = &priv->asyncclk;
+		clk_freq_sel = madera_get_sysclk_setting(freq);
+		break;
+	case MADERA_CLK_DSPCLK:
+		name = "DSPCLK";
+		reg = MADERA_DSP_CLOCK_1;
+		clk = &priv->dspclk;
+		clk_freq_sel = madera_get_dspclk_setting(madera, freq,
+							 &clock_2_val);
+		break;
+	case MADERA_CLK_OPCLK:
+	case MADERA_CLK_ASYNC_OPCLK:
+		return madera_set_opclk(component, clk_id, freq);
+	default:
+		return -EINVAL;
+	}
+
+	if (clk_freq_sel < 0) {
+		dev_err(madera->dev,
+			"Failed to get clk setting for %dHZ\n", freq);
+		return clk_freq_sel;
+	}
+
+	*clk = freq;
+
+	if (freq == 0) {
+		dev_dbg(madera->dev, "%s cleared\n", name);
+		return 0;
+	}
+
+	val |= clk_freq_sel;
+
+	if (clock_2_val) {
+		ret = regmap_write(madera->regmap, MADERA_DSP_CLOCK_2,
+				   clock_2_val);
+		if (ret) {
+			dev_err(madera->dev,
+				"Failed to write DSP_CONFIG2: %d\n", ret);
+			return ret;
+		}
+
+		/*
+		 * We're using the frequency setting in MADERA_DSP_CLOCK_2 so
+		 * don't change the frequency select bits in MADERA_DSP_CLOCK_1
+		 */
+		mask = MADERA_SYSCLK_SRC_MASK;
+	}
+
+	if (freq % 6144000)
+		val |= MADERA_SYSCLK_FRAC;
+
+	dev_dbg(madera->dev, "%s set to %uHz\n", name, freq);
+
+	return regmap_update_bits(madera->regmap, reg, mask, val);
+}
+EXPORT_SYMBOL_GPL(madera_set_sysclk);
+
+static int madera_set_fmt(struct snd_soc_dai *dai, unsigned int fmt)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	int lrclk, bclk, mode, base;
+
+	base = dai->driver->base;
+
+	lrclk = 0;
+	bclk = 0;
+
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_DSP_A:
+		mode = MADERA_FMT_DSP_MODE_A;
+		break;
+	case SND_SOC_DAIFMT_DSP_B:
+		if ((fmt & SND_SOC_DAIFMT_MASTER_MASK) !=
+		    SND_SOC_DAIFMT_CBM_CFM) {
+			madera_aif_err(dai, "DSP_B not valid in slave mode\n");
+			return -EINVAL;
+		}
+		mode = MADERA_FMT_DSP_MODE_B;
+		break;
+	case SND_SOC_DAIFMT_I2S:
+		mode = MADERA_FMT_I2S_MODE;
+		break;
+	case SND_SOC_DAIFMT_LEFT_J:
+		if ((fmt & SND_SOC_DAIFMT_MASTER_MASK) !=
+		    SND_SOC_DAIFMT_CBM_CFM) {
+			madera_aif_err(dai, "LEFT_J not valid in slave mode\n");
+			return -EINVAL;
+		}
+		mode = MADERA_FMT_LEFT_JUSTIFIED_MODE;
+		break;
+	default:
+		madera_aif_err(dai, "Unsupported DAI format %d\n",
+			       fmt & SND_SOC_DAIFMT_FORMAT_MASK);
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+	case SND_SOC_DAIFMT_CBS_CFS:
+		break;
+	case SND_SOC_DAIFMT_CBS_CFM:
+		lrclk |= MADERA_AIF1TX_LRCLK_MSTR;
+		break;
+	case SND_SOC_DAIFMT_CBM_CFS:
+		bclk |= MADERA_AIF1_BCLK_MSTR;
+		break;
+	case SND_SOC_DAIFMT_CBM_CFM:
+		bclk |= MADERA_AIF1_BCLK_MSTR;
+		lrclk |= MADERA_AIF1TX_LRCLK_MSTR;
+		break;
+	default:
+		madera_aif_err(dai, "Unsupported master mode %d\n",
+			       fmt & SND_SOC_DAIFMT_MASTER_MASK);
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	case SND_SOC_DAIFMT_NB_NF:
+		break;
+	case SND_SOC_DAIFMT_IB_IF:
+		bclk |= MADERA_AIF1_BCLK_INV;
+		lrclk |= MADERA_AIF1TX_LRCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_IB_NF:
+		bclk |= MADERA_AIF1_BCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_NB_IF:
+		lrclk |= MADERA_AIF1TX_LRCLK_INV;
+		break;
+	default:
+		madera_aif_err(dai, "Unsupported invert mode %d\n",
+			       fmt & SND_SOC_DAIFMT_INV_MASK);
+		return -EINVAL;
+	}
+
+	regmap_update_bits(madera->regmap, base + MADERA_AIF_BCLK_CTRL,
+			   MADERA_AIF1_BCLK_INV | MADERA_AIF1_BCLK_MSTR,
+			   bclk);
+	regmap_update_bits(madera->regmap, base + MADERA_AIF_TX_PIN_CTRL,
+			   MADERA_AIF1TX_LRCLK_INV | MADERA_AIF1TX_LRCLK_MSTR,
+			   lrclk);
+	regmap_update_bits(madera->regmap, base + MADERA_AIF_RX_PIN_CTRL,
+			   MADERA_AIF1RX_LRCLK_INV | MADERA_AIF1RX_LRCLK_MSTR,
+			   lrclk);
+	regmap_update_bits(madera->regmap, base + MADERA_AIF_FORMAT,
+			   MADERA_AIF1_FMT_MASK, mode);
+
+	return 0;
+}
+
+static const int madera_48k_bclk_rates[] = {
+	-1,
+	48000,
+	64000,
+	96000,
+	128000,
+	192000,
+	256000,
+	384000,
+	512000,
+	768000,
+	1024000,
+	1536000,
+	2048000,
+	3072000,
+	4096000,
+	6144000,
+	8192000,
+	12288000,
+	24576000,
+};
+
+static const int madera_44k1_bclk_rates[] = {
+	-1,
+	44100,
+	58800,
+	88200,
+	117600,
+	177640,
+	235200,
+	352800,
+	470400,
+	705600,
+	940800,
+	1411200,
+	1881600,
+	2822400,
+	3763200,
+	5644800,
+	7526400,
+	11289600,
+	22579200,
+};
+
+static const unsigned int madera_sr_vals[] = {
+	0,
+	12000,
+	24000,
+	48000,
+	96000,
+	192000,
+	384000,
+	768000,
+	0,
+	11025,
+	22050,
+	44100,
+	88200,
+	176400,
+	352800,
+	705600,
+	4000,
+	8000,
+	16000,
+	32000,
+	64000,
+	128000,
+	256000,
+	512000,
+};
+
+#define MADERA_192K_48K_RATE_MASK	0x0F003E
+#define MADERA_192K_44K1_RATE_MASK	0x003E00
+#define MADERA_192K_RATE_MASK		(MADERA_192K_48K_RATE_MASK | \
+					 MADERA_192K_44K1_RATE_MASK)
+
+static const struct snd_pcm_hw_constraint_list madera_constraint = {
+	.count	= ARRAY_SIZE(madera_sr_vals),
+	.list	= madera_sr_vals,
+};
+
+static int madera_startup(struct snd_pcm_substream *substream,
+			  struct snd_soc_dai *dai)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera_dai_priv *dai_priv = &priv->dai[dai->id - 1];
+	unsigned int base_rate;
+
+	if (!substream->runtime)
+		return 0;
+
+	switch (dai_priv->clk) {
+	case MADERA_CLK_SYSCLK_1:
+	case MADERA_CLK_SYSCLK_2:
+	case MADERA_CLK_SYSCLK_3:
+		base_rate = priv->sysclk;
+		break;
+	case MADERA_CLK_ASYNCCLK_1:
+	case MADERA_CLK_ASYNCCLK_2:
+		base_rate = priv->asyncclk;
+		break;
+	default:
+		return 0;
+	}
+
+	if (base_rate == 0)
+		dai_priv->constraint.mask = MADERA_192K_RATE_MASK;
+	else if (base_rate % 4000)
+		dai_priv->constraint.mask = MADERA_192K_44K1_RATE_MASK;
+	else
+		dai_priv->constraint.mask = MADERA_192K_48K_RATE_MASK;
+
+	return snd_pcm_hw_constraint_list(substream->runtime, 0,
+					  SNDRV_PCM_HW_PARAM_RATE,
+					  &dai_priv->constraint);
+}
+
+static int madera_hw_params_rate(struct snd_pcm_substream *substream,
+				 struct snd_pcm_hw_params *params,
+				 struct snd_soc_dai *dai)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera_dai_priv *dai_priv = &priv->dai[dai->id - 1];
+	int base = dai->driver->base;
+	int i, sr_val;
+	unsigned int reg, cur, tar;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(madera_sr_vals); i++)
+		if (madera_sr_vals[i] == params_rate(params))
+			break;
+
+	if (i == ARRAY_SIZE(madera_sr_vals)) {
+		madera_aif_err(dai, "Unsupported sample rate %dHz\n",
+			       params_rate(params));
+		return -EINVAL;
+	}
+	sr_val = i;
+
+	switch (dai_priv->clk) {
+	case MADERA_CLK_SYSCLK_1:
+		reg = MADERA_SAMPLE_RATE_1;
+		tar = 0 << MADERA_AIF1_RATE_SHIFT;
+		break;
+	case MADERA_CLK_SYSCLK_2:
+		reg = MADERA_SAMPLE_RATE_2;
+		tar = 1 << MADERA_AIF1_RATE_SHIFT;
+		break;
+	case MADERA_CLK_SYSCLK_3:
+		reg = MADERA_SAMPLE_RATE_3;
+		tar = 2 << MADERA_AIF1_RATE_SHIFT;
+		break;
+	case MADERA_CLK_ASYNCCLK_1:
+		reg = MADERA_ASYNC_SAMPLE_RATE_1,
+		tar = 8 << MADERA_AIF1_RATE_SHIFT;
+		break;
+	case MADERA_CLK_ASYNCCLK_2:
+		reg = MADERA_ASYNC_SAMPLE_RATE_2,
+		tar = 9 << MADERA_AIF1_RATE_SHIFT;
+		break;
+	default:
+		madera_aif_err(dai, "Invalid clock %d\n", dai_priv->clk);
+		return -EINVAL;
+	}
+
+	snd_soc_component_update_bits(component, reg, MADERA_SAMPLE_RATE_1_MASK,
+				      sr_val);
+
+	if (!base)
+		return 0;
+
+	ret = regmap_read(priv->madera->regmap,
+			  base + MADERA_AIF_RATE_CTRL, &cur);
+	if (ret != 0) {
+		madera_aif_err(dai, "Failed to check rate: %d\n", ret);
+		return ret;
+	}
+
+	if ((cur & MADERA_AIF1_RATE_MASK) == (tar & MADERA_AIF1_RATE_MASK))
+		return 0;
+
+	mutex_lock(&priv->rate_lock);
+
+	if (!madera_can_change_grp_rate(priv, base + MADERA_AIF_RATE_CTRL)) {
+		madera_aif_warn(dai, "Cannot change rate while active\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* Guard the rate change with SYSCLK cycles */
+	madera_spin_sysclk(priv);
+	snd_soc_component_update_bits(component, base + MADERA_AIF_RATE_CTRL,
+				      MADERA_AIF1_RATE_MASK, tar);
+	madera_spin_sysclk(priv);
+
+out:
+	mutex_unlock(&priv->rate_lock);
+
+	return ret;
+}
+
+static int madera_aif_cfg_changed(struct snd_soc_component *component,
+				  int base, int bclk, int lrclk, int frame)
+{
+	unsigned int val;
+	int ret;
+
+	ret = snd_soc_component_read(component, base + MADERA_AIF_BCLK_CTRL,
+				     &val);
+	if (ret)
+		return ret;
+	if (bclk != (val & MADERA_AIF1_BCLK_FREQ_MASK))
+		return 1;
+
+	ret = snd_soc_component_read(component, base + MADERA_AIF_RX_BCLK_RATE,
+				     &val);
+	if (ret)
+		return ret;
+	if (lrclk != (val & MADERA_AIF1RX_BCPF_MASK))
+		return 1;
+
+	ret = snd_soc_component_read(component, base + MADERA_AIF_FRAME_CTRL_1,
+				     &val);
+	if (ret)
+		return ret;
+	if (frame != (val & (MADERA_AIF1TX_WL_MASK |
+			     MADERA_AIF1TX_SLOT_LEN_MASK)))
+		return 1;
+
+	return 0;
+}
+
+static int madera_hw_params(struct snd_pcm_substream *substream,
+			    struct snd_pcm_hw_params *params,
+			    struct snd_soc_dai *dai)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	int base = dai->driver->base;
+	const int *rates;
+	int i, ret;
+	unsigned int val;
+	unsigned int channels = params_channels(params);
+	unsigned int rate = params_rate(params);
+	unsigned int chan_limit =
+			madera->pdata.codec.max_channels_clocked[dai->id - 1];
+	int tdm_width = priv->tdm_width[dai->id - 1];
+	int tdm_slots = priv->tdm_slots[dai->id - 1];
+	int bclk, lrclk, wl, frame, bclk_target, num_rates;
+	int reconfig;
+	unsigned int aif_tx_state = 0, aif_rx_state = 0;
+
+	if (rate % 4000) {
+		rates = &madera_44k1_bclk_rates[0];
+		num_rates = ARRAY_SIZE(madera_44k1_bclk_rates);
+	} else {
+		rates = &madera_48k_bclk_rates[0];
+		num_rates = ARRAY_SIZE(madera_48k_bclk_rates);
+	}
+
+	wl = snd_pcm_format_width(params_format(params));
+
+	if (tdm_slots) {
+		madera_aif_dbg(dai, "Configuring for %d %d bit TDM slots\n",
+			       tdm_slots, tdm_width);
+		bclk_target = tdm_slots * tdm_width * rate;
+		channels = tdm_slots;
+	} else {
+		bclk_target = snd_soc_params_to_bclk(params);
+		tdm_width = wl;
+	}
+
+	if (chan_limit && chan_limit < channels) {
+		madera_aif_dbg(dai, "Limiting to %d channels\n", chan_limit);
+		bclk_target /= channels;
+		bclk_target *= chan_limit;
+	}
+
+	/* Force multiple of 2 channels for I2S mode */
+	ret = snd_soc_component_read(component, base + MADERA_AIF_FORMAT, &val);
+	if (ret)
+		return ret;
+
+	val &= MADERA_AIF1_FMT_MASK;
+	if ((channels & 1) && val == MADERA_FMT_I2S_MODE) {
+		madera_aif_dbg(dai, "Forcing stereo mode\n");
+		bclk_target /= channels;
+		bclk_target *= channels + 1;
+	}
+
+	for (i = 0; i < num_rates; i++) {
+		if (rates[i] >= bclk_target && rates[i] % rate == 0) {
+			bclk = i;
+			break;
+		}
+	}
+
+	if (i == num_rates) {
+		madera_aif_err(dai, "Unsupported sample rate %dHz\n", rate);
+		return -EINVAL;
+	}
+
+	lrclk = rates[bclk] / rate;
+
+	madera_aif_dbg(dai, "BCLK %dHz LRCLK %dHz\n",
+		       rates[bclk], rates[bclk] / lrclk);
+
+	frame = wl << MADERA_AIF1TX_WL_SHIFT | tdm_width;
+
+	reconfig = madera_aif_cfg_changed(component, base, bclk, lrclk, frame);
+	if (reconfig < 0)
+		return reconfig;
+
+	if (reconfig) {
+		/* Save AIF TX/RX state */
+		regmap_read(madera->regmap, base + MADERA_AIF_TX_ENABLES,
+			    &aif_tx_state);
+		regmap_read(madera->regmap, base + MADERA_AIF_RX_ENABLES,
+			    &aif_rx_state);
+		/* Disable AIF TX/RX before reconfiguring it */
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_TX_ENABLES, 0xff, 0x0);
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_RX_ENABLES, 0xff, 0x0);
+	}
+
+	ret = madera_hw_params_rate(substream, params, dai);
+	if (ret != 0)
+		goto restore_aif;
+
+	if (reconfig) {
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_BCLK_CTRL,
+				   MADERA_AIF1_BCLK_FREQ_MASK, bclk);
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_RX_BCLK_RATE,
+				   MADERA_AIF1RX_BCPF_MASK, lrclk);
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_FRAME_CTRL_1,
+				   MADERA_AIF1TX_WL_MASK |
+				   MADERA_AIF1TX_SLOT_LEN_MASK, frame);
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_FRAME_CTRL_2,
+				   MADERA_AIF1RX_WL_MASK |
+				   MADERA_AIF1RX_SLOT_LEN_MASK, frame);
+	}
+
+restore_aif:
+	if (reconfig) {
+		/* Restore AIF TX/RX state */
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_TX_ENABLES,
+				   0xff, aif_tx_state);
+		regmap_update_bits(madera->regmap,
+				   base + MADERA_AIF_RX_ENABLES,
+				   0xff, aif_rx_state);
+	}
+
+	return ret;
+}
+
+static int madera_is_syncclk(int clk_id)
+{
+	switch (clk_id) {
+	case MADERA_CLK_SYSCLK_1:
+	case MADERA_CLK_SYSCLK_2:
+	case MADERA_CLK_SYSCLK_3:
+		return 1;
+	case MADERA_CLK_ASYNCCLK_1:
+	case MADERA_CLK_ASYNCCLK_2:
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int madera_dai_set_sysclk(struct snd_soc_dai *dai,
+				 int clk_id, unsigned int freq, int dir)
+{
+	struct snd_soc_component *component = dai->component;
+	struct snd_soc_dapm_context *dapm =
+		snd_soc_component_get_dapm(component);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera_dai_priv *dai_priv = &priv->dai[dai->id - 1];
+	struct snd_soc_dapm_route routes[2];
+	int is_sync;
+
+	is_sync = madera_is_syncclk(clk_id);
+	if (is_sync < 0) {
+		dev_err(component->dev, "Illegal DAI clock id %d\n", clk_id);
+		return is_sync;
+	}
+
+	if (is_sync == madera_is_syncclk(dai_priv->clk))
+		return 0;
+
+	if (dai->active) {
+		dev_err(component->dev, "Can't change clock on active DAI %d\n",
+			dai->id);
+		return -EBUSY;
+	}
+
+	dev_dbg(component->dev, "Setting AIF%d to %s\n", dai->id,
+		is_sync ? "SYSCLK" : "ASYNCCLK");
+
+	/*
+	 * A connection to SYSCLK is always required, we only add and remove
+	 * a connection to ASYNCCLK
+	 */
+	memset(&routes, 0, sizeof(routes));
+	routes[0].sink = dai->driver->capture.stream_name;
+	routes[1].sink = dai->driver->playback.stream_name;
+	routes[0].source = "ASYNCCLK";
+	routes[1].source = "ASYNCCLK";
+
+	if (is_sync)
+		snd_soc_dapm_del_routes(dapm, routes, ARRAY_SIZE(routes));
+	else
+		snd_soc_dapm_add_routes(dapm, routes, ARRAY_SIZE(routes));
+
+	dai_priv->clk = clk_id;
+
+	return snd_soc_dapm_sync(dapm);
+}
+
+static int madera_set_tristate(struct snd_soc_dai *dai, int tristate)
+{
+	struct snd_soc_component *component = dai->component;
+	int base = dai->driver->base;
+	unsigned int reg;
+	int ret;
+
+	if (tristate)
+		reg = MADERA_AIF1_TRI;
+	else
+		reg = 0;
+
+	ret = snd_soc_component_update_bits(component,
+					    base + MADERA_AIF_RATE_CTRL,
+					    MADERA_AIF1_TRI, reg);
+	if (ret < 0)
+		return ret;
+	else
+		return 0;
+}
+
+static void madera_set_channels_to_mask(struct snd_soc_dai *dai,
+					unsigned int base,
+					int channels, unsigned int mask)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	int slot, i;
+
+	for (i = 0; i < channels; ++i) {
+		slot = ffs(mask) - 1;
+		if (slot < 0)
+			return;
+
+		regmap_write(madera->regmap, base + i, slot);
+
+		mask &= ~(1 << slot);
+	}
+
+	if (mask)
+		madera_aif_warn(dai, "Too many channels in TDM mask\n");
+}
+
+static int madera_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+			       unsigned int rx_mask, int slots, int slot_width)
+{
+	struct snd_soc_component *component = dai->component;
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	int base = dai->driver->base;
+	int rx_max_chan = dai->driver->playback.channels_max;
+	int tx_max_chan = dai->driver->capture.channels_max;
+
+	/* Only support TDM for the physical AIFs */
+	if (dai->id > MADERA_MAX_AIF)
+		return -ENOTSUPP;
+
+	if (slots == 0) {
+		tx_mask = (1 << tx_max_chan) - 1;
+		rx_mask = (1 << rx_max_chan) - 1;
+	}
+
+	madera_set_channels_to_mask(dai, base + MADERA_AIF_FRAME_CTRL_3,
+				    tx_max_chan, tx_mask);
+	madera_set_channels_to_mask(dai, base + MADERA_AIF_FRAME_CTRL_11,
+				    rx_max_chan, rx_mask);
+
+	priv->tdm_width[dai->id - 1] = slot_width;
+	priv->tdm_slots[dai->id - 1] = slots;
+
+	return 0;
+}
+
+const struct snd_soc_dai_ops madera_dai_ops = {
+	.startup = &madera_startup,
+	.set_fmt = &madera_set_fmt,
+	.set_tdm_slot = &madera_set_tdm_slot,
+	.hw_params = &madera_hw_params,
+	.set_sysclk = &madera_dai_set_sysclk,
+	.set_tristate = &madera_set_tristate,
+};
+EXPORT_SYMBOL_GPL(madera_dai_ops);
+
+const struct snd_soc_dai_ops madera_simple_dai_ops = {
+	.startup = &madera_startup,
+	.hw_params = &madera_hw_params_rate,
+	.set_sysclk = &madera_dai_set_sysclk,
+};
+EXPORT_SYMBOL_GPL(madera_simple_dai_ops);
+
+int madera_init_dai(struct madera_priv *priv, int id)
+{
+	struct madera_dai_priv *dai_priv = &priv->dai[id];
+
+	dai_priv->clk = MADERA_CLK_SYSCLK_1;
+	dai_priv->constraint = madera_constraint;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_init_dai);
+
+static const struct {
+	unsigned int min;
+	unsigned int max;
+	u16 fratio;
+	int ratio;
+} fll_sync_fratios[] = {
+	{       0,    64000, 4, 16 },
+	{   64000,   128000, 3,  8 },
+	{  128000,   256000, 2,  4 },
+	{  256000,  1000000, 1,  2 },
+	{ 1000000, 13500000, 0,  1 },
+};
+
+static const unsigned int pseudo_fref_max[MADERA_FLL_MAX_FRATIO] = {
+	13500000,
+	 6144000,
+	 6144000,
+	 3072000,
+	 3072000,
+	 2822400,
+	 2822400,
+	 1536000,
+	 1536000,
+	 1536000,
+	 1536000,
+	 1536000,
+	 1536000,
+	 1536000,
+	 1536000,
+	  768000,
+};
+
+struct madera_fll_gains {
+	unsigned int min;
+	unsigned int max;
+	int gain;		/* main gain */
+	int alt_gain;		/* alternate integer gain */
+};
+
+static const struct madera_fll_gains madera_fll_sync_gains[] = {
+	{       0,   256000, 0, -1 },
+	{  256000,  1000000, 2, -1 },
+	{ 1000000, 13500000, 4, -1 },
+};
+
+static const struct madera_fll_gains madera_fll_main_gains[] = {
+	{       0,   100000, 0, 2 },
+	{  100000,   375000, 2, 2 },
+	{  375000,   768000, 3, 2 },
+	{  768001,  1500000, 3, 3 },
+	{ 1500000,  6000000, 4, 3 },
+	{ 6000000, 13500000, 5, 3 },
+};
+
+static int madera_find_sync_fratio(unsigned int fref, int *fratio)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fll_sync_fratios); i++) {
+		if (fll_sync_fratios[i].min <= fref &&
+		    fref <= fll_sync_fratios[i].max) {
+			if (fratio)
+				*fratio = fll_sync_fratios[i].fratio;
+
+			return fll_sync_fratios[i].ratio;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int madera_find_main_fratio(unsigned int fref, unsigned int fout,
+				   int *fratio)
+{
+	int ratio = 1;
+
+	while ((fout / (ratio * fref)) > MADERA_FLL_MAX_N)
+		ratio++;
+
+	if (fratio)
+		*fratio = ratio - 1;
+
+	return ratio;
+}
+
+static int madera_find_fratio(struct madera_fll *fll, unsigned int fref,
+			      bool sync, int *fratio)
+{
+	switch (fll->madera->type) {
+	case CS47L35:
+		switch (fll->madera->rev) {
+		case 0:
+			/* rev A0 uses sync calculation for both loops */
+			return madera_find_sync_fratio(fref, fratio);
+		default:
+			if (sync)
+				return madera_find_sync_fratio(fref, fratio);
+			else
+				return madera_find_main_fratio(fref,
+							       fll->fout,
+							       fratio);
+		}
+		break;
+	case CS47L85:
+	case WM1840:
+		/* these use the same calculation for main and sync loops */
+		return madera_find_sync_fratio(fref, fratio);
+	default:
+		if (sync)
+			return madera_find_sync_fratio(fref, fratio);
+		else
+			return madera_find_main_fratio(fref, fll->fout, fratio);
+	}
+}
+
+static int madera_calc_fratio(struct madera_fll *fll,
+			      struct madera_fll_cfg *cfg,
+			      unsigned int fref, bool sync)
+{
+	int init_ratio, ratio;
+	int refdiv, div;
+
+	/* fref must be <=13.5MHz, find initial refdiv */
+	div = 1;
+	cfg->refdiv = 0;
+	while (fref > MADERA_FLL_MAX_FREF) {
+		div *= 2;
+		fref /= 2;
+		cfg->refdiv++;
+
+		if (div > MADERA_FLL_MAX_REFDIV)
+			return -EINVAL;
+	}
+
+	/* Find an appropriate FLL_FRATIO */
+	init_ratio = madera_find_fratio(fll, fref, sync, &cfg->fratio);
+	if (init_ratio < 0) {
+		madera_fll_err(fll, "Unable to find FRATIO for fref=%uHz\n",
+			       fref);
+		return init_ratio;
+	}
+
+	if (!sync)
+		cfg->fratio = init_ratio - 1;
+
+	switch (fll->madera->type) {
+	case CS47L35:
+		switch (fll->madera->rev) {
+		case 0:
+			if (sync)
+				return init_ratio;
+			break;
+		default:
+			return init_ratio;
+		}
+		break;
+	case CS47L85:
+	case WM1840:
+		if (sync)
+			return init_ratio;
+		break;
+	default:
+		return init_ratio;
+	}
+
+	/*
+	 * For CS47L35 rev A0, CS47L85 and WM1840 adjust FRATIO/refdiv to avoid
+	 * integer mode if possible
+	 */
+	refdiv = cfg->refdiv;
+
+	while (div <= MADERA_FLL_MAX_REFDIV) {
+		/*
+		 * start from init_ratio because this may already give a
+		 * fractional N.K
+		 */
+		for (ratio = init_ratio; ratio > 0; ratio--) {
+			if (fll->fout % (ratio * fref)) {
+				cfg->refdiv = refdiv;
+				cfg->fratio = ratio - 1;
+				return ratio;
+			}
+		}
+
+		for (ratio = init_ratio + 1; ratio <= MADERA_FLL_MAX_FRATIO;
+		     ratio++) {
+			if ((MADERA_FLL_VCO_CORNER / 2) /
+			    (MADERA_FLL_VCO_MULT * ratio) < fref)
+				break;
+
+			if (fref > pseudo_fref_max[ratio - 1])
+				break;
+
+			if (fll->fout % (ratio * fref)) {
+				cfg->refdiv = refdiv;
+				cfg->fratio = ratio - 1;
+				return ratio;
+			}
+		}
+
+		div *= 2;
+		fref /= 2;
+		refdiv++;
+		init_ratio = madera_find_fratio(fll, fref, sync, NULL);
+	}
+
+	madera_fll_warn(fll, "Falling back to integer mode operation\n");
+
+	return cfg->fratio + 1;
+}
+
+static int madera_find_fll_gain(struct madera_fll *fll,
+				struct madera_fll_cfg *cfg,
+				unsigned int fref,
+				const struct madera_fll_gains *gains,
+				int n_gains)
+{
+	int i;
+
+	for (i = 0; i < n_gains; i++) {
+		if (gains[i].min <= fref && fref <= gains[i].max) {
+			cfg->gain = gains[i].gain;
+			cfg->alt_gain = gains[i].alt_gain;
+			return 0;
+		}
+	}
+
+	madera_fll_err(fll, "Unable to find gain for fref=%uHz\n", fref);
+
+	return -EINVAL;
+}
+
+static int madera_calc_fll(struct madera_fll *fll,
+			   struct madera_fll_cfg *cfg,
+			   unsigned int fref, bool sync)
+{
+	unsigned int gcd_fll;
+	const struct madera_fll_gains *gains;
+	int n_gains;
+	int ratio, ret;
+
+	madera_fll_dbg(fll, "fref=%u Fout=%u fvco=%u\n",
+		       fref, fll->fout, fll->fout * MADERA_FLL_VCO_MULT);
+
+	/* Find an appropriate FLL_FRATIO and refdiv */
+	ratio = madera_calc_fratio(fll, cfg, fref, sync);
+	if (ratio < 0)
+		return ratio;
+
+	/* Apply the division for our remaining calculations */
+	fref = fref / (1 << cfg->refdiv);
+
+	cfg->n = fll->fout / (ratio * fref);
+
+	if (fll->fout % (ratio * fref)) {
+		gcd_fll = gcd(fll->fout, ratio * fref);
+		madera_fll_dbg(fll, "GCD=%u\n", gcd_fll);
+
+		cfg->theta = (fll->fout - (cfg->n * ratio * fref))
+			/ gcd_fll;
+		cfg->lambda = (ratio * fref) / gcd_fll;
+	} else {
+		cfg->theta = 0;
+		cfg->lambda = 0;
+	}
+
+	/*
+	 * Round down to 16bit range with cost of accuracy lost.
+	 * Denominator must be bigger than numerator so we only
+	 * take care of it.
+	 */
+	while (cfg->lambda >= (1 << 16)) {
+		cfg->theta >>= 1;
+		cfg->lambda >>= 1;
+	}
+
+	switch (fll->madera->type) {
+	case CS47L35:
+		switch (fll->madera->rev) {
+		case 0:
+			/* Rev A0 uses the sync gains for both loops */
+			gains = madera_fll_sync_gains;
+			n_gains = ARRAY_SIZE(madera_fll_sync_gains);
+			break;
+		default:
+			if (sync) {
+				gains = madera_fll_sync_gains;
+				n_gains = ARRAY_SIZE(madera_fll_sync_gains);
+			} else {
+				gains = madera_fll_main_gains;
+				n_gains = ARRAY_SIZE(madera_fll_main_gains);
+			}
+			break;
+		}
+		break;
+	case CS47L85:
+	case WM1840:
+		/* These use the sync gains for both loops */
+		gains = madera_fll_sync_gains;
+		n_gains = ARRAY_SIZE(madera_fll_sync_gains);
+		break;
+	default:
+		if (sync) {
+			gains = madera_fll_sync_gains;
+			n_gains = ARRAY_SIZE(madera_fll_sync_gains);
+		} else {
+			gains = madera_fll_main_gains;
+			n_gains = ARRAY_SIZE(madera_fll_main_gains);
+		}
+		break;
+	}
+
+	ret = madera_find_fll_gain(fll, cfg, fref, gains, n_gains);
+	if (ret)
+		return ret;
+
+	madera_fll_dbg(fll, "N=%d THETA=%d LAMBDA=%d\n",
+		       cfg->n, cfg->theta, cfg->lambda);
+	madera_fll_dbg(fll, "FRATIO=0x%x(%d) REFCLK_DIV=0x%x(%d)\n",
+		       cfg->fratio, ratio, cfg->refdiv, 1 << cfg->refdiv);
+	madera_fll_dbg(fll, "GAIN=0x%x(%d)\n", cfg->gain, 1 << cfg->gain);
+
+	return 0;
+}
+
+static bool madera_write_fll(struct madera *madera, unsigned int base,
+			     struct madera_fll_cfg *cfg, int source,
+			     bool sync, int gain)
+{
+	bool change, fll_change;
+
+	fll_change = false;
+	regmap_update_bits_check(madera->regmap,
+				 base + MADERA_FLL_CONTROL_3_OFFS,
+				 MADERA_FLL1_THETA_MASK,
+				 cfg->theta, &change);
+	fll_change |= change;
+	regmap_update_bits_check(madera->regmap,
+				 base + MADERA_FLL_CONTROL_4_OFFS,
+				 MADERA_FLL1_LAMBDA_MASK,
+				 cfg->lambda, &change);
+	fll_change |= change;
+	regmap_update_bits_check(madera->regmap,
+				 base + MADERA_FLL_CONTROL_5_OFFS,
+				 MADERA_FLL1_FRATIO_MASK,
+				 cfg->fratio << MADERA_FLL1_FRATIO_SHIFT,
+				 &change);
+	fll_change |= change;
+	regmap_update_bits_check(madera->regmap,
+				 base + MADERA_FLL_CONTROL_6_OFFS,
+				 MADERA_FLL1_REFCLK_DIV_MASK |
+				 MADERA_FLL1_REFCLK_SRC_MASK,
+				 cfg->refdiv << MADERA_FLL1_REFCLK_DIV_SHIFT |
+				 source << MADERA_FLL1_REFCLK_SRC_SHIFT,
+				 &change);
+	fll_change |= change;
+
+	if (sync) {
+		regmap_update_bits_check(madera->regmap,
+					 base + MADERA_FLL_SYNCHRONISER_7_OFFS,
+					 MADERA_FLL1_GAIN_MASK,
+					 gain << MADERA_FLL1_GAIN_SHIFT,
+					 &change);
+		fll_change |= change;
+	} else {
+		regmap_update_bits_check(madera->regmap,
+					 base + MADERA_FLL_CONTROL_7_OFFS,
+					 MADERA_FLL1_GAIN_MASK,
+					 gain << MADERA_FLL1_GAIN_SHIFT,
+					 &change);
+		fll_change |= change;
+	}
+
+	regmap_update_bits_check(madera->regmap,
+				 base + MADERA_FLL_CONTROL_2_OFFS,
+				 MADERA_FLL1_CTRL_UPD | MADERA_FLL1_N_MASK,
+				 MADERA_FLL1_CTRL_UPD | cfg->n, &change);
+	fll_change |= change;
+
+	return fll_change;
+}
+
+static int madera_is_enabled_fll(struct madera_fll *fll, int base)
+{
+	struct madera *madera = fll->madera;
+	unsigned int reg;
+	int ret;
+
+	ret = regmap_read(madera->regmap,
+			  base + MADERA_FLL_CONTROL_1_OFFS, &reg);
+	if (ret != 0) {
+		madera_fll_err(fll, "Failed to read current state: %d\n", ret);
+		return ret;
+	}
+
+	return reg & MADERA_FLL1_ENA;
+}
+
+static int madera_wait_for_fll(struct madera_fll *fll, bool requested)
+{
+	struct madera *madera = fll->madera;
+	unsigned int val = 0;
+	bool status;
+	int i;
+
+	madera_fll_dbg(fll, "Waiting for FLL...\n");
+
+	for (i = 0; i < 30; i++) {
+		regmap_read(madera->regmap, MADERA_IRQ1_RAW_STATUS_2, &val);
+		status = val & (MADERA_FLL1_LOCK_STS1 << (fll->id - 1));
+		if (status == requested)
+			return 0;
+
+		switch (i) {
+		case 0 ... 5:
+			usleep_range(75, 125);
+			break;
+		case 11 ... 20:
+			usleep_range(750, 1250);
+			break;
+		default:
+			msleep(20);
+			break;
+		}
+	}
+
+	madera_fll_warn(fll, "Timed out waiting for lock\n");
+
+	return -ETIMEDOUT;
+}
+
+static bool madera_set_fll_phase_integrator(struct madera_fll *fll,
+					    struct madera_fll_cfg *ref_cfg,
+					    bool sync)
+{
+	unsigned int val;
+	bool reg_change;
+
+	if (!sync && ref_cfg->theta == 0)
+		val = (1 << MADERA_FLL1_PHASE_ENA_SHIFT) |
+		      (2 << MADERA_FLL1_PHASE_GAIN_SHIFT);
+	else
+		val = 2 << MADERA_FLL1_PHASE_GAIN_SHIFT;
+
+	regmap_update_bits_check(fll->madera->regmap,
+				 fll->base + MADERA_FLL_EFS_2_OFFS,
+				 MADERA_FLL1_PHASE_ENA_MASK |
+				 MADERA_FLL1_PHASE_GAIN_MASK,
+				 val, &reg_change);
+
+	return reg_change;
+}
+
+static void madera_disable_fll(struct madera_fll *fll)
+{
+	struct madera *madera = fll->madera;
+	unsigned int sync_base;
+	bool change;
+
+	switch (madera->type) {
+	case CS47L35:
+		sync_base = fll->base + CS47L35_FLL_SYNCHRONISER_OFFS;
+		break;
+	default:
+		sync_base = fll->base + MADERA_FLL_SYNCHRONISER_OFFS;
+		break;
+	}
+
+	madera_fll_dbg(fll, "Disabling FLL\n");
+
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+			   MADERA_FLL1_FREERUN, MADERA_FLL1_FREERUN);
+	regmap_update_bits_check(madera->regmap,
+				 fll->base + MADERA_FLL_CONTROL_1_OFFS,
+				 MADERA_FLL1_ENA, 0, &change);
+	regmap_update_bits(madera->regmap,
+			   sync_base + MADERA_FLL_SYNCHRONISER_1_OFFS,
+			   MADERA_FLL1_SYNC_ENA, 0);
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+			   MADERA_FLL1_FREERUN, 0);
+
+	madera_wait_for_fll(fll, false);
+
+	if (change)
+		pm_runtime_put_autosuspend(madera->dev);
+}
+
+static int madera_enable_fll(struct madera_fll *fll)
+{
+	struct madera *madera = fll->madera;
+	bool have_sync = false;
+	int already_enabled = madera_is_enabled_fll(fll, fll->base);
+	int sync_enabled;
+	struct madera_fll_cfg cfg;
+	unsigned int sync_base;
+	int gain, ret;
+	bool fll_change = false;
+
+	if (already_enabled < 0)
+		return already_enabled;	/* error getting current state */
+
+	if (fll->ref_src < 0 || fll->ref_freq == 0) {
+		madera_fll_err(fll, "No REFCLK\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	madera_fll_dbg(fll, "Enabling FLL, initially %s\n",
+		       already_enabled ? "enabled" : "disabled");
+
+	if (fll->fout < MADERA_FLL_MIN_FOUT ||
+	    fll->fout > MADERA_FLL_MAX_FOUT) {
+		madera_fll_err(fll, "invalid fout %uHz\n", fll->fout);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	switch (madera->type) {
+	case CS47L35:
+		sync_base = fll->base + CS47L35_FLL_SYNCHRONISER_OFFS;
+		break;
+	default:
+		sync_base = fll->base + MADERA_FLL_SYNCHRONISER_OFFS;
+		break;
+	}
+
+	sync_enabled = madera_is_enabled_fll(fll, sync_base);
+	if (sync_enabled < 0)
+		return sync_enabled;
+
+	if (already_enabled) {
+		/* Facilitate smooth refclk across the transition */
+		regmap_update_bits(fll->madera->regmap,
+				   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+				   MADERA_FLL1_FREERUN,
+				   MADERA_FLL1_FREERUN);
+		udelay(32);
+		regmap_update_bits(fll->madera->regmap,
+				   fll->base + MADERA_FLL_CONTROL_7_OFFS,
+				   MADERA_FLL1_GAIN_MASK, 0);
+	}
+
+	/* Apply SYNCCLK setting */
+	if (fll->sync_src >= 0) {
+		ret = madera_calc_fll(fll, &cfg, fll->sync_freq, true);
+		if (ret < 0)
+			goto err;
+
+		fll_change |= madera_write_fll(madera, sync_base,
+					       &cfg, fll->sync_src,
+					       true, cfg.gain);
+		have_sync = true;
+	}
+
+	if (already_enabled && !!sync_enabled != have_sync)
+		madera_fll_warn(fll, "Synchroniser changed on active FLL\n");
+
+	/* Apply REFCLK setting */
+	ret = madera_calc_fll(fll, &cfg, fll->ref_freq, false);
+	if (ret < 0)
+		goto err;
+
+	/* Ref path hardcodes lambda to 65536 when sync is on */
+	if (have_sync && cfg.lambda)
+		cfg.theta = (cfg.theta * (1 << 16)) / cfg.lambda;
+
+	switch (fll->madera->type) {
+	case CS47L35:
+		switch (fll->madera->rev) {
+		case 0:
+			gain = cfg.gain;
+			break;
+		default:
+			fll_change |=
+				madera_set_fll_phase_integrator(fll, &cfg,
+								have_sync);
+			if (!have_sync && cfg.theta == 0)
+				gain = cfg.alt_gain;
+			else
+				gain = cfg.gain;
+			break;
+		}
+		break;
+	case CS47L85:
+	case WM1840:
+		gain = cfg.gain;
+		break;
+	default:
+		fll_change |= madera_set_fll_phase_integrator(fll, &cfg,
+							      have_sync);
+		if (!have_sync && cfg.theta == 0)
+			gain = cfg.alt_gain;
+		else
+			gain = cfg.gain;
+		break;
+	}
+
+	fll_change |= madera_write_fll(madera, fll->base,
+				       &cfg, fll->ref_src,
+				       false, gain);
+
+	/*
+	 * Increase the bandwidth if we're not using a low frequency
+	 * sync source.
+	 */
+	if (have_sync && fll->sync_freq > 100000)
+		regmap_update_bits(madera->regmap,
+				   sync_base + MADERA_FLL_SYNCHRONISER_7_OFFS,
+				   MADERA_FLL1_SYNC_DFSAT_MASK, 0);
+	else
+		regmap_update_bits(madera->regmap,
+				   sync_base + MADERA_FLL_SYNCHRONISER_7_OFFS,
+				   MADERA_FLL1_SYNC_DFSAT_MASK,
+				   MADERA_FLL1_SYNC_DFSAT);
+
+	if (!already_enabled)
+		pm_runtime_get_sync(madera->dev);
+
+	if (have_sync)
+		regmap_update_bits(madera->regmap,
+				   sync_base + MADERA_FLL_SYNCHRONISER_1_OFFS,
+				   MADERA_FLL1_SYNC_ENA,
+				   MADERA_FLL1_SYNC_ENA);
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+			   MADERA_FLL1_ENA, MADERA_FLL1_ENA);
+
+	if (already_enabled)
+		regmap_update_bits(madera->regmap,
+				   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+				   MADERA_FLL1_FREERUN, 0);
+
+	if (fll_change || !already_enabled)
+		madera_wait_for_fll(fll, true);
+
+	return 0;
+
+err:
+	 /* In case of error don't leave the FLL running with an old config */
+	madera_disable_fll(fll);
+
+	return ret;
+}
+
+static int madera_apply_fll(struct madera_fll *fll)
+{
+	if (fll->fout) {
+		return madera_enable_fll(fll);
+	} else {
+		madera_disable_fll(fll);
+		return 0;
+	}
+}
+
+int madera_set_fll_syncclk(struct madera_fll *fll, int source,
+			   unsigned int fref, unsigned int fout)
+{
+	/*
+	 * fout is ignored, since the synchronizer is an optional extra
+	 * constraint on the Fout generated from REFCLK, so the Fout is
+	 * set when configuring REFCLK
+	 */
+
+	if (fll->sync_src == source && fll->sync_freq == fref)
+		return 0;
+
+	fll->sync_src = source;
+	fll->sync_freq = fref;
+
+	return madera_apply_fll(fll);
+}
+EXPORT_SYMBOL_GPL(madera_set_fll_syncclk);
+
+int madera_set_fll_refclk(struct madera_fll *fll, int source,
+			  unsigned int fref, unsigned int fout)
+{
+	int ret;
+
+	if (fll->ref_src == source &&
+	    fll->ref_freq == fref && fll->fout == fout)
+		return 0;
+
+	/*
+	 * Changes of fout on an enabled FLL aren't allowed except when
+	 * setting fout==0 to disable the FLL
+	 */
+	if (fout && fout != fll->fout) {
+		ret = madera_is_enabled_fll(fll, fll->base);
+		if (ret < 0)
+			return ret;
+
+		if (ret) {
+			madera_fll_err(fll, "Can't change Fout on active FLL\n");
+			return -EBUSY;
+		}
+	}
+
+	fll->ref_src = source;
+	fll->ref_freq = fref;
+	fll->fout = fout;
+
+	return madera_apply_fll(fll);
+}
+EXPORT_SYMBOL_GPL(madera_set_fll_refclk);
+
+int madera_init_fll(struct madera *madera, int id, int base,
+		    struct madera_fll *fll)
+{
+	fll->id = id;
+	fll->base = base;
+	fll->madera = madera;
+	fll->ref_src = MADERA_FLL_SRC_NONE;
+	fll->sync_src = MADERA_FLL_SRC_NONE;
+
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLL_CONTROL_1_OFFS,
+			   MADERA_FLL1_FREERUN, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(madera_init_fll);
+
+static const struct reg_sequence madera_fll_ao_32K_49M_patch[] = {
+	{ MADERA_FLLAO_CONTROL_2,  0x02EE },
+	{ MADERA_FLLAO_CONTROL_3,  0x0000 },
+	{ MADERA_FLLAO_CONTROL_4,  0x0001 },
+	{ MADERA_FLLAO_CONTROL_5,  0x0002 },
+	{ MADERA_FLLAO_CONTROL_6,  0x8001 },
+	{ MADERA_FLLAO_CONTROL_7,  0x0004 },
+	{ MADERA_FLLAO_CONTROL_8,  0x0077 },
+	{ MADERA_FLLAO_CONTROL_10, 0x06D8 },
+	{ MADERA_FLLAO_CONTROL_11, 0x0085 },
+	{ MADERA_FLLAO_CONTROL_2,  0x82EE },
+};
+
+static const struct reg_sequence madera_fll_ao_32K_45M_patch[] = {
+	{ MADERA_FLLAO_CONTROL_2,  0x02B1 },
+	{ MADERA_FLLAO_CONTROL_3,  0x0001 },
+	{ MADERA_FLLAO_CONTROL_4,  0x0010 },
+	{ MADERA_FLLAO_CONTROL_5,  0x0002 },
+	{ MADERA_FLLAO_CONTROL_6,  0x8001 },
+	{ MADERA_FLLAO_CONTROL_7,  0x0004 },
+	{ MADERA_FLLAO_CONTROL_8,  0x0077 },
+	{ MADERA_FLLAO_CONTROL_10, 0x06D8 },
+	{ MADERA_FLLAO_CONTROL_11, 0x0005 },
+	{ MADERA_FLLAO_CONTROL_2,  0x82B1 },
+};
+
+struct madera_fllao_patch {
+	unsigned int fin;
+	unsigned int fout;
+	const struct reg_sequence *patch;
+	unsigned int patch_size;
+};
+
+static const struct madera_fllao_patch madera_fllao_settings[] = {
+	{
+		.fin = 32768,
+		.fout = 49152000,
+		.patch = madera_fll_ao_32K_49M_patch,
+		.patch_size = ARRAY_SIZE(madera_fll_ao_32K_49M_patch),
+
+	},
+	{
+		.fin = 32768,
+		.fout = 45158400,
+		.patch = madera_fll_ao_32K_45M_patch,
+		.patch_size = ARRAY_SIZE(madera_fll_ao_32K_45M_patch),
+	},
+};
+
+static int madera_enable_fll_ao(struct madera_fll *fll,
+				const struct reg_sequence *patch,
+				unsigned int patch_size)
+{
+	struct madera *madera = fll->madera;
+	int already_enabled = madera_is_enabled_fll(fll, fll->base);
+	unsigned int val;
+	int i;
+
+	if (already_enabled < 0)
+		return already_enabled;
+
+	if (!already_enabled)
+		pm_runtime_get_sync(madera->dev);
+
+	madera_fll_dbg(fll, "Enabling FLL_AO, initially %s\n",
+		       already_enabled ? "enabled" : "disabled");
+
+	/* FLL_AO_HOLD must be set before configuring any registers */
+	regmap_update_bits(fll->madera->regmap,
+			   fll->base + MADERA_FLLAO_CONTROL_1_OFFS,
+			   MADERA_FLL_AO_HOLD, MADERA_FLL_AO_HOLD);
+
+	for (i = 0; i < patch_size; i++) {
+		val = patch[i].def;
+
+		/* modify the patch to apply fll->ref_src as input clock */
+		if (patch[i].reg == MADERA_FLLAO_CONTROL_6) {
+			val &= ~MADERA_FLL_AO_REFCLK_SRC_MASK;
+			val |= (fll->ref_src << MADERA_FLL_AO_REFCLK_SRC_SHIFT)
+				& MADERA_FLL_AO_REFCLK_SRC_MASK;
+		}
+
+		regmap_write(madera->regmap, patch[i].reg, val);
+	}
+
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLLAO_CONTROL_1_OFFS,
+			   MADERA_FLL_AO_ENA, MADERA_FLL_AO_ENA);
+
+	/* Release the hold so that fll_ao locks to external frequency */
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLLAO_CONTROL_1_OFFS,
+			   MADERA_FLL_AO_HOLD, 0);
+
+	if (!already_enabled)
+		madera_wait_for_fll(fll, true);
+
+	return 0;
+}
+
+static int madera_disable_fll_ao(struct madera_fll *fll)
+{
+	struct madera *madera = fll->madera;
+	bool change;
+
+	madera_fll_dbg(fll, "Disabling FLL_AO\n");
+
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLLAO_CONTROL_1_OFFS,
+			   MADERA_FLL_AO_HOLD, MADERA_FLL_AO_HOLD);
+	regmap_update_bits_check(madera->regmap,
+				 fll->base + MADERA_FLLAO_CONTROL_1_OFFS,
+				 MADERA_FLL_AO_ENA, 0, &change);
+
+	madera_wait_for_fll(fll, false);
+
+	/*
+	 * ctrl_up gates the writes to all fll_ao register, setting it to 0
+	 * here ensures that after a runtime suspend/resume cycle when one
+	 * enables the fllao then ctrl_up is the last bit that is configured
+	 * by the fllao enable code rather than the cache sync operation which
+	 * would have updated it much earlier before writing out all fllao
+	 * registers
+	 */
+	regmap_update_bits(madera->regmap,
+			   fll->base + MADERA_FLLAO_CONTROL_2_OFFS,
+			   MADERA_FLL_AO_CTRL_UPD_MASK, 0);
+
+	if (change)
+		pm_runtime_put_autosuspend(madera->dev);
+
+	return 0;
+}
+
+int madera_set_fll_ao_refclk(struct madera_fll *fll, int source,
+			     unsigned int fin, unsigned int fout)
+{
+	int ret = 0;
+	const struct reg_sequence *patch = NULL;
+	int patch_size = 0;
+	unsigned int i;
+
+	if (fll->ref_src == source &&
+	    fll->ref_freq == fin && fll->fout == fout)
+		return 0;
+
+	madera_fll_dbg(fll, "Change FLL_AO refclk to fin=%u fout=%u source=%d\n",
+		       fin, fout, source);
+
+	if (fout && (fll->ref_freq != fin || fll->fout != fout)) {
+		for (i = 0; i < ARRAY_SIZE(madera_fllao_settings); i++) {
+			if (madera_fllao_settings[i].fin == fin &&
+			    madera_fllao_settings[i].fout == fout)
+				break;
+		}
+
+		if (i == ARRAY_SIZE(madera_fllao_settings)) {
+			madera_fll_err(fll,
+				       "No matching configuration for FLL_AO\n");
+			return -EINVAL;
+		}
+
+		patch = madera_fllao_settings[i].patch;
+		patch_size = madera_fllao_settings[i].patch_size;
+	}
+
+	fll->ref_src = source;
+	fll->ref_freq = fin;
+	fll->fout = fout;
+
+	if (fout)
+		ret = madera_enable_fll_ao(fll, patch, patch_size);
+	else
+		madera_disable_fll_ao(fll);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_set_fll_ao_refclk);
+
+/**
+ * madera_set_output_mode - Set the mode of the specified output
+ *
+ * @component: Device to configure
+ * @output: Output number
+ * @diff: True to set the output to differential mode
+ *
+ * Some systems use external analogue switches to connect more
+ * analogue devices to the CODEC than are supported by the device.  In
+ * some systems this requires changing the switched output from single
+ * ended to differential mode dynamically at runtime, an operation
+ * supported using this function.
+ *
+ * Most systems have a single static configuration and should use
+ * platform data instead.
+ */
+int madera_set_output_mode(struct snd_soc_component *component, int output,
+			   bool differential)
+{
+	unsigned int reg, val;
+	int ret;
+
+	if (output < 1 || output > MADERA_MAX_OUTPUT)
+		return -EINVAL;
+
+	reg = MADERA_OUTPUT_PATH_CONFIG_1L + (output - 1) * 8;
+
+	if (differential)
+		val = MADERA_OUT1_MONO;
+	else
+		val = 0;
+
+	ret = snd_soc_component_update_bits(component, reg, MADERA_OUT1_MONO,
+					    val);
+	if (ret < 0)
+		return ret;
+	else
+		return 0;
+}
+EXPORT_SYMBOL_GPL(madera_set_output_mode);
+
+static bool madera_eq_filter_unstable(bool mode, __be16 _a, __be16 _b)
+{
+	s16 a = be16_to_cpu(_a);
+	s16 b = be16_to_cpu(_b);
+
+	if (!mode) {
+		return abs(a) >= 4096;
+	} else {
+		if (abs(b) >= 4096)
+			return true;
+
+		return (abs((a << 16) / (4096 - b)) >= 4096 << 4);
+	}
+}
+
+int madera_eq_coeff_put(struct snd_kcontrol *kcontrol,
+			struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	struct soc_bytes *params = (void *)kcontrol->private_value;
+	unsigned int val;
+	__be16 *data;
+	int len;
+	int ret;
+
+	len = params->num_regs * regmap_get_val_bytes(madera->regmap);
+
+	data = kmemdup(ucontrol->value.bytes.data, len, GFP_KERNEL | GFP_DMA);
+	if (!data)
+		return -ENOMEM;
+
+	data[0] &= cpu_to_be16(MADERA_EQ1_B1_MODE);
+
+	if (madera_eq_filter_unstable(!!data[0], data[1], data[2]) ||
+	    madera_eq_filter_unstable(true, data[4], data[5]) ||
+	    madera_eq_filter_unstable(true, data[8], data[9]) ||
+	    madera_eq_filter_unstable(true, data[12], data[13]) ||
+	    madera_eq_filter_unstable(false, data[16], data[17])) {
+		dev_err(madera->dev, "Rejecting unstable EQ coefficients\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = regmap_read(madera->regmap, params->base, &val);
+	if (ret != 0)
+		goto out;
+
+	val &= ~MADERA_EQ1_B1_MODE;
+	data[0] |= cpu_to_be16(val);
+
+	ret = regmap_raw_write(madera->regmap, params->base, data, len);
+
+out:
+	kfree(data);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(madera_eq_coeff_put);
+
+int madera_lhpf_coeff_put(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component =
+		snd_soc_kcontrol_component(kcontrol);
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+	__be16 *data = (__be16 *)ucontrol->value.bytes.data;
+	s16 val = be16_to_cpu(*data);
+
+	if (abs(val) >= 4096) {
+		dev_err(madera->dev, "Rejecting unstable LHPF coefficients\n");
+		return -EINVAL;
+	}
+
+	return snd_soc_bytes_put(kcontrol, ucontrol);
+}
+EXPORT_SYMBOL_GPL(madera_lhpf_coeff_put);
+
+MODULE_SOFTDEP("pre: madera");
+MODULE_DESCRIPTION("ASoC Cirrus Logic Madera codec support");
+MODULE_AUTHOR("Charles Keepax <ckeepax@opensource.cirrus.com>");
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/sound/soc/codecs/madera.h b/sound/soc/codecs/madera.h
new file mode 100644
index 000000000000..aa2db156582b
--- /dev/null
+++ b/sound/soc/codecs/madera.h
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cirrus Logic Madera class codecs common support
+ *
+ * Copyright (C) 2015-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2.
+ */
+
+#ifndef ASOC_MADERA_H
+#define ASOC_MADERA_H
+
+#include <linux/completion.h>
+#include <sound/soc.h>
+#include <sound/madera-pdata.h>
+
+#include "wm_adsp.h"
+
+#define MADERA_FLL1_REFCLK		1
+#define MADERA_FLL2_REFCLK		2
+#define MADERA_FLL3_REFCLK		3
+#define MADERA_FLLAO_REFCLK		4
+#define MADERA_FLL1_SYNCCLK		5
+#define MADERA_FLL2_SYNCCLK		6
+#define MADERA_FLL3_SYNCCLK		7
+#define MADERA_FLLAO_SYNCCLK		8
+
+#define MADERA_FLL_SRC_NONE		-1
+#define MADERA_FLL_SRC_MCLK1		0
+#define MADERA_FLL_SRC_MCLK2		1
+#define MADERA_FLL_SRC_SLIMCLK		3
+#define MADERA_FLL_SRC_FLL1		4
+#define MADERA_FLL_SRC_FLL2		5
+#define MADERA_FLL_SRC_AIF1BCLK		8
+#define MADERA_FLL_SRC_AIF2BCLK		9
+#define MADERA_FLL_SRC_AIF3BCLK		10
+#define MADERA_FLL_SRC_AIF4BCLK		11
+#define MADERA_FLL_SRC_AIF1LRCLK	12
+#define MADERA_FLL_SRC_AIF2LRCLK	13
+#define MADERA_FLL_SRC_AIF3LRCLK	14
+#define MADERA_FLL_SRC_AIF4LRCLK	15
+
+#define MADERA_CLK_SYSCLK_1		1
+#define MADERA_CLK_ASYNCCLK_1		2
+#define MADERA_CLK_OPCLK		3
+#define MADERA_CLK_ASYNC_OPCLK		4
+#define MADERA_CLK_SYSCLK_2		5
+#define MADERA_CLK_SYSCLK_3		6
+#define MADERA_CLK_ASYNCCLK_2		7
+#define MADERA_CLK_DSPCLK		8
+
+#define MADERA_CLK_SRC_MCLK1		0x0
+#define MADERA_CLK_SRC_MCLK2		0x1
+#define MADERA_CLK_SRC_FLL1		0x4
+#define MADERA_CLK_SRC_FLL2		0x5
+#define MADERA_CLK_SRC_FLL3		0x6
+#define MADERA_CLK_SRC_FLLAO_HI		0x7
+#define MADERA_CLK_SRC_FLL1_DIV6	0x7
+#define MADERA_CLK_SRC_AIF1BCLK		0x8
+#define MADERA_CLK_SRC_AIF2BCLK		0x9
+#define MADERA_CLK_SRC_AIF3BCLK		0xA
+#define MADERA_CLK_SRC_AIF4BCLK		0xB
+#define MADERA_CLK_SRC_FLLAO		0xF
+
+#define MADERA_MIXER_VOL_MASK		0x00FE
+#define MADERA_MIXER_VOL_SHIFT		1
+#define MADERA_MIXER_VOL_WIDTH		7
+
+#define MADERA_DOM_GRP_FX		0
+#define MADERA_DOM_GRP_ASRC1		1
+#define MADERA_DOM_GRP_ASRC2		2
+#define MADERA_DOM_GRP_ISRC1		3
+#define MADERA_DOM_GRP_ISRC2		4
+#define MADERA_DOM_GRP_ISRC3		5
+#define MADERA_DOM_GRP_ISRC4		6
+#define MADERA_DOM_GRP_OUT		7
+#define MADERA_DOM_GRP_SPD		8
+#define MADERA_DOM_GRP_DSP1		9
+#define MADERA_DOM_GRP_DSP2		10
+#define MADERA_DOM_GRP_DSP3		11
+#define MADERA_DOM_GRP_DSP4		12
+#define MADERA_DOM_GRP_DSP5		13
+#define MADERA_DOM_GRP_DSP6		14
+#define MADERA_DOM_GRP_DSP7		15
+#define MADERA_DOM_GRP_AIF1		16
+#define MADERA_DOM_GRP_AIF2		17
+#define MADERA_DOM_GRP_AIF3		18
+#define MADERA_DOM_GRP_AIF4		19
+#define MADERA_DOM_GRP_SLIMBUS		20
+#define MADERA_DOM_GRP_PWM		21
+#define MADERA_DOM_GRP_DFC		22
+#define MADERA_N_DOM_GRPS		23
+
+#define MADERA_MAX_DAI			11
+#define MADERA_MAX_ADSP			7
+
+#define MADERA_NUM_MIXER_INPUTS		148
+
+struct madera;
+struct wm_adsp;
+
+struct madera_voice_trigger_info {
+	/** Which core triggered, 1-based (1 = DSP1, ...) */
+	int core_num;
+};
+
+struct madera_dai_priv {
+	int clk;
+	struct snd_pcm_hw_constraint_list constraint;
+};
+
+struct madera_priv {
+	struct wm_adsp adsp[MADERA_MAX_ADSP];
+	struct madera *madera;
+	struct device *dev;
+	int sysclk;
+	int asyncclk;
+	int dspclk;
+	struct madera_dai_priv dai[MADERA_MAX_DAI];
+
+	int num_inputs;
+
+	unsigned int in_pending;
+
+	unsigned int out_up_pending;
+	unsigned int out_up_delay;
+	unsigned int out_down_pending;
+	unsigned int out_down_delay;
+
+	unsigned int adsp_rate_cache[MADERA_MAX_ADSP];
+
+	struct mutex rate_lock;
+
+	int tdm_width[MADERA_MAX_AIF];
+	int tdm_slots[MADERA_MAX_AIF];
+
+	int domain_group_ref[MADERA_N_DOM_GRPS];
+};
+
+struct madera_fll_cfg {
+	int n;
+	unsigned int theta;
+	unsigned int lambda;
+	int refdiv;
+	int fratio;
+	int gain;
+	int alt_gain;
+};
+
+struct madera_fll {
+	struct madera *madera;
+	int id;
+	unsigned int base;
+
+	unsigned int fout;
+
+	int sync_src;
+	unsigned int sync_freq;
+
+	int ref_src;
+	unsigned int ref_freq;
+	struct madera_fll_cfg ref_cfg;
+};
+
+struct madera_enum {
+	struct soc_enum mixer_enum;
+	int val;
+};
+
+extern const unsigned int madera_ana_tlv[];
+extern const unsigned int madera_eq_tlv[];
+extern const unsigned int madera_digital_tlv[];
+extern const unsigned int madera_noise_tlv[];
+extern const unsigned int madera_ng_tlv[];
+
+extern const unsigned int madera_mixer_tlv[];
+extern const char * const madera_mixer_texts[MADERA_NUM_MIXER_INPUTS];
+extern const unsigned int madera_mixer_values[MADERA_NUM_MIXER_INPUTS];
+
+#define MADERA_GAINMUX_CONTROLS(name, base) \
+	SOC_SINGLE_RANGE_TLV(name " Input Volume", base + 1,		\
+			     MADERA_MIXER_VOL_SHIFT, 0x20, 0x50, 0,	\
+			     madera_mixer_tlv)
+
+#define MADERA_MIXER_CONTROLS(name, base) \
+	SOC_SINGLE_RANGE_TLV(name " Input 1 Volume", base + 1,		\
+			     MADERA_MIXER_VOL_SHIFT, 0x20, 0x50, 0,	\
+			     madera_mixer_tlv),			\
+	SOC_SINGLE_RANGE_TLV(name " Input 2 Volume", base + 3,		\
+			     MADERA_MIXER_VOL_SHIFT, 0x20, 0x50, 0,	\
+			     madera_mixer_tlv),			\
+	SOC_SINGLE_RANGE_TLV(name " Input 3 Volume", base + 5,		\
+			     MADERA_MIXER_VOL_SHIFT, 0x20, 0x50, 0,	\
+			     madera_mixer_tlv),			\
+	SOC_SINGLE_RANGE_TLV(name " Input 4 Volume", base + 7,		\
+			     MADERA_MIXER_VOL_SHIFT, 0x20, 0x50, 0,	\
+			     madera_mixer_tlv)
+
+#define MADERA_MUX_ENUM_DECL(name, reg) \
+	SOC_VALUE_ENUM_SINGLE_AUTODISABLE_DECL( \
+		name, reg, 0, 0xff, madera_mixer_texts, madera_mixer_values)
+
+#define MADERA_MUX_CTL_DECL(name) \
+	const struct snd_kcontrol_new name##_mux =	\
+		SOC_DAPM_ENUM("Route", name##_enum)
+
+#define MADERA_MUX_ENUMS(name, base_reg) \
+	static MADERA_MUX_ENUM_DECL(name##_enum, base_reg);	\
+	static MADERA_MUX_CTL_DECL(name)
+
+#define MADERA_MIXER_ENUMS(name, base_reg) \
+	MADERA_MUX_ENUMS(name##_in1, base_reg);     \
+	MADERA_MUX_ENUMS(name##_in2, base_reg + 2); \
+	MADERA_MUX_ENUMS(name##_in3, base_reg + 4); \
+	MADERA_MUX_ENUMS(name##_in4, base_reg + 6)
+
+#define MADERA_DSP_AUX_ENUMS(name, base_reg) \
+	MADERA_MUX_ENUMS(name##_aux1, base_reg);	\
+	MADERA_MUX_ENUMS(name##_aux2, base_reg + 8);	\
+	MADERA_MUX_ENUMS(name##_aux3, base_reg + 16);	\
+	MADERA_MUX_ENUMS(name##_aux4, base_reg + 24);	\
+	MADERA_MUX_ENUMS(name##_aux5, base_reg + 32);	\
+	MADERA_MUX_ENUMS(name##_aux6, base_reg + 40)
+
+#define MADERA_MUX(name, ctrl) \
+	SND_SOC_DAPM_MUX(name, SND_SOC_NOPM, 0, 0, ctrl)
+
+#define MADERA_MUX_WIDGETS(name, name_str) \
+	MADERA_MUX(name_str " Input 1", &name##_mux)
+
+#define MADERA_MIXER_WIDGETS(name, name_str)	\
+	MADERA_MUX(name_str " Input 1", &name##_in1_mux), \
+	MADERA_MUX(name_str " Input 2", &name##_in2_mux), \
+	MADERA_MUX(name_str " Input 3", &name##_in3_mux), \
+	MADERA_MUX(name_str " Input 4", &name##_in4_mux), \
+	SND_SOC_DAPM_MIXER(name_str " Mixer", SND_SOC_NOPM, 0, 0, NULL, 0)
+
+#define MADERA_DSP_WIDGETS(name, name_str)			\
+	MADERA_MIXER_WIDGETS(name##L, name_str "L"),		\
+	MADERA_MIXER_WIDGETS(name##R, name_str "R"),		\
+	MADERA_MUX(name_str " Aux 1", &name##_aux1_mux),	\
+	MADERA_MUX(name_str " Aux 2", &name##_aux2_mux),	\
+	MADERA_MUX(name_str " Aux 3", &name##_aux3_mux),	\
+	MADERA_MUX(name_str " Aux 4", &name##_aux4_mux),	\
+	MADERA_MUX(name_str " Aux 5", &name##_aux5_mux),	\
+	MADERA_MUX(name_str " Aux 6", &name##_aux6_mux)
+
+#define MADERA_MUX_ROUTES(widget, name) \
+	{ widget, NULL, name " Input 1" }, \
+	MADERA_MIXER_INPUT_ROUTES(name " Input 1")
+
+#define MADERA_MIXER_ROUTES(widget, name)		\
+	{ widget, NULL, name " Mixer" },		\
+	{ name " Mixer", NULL, name " Input 1" },	\
+	{ name " Mixer", NULL, name " Input 2" },	\
+	{ name " Mixer", NULL, name " Input 3" },	\
+	{ name " Mixer", NULL, name " Input 4" },	\
+	MADERA_MIXER_INPUT_ROUTES(name " Input 1"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Input 2"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Input 3"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Input 4")
+
+#define MADERA_DSP_ROUTES(name)				\
+	{ name, NULL, name " Preloader"},		\
+	{ name " Preload", NULL, name " Preloader"},	\
+	{ name, NULL, "SYSCLK"},			\
+	{ name, NULL, "DSPCLK"},			\
+	{ name, NULL, name " Aux 1" },			\
+	{ name, NULL, name " Aux 2" },			\
+	{ name, NULL, name " Aux 3" },			\
+	{ name, NULL, name " Aux 4" },			\
+	{ name, NULL, name " Aux 5" },			\
+	{ name, NULL, name " Aux 6" },			\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 1"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 2"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 3"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 4"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 5"),	\
+	MADERA_MIXER_INPUT_ROUTES(name " Aux 6"),	\
+	MADERA_MIXER_ROUTES(name, name "L"),		\
+	MADERA_MIXER_ROUTES(name, name "R")
+
+#define MADERA_RATE_ENUM(xname, xenum) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname,\
+	.info = snd_soc_info_enum_double, \
+	.get = snd_soc_get_enum_double, .put = madera_rate_put, \
+	.private_value = (unsigned long)&xenum }
+
+#define MADERA_EQ_CONTROL(xname, xbase)				\
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname,	\
+	.info = snd_soc_bytes_info, .get = snd_soc_bytes_get,	\
+	.put = madera_eq_coeff_put, .private_value =		\
+	((unsigned long)&(struct soc_bytes) { .base = xbase,	\
+	 .num_regs = 20, .mask = ~MADERA_EQ1_B1_MODE }) }
+
+#define MADERA_LHPF_CONTROL(xname, xbase)			\
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname,	\
+	.info = snd_soc_bytes_info, .get = snd_soc_bytes_get,	\
+	.put = madera_lhpf_coeff_put, .private_value =		\
+	((unsigned long)&(struct soc_bytes) { .base = xbase,	\
+	 .num_regs = 1 }) }
+
+#define MADERA_RATES SNDRV_PCM_RATE_KNOT
+
+#define MADERA_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE | \
+			SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32_LE)
+
+#define MADERA_OSR_ENUM_SIZE		5
+#define MADERA_SYNC_RATE_ENUM_SIZE	3
+#define MADERA_ASYNC_RATE_ENUM_SIZE	2
+#define MADERA_RATE_ENUM_SIZE \
+		(MADERA_SYNC_RATE_ENUM_SIZE + MADERA_ASYNC_RATE_ENUM_SIZE)
+#define MADERA_SAMPLE_RATE_ENUM_SIZE	16
+#define MADERA_DFC_TYPE_ENUM_SIZE	5
+#define MADERA_DFC_WIDTH_ENUM_SIZE	5
+
+extern const struct snd_soc_dai_ops madera_dai_ops;
+extern const struct snd_soc_dai_ops madera_simple_dai_ops;
+
+extern const struct snd_kcontrol_new madera_inmux[];
+extern const struct snd_kcontrol_new madera_inmode[];
+
+extern const char * const madera_rate_text[MADERA_RATE_ENUM_SIZE];
+extern const unsigned int madera_rate_val[MADERA_RATE_ENUM_SIZE];
+
+extern const struct soc_enum madera_sample_rate[];
+extern const struct soc_enum madera_isrc_fsl[];
+extern const struct soc_enum madera_isrc_fsh[];
+extern const struct soc_enum madera_asrc1_rate[];
+extern const struct soc_enum madera_asrc2_rate[];
+extern const struct soc_enum madera_dfc_width[];
+extern const struct soc_enum madera_dfc_type[];
+
+extern const struct soc_enum madera_in_vi_ramp;
+extern const struct soc_enum madera_in_vd_ramp;
+
+extern const struct soc_enum madera_out_vi_ramp;
+extern const struct soc_enum madera_out_vd_ramp;
+
+extern const struct soc_enum madera_lhpf1_mode;
+extern const struct soc_enum madera_lhpf2_mode;
+extern const struct soc_enum madera_lhpf3_mode;
+extern const struct soc_enum madera_lhpf4_mode;
+
+extern const struct soc_enum madera_ng_hold;
+extern const struct soc_enum madera_in_hpf_cut_enum;
+extern const struct soc_enum madera_in_dmic_osr[];
+
+extern const struct soc_enum madera_output_anc_src[];
+extern const struct soc_enum madera_anc_input_src[];
+extern const struct soc_enum madera_anc_ng_enum;
+
+extern const struct snd_kcontrol_new madera_dsp_trigger_output_mux[];
+extern const struct snd_kcontrol_new madera_drc_activity_output_mux[];
+
+extern const struct snd_kcontrol_new madera_adsp_rate_controls[];
+
+int madera_dfc_put(struct snd_kcontrol *kcontrol,
+		   struct snd_ctl_elem_value *ucontrol);
+
+int madera_lp_mode_put(struct snd_kcontrol *kcontrol,
+		       struct snd_ctl_elem_value *ucontrol);
+
+int madera_out1_demux_put(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol);
+int madera_out1_demux_get(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol);
+
+int madera_rate_put(struct snd_kcontrol *kcontrol,
+		    struct snd_ctl_elem_value *ucontrol);
+
+int madera_eq_coeff_put(struct snd_kcontrol *kcontrol,
+			struct snd_ctl_elem_value *ucontrol);
+int madera_lhpf_coeff_put(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol);
+
+int madera_sysclk_ev(struct snd_soc_dapm_widget *w,
+		     struct snd_kcontrol *kcontrol, int event);
+int madera_spk_ev(struct snd_soc_dapm_widget *w,
+		  struct snd_kcontrol *kcontrol, int event);
+int madera_in_ev(struct snd_soc_dapm_widget *w,
+		 struct snd_kcontrol *kcontrol, int event);
+int madera_out_ev(struct snd_soc_dapm_widget *w,
+		  struct snd_kcontrol *kcontrol, int event);
+int madera_hp_ev(struct snd_soc_dapm_widget *w,
+		 struct snd_kcontrol *kcontrol, int event);
+int madera_anc_ev(struct snd_soc_dapm_widget *w,
+		  struct snd_kcontrol *kcontrol, int event);
+int madera_domain_clk_ev(struct snd_soc_dapm_widget *w,
+			 struct snd_kcontrol *kcontrol,
+			 int event);
+
+int madera_set_adsp_clk(struct madera_priv *priv, int dsp_num,
+			unsigned int freq);
+
+int madera_set_sysclk(struct snd_soc_component *component, int clk_id,
+		      int source, unsigned int freq, int dir);
+
+int madera_init_fll(struct madera *madera, int id, int base,
+		    struct madera_fll *fll);
+int madera_set_fll_refclk(struct madera_fll *fll, int source,
+			  unsigned int fref, unsigned int fout);
+int madera_set_fll_syncclk(struct madera_fll *fll, int source,
+			   unsigned int fref, unsigned int fout);
+int madera_set_fll_ao_refclk(struct madera_fll *fll, int source,
+			     unsigned int fin, unsigned int fout);
+
+int madera_core_init(struct madera_priv *priv);
+int madera_core_free(struct madera_priv *priv);
+int madera_init_overheat(struct madera_priv *priv);
+int madera_free_overheat(struct madera_priv *priv);
+int madera_init_inputs(struct snd_soc_component *component);
+int madera_init_outputs(struct snd_soc_component *component, int n_mono_routes);
+int madera_init_bus_error_irq(struct madera_priv *priv, int dsp_num,
+			      irq_handler_t handler);
+void madera_free_bus_error_irq(struct madera_priv *priv, int dsp_num);
+
+int madera_init_dai(struct madera_priv *priv, int dai);
+
+int madera_set_output_mode(struct snd_soc_component *component, int output,
+			   bool differential);
+
+/* Following functions are for use by machine drivers */
+static inline int madera_register_notifier(struct snd_soc_component *component,
+					   struct notifier_block *nb)
+{
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+
+	return blocking_notifier_chain_register(&madera->notifier, nb);
+}
+
+static inline int
+madera_unregister_notifier(struct snd_soc_component *component,
+			   struct notifier_block *nb)
+{
+	struct madera_priv *priv = snd_soc_component_get_drvdata(component);
+	struct madera *madera = priv->madera;
+
+	return blocking_notifier_chain_unregister(&madera->notifier, nb);
+}
+
+#endif
-- 
cgit v1.2.3


From fd5d10059d5ead12dd12f05ae6d96e70d1fac3df Mon Sep 17 00:00:00 2001
From: Felix Riemann <felix.riemann@sma.de>
Date: Thu, 20 Jun 2019 08:45:00 +0100
Subject: regulator: da9061/62: Adjust LDO voltage selection minimum value

According to the DA9061 and DA9062 datasheets the LDO voltage selection
registers have a lower value of 0x02. This applies to voltage registers
VLDO1_A, VLDO2_A, VLDO3_A and VLDO4_A. This linear offset of 0x02 was
previously not observed by the driver, causing the LDO output voltage to
be systematically lower by two steps (= 0.1V).

This patch fixes the minimum linear selector offset by setting it to a
value of 2 and increases the n_voltages by the same amount allowing
voltages in the range 0x02 -> 0.9V to 0x38 -> 3.6V to be correctly
selected. Also fixes an incorrect calculaton for the n_voltages value in
the regulator LDO2.

These fixes effect all LDO regulators for DA9061 and DA9062.

Acked-by: Steve Twiss <stwiss.opensource@diasemi.com>
Tested-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Felix Riemann <felix.riemann@sma.de>
Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/da9062-regulator.c | 40 +++++++++++++++++++++---------------
 include/linux/mfd/da9062/registers.h |  3 +++
 2 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/da9062-regulator.c b/drivers/regulator/da9062-regulator.c
index a02e0488410f..2ffc64622451 100644
--- a/drivers/regulator/da9062-regulator.c
+++ b/drivers/regulator/da9062-regulator.c
@@ -493,12 +493,13 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO1_CONT,
 		.desc.enable_mask = DA9062AA_LDO1_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO1_A,
 		.desc.vsel_mask = DA9062AA_VLDO1_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO1_A,
 			__builtin_ffs((int)DA9062AA_LDO1_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -525,12 +526,13 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (600))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO2_CONT,
 		.desc.enable_mask = DA9062AA_LDO2_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO2_A,
 		.desc.vsel_mask = DA9062AA_VLDO2_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO2_A,
 			__builtin_ffs((int)DA9062AA_LDO2_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -557,12 +559,13 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO3_CONT,
 		.desc.enable_mask = DA9062AA_LDO3_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO3_A,
 		.desc.vsel_mask = DA9062AA_VLDO3_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO3_A,
 			__builtin_ffs((int)DA9062AA_LDO3_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -589,12 +592,13 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO4_CONT,
 		.desc.enable_mask = DA9062AA_LDO4_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO4_A,
 		.desc.vsel_mask = DA9062AA_VLDO4_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO4_A,
 			__builtin_ffs((int)DA9062AA_LDO4_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -769,12 +773,13 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO1_CONT,
 		.desc.enable_mask = DA9062AA_LDO1_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO1_A,
 		.desc.vsel_mask = DA9062AA_VLDO1_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO1_A,
 			__builtin_ffs((int)DA9062AA_LDO1_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -801,12 +806,13 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (600))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO2_CONT,
 		.desc.enable_mask = DA9062AA_LDO2_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO2_A,
 		.desc.vsel_mask = DA9062AA_VLDO2_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO2_A,
 			__builtin_ffs((int)DA9062AA_LDO2_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -833,12 +839,13 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO3_CONT,
 		.desc.enable_mask = DA9062AA_LDO3_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO3_A,
 		.desc.vsel_mask = DA9062AA_VLDO3_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO3_A,
 			__builtin_ffs((int)DA9062AA_LDO3_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
@@ -865,12 +872,13 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = {
 		.desc.ops = &da9062_ldo_ops,
 		.desc.min_uV = (900) * 1000,
 		.desc.uV_step = (50) * 1000,
-		.desc.n_voltages = ((3600) - (900))/(50) + 1,
+		.desc.n_voltages = ((3600) - (900))/(50) + 1
+				+ DA9062AA_VLDO_A_MIN_SEL,
 		.desc.enable_reg = DA9062AA_LDO4_CONT,
 		.desc.enable_mask = DA9062AA_LDO4_EN_MASK,
 		.desc.vsel_reg = DA9062AA_VLDO4_A,
 		.desc.vsel_mask = DA9062AA_VLDO4_A_MASK,
-		.desc.linear_min_sel = 0,
+		.desc.linear_min_sel = DA9062AA_VLDO_A_MIN_SEL,
 		.sleep = REG_FIELD(DA9062AA_VLDO4_A,
 			__builtin_ffs((int)DA9062AA_LDO4_SL_A_MASK) - 1,
 			sizeof(unsigned int) * 8 -
diff --git a/include/linux/mfd/da9062/registers.h b/include/linux/mfd/da9062/registers.h
index fe04b708742b..2906bf6160fb 100644
--- a/include/linux/mfd/da9062/registers.h
+++ b/include/linux/mfd/da9062/registers.h
@@ -797,6 +797,9 @@
 #define DA9062AA_BUCK3_SL_A_SHIFT	7
 #define DA9062AA_BUCK3_SL_A_MASK	BIT(7)
 
+/* DA9062AA_VLDO[1-4]_A common */
+#define DA9062AA_VLDO_A_MIN_SEL	2
+
 /* DA9062AA_VLDO1_A = 0x0A9 */
 #define DA9062AA_VLDO1_A_SHIFT		0
 #define DA9062AA_VLDO1_A_MASK		0x3f
-- 
cgit v1.2.3


From 116b9731ad7614032a390bb9ad8998a14d6dc752 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 26 May 2019 17:34:02 +0300
Subject: fsnotify: add empty fsnotify_{unlink,rmdir}() hooks

We would like to move fsnotify_nameremove() calls from d_delete()
into a higher layer where the hook makes more sense and so we can
consider every d_delete() call site individually.

Start by creating empty hook fsnotify_{unlink,rmdir}() and place
them in the proper VFS call sites.  After all d_delete() call sites
will be converted to use the new hook, the new hook will generate the
delete events and fsnotify_nameremove() hook will be removed.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/namei.c               |  2 ++
 include/linux/fsnotify.h | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 20831c2fbb34..209c51a5226c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3883,6 +3883,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_flags |= S_DEAD;
 	dont_mount(dentry);
 	detach_mounts(dentry);
+	fsnotify_rmdir(dir, dentry);
 
 out:
 	inode_unlock(dentry->d_inode);
@@ -3999,6 +4000,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 			if (!error) {
 				dont_mount(dentry);
 				detach_mounts(dentry);
+				fsnotify_unlink(dir, dentry);
 			}
 		}
 	}
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 94972e8eb6d1..7f23eddefcd0 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -188,6 +188,19 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, &new_dentry->d_name, 0);
 }
 
+/*
+ * fsnotify_unlink - 'name' was unlinked
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ */
+static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
+{
+	/* Expected to be called before d_delete() */
+	WARN_ON_ONCE(d_is_negative(dentry));
+
+	/* TODO: call fsnotify_dirent() */
+}
+
 /*
  * fsnotify_mkdir - directory 'name' was created
  */
@@ -198,6 +211,19 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
 }
 
+/*
+ * fsnotify_rmdir - directory 'name' was removed
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ */
+static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	/* Expected to be called before d_delete() */
+	WARN_ON_ONCE(d_is_negative(dentry));
+
+	/* TODO: call fsnotify_dirent() */
+}
+
 /*
  * fsnotify_access - file was read
  */
-- 
cgit v1.2.3


From 49246466a98996e78b68a0041807dbd2628c53fe Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 26 May 2019 17:34:10 +0300
Subject: fsnotify: move fsnotify_nameremove() hook out of d_delete()

d_delete() was piggy backed for the fsnotify_nameremove() hook when
in fact not all callers of d_delete() care about fsnotify events.

For all callers of d_delete() that may be interested in fsnotify events,
we made sure to call one of fsnotify_{unlink,rmdir}() hooks before
calling d_delete().

Now we can move the fsnotify_nameremove() call from d_delete() to the
fsnotify_{unlink,rmdir}() hooks.

Two explicit calls to fsnotify_nameremove() from nfs/afs sillyrename
are also removed. This will cause a change of behavior - nfs/afs will
NOT generate an fsnotify delete event when renaming over a positive
dentry.  This change is desirable, because it is consistent with the
behavior of all other filesystems.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/afs/dir_silly.c       | 5 -----
 fs/dcache.c              | 2 --
 fs/nfs/unlink.c          | 6 ------
 include/linux/fsnotify.h | 2 ++
 4 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 057b8d322422..361088a5edb9 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -60,11 +60,6 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 		if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
 			afs_edit_dir_add(dvnode, &new->d_name,
 					 &vnode->fid, afs_edit_dir_for_silly_1);
-
-		/* vfs_unlink and the like do not issue this when a file is
-		 * sillyrenamed, so do it here.
-		 */
-		fsnotify_nameremove(old, 0);
 	}
 
 	kfree(scb);
diff --git a/fs/dcache.c b/fs/dcache.c
index c435398f2c81..f41121e5d1ec 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2372,7 +2372,6 @@ EXPORT_SYMBOL(d_hash_and_lookup);
 void d_delete(struct dentry * dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	int isdir = d_is_dir(dentry);
 
 	spin_lock(&inode->i_lock);
 	spin_lock(&dentry->d_lock);
@@ -2387,7 +2386,6 @@ void d_delete(struct dentry * dentry)
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&inode->i_lock);
 	}
-	fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
 
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 52d533967485..0effeee28352 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -396,12 +396,6 @@ nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
 		nfs_cancel_async_unlink(dentry);
 		return;
 	}
-
-	/*
-	 * vfs_unlink and the like do not issue this when a file is
-	 * sillyrenamed, so do it here.
-	 */
-	fsnotify_nameremove(dentry, 0);
 }
 
 #define SILLYNAME_PREFIX ".nfs"
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 7f23eddefcd0..0145073c2b42 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -199,6 +199,7 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
 	WARN_ON_ONCE(d_is_negative(dentry));
 
 	/* TODO: call fsnotify_dirent() */
+	fsnotify_nameremove(dentry, 0);
 }
 
 /*
@@ -222,6 +223,7 @@ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
 	WARN_ON_ONCE(d_is_negative(dentry));
 
 	/* TODO: call fsnotify_dirent() */
+	fsnotify_nameremove(dentry, 1);
 }
 
 /*
-- 
cgit v1.2.3


From 7377f5bec13332bc470856f337935be6cabbcf24 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 26 May 2019 17:34:11 +0300
Subject: fsnotify: get rid of fsnotify_nameremove()

For all callers of fsnotify_{unlink,rmdir}(), we made sure that d_parent
and d_name are stable.  Therefore, fsnotify_{unlink,rmdir}() do not need
the safety measures in fsnotify_nameremove() to stabilize parent and name.
We can now simplify those hooks and get rid of fsnotify_nameremove().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.c             | 41 ----------------------------------------
 include/linux/fsnotify.h         |  6 ++----
 include/linux/fsnotify_backend.h |  4 ----
 3 files changed, 2 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4eb2ebfac468..2ecef6155fc0 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -94,47 +94,6 @@ void fsnotify_sb_delete(struct super_block *sb)
 	fsnotify_clear_marks_by_sb(sb);
 }
 
-/*
- * fsnotify_nameremove - a filename was removed from a directory
- *
- * This is mostly called under parent vfs inode lock so name and
- * dentry->d_parent should be stable. However there are some corner cases where
- * inode lock is not held. So to be on the safe side and be reselient to future
- * callers and out of tree users of d_delete(), we do not assume that d_parent
- * and d_name are stable and we use dget_parent() and
- * take_dentry_name_snapshot() to grab stable references.
- */
-void fsnotify_nameremove(struct dentry *dentry, int isdir)
-{
-	struct dentry *parent;
-	struct name_snapshot name;
-	__u32 mask = FS_DELETE;
-
-	/* d_delete() of pseudo inode? (e.g. __ns_get_path() playing tricks) */
-	if (IS_ROOT(dentry))
-		return;
-
-	if (isdir)
-		mask |= FS_ISDIR;
-
-	parent = dget_parent(dentry);
-	/* Avoid unneeded take_dentry_name_snapshot() */
-	if (!(d_inode(parent)->i_fsnotify_mask & FS_DELETE) &&
-	    !(dentry->d_sb->s_fsnotify_mask & FS_DELETE))
-		goto out_dput;
-
-	take_dentry_name_snapshot(&name, dentry);
-
-	fsnotify(d_inode(parent), mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
-		 &name.name, 0);
-
-	release_dentry_name_snapshot(&name);
-
-out_dput:
-	dput(parent);
-}
-EXPORT_SYMBOL(fsnotify_nameremove);
-
 /*
  * Given an inode, first check if we care what happens to our children.  Inotify
  * and dnotify both tell their parents about events.  If we care about any event
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 0145073c2b42..a2d5d175d3c1 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -198,8 +198,7 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
 	/* Expected to be called before d_delete() */
 	WARN_ON_ONCE(d_is_negative(dentry));
 
-	/* TODO: call fsnotify_dirent() */
-	fsnotify_nameremove(dentry, 0);
+	fsnotify_dirent(dir, dentry, FS_DELETE);
 }
 
 /*
@@ -222,8 +221,7 @@ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
 	/* Expected to be called before d_delete() */
 	WARN_ON_ONCE(d_is_negative(dentry));
 
-	/* TODO: call fsnotify_dirent() */
-	fsnotify_nameremove(dentry, 1);
+	fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a9f9dcc1e515..c28f6ed1f59b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -355,7 +355,6 @@ extern int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern void fsnotify_sb_delete(struct super_block *sb);
-extern void fsnotify_nameremove(struct dentry *dentry, int isdir);
 extern u32 fsnotify_get_cookie(void);
 
 static inline int fsnotify_inode_watches_children(struct inode *inode)
@@ -525,9 +524,6 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 static inline void fsnotify_sb_delete(struct super_block *sb)
 {}
 
-static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
-{}
-
 static inline void fsnotify_update_flags(struct dentry *dentry)
 {}
 
-- 
cgit v1.2.3


From cf4b20ecfa4edc4a0e55d52bc0a735f60bdfe7eb Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 18 Jun 2019 17:34:42 +0200
Subject: mmc: sdio: Turn sdio_run_irqs() into static

All external users of sdio_run_irqs() have converted into using the
preferred sdio_signal_irq() interface, thus not calling the function
directly any more. Avoid further new users of it, by turning it into
static.

Suggested-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sdio_irq.c | 3 +--
 include/linux/mmc/host.h    | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index 9f54a259a1b3..0bcc5e83bd1a 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -92,7 +92,7 @@ static int process_sdio_pending_irqs(struct mmc_host *host)
 	return ret;
 }
 
-void sdio_run_irqs(struct mmc_host *host)
+static void sdio_run_irqs(struct mmc_host *host)
 {
 	mmc_claim_host(host);
 	if (host->sdio_irqs) {
@@ -103,7 +103,6 @@ void sdio_run_irqs(struct mmc_host *host)
 	}
 	mmc_release_host(host);
 }
-EXPORT_SYMBOL_GPL(sdio_run_irqs);
 
 void sdio_irq_work(struct work_struct *work)
 {
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index ecb7972e2423..a9b12322c775 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -504,7 +504,6 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 		wake_up_process(host->sdio_irq_thread);
 }
 
-void sdio_run_irqs(struct mmc_host *host);
 void sdio_signal_irq(struct mmc_host *host);
 
 #ifdef CONFIG_REGULATOR
-- 
cgit v1.2.3


From f924cddebc900f7cb10d5538d69523e558fa681c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:29:00 +0200
Subject: block: remove blk_init_request_from_bio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lightnvm should have never used this function, as it is sending
passthrough requests, so switch it to blk_rq_append_bio like all the
other passthrough request users.  Inline blk_init_request_from_bio into
the only remaining caller.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Javier González <javier@javigon.com>
Reviewed-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c             | 11 -----------
 block/blk-mq.c               |  7 ++++++-
 drivers/nvme/host/lightnvm.c |  2 +-
 include/linux/blkdev.h       |  1 -
 4 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index b6f22f219389..d1c7c69a20dd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -687,17 +687,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	return false;
 }
 
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
-{
-	if (bio->bi_opf & REQ_RAHEAD)
-		req->cmd_flags |= REQ_FAILFAST_MASK;
-
-	req->__sector = bio->bi_iter.bi_sector;
-	req->write_hint = bio->bi_write_hint;
-	blk_rq_bio_prep(req->q, req, bio);
-}
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
-
 static void handle_bad_sector(struct bio *bio, sector_t maxsector)
 {
 	char b[BDEVNAME_SIZE];
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..61457bffa55f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1766,7 +1766,12 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
-	blk_init_request_from_bio(rq, bio);
+	if (bio->bi_opf & REQ_RAHEAD)
+		rq->cmd_flags |= REQ_FAILFAST_MASK;
+
+	rq->__sector = bio->bi_iter.bi_sector;
+	rq->write_hint = bio->bi_write_hint;
+	blk_rq_bio_prep(rq->q, rq, bio);
 
 	blk_account_io_start(rq, true);
 }
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 4f20a10b39d3..ba009d4c9dfa 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -660,7 +660,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
 	if (rqd->bio)
-		blk_init_request_from_bio(rq, rqd->bio);
+		blk_rq_append_bio(rq, &rqd->bio);
 	else
 		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ad49a775c54f..2d4dfe82767a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -828,7 +828,6 @@ extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
 extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
-extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       blk_mq_req_flags_t flags);
-- 
cgit v1.2.3


From 14ccb66b3f585b2bc21e7256c96090abed5a512c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:29:01 +0200
Subject: block: remove the bi_phys_segments field in struct bio

We only need the number of segments in the blk-mq submission path.
Remove the field from struct bio, and return it from a variant of
blk_queue_split instead of that it can passed as an argument to
those functions that need the value.

This also means we stop recounting segments except for cloning
and partial segments.

To keep the number of arguments in this how path down remove
pointless struct request_queue arguments from any of the functions
that had it and grew a nr_segs argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt |  1 -
 block/bfq-iosched.c            |  5 +--
 block/bio.c                    | 15 +--------
 block/blk-core.c               | 32 ++++++++----------
 block/blk-map.c                | 10 ++++--
 block/blk-merge.c              | 75 +++++++++++++++---------------------------
 block/blk-mq-sched.c           | 26 ++++++++-------
 block/blk-mq-sched.h           | 10 +++---
 block/blk-mq.c                 | 23 ++++++-------
 block/blk.h                    | 23 ++++++-------
 block/kyber-iosched.c          |  5 +--
 block/mq-deadline.c            |  5 +--
 drivers/md/raid5.c             |  1 -
 include/linux/bio.h            |  1 -
 include/linux/blk-mq.h         |  2 +-
 include/linux/blk_types.h      |  6 ----
 include/linux/blkdev.h         |  1 -
 include/linux/elevator.h       |  2 +-
 18 files changed, 106 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index ac18b488cb5e..31c177663ed5 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -436,7 +436,6 @@ struct bio {
        struct bvec_iter	bi_iter;	/* current index into bio_vec array */
 
        unsigned int	bi_size;     /* total size in bytes */
-       unsigned short 	bi_phys_segments; /* segments after physaddr coalesce*/
        unsigned short	bi_hw_segments; /* segments after DMA remapping */
        unsigned int	bi_max;	     /* max bio_vecs we can hold
                                         used as index into pool */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f8d430f88d25..a6bf842cbe16 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2027,7 +2027,8 @@ static void bfq_remove_request(struct request_queue *q,
 
 }
 
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+		unsigned int nr_segs)
 {
 	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -2050,7 +2051,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 		bfqd->bio_bfqq = NULL;
 	bfqd->bio_bic = bic;
 
-	ret = blk_mq_sched_try_merge(q, bio, &free);
+	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 
 	if (free)
 		blk_mq_free_request(free);
diff --git a/block/bio.c b/block/bio.c
index 4bcdcd3f63f4..ad9c3aa9bf7d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -558,14 +558,6 @@ void bio_put(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_put);
 
-int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_phys_segments;
-}
-
 /**
  * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
  * 	@bio: destination bio
@@ -739,7 +731,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
 	if (bio_full(bio))
 		return 0;
 
-	if (bio->bi_phys_segments >= queue_max_segments(q))
+	if (bio->bi_vcnt >= queue_max_segments(q))
 		return 0;
 
 	bvec = &bio->bi_io_vec[bio->bi_vcnt];
@@ -749,8 +741,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
 	bio->bi_vcnt++;
  done:
 	bio->bi_iter.bi_size += len;
-	bio->bi_phys_segments = bio->bi_vcnt;
-	bio_set_flag(bio, BIO_SEG_VALID);
 	return len;
 }
 
@@ -1909,10 +1899,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 	if (offset == 0 && size == bio->bi_iter.bi_size)
 		return;
 
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	bio_advance(bio, offset << 9);
-
 	bio->bi_iter.bi_size = size;
 
 	if (bio_integrity(bio))
diff --git a/block/blk-core.c b/block/blk-core.c
index d1c7c69a20dd..ef998a724b27 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -550,15 +550,15 @@ void blk_put_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_put_request);
 
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
-			    struct bio *bio)
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+		unsigned int nr_segs)
 {
 	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
-	if (!ll_back_merge_fn(q, req, bio))
+	if (!ll_back_merge_fn(req, bio, nr_segs))
 		return false;
 
-	trace_block_bio_backmerge(q, req, bio);
+	trace_block_bio_backmerge(req->q, req, bio);
 
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
@@ -571,15 +571,15 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	return true;
 }
 
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
-			     struct bio *bio)
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+		unsigned int nr_segs)
 {
 	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
-	if (!ll_front_merge_fn(q, req, bio))
+	if (!ll_front_merge_fn(req, bio, nr_segs))
 		return false;
 
-	trace_block_bio_frontmerge(q, req, bio);
+	trace_block_bio_frontmerge(req->q, req, bio);
 
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
@@ -621,6 +621,7 @@ no_merge:
  * blk_attempt_plug_merge - try to merge with %current's plugged list
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
  * @same_queue_rq: pointer to &struct request that gets filled in when
  * another request associated with @q is found on the plug list
  * (optional, may be %NULL)
@@ -639,7 +640,7 @@ no_merge:
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    struct request **same_queue_rq)
+		unsigned int nr_segs, struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -668,10 +669,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 
 		switch (blk_try_merge(rq, bio)) {
 		case ELEVATOR_BACK_MERGE:
-			merged = bio_attempt_back_merge(q, rq, bio);
+			merged = bio_attempt_back_merge(rq, bio, nr_segs);
 			break;
 		case ELEVATOR_FRONT_MERGE:
-			merged = bio_attempt_front_merge(q, rq, bio);
+			merged = bio_attempt_front_merge(rq, bio, nr_segs);
 			break;
 		case ELEVATOR_DISCARD_MERGE:
 			merged = bio_attempt_discard_merge(q, rq, bio);
@@ -1427,14 +1428,9 @@ bool blk_update_request(struct request *req, blk_status_t error,
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
-		     struct bio *bio)
+void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs)
 {
-	if (bio_has_data(bio))
-		rq->nr_phys_segments = bio_phys_segments(q, bio);
-	else if (bio_op(bio) == REQ_OP_DISCARD)
-		rq->nr_phys_segments = 1;
-
+	rq->nr_phys_segments = nr_segs;
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
 	rq->ioprio = bio_prio(bio);
diff --git a/block/blk-map.c b/block/blk-map.c
index db9373bd31ac..3a62e471d81b 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -18,13 +18,19 @@
 int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
 	struct bio *orig_bio = *bio;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned int nr_segs = 0;
 
 	blk_queue_bounce(rq->q, bio);
 
+	bio_for_each_bvec(bv, *bio, iter)
+		nr_segs++;
+
 	if (!rq->bio) {
-		blk_rq_bio_prep(rq->q, rq, *bio);
+		blk_rq_bio_prep(rq, *bio, nr_segs);
 	} else {
-		if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+		if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
 			if (orig_bio != *bio) {
 				bio_put(*bio);
 				*bio = orig_bio;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 17713d7d98d5..72b4fd89a22d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -258,32 +258,29 @@ split:
 	return do_split ? new : NULL;
 }
 
-void blk_queue_split(struct request_queue *q, struct bio **bio)
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+		unsigned int *nr_segs)
 {
-	struct bio *split, *res;
-	unsigned nsegs;
+	struct bio *split;
 
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
+				nr_segs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &q->bio_split,
+				nr_segs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
 		break;
 	}
 
-	/* physical segments can be figured out during splitting */
-	res = split ? split : *bio;
-	res->bi_phys_segments = nsegs;
-	bio_set_flag(res, BIO_SEG_VALID);
-
 	if (split) {
 		/* there isn't chance to merge the splitted bio */
 		split->bi_opf |= REQ_NOMERGE;
@@ -304,6 +301,13 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 		*bio = split;
 	}
 }
+
+void blk_queue_split(struct request_queue *q, struct bio **bio)
+{
+	unsigned int nr_segs;
+
+	__blk_queue_split(q, bio, &nr_segs);
+}
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
@@ -338,17 +342,6 @@ void blk_recalc_rq_segments(struct request *rq)
 	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
-void blk_recount_segments(struct request_queue *q, struct bio *bio)
-{
-	struct bio *nxt = bio->bi_next;
-
-	bio->bi_next = NULL;
-	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
-	bio->bi_next = nxt;
-
-	bio_set_flag(bio, BIO_SEG_VALID);
-}
-
 static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
 		struct scatterlist *sglist)
 {
@@ -519,16 +512,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
-static inline int ll_new_hw_segment(struct request_queue *q,
-				    struct request *req,
-				    struct bio *bio)
+static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
+		unsigned int nr_phys_segs)
 {
-	int nr_phys_segs = bio_phys_segments(q, bio);
-
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q))
 		goto no_merge;
 
-	if (blk_integrity_merge_bio(q, req, bio) == false)
+	if (blk_integrity_merge_bio(req->q, req, bio) == false)
 		goto no_merge;
 
 	/*
@@ -539,12 +529,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	return 1;
 
 no_merge:
-	req_set_nomerge(q, req);
+	req_set_nomerge(req->q, req);
 	return 0;
 }
 
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
-		     struct bio *bio)
+int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
 {
 	if (req_gap_back_merge(req, bio))
 		return 0;
@@ -553,21 +542,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
-		req_set_nomerge(q, req);
+		req_set_nomerge(req->q, req);
 		return 0;
 	}
-	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
-		blk_recount_segments(q, req->biotail);
-	if (!bio_flagged(bio, BIO_SEG_VALID))
-		blk_recount_segments(q, bio);
 
-	return ll_new_hw_segment(q, req, bio);
+	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
-int ll_front_merge_fn(struct request_queue *q, struct request *req,
-		      struct bio *bio)
+int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
 {
-
 	if (req_gap_front_merge(req, bio))
 		return 0;
 	if (blk_integrity_rq(req) &&
@@ -575,15 +558,11 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
-		req_set_nomerge(q, req);
+		req_set_nomerge(req->q, req);
 		return 0;
 	}
-	if (!bio_flagged(bio, BIO_SEG_VALID))
-		blk_recount_segments(q, bio);
-	if (!bio_flagged(req->bio, BIO_SEG_VALID))
-		blk_recount_segments(q, req->bio);
 
-	return ll_new_hw_segment(q, req, bio);
+	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
 static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..956a7aa9a637 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -224,7 +224,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-			    struct request **merged_request)
+		unsigned int nr_segs, struct request **merged_request)
 {
 	struct request *rq;
 
@@ -232,7 +232,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 	case ELEVATOR_BACK_MERGE:
 		if (!blk_mq_sched_allow_merge(q, rq, bio))
 			return false;
-		if (!bio_attempt_back_merge(q, rq, bio))
+		if (!bio_attempt_back_merge(rq, bio, nr_segs))
 			return false;
 		*merged_request = attempt_back_merge(q, rq);
 		if (!*merged_request)
@@ -241,7 +241,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 	case ELEVATOR_FRONT_MERGE:
 		if (!blk_mq_sched_allow_merge(q, rq, bio))
 			return false;
-		if (!bio_attempt_front_merge(q, rq, bio))
+		if (!bio_attempt_front_merge(rq, bio, nr_segs))
 			return false;
 		*merged_request = attempt_front_merge(q, rq);
 		if (!*merged_request)
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
  * of them.
  */
 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
-			   struct bio *bio)
+			   struct bio *bio, unsigned int nr_segs)
 {
 	struct request *rq;
 	int checked = 8;
@@ -277,11 +277,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
 		switch (blk_try_merge(rq, bio)) {
 		case ELEVATOR_BACK_MERGE:
 			if (blk_mq_sched_allow_merge(q, rq, bio))
-				merged = bio_attempt_back_merge(q, rq, bio);
+				merged = bio_attempt_back_merge(rq, bio,
+						nr_segs);
 			break;
 		case ELEVATOR_FRONT_MERGE:
 			if (blk_mq_sched_allow_merge(q, rq, bio))
-				merged = bio_attempt_front_merge(q, rq, bio);
+				merged = bio_attempt_front_merge(rq, bio,
+						nr_segs);
 			break;
 		case ELEVATOR_DISCARD_MERGE:
 			merged = bio_attempt_discard_merge(q, rq, bio);
@@ -304,13 +306,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
  */
 static bool blk_mq_attempt_merge(struct request_queue *q,
 				 struct blk_mq_hw_ctx *hctx,
-				 struct blk_mq_ctx *ctx, struct bio *bio)
+				 struct blk_mq_ctx *ctx, struct bio *bio,
+				 unsigned int nr_segs)
 {
 	enum hctx_type type = hctx->type;
 
 	lockdep_assert_held(&ctx->lock);
 
-	if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
+	if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
 		ctx->rq_merged++;
 		return true;
 	}
@@ -318,7 +321,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	return false;
 }
 
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
@@ -328,7 +332,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 
 	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
-		return e->type->ops.bio_merge(hctx, bio);
+		return e->type->ops.bio_merge(hctx, bio, nr_segs);
 	}
 
 	type = hctx->type;
@@ -336,7 +340,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 			!list_empty_careful(&ctx->rq_lists[type])) {
 		/* default per sw-queue merge */
 		spin_lock(&ctx->lock);
-		ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
+		ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
 		spin_unlock(&ctx->lock);
 	}
 
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3cf92cbbd8ac..cf22ab00fefb 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq);
 
 void blk_mq_sched_request_inserted(struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-				struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+		unsigned int nr_segs, struct request **merged_request);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
@@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
 void blk_mq_sched_free_requests(struct request_queue *q);
 
 static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs)
 {
 	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
 		return false;
 
-	return __blk_mq_sched_bio_merge(q, bio);
+	return __blk_mq_sched_bio_merge(q, bio, nr_segs);
 }
 
 static inline bool
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 61457bffa55f..d89383847d09 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1764,14 +1764,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	}
 }
 
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
+		unsigned int nr_segs)
 {
 	if (bio->bi_opf & REQ_RAHEAD)
 		rq->cmd_flags |= REQ_FAILFAST_MASK;
 
 	rq->__sector = bio->bi_iter.bi_sector;
 	rq->write_hint = bio->bi_write_hint;
-	blk_rq_bio_prep(rq->q, rq, bio);
+	blk_rq_bio_prep(rq, bio, nr_segs);
 
 	blk_account_io_start(rq, true);
 }
@@ -1941,20 +1942,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	struct request *rq;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
+	unsigned int nr_segs;
 	blk_qc_t cookie;
 
 	blk_queue_bounce(q, &bio);
-
-	blk_queue_split(q, &bio);
+	__blk_queue_split(q, &bio, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
 		return BLK_QC_T_NONE;
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &same_queue_rq))
+	    blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
-	if (blk_mq_sched_bio_merge(q, bio))
+	if (blk_mq_sched_bio_merge(q, bio, nr_segs))
 		return BLK_QC_T_NONE;
 
 	rq_qos_throttle(q, bio);
@@ -1977,7 +1978,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	plug = current->plug;
 	if (unlikely(is_flush_fua)) {
 		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
+		blk_mq_bio_to_request(rq, bio, nr_segs);
 
 		/* bypass scheduler for flush rq */
 		blk_insert_flush(rq);
@@ -1991,7 +1992,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		struct request *last = NULL;
 
 		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
+		blk_mq_bio_to_request(rq, bio, nr_segs);
 
 		if (!request_count)
 			trace_block_plug(q);
@@ -2006,7 +2007,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		blk_add_rq_to_plug(plug, rq);
 	} else if (plug && !blk_queue_nomerges(q)) {
-		blk_mq_bio_to_request(rq, bio);
+		blk_mq_bio_to_request(rq, bio, nr_segs);
 
 		/*
 		 * We do limited plugging. If the bio can be merged, do that.
@@ -2035,11 +2036,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	} else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
 			!data.hctx->dispatch_busy)) {
 		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
+		blk_mq_bio_to_request(rq, bio, nr_segs);
 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
 	} else {
 		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
+		blk_mq_bio_to_request(rq, bio, nr_segs);
 		blk_mq_sched_insert_request(rq, false, true, true);
 	}
 
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..a1d33cb65842 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,8 +51,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 		int node, int cmd_size, gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
-			struct bio *bio);
+void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs);
 void blk_freeze_queue(struct request_queue *q);
 
 static inline void blk_queue_enter_live(struct request_queue *q)
@@ -154,14 +153,14 @@ static inline bool bio_integrity_endio(struct bio *bio)
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
-			     struct bio *bio);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
-			    struct bio *bio);
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+		unsigned int nr_segs);
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+		unsigned int nr_segs);
 bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
 		struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    struct request **same_queue_rq);
+		unsigned int nr_segs, struct request **same_queue_rq);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -202,10 +201,12 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
-		     struct bio *bio);
-int ll_front_merge_fn(struct request_queue *q, struct request *req, 
-		      struct bio *bio);
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+		unsigned int *nr_segs);
+int ll_back_merge_fn(struct request *req, struct bio *bio,
+		unsigned int nr_segs);
+int ll_front_merge_fn(struct request *req,  struct bio *bio,
+		unsigned int nr_segs);
 struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
 struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index c3b05119cebd..3c2602601741 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -562,7 +562,8 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 	}
 }
 
-static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+		unsigned int nr_segs)
 {
 	struct kyber_hctx_data *khd = hctx->sched_data;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
@@ -572,7 +573,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 	bool merged;
 
 	spin_lock(&kcq->lock);
-	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
+	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
 	spin_unlock(&kcq->lock);
 	blk_mq_put_ctx(ctx);
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1876f5712bfd..b8a682b5a1bb 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -469,7 +469,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
 	return ELEVATOR_NO_MERGE;
 }
 
-static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+		unsigned int nr_segs)
 {
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
@@ -477,7 +478,7 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 	bool ret;
 
 	spin_lock(&dd->lock);
-	ret = blk_mq_sched_try_merge(q, bio, &free);
+	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 	spin_unlock(&dd->lock);
 
 	if (free)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index da94cbaa1a9e..3de4e13bde98 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5251,7 +5251,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		rcu_read_unlock();
 		raid_bio->bi_next = (void*)rdev;
 		bio_set_dev(align_bi, rdev->bdev);
-		bio_clear_flag(align_bi, BIO_SEG_VALID);
 
 		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
 				bio_sectors(align_bi),
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0f23b5682640..ee11c4324751 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -408,7 +408,6 @@ static inline void bio_wouldblock_error(struct bio *bio)
 }
 
 struct request_queue;
-extern int bio_phys_segments(struct request_queue *, struct bio *);
 
 extern int submit_bio_wait(struct bio *bio);
 extern void bio_advance(struct bio *, unsigned);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15d1aa53d96c..3fa1fa59f9b2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -306,7 +306,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs
 bool blk_mq_complete_request(struct request *rq);
 void blk_mq_complete_request_sync(struct request *rq);
 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
-			   struct bio *bio);
+			   struct bio *bio, unsigned int nr_segs);
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 95202f80676c..6a53799c3fe2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -154,11 +154,6 @@ struct bio {
 	blk_status_t		bi_status;
 	u8			bi_partno;
 
-	/* Number of segments in this BIO after
-	 * physical address coalescing is performed.
-	 */
-	unsigned int		bi_phys_segments;
-
 	struct bvec_iter	bi_iter;
 
 	atomic_t		__bi_remaining;
@@ -210,7 +205,6 @@ struct bio {
  */
 enum {
 	BIO_NO_PAGE_REF,	/* don't put release vec pages */
-	BIO_SEG_VALID,		/* bi_phys_segments valid */
 	BIO_CLONED,		/* doesn't own data */
 	BIO_BOUNCED,		/* bio is a bounce bio */
 	BIO_USER_MAPPED,	/* contains user pages */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2d4dfe82767a..d5d3bb45dfb6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -841,7 +841,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_queue_split(struct request_queue *, struct bio **);
-extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
 			      unsigned int, void __user *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 6e8bc53740f0..169bb2e02516 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -34,7 +34,7 @@ struct elevator_mq_ops {
 	void (*depth_updated)(struct blk_mq_hw_ctx *);
 
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
-	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int);
 	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
 	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
 	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
-- 
cgit v1.2.3


From 239eeb085753d4356f731a773f363eb5bed4fe81 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:26:19 +0200
Subject: blk-cgroup: factor out a helper to read rwstat counter

Trying to break up the crazy statements to something readable.
Also switch to an unsigned counter as it can't ever turn negative.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 5 ++---
 include/linux/blk-cgroup.h | 7 +++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 440797293235..0778e52b1db2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -745,7 +745,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	struct blkg_rwstat sum = { };
-	int i;
+	unsigned int i;
 
 	lockdep_assert_held(&blkg->q->queue_lock);
 
@@ -762,8 +762,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 			rwstat = (void *)pos_blkg + off;
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
-				percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+			atomic64_add(blkg_rwstat_read_counter(rwstat, i),
 				&sum.aux_cnt[i]);
 	}
 	rcu_read_unlock();
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..06236f56a840 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -198,6 +198,13 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
+static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
+		unsigned int idx)
+{
+	return atomic64_read(&rwstat->aux_cnt[idx]) +
+		percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
+}
+
 const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
-- 
cgit v1.2.3


From 5d0b6e48cbef3219c0ed75e0e746c4ed259303c2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:26:20 +0200
Subject: blk-cgroup: pass blkg_rwstat structures by reference

Returning a structure generates rather bad code, so switch to passing
by reference.  Also don't require the structure to be zeroed and add
to the 0-initialized counters, but actually set the counters to the
calculated value.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c         | 15 +++++++++------
 block/blk-cgroup.c         | 31 ++++++++++++++++---------------
 include/linux/blk-cgroup.h | 14 +++++++-------
 3 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b3796a40a61a..66abc82179f3 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -935,9 +935,9 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
 static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
-							   &blkcg_policy_bfq,
-							   off);
+	struct blkg_rwstat sum;
+
+	blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
@@ -975,9 +975,12 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
 static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
 					 struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
-					offsetof(struct blkcg_gq, stat_bytes));
-	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+	struct blkg_rwstat tmp;
+	u64 sum;
+
+	blkg_rwstat_recursive_sum(pd->blkg, NULL,
+			offsetof(struct blkcg_gq, stat_bytes), &tmp);
+	sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
 		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 
 	return __blkg_prfill_u64(sf, pd, sum >> 9);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0778e52b1db2..db039a869d95 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -597,8 +597,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
+	struct blkg_rwstat rwstat = { };
 
+	blkg_rwstat_read((void *)pd + off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
@@ -606,8 +607,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
 				    struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+	struct blkg_rwstat rwstat = { };
 
+	blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
@@ -649,8 +651,9 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
 					      struct blkg_policy_data *pd,
 					      int off)
 {
-	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
-							      NULL, off);
+	struct blkg_rwstat rwstat;
+
+	blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
@@ -731,6 +734,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * @blkg: blkg of interest
  * @pol: blkcg_policy which contains the blkg_rwstat
  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * @sum: blkg_rwstat structure containing the results
  *
  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
  * online descendants and their aux counts.  The caller must be holding the
@@ -739,12 +743,11 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
  * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
-					     struct blkcg_policy *pol, int off)
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+		int off, struct blkg_rwstat *sum)
 {
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
-	struct blkg_rwstat sum = { };
 	unsigned int i;
 
 	lockdep_assert_held(&blkg->q->queue_lock);
@@ -762,12 +765,10 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 			rwstat = (void *)pos_blkg + off;
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			atomic64_add(blkg_rwstat_read_counter(rwstat, i),
-				&sum.aux_cnt[i]);
+			atomic64_set(&sum->aux_cnt[i],
+				blkg_rwstat_read_counter(rwstat, i));
 	}
 	rcu_read_unlock();
-
-	return sum;
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 
@@ -953,14 +954,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 
 		spin_lock_irq(&blkg->q->queue_lock);
 
-		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
-					offsetof(struct blkcg_gq, stat_bytes));
+		blkg_rwstat_recursive_sum(blkg, NULL,
+				offsetof(struct blkcg_gq, stat_bytes), &rwstat);
 		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 		dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 
-		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
-					offsetof(struct blkcg_gq, stat_ios));
+		blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_ios), &rwstat);
 		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 06236f56a840..3ee858111274 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -224,8 +224,8 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 			    struct blkcg_policy *pol, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
-					     struct blkcg_policy *pol, int off);
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+		int off, struct blkg_rwstat *sum);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
@@ -700,15 +700,14 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
  *
  * Read the current snapshot of @rwstat and return it in the aux counts.
  */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
+		struct blkg_rwstat *result)
 {
-	struct blkg_rwstat result;
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_set(&result.aux_cnt[i],
+		atomic64_set(&result->aux_cnt[i],
 			     percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
-	return result;
 }
 
 /**
@@ -721,8 +720,9 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
  */
 static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
-	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+	struct blkg_rwstat tmp = { };
 
+	blkg_rwstat_read(rwstat, &tmp);
 	return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
 		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
-- 
cgit v1.2.3


From 7af6fd9112ba310a889c60d0606b4b74049cfe14 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:26:21 +0200
Subject: blk-cgroup: introduce a new struct blkg_rwstat_sample

When sampling the blkcg counts we don't need atomics or per-cpu
variables.  Introduce a new structure just containing plain u64
counters.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c         | 10 ++++------
 block/blk-cgroup.c         | 39 +++++++++++++++++++--------------------
 include/linux/blk-cgroup.h | 22 ++++++++++++----------
 3 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 66abc82179f3..624374a99c6e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -935,7 +935,7 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
 static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum;
+	struct blkg_rwstat_sample sum;
 
 	blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
@@ -975,15 +975,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
 static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
 					 struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat tmp;
-	u64 sum;
+	struct blkg_rwstat_sample tmp;
 
 	blkg_rwstat_recursive_sum(pd->blkg, NULL,
 			offsetof(struct blkcg_gq, stat_bytes), &tmp);
-	sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
+	return __blkg_prfill_u64(sf, pd,
+		(tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
 }
 
 static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index db039a869d95..664c09866839 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  * Print @rwstat to @sf for the device assocaited with @pd.
  */
 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat *rwstat)
+			 const struct blkg_rwstat_sample *rwstat)
 {
 	static const char *rwstr[] = {
 		[BLKG_RWSTAT_READ]	= "Read",
@@ -562,12 +562,12 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
+			   rwstat->cnt[i]);
 
-	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
-		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
-	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
+	v = rwstat->cnt[BLKG_RWSTAT_READ] +
+		rwstat->cnt[BLKG_RWSTAT_WRITE] +
+		rwstat->cnt[BLKG_RWSTAT_DISCARD];
+	seq_printf(sf, "%s Total %llu\n", dname, v);
 	return v;
 }
 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
@@ -597,7 +597,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off)
 {
-	struct blkg_rwstat rwstat = { };
+	struct blkg_rwstat_sample rwstat = { };
 
 	blkg_rwstat_read((void *)pd + off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
 				    struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat rwstat = { };
+	struct blkg_rwstat_sample rwstat = { };
 
 	blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
@@ -651,7 +651,7 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
 					      struct blkg_policy_data *pd,
 					      int off)
 {
-	struct blkg_rwstat rwstat;
+	struct blkg_rwstat_sample rwstat;
 
 	blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
@@ -734,7 +734,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * @blkg: blkg of interest
  * @pol: blkcg_policy which contains the blkg_rwstat
  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
- * @sum: blkg_rwstat structure containing the results
+ * @sum: blkg_rwstat_sample structure containing the results
  *
  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
  * online descendants and their aux counts.  The caller must be holding the
@@ -744,7 +744,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
 void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
-		int off, struct blkg_rwstat *sum)
+		int off, struct blkg_rwstat_sample *sum)
 {
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
@@ -765,8 +765,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 			rwstat = (void *)pos_blkg + off;
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			atomic64_set(&sum->aux_cnt[i],
-				blkg_rwstat_read_counter(rwstat, i));
+			sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
 	}
 	rcu_read_unlock();
 }
@@ -934,7 +933,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 		const char *dname;
 		char *buf;
-		struct blkg_rwstat rwstat;
+		struct blkg_rwstat_sample rwstat;
 		u64 rbytes, wbytes, rios, wios, dbytes, dios;
 		size_t size = seq_get_buf(sf, &buf), off = 0;
 		int i;
@@ -956,15 +955,15 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 
 		blkg_rwstat_recursive_sum(blkg, NULL,
 				offsetof(struct blkcg_gq, stat_bytes), &rwstat);
-		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
-		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
-		dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
+		rbytes = rwstat.cnt[BLKG_RWSTAT_READ];
+		wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE];
+		dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD];
 
 		blkg_rwstat_recursive_sum(blkg, NULL,
 					offsetof(struct blkcg_gq, stat_ios), &rwstat);
-		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
-		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
-		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
+		rios = rwstat.cnt[BLKG_RWSTAT_READ];
+		wios = rwstat.cnt[BLKG_RWSTAT_WRITE];
+		dios = rwstat.cnt[BLKG_RWSTAT_DISCARD];
 
 		spin_unlock_irq(&blkg->q->queue_lock);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 3ee858111274..e4a81767e111 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -63,8 +63,7 @@ struct blkcg {
 
 /*
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
- * to carry result values from read and sum operations.
+ * recursive.  Used to carry stats of dead children.
  */
 struct blkg_stat {
 	struct percpu_counter		cpu_cnt;
@@ -76,6 +75,10 @@ struct blkg_rwstat {
 	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
+struct blkg_rwstat_sample {
+	u64				cnt[BLKG_RWSTAT_NR];
+};
+
 /*
  * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
  * request_queue (q).  This is used by blkcg policies which need to track
@@ -213,7 +216,7 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       bool show_total);
 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat *rwstat);
+			 const struct blkg_rwstat_sample *rwstat);
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
@@ -225,7 +228,7 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 			    struct blkcg_policy *pol, int off);
 void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
-		int off, struct blkg_rwstat *sum);
+		int off, struct blkg_rwstat_sample *sum);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
@@ -701,13 +704,13 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
  * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
-		struct blkg_rwstat *result)
+		struct blkg_rwstat_sample *result)
 {
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_set(&result->aux_cnt[i],
-			     percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+		result->cnt[i] =
+			percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -720,11 +723,10 @@ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
  */
 static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
-	struct blkg_rwstat tmp = { };
+	struct blkg_rwstat_sample tmp = { };
 
 	blkg_rwstat_read(rwstat, &tmp);
-	return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
 }
 
 /**
-- 
cgit v1.2.3


From c0ce79dca5b0e8373a546ebea2af7b3df94c584e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jun 2019 12:26:22 +0200
Subject: blk-cgroup: move struct blkg_stat to bfq

This structure and assorted infrastructure is only used by the bfq I/O
scheduler.  Move it there instead of bloating the common code.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c         | 192 ++++++++++++++++++++++++++++++++++++---------
 block/bfq-iosched.h        |  19 +++--
 block/blk-cgroup.c         |  56 -------------
 include/linux/blk-cgroup.h |  71 -----------------
 4 files changed, 167 insertions(+), 171 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 624374a99c6e..a691dca7e966 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -17,6 +17,124 @@
 
 #if defined(CONFIG_BFQ_GROUP_IOSCHED) &&  defined(CONFIG_DEBUG_BLK_CGROUP)
 
+static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp)
+{
+	int ret;
+
+	ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+	if (ret)
+		return ret;
+
+	atomic64_set(&stat->aux_cnt, 0);
+	return 0;
+}
+
+static void bfq_stat_exit(struct bfq_stat *stat)
+{
+	percpu_counter_destroy(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_add - add a value to a bfq_stat
+ * @stat: target bfq_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
+ */
+static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val)
+{
+	percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * bfq_stat_read - read the current value of a bfq_stat
+ * @stat: bfq_stat to read
+ */
+static inline uint64_t bfq_stat_read(struct bfq_stat *stat)
+{
+	return percpu_counter_sum_positive(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_reset - reset a bfq_stat
+ * @stat: bfq_stat to reset
+ */
+static inline void bfq_stat_reset(struct bfq_stat *stat)
+{
+	percpu_counter_set(&stat->cpu_cnt, 0);
+	atomic64_set(&stat->aux_cnt, 0);
+}
+
+/**
+ * bfq_stat_add_aux - add a bfq_stat into another's aux count
+ * @to: the destination bfq_stat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void bfq_stat_add_aux(struct bfq_stat *to,
+				     struct bfq_stat *from)
+{
+	atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt),
+		     &to->aux_cnt);
+}
+
+/**
+ * bfq_stat_recursive_sum - collect hierarchical bfq_stat
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the bfq_stat
+ * @off: offset to the bfq_stat in blkg_policy_data or @blkg
+ *
+ * Collect the bfq_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, bfq_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
+ */
+static u64 bfq_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off)
+{
+	struct blkcg_gq *pos_blkg;
+	struct cgroup_subsys_state *pos_css;
+	u64 sum = 0;
+
+	lockdep_assert_held(&blkg->q->queue_lock);
+
+	rcu_read_lock();
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct bfq_stat *stat;
+
+		if (!pos_blkg->online)
+			continue;
+
+		if (pol)
+			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			stat = (void *)blkg + off;
+
+		sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt);
+	}
+	rcu_read_unlock();
+
+	return sum;
+}
+
+/**
+ * blkg_prfill_stat - prfill callback for bfq_stat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the bfq_stat in @pd
+ *
+ * prfill callback for printing a bfq_stat.
+ */
+static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
+		int off)
+{
+	return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off));
+}
+
 /* bfqg stats flags */
 enum bfqg_stats_flags {
 	BFQG_stats_waiting = 0,
@@ -53,7 +171,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 
 	now = ktime_get_ns();
 	if (now > stats->start_group_wait_time)
-		blkg_stat_add(&stats->group_wait_time,
+		bfq_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	bfqg_stats_clear_waiting(stats);
 }
@@ -82,14 +200,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 
 	now = ktime_get_ns();
 	if (now > stats->start_empty_time)
-		blkg_stat_add(&stats->empty_time,
+		bfq_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	bfqg_stats_clear_empty(stats);
 }
 
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
 {
-	blkg_stat_add(&bfqg->stats.dequeue, 1);
+	bfq_stat_add(&bfqg->stats.dequeue, 1);
 }
 
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
@@ -119,7 +237,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 		u64 now = ktime_get_ns();
 
 		if (now > stats->start_idle_time)
-			blkg_stat_add(&stats->idle_time,
+			bfq_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		bfqg_stats_clear_idling(stats);
 	}
@@ -137,9 +255,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	blkg_stat_add(&stats->avg_queue_size_sum,
+	bfq_stat_add(&stats->avg_queue_size_sum,
 		      blkg_rwstat_total(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfq_stat_add(&stats->avg_queue_size_samples, 1);
 	bfqg_stats_update_group_wait_time(stats);
 }
 
@@ -279,13 +397,13 @@ static void bfqg_stats_reset(struct bfqg_stats *stats)
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
-	blkg_stat_reset(&stats->time);
-	blkg_stat_reset(&stats->avg_queue_size_sum);
-	blkg_stat_reset(&stats->avg_queue_size_samples);
-	blkg_stat_reset(&stats->dequeue);
-	blkg_stat_reset(&stats->group_wait_time);
-	blkg_stat_reset(&stats->idle_time);
-	blkg_stat_reset(&stats->empty_time);
+	bfq_stat_reset(&stats->time);
+	bfq_stat_reset(&stats->avg_queue_size_sum);
+	bfq_stat_reset(&stats->avg_queue_size_samples);
+	bfq_stat_reset(&stats->dequeue);
+	bfq_stat_reset(&stats->group_wait_time);
+	bfq_stat_reset(&stats->idle_time);
+	bfq_stat_reset(&stats->empty_time);
 #endif
 }
 
@@ -300,14 +418,14 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
 	blkg_rwstat_add_aux(&to->merged, &from->merged);
 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
-	blkg_stat_add_aux(&from->time, &from->time);
-	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_add_aux(&to->avg_queue_size_samples,
+	bfq_stat_add_aux(&from->time, &from->time);
+	bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	bfq_stat_add_aux(&to->avg_queue_size_samples,
 			  &from->avg_queue_size_samples);
-	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
-	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
-	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+	bfq_stat_add_aux(&to->dequeue, &from->dequeue);
+	bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	bfq_stat_add_aux(&to->idle_time, &from->idle_time);
+	bfq_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
@@ -360,13 +478,13 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)
 	blkg_rwstat_exit(&stats->service_time);
 	blkg_rwstat_exit(&stats->wait_time);
 	blkg_rwstat_exit(&stats->queued);
-	blkg_stat_exit(&stats->time);
-	blkg_stat_exit(&stats->avg_queue_size_sum);
-	blkg_stat_exit(&stats->avg_queue_size_samples);
-	blkg_stat_exit(&stats->dequeue);
-	blkg_stat_exit(&stats->group_wait_time);
-	blkg_stat_exit(&stats->idle_time);
-	blkg_stat_exit(&stats->empty_time);
+	bfq_stat_exit(&stats->time);
+	bfq_stat_exit(&stats->avg_queue_size_sum);
+	bfq_stat_exit(&stats->avg_queue_size_samples);
+	bfq_stat_exit(&stats->dequeue);
+	bfq_stat_exit(&stats->group_wait_time);
+	bfq_stat_exit(&stats->idle_time);
+	bfq_stat_exit(&stats->empty_time);
 #endif
 }
 
@@ -377,13 +495,13 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 	    blkg_rwstat_init(&stats->service_time, gfp) ||
 	    blkg_rwstat_init(&stats->wait_time, gfp) ||
 	    blkg_rwstat_init(&stats->queued, gfp) ||
-	    blkg_stat_init(&stats->time, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
-	    blkg_stat_init(&stats->dequeue, gfp) ||
-	    blkg_stat_init(&stats->group_wait_time, gfp) ||
-	    blkg_stat_init(&stats->idle_time, gfp) ||
-	    blkg_stat_init(&stats->empty_time, gfp)) {
+	    bfq_stat_init(&stats->time, gfp) ||
+	    bfq_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    bfq_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    bfq_stat_init(&stats->dequeue, gfp) ||
+	    bfq_stat_init(&stats->group_wait_time, gfp) ||
+	    bfq_stat_init(&stats->idle_time, gfp) ||
+	    bfq_stat_init(&stats->empty_time, gfp)) {
 		bfqg_stats_exit(stats);
 		return -ENOMEM;
 	}
@@ -927,7 +1045,7 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+	u64 sum = bfq_stat_recursive_sum(pd_to_blkg(pd),
 					  &blkcg_policy_bfq, off);
 	return __blkg_prfill_u64(sf, pd, sum);
 }
@@ -996,11 +1114,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
-	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples);
 	u64 v = 0;
 
 	if (samples) {
-		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum);
 		v = div64_u64(v, samples);
 	}
 	__blkg_prfill_u64(sf, pd, v);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index c2faa77824f8..aef4fa0046b8 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -777,6 +777,11 @@ enum bfqq_expiration {
 	BFQQE_PREEMPTED		/* preemption in progress */
 };
 
+struct bfq_stat {
+	struct percpu_counter		cpu_cnt;
+	atomic64_t			aux_cnt;
+};
+
 struct bfqg_stats {
 #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 	/* number of ios merged */
@@ -788,19 +793,19 @@ struct bfqg_stats {
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
 	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
+	struct bfq_stat		time;
 	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
+	struct bfq_stat		avg_queue_size_sum;
 	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
+	struct bfq_stat		avg_queue_size_samples;
 	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
+	struct bfq_stat		dequeue;
 	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
+	struct bfq_stat		group_wait_time;
 	/* time spent idling for this blkcg_gq */
-	struct blkg_stat		idle_time;
+	struct bfq_stat		idle_time;
 	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
+	struct bfq_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
 	u64				start_group_wait_time;
 	u64				start_idle_time;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 664c09866839..53b7bd4c7000 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -572,20 +572,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
 
-/**
- * blkg_prfill_stat - prfill callback for blkg_stat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
- *
- * prfill callback for printing a blkg_stat.
- */
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
-{
-	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
-}
-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
-
 /**
  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
  * @sf: seq_file to print to
@@ -687,48 +673,6 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
 }
 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
 
-/**
- * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @blkg: blkg of interest
- * @pol: blkcg_policy which contains the blkg_stat
- * @off: offset to the blkg_stat in blkg_policy_data or @blkg
- *
- * Collect the blkg_stat specified by @blkg, @pol and @off and all its
- * online descendants and their aux counts.  The caller must be holding the
- * queue lock for online tests.
- *
- * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
- * at @off bytes into @blkg's blkg_policy_data of the policy.
- */
-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
-			    struct blkcg_policy *pol, int off)
-{
-	struct blkcg_gq *pos_blkg;
-	struct cgroup_subsys_state *pos_css;
-	u64 sum = 0;
-
-	lockdep_assert_held(&blkg->q->queue_lock);
-
-	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
-		struct blkg_stat *stat;
-
-		if (!pos_blkg->online)
-			continue;
-
-		if (pol)
-			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
-		else
-			stat = (void *)blkg + off;
-
-		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
-	}
-	rcu_read_unlock();
-
-	return sum;
-}
-EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
-
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
  * @blkg: blkg of interest
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e4a81767e111..33f23a858438 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -65,11 +65,6 @@ struct blkcg {
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
  * recursive.  Used to carry stats of dead children.
  */
-struct blkg_stat {
-	struct percpu_counter		cpu_cnt;
-	atomic64_t			aux_cnt;
-};
-
 struct blkg_rwstat {
 	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
 	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
@@ -217,7 +212,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 			 const struct blkg_rwstat_sample *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
 int blkg_print_stat_bytes(struct seq_file *sf, void *v);
@@ -225,8 +219,6 @@ int blkg_print_stat_ios(struct seq_file *sf, void *v);
 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
-			    struct blkcg_policy *pol, int off);
 void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 		int off, struct blkg_rwstat_sample *sum);
 
@@ -579,69 +571,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
 					      (p_blkg)->q, false)))
 
-static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
-{
-	int ret;
-
-	ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
-	if (ret)
-		return ret;
-
-	atomic64_set(&stat->aux_cnt, 0);
-	return 0;
-}
-
-static inline void blkg_stat_exit(struct blkg_stat *stat)
-{
-	percpu_counter_destroy(&stat->cpu_cnt);
-}
-
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
- * don't re-enter this function for the same counter.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
-	percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
-}
-
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
-	return percpu_counter_sum_positive(&stat->cpu_cnt);
-}
-
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
-	percpu_counter_set(&stat->cpu_cnt, 0);
-	atomic64_set(&stat->aux_cnt, 0);
-}
-
-/**
- * blkg_stat_add_aux - add a blkg_stat into another's aux count
- * @to: the destination blkg_stat
- * @from: the source
- *
- * Add @from's count including the aux one to @to's aux count.
- */
-static inline void blkg_stat_add_aux(struct blkg_stat *to,
-				     struct blkg_stat *from)
-{
-	atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
-		     &to->aux_cnt);
-}
-
 static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
 	int i, ret;
-- 
cgit v1.2.3


From e47bc4eda953928644109101d07c9c95dc29a458 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Thu, 20 Jun 2019 10:59:16 -0700
Subject: block: add centralize REQ_OP_XXX to string helper

In order to centralize the REQ_OP_XXX to string conversion which can be
used in the block layer and different places in the kernel like f2fs,
this patch adds a new helper function along with an array similar to the
one present in the blk-mq-debugfs.c.

We keep this helper functionality centralize under blk-core.c instead of
blk-mq-debugfs.c since blk-core.c is configured using CONFIG_BLOCK and
it will not be dependent on blk-mq-debugfs.c which is configured using
CONFIG_BLK_DEBUG_FS.

Next patch adjusts the code in the blk-mq-debugfs.c with newly
introduced helper.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  3 +++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c97da29ddc07..129204dd3bae 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -120,6 +120,42 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_rq_init);
 
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
+static const char *const blk_op_name[] = {
+	REQ_OP_NAME(READ),
+	REQ_OP_NAME(WRITE),
+	REQ_OP_NAME(FLUSH),
+	REQ_OP_NAME(DISCARD),
+	REQ_OP_NAME(SECURE_ERASE),
+	REQ_OP_NAME(ZONE_RESET),
+	REQ_OP_NAME(WRITE_SAME),
+	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(SCSI_IN),
+	REQ_OP_NAME(SCSI_OUT),
+	REQ_OP_NAME(DRV_IN),
+	REQ_OP_NAME(DRV_OUT),
+};
+#undef REQ_OP_NAME
+
+/**
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
+ * @op: REQ_OP_XXX.
+ *
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
+ * string format. Useful in the debugging and tracing bio or request. For
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
+ */
+inline const char *blk_op_str(unsigned int op)
+{
+	const char *op_str = "UNKNOWN";
+
+	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
+		op_str = blk_op_name[op];
+
+	return op_str;
+}
+EXPORT_SYMBOL_GPL(blk_op_str);
+
 static const struct {
 	int		errno;
 	const char	*name;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d5d3bb45dfb6..0c482371c8b3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -865,6 +865,9 @@ extern void blk_execute_rq(struct request_queue *, struct gendisk *,
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
+/* Helper to convert REQ_OP_XXX to its string format XXX */
+extern const char *blk_op_str(unsigned int op);
+
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-- 
cgit v1.2.3


From 4635873c561ac57b66adfcc2487c38106b1c916c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 28 Apr 2019 15:39:30 +0800
Subject: scsi: lib/sg_pool.c: improve APIs for allocating sg pool

sg_alloc_table_chained() currently allows the caller to provide one
preallocated SGL and returns if the requested number isn't bigger than
size of that SGL. This is used to inline an SGL for an IO request.

However, scattergather code only allows that size of the 1st preallocated
SGL to be SG_CHUNK_SIZE(128). This means a substantial amount of memory
(4KB) is claimed for the SGL for each IO request. If the I/O is small, it
would be prudent to allocate a smaller SGL.

Introduce an extra parameter to sg_alloc_table_chained() and
sg_free_table_chained() for specifying size of the preallocated SGL.

Both __sg_free_table() and __sg_alloc_table() assume that each SGL has the
same size except for the last one.  Change the code to allow both functions
to accept a variable size for the 1st preallocated SGL.

[mkp: attempted to clarify commit desc]

Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: netdev@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/nvme/host/fc.c            |  7 ++++---
 drivers/nvme/host/rdma.c          |  7 ++++---
 drivers/nvme/target/loop.c        |  4 ++--
 drivers/scsi/scsi_lib.c           | 10 ++++++----
 include/linux/scatterlist.h       | 11 +++++++----
 lib/scatterlist.c                 | 36 +++++++++++++++++++++++-------------
 lib/sg_pool.c                     | 37 +++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/svc_rdma_rw.c |  5 +++--
 8 files changed, 76 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index dd8169bbf0d2..46811caac9d2 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2112,7 +2112,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
 
 	freq->sg_table.sgl = freq->first_sgl;
 	ret = sg_alloc_table_chained(&freq->sg_table,
-			blk_rq_nr_phys_segments(rq), freq->sg_table.sgl);
+			blk_rq_nr_phys_segments(rq), freq->sg_table.sgl,
+			SG_CHUNK_SIZE);
 	if (ret)
 		return -ENOMEM;
 
@@ -2122,7 +2123,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
 	freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
 				op->nents, dir);
 	if (unlikely(freq->sg_cnt <= 0)) {
-		sg_free_table_chained(&freq->sg_table, true);
+		sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
 		freq->sg_cnt = 0;
 		return -EFAULT;
 	}
@@ -2148,7 +2149,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
 
 	nvme_cleanup_cmd(rq);
 
-	sg_free_table_chained(&freq->sg_table, true);
+	sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
 
 	freq->sg_cnt = 0;
 }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f383146e7d0f..f7ea19b45798 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1133,7 +1133,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
 				    WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
 	nvme_cleanup_cmd(rq);
-	sg_free_table_chained(&req->sg_table, true);
+	sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
 }
 
 static int nvme_rdma_set_sg_null(struct nvme_command *c)
@@ -1248,7 +1248,8 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 
 	req->sg_table.sgl = req->first_sgl;
 	ret = sg_alloc_table_chained(&req->sg_table,
-			blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
+			blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
+			SG_CHUNK_SIZE);
 	if (ret)
 		return -ENOMEM;
 
@@ -1288,7 +1289,7 @@ out_unmap_sg:
 			req->nents, rq_data_dir(rq) ==
 			WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 out_free_table:
-	sg_free_table_chained(&req->sg_table, true);
+	sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
 	return ret;
 }
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9e211ad6bdd3..b16dc3981c69 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -77,7 +77,7 @@ static void nvme_loop_complete_rq(struct request *req)
 	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
 
 	nvme_cleanup_cmd(req);
-	sg_free_table_chained(&iod->sg_table, true);
+	sg_free_table_chained(&iod->sg_table, SG_CHUNK_SIZE);
 	nvme_complete_rq(req);
 }
 
@@ -157,7 +157,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		iod->sg_table.sgl = iod->first_sgl;
 		if (sg_alloc_table_chained(&iod->sg_table,
 				blk_rq_nr_phys_segments(req),
-				iod->sg_table.sgl))
+				iod->sg_table.sgl, SG_CHUNK_SIZE))
 			return BLK_STS_RESOURCE;
 
 		iod->req.sg = iod->sg_table.sgl;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0916bd6d22b0..acc0f7080f18 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -541,9 +541,9 @@ static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
 static void scsi_mq_free_sgtables(struct scsi_cmnd *cmd)
 {
 	if (cmd->sdb.table.nents)
-		sg_free_table_chained(&cmd->sdb.table, true);
+		sg_free_table_chained(&cmd->sdb.table, SG_CHUNK_SIZE);
 	if (scsi_prot_sg_count(cmd))
-		sg_free_table_chained(&cmd->prot_sdb->table, true);
+		sg_free_table_chained(&cmd->prot_sdb->table, SG_CHUNK_SIZE);
 }
 
 static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
@@ -976,7 +976,8 @@ static blk_status_t scsi_init_sgtable(struct request *req,
 	 * If sg table allocation fails, requeue request later.
 	 */
 	if (unlikely(sg_alloc_table_chained(&sdb->table,
-			blk_rq_nr_phys_segments(req), sdb->table.sgl)))
+			blk_rq_nr_phys_segments(req), sdb->table.sgl,
+			SG_CHUNK_SIZE)))
 		return BLK_STS_RESOURCE;
 
 	/* 
@@ -1030,7 +1031,8 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
 		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
 
 		if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
-				prot_sdb->table.sgl)) {
+				prot_sdb->table.sgl,
+				SG_CHUNK_SIZE)) {
 			ret = BLK_STS_RESOURCE;
 			goto out_free_sgtables;
 		}
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 30a9a55c28ba..6eec50fb36c8 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -266,10 +266,11 @@ int sg_split(struct scatterlist *in, const int in_mapped_nents,
 typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
 typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
 
-void __sg_free_table(struct sg_table *, unsigned int, bool, sg_free_fn *);
+void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
+		     sg_free_fn *);
 void sg_free_table(struct sg_table *);
 int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
-		     struct scatterlist *, gfp_t, sg_alloc_fn *);
+		     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
 int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
 int __sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
 				unsigned int n_pages, unsigned int offset,
@@ -331,9 +332,11 @@ size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
 #endif
 
 #ifdef CONFIG_SG_POOL
-void sg_free_table_chained(struct sg_table *table, bool first_chunk);
+void sg_free_table_chained(struct sg_table *table,
+			   unsigned nents_first_chunk);
 int sg_alloc_table_chained(struct sg_table *table, int nents,
-			   struct scatterlist *first_chunk);
+			   struct scatterlist *first_chunk,
+			   unsigned nents_first_chunk);
 #endif
 
 /*
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 739dc9fe2c55..77ec8eec3fd0 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -181,7 +181,8 @@ static void sg_kfree(struct scatterlist *sg, unsigned int nents)
  * __sg_free_table - Free a previously mapped sg table
  * @table:	The sg table header to use
  * @max_ents:	The maximum number of entries per single scatterlist
- * @skip_first_chunk: don't free the (preallocated) first scatterlist chunk
+ * @nents_first_chunk: Number of entries int the (preallocated) first
+ * 	scatterlist chunk, 0 means no such preallocated first chunk
  * @free_fn:	Free function
  *
  *  Description:
@@ -191,9 +192,10 @@ static void sg_kfree(struct scatterlist *sg, unsigned int nents)
  *
  **/
 void __sg_free_table(struct sg_table *table, unsigned int max_ents,
-		     bool skip_first_chunk, sg_free_fn *free_fn)
+		     unsigned int nents_first_chunk, sg_free_fn *free_fn)
 {
 	struct scatterlist *sgl, *next;
+	unsigned curr_max_ents = nents_first_chunk ?: max_ents;
 
 	if (unlikely(!table->sgl))
 		return;
@@ -209,9 +211,9 @@ void __sg_free_table(struct sg_table *table, unsigned int max_ents,
 		 * sg_size is then one less than alloc size, since the last
 		 * element is the chain pointer.
 		 */
-		if (alloc_size > max_ents) {
-			next = sg_chain_ptr(&sgl[max_ents - 1]);
-			alloc_size = max_ents;
+		if (alloc_size > curr_max_ents) {
+			next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
+			alloc_size = curr_max_ents;
 			sg_size = alloc_size - 1;
 		} else {
 			sg_size = alloc_size;
@@ -219,11 +221,12 @@ void __sg_free_table(struct sg_table *table, unsigned int max_ents,
 		}
 
 		table->orig_nents -= sg_size;
-		if (skip_first_chunk)
-			skip_first_chunk = false;
+		if (nents_first_chunk)
+			nents_first_chunk = 0;
 		else
 			free_fn(sgl, alloc_size);
 		sgl = next;
+		curr_max_ents = max_ents;
 	}
 
 	table->sgl = NULL;
@@ -246,6 +249,8 @@ EXPORT_SYMBOL(sg_free_table);
  * @table:	The sg table header to use
  * @nents:	Number of entries in sg list
  * @max_ents:	The maximum number of entries the allocator returns per call
+ * @nents_first_chunk: Number of entries int the (preallocated) first
+ * 	scatterlist chunk, 0 means no such preallocated chunk provided by user
  * @gfp_mask:	GFP allocation mask
  * @alloc_fn:	Allocator to use
  *
@@ -262,10 +267,13 @@ EXPORT_SYMBOL(sg_free_table);
  **/
 int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 		     unsigned int max_ents, struct scatterlist *first_chunk,
-		     gfp_t gfp_mask, sg_alloc_fn *alloc_fn)
+		     unsigned int nents_first_chunk, gfp_t gfp_mask,
+		     sg_alloc_fn *alloc_fn)
 {
 	struct scatterlist *sg, *prv;
 	unsigned int left;
+	unsigned curr_max_ents = nents_first_chunk ?: max_ents;
+	unsigned prv_max_ents;
 
 	memset(table, 0, sizeof(*table));
 
@@ -281,8 +289,8 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 	do {
 		unsigned int sg_size, alloc_size = left;
 
-		if (alloc_size > max_ents) {
-			alloc_size = max_ents;
+		if (alloc_size > curr_max_ents) {
+			alloc_size = curr_max_ents;
 			sg_size = alloc_size - 1;
 		} else
 			sg_size = alloc_size;
@@ -316,7 +324,7 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 		 * If this is not the first mapping, chain previous part.
 		 */
 		if (prv)
-			sg_chain(prv, max_ents, sg);
+			sg_chain(prv, prv_max_ents, sg);
 		else
 			table->sgl = sg;
 
@@ -327,6 +335,8 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 			sg_mark_end(&sg[sg_size - 1]);
 
 		prv = sg;
+		prv_max_ents = curr_max_ents;
+		curr_max_ents = max_ents;
 	} while (left);
 
 	return 0;
@@ -349,9 +359,9 @@ int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
 	int ret;
 
 	ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
-			       NULL, gfp_mask, sg_kmalloc);
+			       NULL, 0, gfp_mask, sg_kmalloc);
 	if (unlikely(ret))
-		__sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
+		__sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree);
 
 	return ret;
 }
diff --git a/lib/sg_pool.c b/lib/sg_pool.c
index d1c1e6388eaa..b3b8cf62ff49 100644
--- a/lib/sg_pool.c
+++ b/lib/sg_pool.c
@@ -69,18 +69,27 @@ static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
 /**
  * sg_free_table_chained - Free a previously mapped sg table
  * @table:	The sg table header to use
- * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained?
+ * @nents_first_chunk: size of the first_chunk SGL passed to
+ *		sg_alloc_table_chained
  *
  *  Description:
  *    Free an sg table previously allocated and setup with
  *    sg_alloc_table_chained().
  *
+ *    @nents_first_chunk has to be same with that same parameter passed
+ *    to sg_alloc_table_chained().
+ *
  **/
-void sg_free_table_chained(struct sg_table *table, bool first_chunk)
+void sg_free_table_chained(struct sg_table *table,
+		unsigned nents_first_chunk)
 {
-	if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE)
+	if (table->orig_nents <= nents_first_chunk)
 		return;
-	__sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free);
+
+	if (nents_first_chunk == 1)
+		nents_first_chunk = 0;
+
+	__sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free);
 }
 EXPORT_SYMBOL_GPL(sg_free_table_chained);
 
@@ -89,31 +98,39 @@ EXPORT_SYMBOL_GPL(sg_free_table_chained);
  * @table:	The sg table header to use
  * @nents:	Number of entries in sg list
  * @first_chunk: first SGL
+ * @nents_first_chunk: number of the SGL of @first_chunk
  *
  *  Description:
  *    Allocate and chain SGLs in an sg table. If @nents@ is larger than
- *    SG_CHUNK_SIZE a chained sg table will be setup.
+ *    @nents_first_chunk a chained sg table will be setup.
  *
  **/
 int sg_alloc_table_chained(struct sg_table *table, int nents,
-		struct scatterlist *first_chunk)
+		struct scatterlist *first_chunk, unsigned nents_first_chunk)
 {
 	int ret;
 
 	BUG_ON(!nents);
 
-	if (first_chunk) {
-		if (nents <= SG_CHUNK_SIZE) {
+	if (first_chunk && nents_first_chunk) {
+		if (nents <= nents_first_chunk) {
 			table->nents = table->orig_nents = nents;
 			sg_init_table(table->sgl, nents);
 			return 0;
 		}
 	}
 
+	/* User supposes that the 1st SGL includes real entry */
+	if (nents_first_chunk == 1) {
+		first_chunk = NULL;
+		nents_first_chunk = 0;
+	}
+
 	ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
-			       first_chunk, GFP_ATOMIC, sg_pool_alloc);
+			       first_chunk, nents_first_chunk,
+			       GFP_ATOMIC, sg_pool_alloc);
 	if (unlikely(ret))
-		sg_free_table_chained(table, (bool)first_chunk);
+		sg_free_table_chained(table, nents_first_chunk);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sg_alloc_table_chained);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 2121c9b4d275..48fe3b16b0d9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -73,7 +73,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
 
 	ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
 	if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
-				   ctxt->rw_sg_table.sgl)) {
+				   ctxt->rw_sg_table.sgl,
+				   SG_CHUNK_SIZE)) {
 		kfree(ctxt);
 		ctxt = NULL;
 	}
@@ -84,7 +85,7 @@ out:
 static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_rw_ctxt *ctxt)
 {
-	sg_free_table_chained(&ctxt->rw_sg_table, true);
+	sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
 
 	spin_lock(&rdma->sc_rw_ctxt_lock);
 	list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
-- 
cgit v1.2.3


From aa0bfcd939c30617385ffa28682c062d78050eba Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@chromium.org>
Date: Thu, 20 Jun 2019 17:05:37 -0400
Subject: mm: add filemap_fdatawait_range_keep_errors()

In the spirit of filemap_fdatawait_range() and
filemap_fdatawait_keep_errors(), introduce
filemap_fdatawait_range_keep_errors() which both takes a range upon
which to wait and does not clear errors from the address space.

Signed-off-by: Ross Zwisler <zwisler@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..79fec8a8413f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2712,6 +2712,8 @@ extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
+extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+		loff_t start_byte, loff_t end_byte);
 
 static inline int filemap_fdatawait(struct address_space *mapping)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index df2006ba0cfa..e87252ca0835 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -553,6 +553,28 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
+/**
+ * filemap_fdatawait_range_keep_errors - wait for writeback to complete
+ * @mapping:		address space structure to wait for
+ * @start_byte:		offset in bytes where the range starts
+ * @end_byte:		offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space in the
+ * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
+ * this function does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+		loff_t start_byte, loff_t end_byte)
+{
+	__filemap_fdatawait_range(mapping, start_byte, end_byte);
+	return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
+
 /**
  * file_fdatawait_range - wait for writeback to complete
  * @file:		file pointing to address space structure to wait for
-- 
cgit v1.2.3


From 6ba0e7dc64a5adcda2fbe65adc466891795d639e Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@chromium.org>
Date: Thu, 20 Jun 2019 17:24:56 -0400
Subject: jbd2: introduce jbd2_inode dirty range scoping

Currently both journal_submit_inode_data_buffers() and
journal_finish_inode_data_buffers() operate on the entire address space
of each of the inodes associated with a given journal entry.  The
consequence of this is that if we have an inode where we are constantly
appending dirty pages we can end up waiting for an indefinite amount of
time in journal_finish_inode_data_buffers() while we wait for all the
pages under writeback to be written out.

The easiest way to cause this type of workload is do just dd from
/dev/zero to a file until it fills the entire filesystem.  This can
cause journal_finish_inode_data_buffers() to wait for the duration of
the entire dd operation.

We can improve this situation by scoping each of the inode dirty ranges
associated with a given transaction.  We do this via the jbd2_inode
structure so that the scoping is contained within jbd2 and so that it
follows the lifetime and locking rules for that structure.

This allows us to limit the writeback & wait in
journal_submit_inode_data_buffers() and
journal_finish_inode_data_buffers() respectively to the dirty range for
a given struct jdb2_inode, keeping us from waiting forever if the inode
in question is still being appended to.

Signed-off-by: Ross Zwisler <zwisler@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 fs/jbd2/commit.c      | 23 +++++++++++++++++------
 fs/jbd2/journal.c     |  4 ++++
 fs/jbd2/transaction.c | 49 ++++++++++++++++++++++++++++---------------------
 include/linux/jbd2.h  | 22 ++++++++++++++++++++++
 4 files changed, 71 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c8c1d6cc6e5d..132fb92098c7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -187,14 +187,15 @@ static int journal_wait_on_commit_record(journal_t *journal,
  * use writepages() because with delayed allocation we may be doing
  * block allocation in writepages().
  */
-static int journal_submit_inode_data_buffers(struct address_space *mapping)
+static int journal_submit_inode_data_buffers(struct address_space *mapping,
+		loff_t dirty_start, loff_t dirty_end)
 {
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode =  WB_SYNC_ALL,
 		.nr_to_write = mapping->nrpages * 2,
-		.range_start = 0,
-		.range_end = i_size_read(mapping->host),
+		.range_start = dirty_start,
+		.range_end = dirty_end,
 	};
 
 	ret = generic_writepages(mapping, &wbc);
@@ -218,6 +219,9 @@ static int journal_submit_data_buffers(journal_t *journal,
 
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		loff_t dirty_start = jinode->i_dirty_start;
+		loff_t dirty_end = jinode->i_dirty_end;
+
 		if (!(jinode->i_flags & JI_WRITE_DATA))
 			continue;
 		mapping = jinode->i_vfs_inode->i_mapping;
@@ -230,7 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 		 * only allocated blocks here.
 		 */
 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
-		err = journal_submit_inode_data_buffers(mapping);
+		err = journal_submit_inode_data_buffers(mapping, dirty_start,
+				dirty_end);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -257,12 +262,16 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		loff_t dirty_start = jinode->i_dirty_start;
+		loff_t dirty_end = jinode->i_dirty_end;
+
 		if (!(jinode->i_flags & JI_WAIT_DATA))
 			continue;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
-		err = filemap_fdatawait_keep_errors(
-				jinode->i_vfs_inode->i_mapping);
+		err = filemap_fdatawait_range_keep_errors(
+				jinode->i_vfs_inode->i_mapping, dirty_start,
+				dirty_end);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -282,6 +291,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 				&jinode->i_transaction->t_inode_list);
 		} else {
 			jinode->i_transaction = NULL;
+			jinode->i_dirty_start = 0;
+			jinode->i_dirty_end = 0;
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 38b426c5ed03..17f679aeba7c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,6 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
 EXPORT_SYMBOL(jbd2_journal_inode_add_write);
 EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -2574,6 +2576,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
 	jinode->i_next_transaction = NULL;
 	jinode->i_vfs_inode = inode;
 	jinode->i_flags = 0;
+	jinode->i_dirty_start = 0;
+	jinode->i_dirty_end = 0;
 	INIT_LIST_HEAD(&jinode->i_list);
 }
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8ca4fddc705f..990e7b5062e7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2565,7 +2565,7 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
  * File inode in the inode list of the handle's transaction
  */
 static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
-				   unsigned long flags)
+		unsigned long flags, loff_t start_byte, loff_t end_byte)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
@@ -2577,26 +2577,17 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
 
-	/*
-	 * First check whether inode isn't already on the transaction's
-	 * lists without taking the lock. Note that this check is safe
-	 * without the lock as we cannot race with somebody removing inode
-	 * from the transaction. The reason is that we remove inode from the
-	 * transaction only in journal_release_jbd_inode() and when we commit
-	 * the transaction. We are guarded from the first case by holding
-	 * a reference to the inode. We are safe against the second case
-	 * because if jinode->i_transaction == transaction, commit code
-	 * cannot touch the transaction because we hold reference to it,
-	 * and if jinode->i_next_transaction == transaction, commit code
-	 * will only file the inode where we want it.
-	 */
-	if ((jinode->i_transaction == transaction ||
-	    jinode->i_next_transaction == transaction) &&
-	    (jinode->i_flags & flags) == flags)
-		return 0;
-
 	spin_lock(&journal->j_list_lock);
 	jinode->i_flags |= flags;
+
+	if (jinode->i_dirty_end) {
+		jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
+		jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
+	} else {
+		jinode->i_dirty_start = start_byte;
+		jinode->i_dirty_end = end_byte;
+	}
+
 	/* Is inode already attached where we need it? */
 	if (jinode->i_transaction == transaction ||
 	    jinode->i_next_transaction == transaction)
@@ -2631,12 +2622,28 @@ done:
 int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
 {
 	return jbd2_journal_file_inode(handle, jinode,
-				       JI_WRITE_DATA | JI_WAIT_DATA);
+			JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
 }
 
 int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
 {
-	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
+			LLONG_MAX);
+}
+
+int jbd2_journal_inode_ranged_write(handle_t *handle,
+		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
+{
+	return jbd2_journal_file_inode(handle, jinode,
+			JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
+			start_byte + length - 1);
+}
+
+int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
+		loff_t start_byte, loff_t length)
+{
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
+			start_byte, start_byte + length - 1);
 }
 
 /*
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5c04181b7c6d..0e0393e7f41a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -451,6 +451,22 @@ struct jbd2_inode {
 	 * @i_flags: Flags of inode [j_list_lock]
 	 */
 	unsigned long i_flags;
+
+	/**
+	 * @i_dirty_start:
+	 *
+	 * Offset in bytes where the dirty range for this inode starts.
+	 * [j_list_lock]
+	 */
+	loff_t i_dirty_start;
+
+	/**
+	 * @i_dirty_end:
+	 *
+	 * Inclusive offset in bytes where the dirty range for this inode
+	 * ends. [j_list_lock]
+	 */
+	loff_t i_dirty_end;
 };
 
 struct jbd2_revoke_table_s;
@@ -1397,6 +1413,12 @@ extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_force_commit_nested(journal_t *);
 extern int	   jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_ranged_write(handle_t *handle,
+			struct jbd2_inode *inode, loff_t start_byte,
+			loff_t length);
+extern int	   jbd2_journal_inode_ranged_wait(handle_t *handle,
+			struct jbd2_inode *inode, loff_t start_byte,
+			loff_t length);
 extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
 				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
-- 
cgit v1.2.3


From 9382cde8cd8fb941fc333b644a5772d02e1ff924 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 20 Jun 2019 17:32:21 -0400
Subject: jbd2: drop declaration of journal_sync_buffer()

The journal_sync_buffer() function was never carried over from jbd to
jbd2.  So get rid of the vestigal declaration of this (non-existent)
function.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/jbd2/journal.c    | 3 ---
 include/linux/jbd2.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 17f679aeba7c..953990eb70a9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -66,9 +66,6 @@ EXPORT_SYMBOL(jbd2_journal_get_undo_access);
 EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_forget);
-#if 0
-EXPORT_SYMBOL(journal_sync_buffer);
-#endif
 EXPORT_SYMBOL(jbd2_journal_flush);
 EXPORT_SYMBOL(jbd2_journal_revoke);
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 0e0393e7f41a..df03825ad1a1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1373,7 +1373,6 @@ void		 jbd2_journal_set_triggers(struct buffer_head *,
 					   struct jbd2_buffer_trigger_type *type);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
-extern void	 journal_sync_buffer (struct buffer_head *);
 extern int	 jbd2_journal_invalidatepage(journal_t *,
 				struct page *, unsigned int, unsigned int);
 extern int	 jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
-- 
cgit v1.2.3


From 4a96895f74c9633b51427fd080ab70fa62b65bc4 Mon Sep 17 00:00:00 2001
From: Yegor Yefremov <yegorslists@googlemail.com>
Date: Thu, 20 Jun 2019 08:24:20 +0200
Subject: tty/serial/8250: use mctrl_gpio helpers

This patch permits the usage for GPIOs to control
the CTS/RTS/DTR/DSR/DCD/RI signals.

Changed by Stefan:
Only call mctrl_gpio_init(), if the device has no ACPI companion device
to not break existing ACPI based systems. Also only use the mctrl_gpio_
functions when "gpios" is available.

Use MSR / MCR <-> TIOCM wrapper functions.

Signed-off-by: Yegor Yefremov <yegorslists@googlemail.com>
Signed-off-by: Stefan Roese <sr@denx.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Tested-by: Yegor Yefremov <yegorslists@googlemail.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Giulio Benetti <giulio.benetti@micronovasrl.com>
Cc: Yegor Yefremov <yegorslists@googlemail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/devicetree/bindings/serial/8250.txt | 19 +++++++++++++++
 drivers/tty/serial/8250/8250.h                    | 18 +++++++++++++-
 drivers/tty/serial/8250/8250_core.c               | 17 +++++++++++++
 drivers/tty/serial/8250/8250_omap.c               | 29 +++++++++++++----------
 drivers/tty/serial/8250/8250_port.c               | 11 ++++++++-
 drivers/tty/serial/8250/Kconfig                   |  1 +
 include/linux/serial_8250.h                       |  1 +
 7 files changed, 81 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt
index 3cba12f855b7..20d351f268ef 100644
--- a/Documentation/devicetree/bindings/serial/8250.txt
+++ b/Documentation/devicetree/bindings/serial/8250.txt
@@ -53,6 +53,9 @@ Optional properties:
   programmable TX FIFO thresholds.
 - resets : phandle + reset specifier pairs
 - overrun-throttle-ms : how long to pause uart rx when input overrun is encountered.
+- {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD
+  line respectively. It will use specified GPIO instead of the peripheral
+  function pin for the UART feature. If unsure, don't specify this property.
 
 Note:
 * fsl,ns16550:
@@ -74,3 +77,19 @@ Example:
 		interrupts = <10>;
 		reg-shift = <2>;
 	};
+
+Example for OMAP UART using GPIO-based modem control signals:
+
+	uart4: serial@49042000 {
+		compatible = "ti,omap3-uart";
+		reg = <0x49042000 0x400>;
+		interrupts = <80>;
+		ti,hwmods = "uart4";
+		clock-frequency = <48000000>;
+		cts-gpios = <&gpio3 5 GPIO_ACTIVE_LOW>;
+		rts-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>;
+		dtr-gpios = <&gpio1 12 GPIO_ACTIVE_LOW>;
+		dsr-gpios = <&gpio1 13 GPIO_ACTIVE_LOW>;
+		dcd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
+		rng-gpios = <&gpio1 15 GPIO_ACTIVE_LOW>;
+	};
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 57db8c1689af..33ad9d6de532 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -11,6 +11,8 @@
 #include <linux/serial_reg.h>
 #include <linux/dmaengine.h>
 
+#include "../serial_mctrl_gpio.h"
+
 struct uart_8250_dma {
 	int (*tx_dma)(struct uart_8250_port *p);
 	int (*rx_dma)(struct uart_8250_port *p);
@@ -214,11 +216,25 @@ static inline int serial8250_MSR_to_TIOCM(int msr)
 static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
 {
 	serial_out(up, UART_MCR, value);
+
+	if (up->gpios)
+		mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value));
 }
 
 static inline int serial8250_in_MCR(struct uart_8250_port *up)
 {
-	return serial_in(up, UART_MCR);
+	int mctrl;
+
+	mctrl = serial_in(up, UART_MCR);
+
+	if (up->gpios) {
+		unsigned int mctrl_gpio = 0;
+
+		mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
+		mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio);
+	}
+
+	return mctrl;
 }
 
 #if defined(__alpha__) && !defined(CONFIG_PCI)
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index e441221e04b9..a4470771005f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -14,6 +14,7 @@
  *	      serial8250_register_8250_port() ports
  */
 
+#include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/ioport.h>
@@ -982,6 +983,8 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 
 	uart = serial8250_find_match_or_unused(&up->port);
 	if (uart && uart->port.type != PORT_8250_CIR) {
+		struct mctrl_gpios *gpios;
+
 		if (uart->port.dev)
 			uart_remove_one_port(&serial8250_reg, &uart->port);
 
@@ -1016,6 +1019,20 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		if (up->port.flags & UPF_FIXED_TYPE)
 			uart->port.type = up->port.type;
 
+		/*
+		 * Only call mctrl_gpio_init(), if the device has no ACPI
+		 * companion device
+		 */
+		if (!has_acpi_companion(uart->port.dev)) {
+			gpios = mctrl_gpio_init(&uart->port, 0);
+			if (IS_ERR(gpios)) {
+				if (PTR_ERR(gpios) != -ENOSYS)
+					return PTR_ERR(gpios);
+			} else {
+				uart->gpios = gpios;
+			}
+		}
+
 		serial8250_set_defaults(uart);
 
 		/* Possibly override default I/O functions.  */
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index ed25cfc3be13..3ef65cbd2478 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -141,18 +141,20 @@ static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
 
 	serial8250_do_set_mctrl(port, mctrl);
 
-	/*
-	 * Turn off autoRTS if RTS is lowered and restore autoRTS setting
-	 * if RTS is raised
-	 */
-	lcr = serial_in(up, UART_LCR);
-	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-	if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
-		priv->efr |= UART_EFR_RTS;
-	else
-		priv->efr &= ~UART_EFR_RTS;
-	serial_out(up, UART_EFR, priv->efr);
-	serial_out(up, UART_LCR, lcr);
+	if (!up->gpios) {
+		/*
+		 * Turn off autoRTS if RTS is lowered and restore autoRTS
+		 * setting if RTS is raised
+		 */
+		lcr = serial_in(up, UART_LCR);
+		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+		if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
+			priv->efr |= UART_EFR_RTS;
+		else
+			priv->efr &= ~UART_EFR_RTS;
+		serial_out(up, UART_EFR, priv->efr);
+		serial_out(up, UART_LCR, lcr);
+	}
 }
 
 /*
@@ -453,7 +455,8 @@ static void omap_8250_set_termios(struct uart_port *port,
 	priv->efr = 0;
 	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
 
-	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
+	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW &&
+	    !up->gpios) {
 		/* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */
 		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		priv->efr |= UART_EFR_CTS;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 74d81f1701ed..a6fabc7e3b13 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1656,6 +1656,8 @@ static void serial8250_disable_ms(struct uart_port *port)
 	if (up->bugs & UART_BUG_NOMSR)
 		return;
 
+	mctrl_gpio_disable_ms(up->gpios);
+
 	up->ier &= ~UART_IER_MSI;
 	serial_port_out(port, UART_IER, up->ier);
 }
@@ -1668,6 +1670,8 @@ static void serial8250_enable_ms(struct uart_port *port)
 	if (up->bugs & UART_BUG_NOMSR)
 		return;
 
+	mctrl_gpio_enable_ms(up->gpios);
+
 	up->ier |= UART_IER_MSI;
 
 	serial8250_rpm_get(up);
@@ -1939,12 +1943,17 @@ unsigned int serial8250_do_get_mctrl(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int status;
+	unsigned int val;
 
 	serial8250_rpm_get(up);
 	status = serial8250_modem_status(up);
 	serial8250_rpm_put(up);
 
-	return serial8250_MSR_to_TIOCM(status);
+	val = serial8250_MSR_to_TIOCM(status);
+	if (up->gpios)
+		return mctrl_gpio_get(up->gpios, &val);
+
+	return val;
 }
 EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
 
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 296115f6a4d8..509f6a3bb9ff 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -8,6 +8,7 @@ config SERIAL_8250
 	tristate "8250/16550 and compatible serial support"
 	depends on !S390
 	select SERIAL_CORE
+	select SERIAL_MCTRL_GPIO if GPIOLIB
 	---help---
 	  This selects whether you want to include the driver for the standard
 	  serial ports.  The standard answer is Y.  People who might say N
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 5e0b59422a68..bb2bc99388ca 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -110,6 +110,7 @@ struct uart_8250_port {
 						 *   if no_console_suspend
 						 */
 	unsigned char		probe;
+	struct mctrl_gpios	*gpios;
 #define UART_PROBE_RSA	(1 << 0)
 
 	/*
-- 
cgit v1.2.3


From 150d71f725fd2f5a0015b7fa8df0816a207d4e4b Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 14 May 2019 14:58:03 -0700
Subject: nvmet-fc: add transport discovery change event callback support

This patch adds support for the nvmet discovery_change transport op.
In turn, the transport adds it's own LLDD api callback discovery_event
op to request the LLDD to generate an RSCN for the discovery change.

Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Arun Easi <aeasi@marvell.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c       | 11 +++++++++++
 include/linux/nvme-fc-driver.h |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 508661af0f50..1f252c9a953a 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -2549,6 +2549,16 @@ nvmet_fc_remove_port(struct nvmet_port *port)
 	kfree(pe);
 }
 
+static void
+nvmet_fc_discovery_chg(struct nvmet_port *port)
+{
+	struct nvmet_fc_port_entry *pe = port->priv;
+	struct nvmet_fc_tgtport *tgtport = pe->tgtport;
+
+	if (tgtport && tgtport->ops->discovery_event)
+		tgtport->ops->discovery_event(&tgtport->fc_target_port);
+}
+
 static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
 	.owner			= THIS_MODULE,
 	.type			= NVMF_TRTYPE_FC,
@@ -2557,6 +2567,7 @@ static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
 	.remove_port		= nvmet_fc_remove_port,
 	.queue_response		= nvmet_fc_fcp_nvme_cmd_done,
 	.delete_ctrl		= nvmet_fc_delete_ctrl,
+	.discovery_chg		= nvmet_fc_discovery_chg,
 };
 
 static int __init nvmet_fc_init_module(void)
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index c48e96436f56..98d904961b33 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -791,6 +791,11 @@ struct nvmet_fc_target_port {
  *       nvmefc_tgt_fcp_req.
  *       Entrypoint is Optional.
  *
+ * @discovery_event:  Called by the transport to generate an RSCN
+ *       change notifications to NVME initiators. The RSCN notifications
+ *       should cause the initiator to rescan the discovery controller
+ *       on the targetport.
+ *
  * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
  *       supports for cpu affinitization.
  *       Value is Mandatory. Must be at least 1.
@@ -832,6 +837,7 @@ struct nvmet_fc_target_template {
 				struct nvmefc_tgt_fcp_req *fcpreq);
 	void (*defer_rcv)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_fcp_req *fcpreq);
+	void (*discovery_event)(struct nvmet_fc_target_port *tgtport);
 
 	u32	max_hw_queues;
 	u16	max_sgl_segments;
-- 
cgit v1.2.3


From 7a1f46e3f75cff5042dfa1bb80c9929a0e412abc Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Thu, 6 Jun 2019 14:30:14 +0900
Subject: nvme: introduce nvme_is_fabrics to check fabrics cmd

This patch introduces a nvme_is_fabrics() inline function to check
whether or not the given command structure is for fabrics.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c       | 2 +-
 drivers/nvme/target/core.c        | 2 +-
 drivers/nvme/target/fabrics-cmd.c | 2 +-
 drivers/nvme/target/fc.c          | 2 +-
 include/linux/nvme.h              | 7 ++++++-
 5 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5838f7cd53ac..1994d5b42f94 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -578,7 +578,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 	switch (ctrl->state) {
 	case NVME_CTRL_NEW:
 	case NVME_CTRL_CONNECTING:
-		if (req->cmd->common.opcode == nvme_fabrics_command &&
+		if (nvme_is_fabrics(req->cmd) &&
 		    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
 			return true;
 		break;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 43e8c4adc1f4..0587707b1a25 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -873,7 +873,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		status = nvmet_parse_connect_cmd(req);
 	else if (likely(req->sq->qid != 0))
 		status = nvmet_parse_io_cmd(req);
-	else if (req->cmd->common.opcode == nvme_fabrics_command)
+	else if (nvme_is_fabrics(req->cmd))
 		status = nvmet_parse_fabrics_cmd(req);
 	else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
 		status = nvmet_parse_discovery_cmd(req);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 3b9f79aba98f..d16b55ffe79f 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -268,7 +268,7 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
 
-	if (cmd->common.opcode != nvme_fabrics_command) {
+	if (!nvme_is_fabrics(cmd)) {
 		pr_err("invalid command 0x%x on unconnected queue.\n",
 			cmd->fabrics.opcode);
 		req->error_loc = offsetof(struct nvme_common_command, opcode);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 1f252c9a953a..ce8d819f86cc 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1806,7 +1806,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 	 */
 	rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
 	if (!(rspcnt % fod->queue->ersp_ratio) ||
-	    sqe->opcode == nvme_fabrics_command ||
+	    nvme_is_fabrics((struct nvme_command *) sqe) ||
 	    xfr_length != fod->req.transfer_len ||
 	    (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
 	    (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8028adacaff3..7080923e78d1 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1165,6 +1165,11 @@ struct nvme_command {
 	};
 };
 
+static inline bool nvme_is_fabrics(struct nvme_command *cmd)
+{
+	return cmd->common.opcode == nvme_fabrics_command;
+}
+
 struct nvme_error_slot {
 	__le64		error_count;
 	__le16		sqid;
@@ -1186,7 +1191,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd)
 	 *
 	 * Why can't we simply have a Fabrics In and Fabrics out command?
 	 */
-	if (unlikely(cmd->common.opcode == nvme_fabrics_command))
+	if (unlikely(nvme_is_fabrics(cmd)))
 		return cmd->fabrics.fctype & 1;
 	return cmd->common.opcode & 1;
 }
-- 
cgit v1.2.3


From 26f2990d85838caa650744a0ded9e38988a2bd7f Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Wed, 12 Jun 2019 21:45:30 +0900
Subject: nvme-trace: move opcode symbol print to nvme.h

The following patches are going to provide the target-side trace which
might need these kind of macros.  It would be great if it can be shared
between host and target side both.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.h | 44 --------------------------------------------
 include/linux/nvme.h      | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index e71502d141ed..62ee29107c32 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -16,50 +16,6 @@
 
 #include "nvme.h"
 
-#define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
-#define show_admin_opcode_name(val)					\
-	__print_symbolic(val,						\
-		nvme_admin_opcode_name(nvme_admin_delete_sq),		\
-		nvme_admin_opcode_name(nvme_admin_create_sq),		\
-		nvme_admin_opcode_name(nvme_admin_get_log_page),	\
-		nvme_admin_opcode_name(nvme_admin_delete_cq),		\
-		nvme_admin_opcode_name(nvme_admin_create_cq),		\
-		nvme_admin_opcode_name(nvme_admin_identify),		\
-		nvme_admin_opcode_name(nvme_admin_abort_cmd),		\
-		nvme_admin_opcode_name(nvme_admin_set_features),	\
-		nvme_admin_opcode_name(nvme_admin_get_features),	\
-		nvme_admin_opcode_name(nvme_admin_async_event),		\
-		nvme_admin_opcode_name(nvme_admin_ns_mgmt),		\
-		nvme_admin_opcode_name(nvme_admin_activate_fw),		\
-		nvme_admin_opcode_name(nvme_admin_download_fw),		\
-		nvme_admin_opcode_name(nvme_admin_ns_attach),		\
-		nvme_admin_opcode_name(nvme_admin_keep_alive),		\
-		nvme_admin_opcode_name(nvme_admin_directive_send),	\
-		nvme_admin_opcode_name(nvme_admin_directive_recv),	\
-		nvme_admin_opcode_name(nvme_admin_dbbuf),		\
-		nvme_admin_opcode_name(nvme_admin_format_nvm),		\
-		nvme_admin_opcode_name(nvme_admin_security_send),	\
-		nvme_admin_opcode_name(nvme_admin_security_recv),	\
-		nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
-
-#define nvme_opcode_name(opcode)	{ opcode, #opcode }
-#define show_nvm_opcode_name(val)				\
-	__print_symbolic(val,					\
-		nvme_opcode_name(nvme_cmd_flush),		\
-		nvme_opcode_name(nvme_cmd_write),		\
-		nvme_opcode_name(nvme_cmd_read),		\
-		nvme_opcode_name(nvme_cmd_write_uncor),		\
-		nvme_opcode_name(nvme_cmd_compare),		\
-		nvme_opcode_name(nvme_cmd_write_zeroes),	\
-		nvme_opcode_name(nvme_cmd_dsm),			\
-		nvme_opcode_name(nvme_cmd_resv_register),	\
-		nvme_opcode_name(nvme_cmd_resv_report),		\
-		nvme_opcode_name(nvme_cmd_resv_acquire),	\
-		nvme_opcode_name(nvme_cmd_resv_release))
-
-#define show_opcode_name(qid, opcode)					\
-	(qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
-
 const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
 		u8 *cdw10);
 const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7080923e78d1..86b3d04baf20 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -562,6 +562,22 @@ enum nvme_opcode {
 	nvme_cmd_resv_release	= 0x15,
 };
 
+#define nvme_opcode_name(opcode)	{ opcode, #opcode }
+#define show_nvm_opcode_name(val)				\
+	__print_symbolic(val,					\
+		nvme_opcode_name(nvme_cmd_flush),		\
+		nvme_opcode_name(nvme_cmd_write),		\
+		nvme_opcode_name(nvme_cmd_read),		\
+		nvme_opcode_name(nvme_cmd_write_uncor),		\
+		nvme_opcode_name(nvme_cmd_compare),		\
+		nvme_opcode_name(nvme_cmd_write_zeroes),	\
+		nvme_opcode_name(nvme_cmd_dsm),			\
+		nvme_opcode_name(nvme_cmd_resv_register),	\
+		nvme_opcode_name(nvme_cmd_resv_report),		\
+		nvme_opcode_name(nvme_cmd_resv_acquire),	\
+		nvme_opcode_name(nvme_cmd_resv_release))
+
+
 /*
  * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier
  *
@@ -794,6 +810,35 @@ enum nvme_admin_opcode {
 	nvme_admin_sanitize_nvm		= 0x84,
 };
 
+#define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
+#define show_admin_opcode_name(val)					\
+	__print_symbolic(val,						\
+		nvme_admin_opcode_name(nvme_admin_delete_sq),		\
+		nvme_admin_opcode_name(nvme_admin_create_sq),		\
+		nvme_admin_opcode_name(nvme_admin_get_log_page),	\
+		nvme_admin_opcode_name(nvme_admin_delete_cq),		\
+		nvme_admin_opcode_name(nvme_admin_create_cq),		\
+		nvme_admin_opcode_name(nvme_admin_identify),		\
+		nvme_admin_opcode_name(nvme_admin_abort_cmd),		\
+		nvme_admin_opcode_name(nvme_admin_set_features),	\
+		nvme_admin_opcode_name(nvme_admin_get_features),	\
+		nvme_admin_opcode_name(nvme_admin_async_event),		\
+		nvme_admin_opcode_name(nvme_admin_ns_mgmt),		\
+		nvme_admin_opcode_name(nvme_admin_activate_fw),		\
+		nvme_admin_opcode_name(nvme_admin_download_fw),		\
+		nvme_admin_opcode_name(nvme_admin_ns_attach),		\
+		nvme_admin_opcode_name(nvme_admin_keep_alive),		\
+		nvme_admin_opcode_name(nvme_admin_directive_send),	\
+		nvme_admin_opcode_name(nvme_admin_directive_recv),	\
+		nvme_admin_opcode_name(nvme_admin_dbbuf),		\
+		nvme_admin_opcode_name(nvme_admin_format_nvm),		\
+		nvme_admin_opcode_name(nvme_admin_security_send),	\
+		nvme_admin_opcode_name(nvme_admin_security_recv),	\
+		nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
+
+#define show_opcode_name(qid, opcode)					\
+	(qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
+
 enum {
 	NVME_QUEUE_PHYS_CONTIG	= (1 << 0),
 	NVME_CQ_IRQ_ENABLED	= (1 << 1),
-- 
cgit v1.2.3


From ad795e47cdef078bfd9e48745040d12104005aab Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Wed, 12 Jun 2019 21:45:31 +0900
Subject: nvme-trace: support for fabrics commands in host-side

This patch introduces fabrics commands tracing feature from host-side.
This patch does not include any changes for the previous host-side
tracing, but just add fabrics commands parsing in cmd=() format.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
[hch: fixed some whitespace damage]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/trace.h | 20 ++++++++++-----
 include/linux/nvme.h      | 20 ++++++++++++---
 3 files changed, 94 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 14b0d2993cbe..f01ad0fd60bb 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -135,6 +135,69 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
 	}
 }
 
+static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 attrib = spc[0];
+	u32 ofst = get_unaligned_le32(spc + 4);
+	u64 value = get_unaligned_le64(spc + 8);
+
+	trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx",
+			 attrib, ofst, value);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u16 recfmt = get_unaligned_le16(spc);
+	u16 qid = get_unaligned_le16(spc + 2);
+	u16 sqsize = get_unaligned_le16(spc + 4);
+	u8 cattr = spc[6];
+	u32 kato = get_unaligned_le32(spc + 8);
+
+	trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u",
+			 recfmt, qid, sqsize, cattr, kato);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 attrib = spc[0];
+	u32 ofst = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	trace_seq_printf(p, "spcecific=%*ph", 24, spc);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
+		u8 fctype, u8 *spc)
+{
+	switch (fctype) {
+	case nvme_fabrics_type_property_set:
+		return nvme_trace_fabrics_property_set(p, spc);
+	case nvme_fabrics_type_connect:
+		return nvme_trace_fabrics_connect(p, spc);
+	case nvme_fabrics_type_property_get:
+		return nvme_trace_fabrics_property_get(p, spc);
+	default:
+		return nvme_trace_fabrics_common(p, spc);
+	}
+}
+
 const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 62ee29107c32..19a18c87fb7b 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -20,11 +20,15 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
 		u8 *cdw10);
 const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
 		u8 *cdw10);
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype,
+		u8 *spc);
 
-#define parse_nvme_cmd(qid, opcode, cdw10) 			\
-	(qid ?							\
-	 nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : 		\
-	 nvme_trace_parse_admin_cmd(p, opcode, cdw10))
+#define parse_nvme_cmd(qid, opcode, fctype, cdw10)			\
+	((opcode) == nvme_fabrics_command ?				\
+	 nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) :		\
+	((qid) ?							\
+	 nvme_trace_parse_nvm_cmd(p, opcode, cdw10) :			\
+	 nvme_trace_parse_admin_cmd(p, opcode, cdw10)))
 
 const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
 #define __print_disk_name(name)				\
@@ -49,6 +53,7 @@ TRACE_EVENT(nvme_setup_cmd,
 		__field(int, qid)
 		__field(u8, opcode)
 		__field(u8, flags)
+		__field(u8, fctype)
 		__field(u16, cid)
 		__field(u32, nsid)
 		__field(u64, metadata)
@@ -62,6 +67,7 @@ TRACE_EVENT(nvme_setup_cmd,
 		__entry->cid = cmd->common.command_id;
 		__entry->nsid = le32_to_cpu(cmd->common.nsid);
 		__entry->metadata = le64_to_cpu(cmd->common.metadata);
+		__entry->fctype = cmd->fabrics.fctype;
 		__assign_disk_name(__entry->disk, req->rq_disk);
 		memcpy(__entry->cdw10, &cmd->common.cdw10,
 			sizeof(__entry->cdw10));
@@ -70,8 +76,10 @@ TRACE_EVENT(nvme_setup_cmd,
 		      __entry->ctrl_id, __print_disk_name(__entry->disk),
 		      __entry->qid, __entry->cid, __entry->nsid,
 		      __entry->flags, __entry->metadata,
-		      show_opcode_name(__entry->qid, __entry->opcode),
-		      parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
+		      show_opcode_name(__entry->qid, __entry->opcode,
+				__entry->fctype),
+		      parse_nvme_cmd(__entry->qid, __entry->opcode,
+				__entry->fctype, __entry->cdw10))
 );
 
 TRACE_EVENT(nvme_complete_rq,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 86b3d04baf20..d98b2d8baf4e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -836,9 +836,6 @@ enum nvme_admin_opcode {
 		nvme_admin_opcode_name(nvme_admin_security_recv),	\
 		nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
 
-#define show_opcode_name(qid, opcode)					\
-	(qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
-
 enum {
 	NVME_QUEUE_PHYS_CONTIG	= (1 << 0),
 	NVME_CQ_IRQ_ENABLED	= (1 << 1),
@@ -1053,6 +1050,23 @@ enum nvmf_capsule_command {
 	nvme_fabrics_type_property_get	= 0x04,
 };
 
+#define nvme_fabrics_type_name(type)   { type, #type }
+#define show_fabrics_type_name(type)					\
+	__print_symbolic(type,						\
+		nvme_fabrics_type_name(nvme_fabrics_type_property_set),	\
+		nvme_fabrics_type_name(nvme_fabrics_type_connect),	\
+		nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+
+/*
+ * If not fabrics command, fctype will be ignored.
+ */
+#define show_opcode_name(qid, opcode, fctype)			\
+	((opcode) == nvme_fabrics_command ?			\
+	 show_fabrics_type_name(fctype) :			\
+	((qid) ?						\
+	 show_nvm_opcode_name(opcode) :				\
+	 show_admin_opcode_name(opcode)))
+
 struct nvmf_common_command {
 	__u8	opcode;
 	__u8	resv1;
-- 
cgit v1.2.3


From 84705f9f8c64cc3d8409ac63e5dd06ed97886fb7 Mon Sep 17 00:00:00 2001
From: Jolly Shah <jolly.shah@xilinx.com>
Date: Wed, 19 Jun 2019 13:59:34 -0700
Subject: firmware: xilinx: zynqmp: Remove unused macro

ZYNQMP_PM_CAPABILITY_POWER capability is not supported by firmware
and hence needs to be removed

Signed-off-by: Tejas Patel <tejas.patel@xilinx.com>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Jolly Shah <jolly.shah@xilinx.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1262ea6a1f4b..778abbbc7d94 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -46,7 +46,6 @@
 #define	ZYNQMP_PM_CAPABILITY_ACCESS	0x1U
 #define	ZYNQMP_PM_CAPABILITY_CONTEXT	0x2U
 #define	ZYNQMP_PM_CAPABILITY_WAKEUP	0x4U
-#define	ZYNQMP_PM_CAPABILITY_POWER	0x8U
 
 /*
  * Firmware FPGA Manager flags
-- 
cgit v1.2.3


From 1c2eb5b2853c9f513690ba6b71072d8eb65da16a Mon Sep 17 00:00:00 2001
From: Vishnu DASA <vdasa@vmware.com>
Date: Fri, 24 May 2019 15:13:10 +0000
Subject: VMCI: Fix integer overflow in VMCI handle arrays

The VMCI handle array has an integer overflow in
vmci_handle_arr_append_entry when it tries to expand the array. This can be
triggered from a guest, since the doorbell link hypercall doesn't impose a
limit on the number of doorbell handles that a VM can create in the
hypervisor, and these handles are stored in a handle array.

In this change, we introduce a mandatory max capacity for handle
arrays/lists to avoid excessive memory usage.

Signed-off-by: Vishnu Dasa <vdasa@vmware.com>
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_context.c      | 80 +++++++++++++++++--------------
 drivers/misc/vmw_vmci/vmci_handle_array.c | 38 ++++++++++-----
 drivers/misc/vmw_vmci/vmci_handle_array.h | 29 +++++++----
 include/linux/vmw_vmci_defs.h             | 11 ++++-
 4 files changed, 99 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c
index 300ed69fe2c7..16695366ec92 100644
--- a/drivers/misc/vmw_vmci/vmci_context.c
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -21,6 +21,9 @@
 #include "vmci_driver.h"
 #include "vmci_event.h"
 
+/* Use a wide upper bound for the maximum contexts. */
+#define VMCI_MAX_CONTEXTS 2000
+
 /*
  * List of current VMCI contexts.  Contexts can be added by
  * vmci_ctx_create() and removed via vmci_ctx_destroy().
@@ -117,19 +120,22 @@ struct vmci_ctx *vmci_ctx_create(u32 cid, u32 priv_flags,
 	/* Initialize host-specific VMCI context. */
 	init_waitqueue_head(&context->host_context.wait_queue);
 
-	context->queue_pair_array = vmci_handle_arr_create(0);
+	context->queue_pair_array =
+		vmci_handle_arr_create(0, VMCI_MAX_GUEST_QP_COUNT);
 	if (!context->queue_pair_array) {
 		error = -ENOMEM;
 		goto err_free_ctx;
 	}
 
-	context->doorbell_array = vmci_handle_arr_create(0);
+	context->doorbell_array =
+		vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT);
 	if (!context->doorbell_array) {
 		error = -ENOMEM;
 		goto err_free_qp_array;
 	}
 
-	context->pending_doorbell_array = vmci_handle_arr_create(0);
+	context->pending_doorbell_array =
+		vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT);
 	if (!context->pending_doorbell_array) {
 		error = -ENOMEM;
 		goto err_free_db_array;
@@ -204,7 +210,7 @@ static int ctx_fire_notification(u32 context_id, u32 priv_flags)
 	 * We create an array to hold the subscribers we find when
 	 * scanning through all contexts.
 	 */
-	subscriber_array = vmci_handle_arr_create(0);
+	subscriber_array = vmci_handle_arr_create(0, VMCI_MAX_CONTEXTS);
 	if (subscriber_array == NULL)
 		return VMCI_ERROR_NO_MEM;
 
@@ -623,20 +629,26 @@ int vmci_ctx_add_notification(u32 context_id, u32 remote_cid)
 
 	spin_lock(&context->lock);
 
-	list_for_each_entry(n, &context->notifier_list, node) {
-		if (vmci_handle_is_equal(n->handle, notifier->handle)) {
-			exists = true;
-			break;
+	if (context->n_notifiers < VMCI_MAX_CONTEXTS) {
+		list_for_each_entry(n, &context->notifier_list, node) {
+			if (vmci_handle_is_equal(n->handle, notifier->handle)) {
+				exists = true;
+				break;
+			}
 		}
-	}
 
-	if (exists) {
-		kfree(notifier);
-		result = VMCI_ERROR_ALREADY_EXISTS;
+		if (exists) {
+			kfree(notifier);
+			result = VMCI_ERROR_ALREADY_EXISTS;
+		} else {
+			list_add_tail_rcu(&notifier->node,
+					  &context->notifier_list);
+			context->n_notifiers++;
+			result = VMCI_SUCCESS;
+		}
 	} else {
-		list_add_tail_rcu(&notifier->node, &context->notifier_list);
-		context->n_notifiers++;
-		result = VMCI_SUCCESS;
+		kfree(notifier);
+		result = VMCI_ERROR_NO_MEM;
 	}
 
 	spin_unlock(&context->lock);
@@ -721,8 +733,7 @@ static int vmci_ctx_get_chkpt_doorbells(struct vmci_ctx *context,
 					u32 *buf_size, void **pbuf)
 {
 	struct dbell_cpt_state *dbells;
-	size_t n_doorbells;
-	int i;
+	u32 i, n_doorbells;
 
 	n_doorbells = vmci_handle_arr_get_size(context->doorbell_array);
 	if (n_doorbells > 0) {
@@ -860,7 +871,8 @@ int vmci_ctx_rcv_notifications_get(u32 context_id,
 	spin_lock(&context->lock);
 
 	*db_handle_array = context->pending_doorbell_array;
-	context->pending_doorbell_array = vmci_handle_arr_create(0);
+	context->pending_doorbell_array =
+		vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT);
 	if (!context->pending_doorbell_array) {
 		context->pending_doorbell_array = *db_handle_array;
 		*db_handle_array = NULL;
@@ -942,12 +954,11 @@ int vmci_ctx_dbell_create(u32 context_id, struct vmci_handle handle)
 		return VMCI_ERROR_NOT_FOUND;
 
 	spin_lock(&context->lock);
-	if (!vmci_handle_arr_has_entry(context->doorbell_array, handle)) {
-		vmci_handle_arr_append_entry(&context->doorbell_array, handle);
-		result = VMCI_SUCCESS;
-	} else {
+	if (!vmci_handle_arr_has_entry(context->doorbell_array, handle))
+		result = vmci_handle_arr_append_entry(&context->doorbell_array,
+						      handle);
+	else
 		result = VMCI_ERROR_DUPLICATE_ENTRY;
-	}
 
 	spin_unlock(&context->lock);
 	vmci_ctx_put(context);
@@ -1083,15 +1094,16 @@ int vmci_ctx_notify_dbell(u32 src_cid,
 			if (!vmci_handle_arr_has_entry(
 					dst_context->pending_doorbell_array,
 					handle)) {
-				vmci_handle_arr_append_entry(
+				result = vmci_handle_arr_append_entry(
 					&dst_context->pending_doorbell_array,
 					handle);
-
-				ctx_signal_notify(dst_context);
-				wake_up(&dst_context->host_context.wait_queue);
-
+				if (result == VMCI_SUCCESS) {
+					ctx_signal_notify(dst_context);
+					wake_up(&dst_context->host_context.wait_queue);
+				}
+			} else {
+				result = VMCI_SUCCESS;
 			}
-			result = VMCI_SUCCESS;
 		}
 		spin_unlock(&dst_context->lock);
 	}
@@ -1118,13 +1130,11 @@ int vmci_ctx_qp_create(struct vmci_ctx *context, struct vmci_handle handle)
 	if (context == NULL || vmci_handle_is_invalid(handle))
 		return VMCI_ERROR_INVALID_ARGS;
 
-	if (!vmci_handle_arr_has_entry(context->queue_pair_array, handle)) {
-		vmci_handle_arr_append_entry(&context->queue_pair_array,
-					     handle);
-		result = VMCI_SUCCESS;
-	} else {
+	if (!vmci_handle_arr_has_entry(context->queue_pair_array, handle))
+		result = vmci_handle_arr_append_entry(
+			&context->queue_pair_array, handle);
+	else
 		result = VMCI_ERROR_DUPLICATE_ENTRY;
-	}
 
 	return result;
 }
diff --git a/drivers/misc/vmw_vmci/vmci_handle_array.c b/drivers/misc/vmw_vmci/vmci_handle_array.c
index c527388f5d7b..de7fee7ead1b 100644
--- a/drivers/misc/vmw_vmci/vmci_handle_array.c
+++ b/drivers/misc/vmw_vmci/vmci_handle_array.c
@@ -8,24 +8,29 @@
 #include <linux/slab.h>
 #include "vmci_handle_array.h"
 
-static size_t handle_arr_calc_size(size_t capacity)
+static size_t handle_arr_calc_size(u32 capacity)
 {
-	return sizeof(struct vmci_handle_arr) +
+	return VMCI_HANDLE_ARRAY_HEADER_SIZE +
 	    capacity * sizeof(struct vmci_handle);
 }
 
-struct vmci_handle_arr *vmci_handle_arr_create(size_t capacity)
+struct vmci_handle_arr *vmci_handle_arr_create(u32 capacity, u32 max_capacity)
 {
 	struct vmci_handle_arr *array;
 
+	if (max_capacity == 0 || capacity > max_capacity)
+		return NULL;
+
 	if (capacity == 0)
-		capacity = VMCI_HANDLE_ARRAY_DEFAULT_SIZE;
+		capacity = min((u32)VMCI_HANDLE_ARRAY_DEFAULT_CAPACITY,
+			       max_capacity);
 
 	array = kmalloc(handle_arr_calc_size(capacity), GFP_ATOMIC);
 	if (!array)
 		return NULL;
 
 	array->capacity = capacity;
+	array->max_capacity = max_capacity;
 	array->size = 0;
 
 	return array;
@@ -36,27 +41,34 @@ void vmci_handle_arr_destroy(struct vmci_handle_arr *array)
 	kfree(array);
 }
 
-void vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr,
-				  struct vmci_handle handle)
+int vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr,
+				 struct vmci_handle handle)
 {
 	struct vmci_handle_arr *array = *array_ptr;
 
 	if (unlikely(array->size >= array->capacity)) {
 		/* reallocate. */
 		struct vmci_handle_arr *new_array;
-		size_t new_capacity = array->capacity * VMCI_ARR_CAP_MULT;
-		size_t new_size = handle_arr_calc_size(new_capacity);
+		u32 capacity_bump = min(array->max_capacity - array->capacity,
+					array->capacity);
+		size_t new_size = handle_arr_calc_size(array->capacity +
+						       capacity_bump);
+
+		if (array->size >= array->max_capacity)
+			return VMCI_ERROR_NO_MEM;
 
 		new_array = krealloc(array, new_size, GFP_ATOMIC);
 		if (!new_array)
-			return;
+			return VMCI_ERROR_NO_MEM;
 
-		new_array->capacity = new_capacity;
+		new_array->capacity += capacity_bump;
 		*array_ptr = array = new_array;
 	}
 
 	array->entries[array->size] = handle;
 	array->size++;
+
+	return VMCI_SUCCESS;
 }
 
 /*
@@ -66,7 +78,7 @@ struct vmci_handle vmci_handle_arr_remove_entry(struct vmci_handle_arr *array,
 						struct vmci_handle entry_handle)
 {
 	struct vmci_handle handle = VMCI_INVALID_HANDLE;
-	size_t i;
+	u32 i;
 
 	for (i = 0; i < array->size; i++) {
 		if (vmci_handle_is_equal(array->entries[i], entry_handle)) {
@@ -101,7 +113,7 @@ struct vmci_handle vmci_handle_arr_remove_tail(struct vmci_handle_arr *array)
  * Handle at given index, VMCI_INVALID_HANDLE if invalid index.
  */
 struct vmci_handle
-vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, size_t index)
+vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, u32 index)
 {
 	if (unlikely(index >= array->size))
 		return VMCI_INVALID_HANDLE;
@@ -112,7 +124,7 @@ vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, size_t index)
 bool vmci_handle_arr_has_entry(const struct vmci_handle_arr *array,
 			       struct vmci_handle entry_handle)
 {
-	size_t i;
+	u32 i;
 
 	for (i = 0; i < array->size; i++)
 		if (vmci_handle_is_equal(array->entries[i], entry_handle))
diff --git a/drivers/misc/vmw_vmci/vmci_handle_array.h b/drivers/misc/vmw_vmci/vmci_handle_array.h
index bd1559a548e9..96193f85be5b 100644
--- a/drivers/misc/vmw_vmci/vmci_handle_array.h
+++ b/drivers/misc/vmw_vmci/vmci_handle_array.h
@@ -9,32 +9,41 @@
 #define _VMCI_HANDLE_ARRAY_H_
 
 #include <linux/vmw_vmci_defs.h>
+#include <linux/limits.h>
 #include <linux/types.h>
 
-#define VMCI_HANDLE_ARRAY_DEFAULT_SIZE 4
-#define VMCI_ARR_CAP_MULT 2	/* Array capacity multiplier */
-
 struct vmci_handle_arr {
-	size_t capacity;
-	size_t size;
+	u32 capacity;
+	u32 max_capacity;
+	u32 size;
+	u32 pad;
 	struct vmci_handle entries[];
 };
 
-struct vmci_handle_arr *vmci_handle_arr_create(size_t capacity);
+#define VMCI_HANDLE_ARRAY_HEADER_SIZE				\
+	offsetof(struct vmci_handle_arr, entries)
+/* Select a default capacity that results in a 64 byte sized array */
+#define VMCI_HANDLE_ARRAY_DEFAULT_CAPACITY			6
+/* Make sure that the max array size can be expressed by a u32 */
+#define VMCI_HANDLE_ARRAY_MAX_CAPACITY				\
+	((U32_MAX - VMCI_HANDLE_ARRAY_HEADER_SIZE - 1) /	\
+	sizeof(struct vmci_handle))
+
+struct vmci_handle_arr *vmci_handle_arr_create(u32 capacity, u32 max_capacity);
 void vmci_handle_arr_destroy(struct vmci_handle_arr *array);
-void vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr,
-				  struct vmci_handle handle);
+int vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr,
+				 struct vmci_handle handle);
 struct vmci_handle vmci_handle_arr_remove_entry(struct vmci_handle_arr *array,
 						struct vmci_handle
 						entry_handle);
 struct vmci_handle vmci_handle_arr_remove_tail(struct vmci_handle_arr *array);
 struct vmci_handle
-vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, size_t index);
+vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, u32 index);
 bool vmci_handle_arr_has_entry(const struct vmci_handle_arr *array,
 			       struct vmci_handle entry_handle);
 struct vmci_handle *vmci_handle_arr_get_handles(struct vmci_handle_arr *array);
 
-static inline size_t vmci_handle_arr_get_size(
+static inline u32 vmci_handle_arr_get_size(
 	const struct vmci_handle_arr *array)
 {
 	return array->size;
diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index 606504bf376a..fefb5292403b 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -62,9 +62,18 @@ enum {
 
 /*
  * A single VMCI device has an upper limit of 128MB on the amount of
- * memory that can be used for queue pairs.
+ * memory that can be used for queue pairs. Since each queue pair
+ * consists of at least two pages, the memory limit also dictates the
+ * number of queue pairs a guest can create.
  */
 #define VMCI_MAX_GUEST_QP_MEMORY (128 * 1024 * 1024)
+#define VMCI_MAX_GUEST_QP_COUNT  (VMCI_MAX_GUEST_QP_MEMORY / PAGE_SIZE / 2)
+
+/*
+ * There can be at most PAGE_SIZE doorbells since there is one doorbell
+ * per byte in the doorbell bitmap page.
+ */
+#define VMCI_MAX_GUEST_DOORBELL_COUNT PAGE_SIZE
 
 /*
  * Queues with pre-mapped data pages must be small, so that we don't pin
-- 
cgit v1.2.3


From 43a38c3f318082839d7e613352d4dae7bbdfcdec Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 17 Jun 2019 15:15:04 +0200
Subject: netfilter: fix nf_conntrack_bridge/ipv6 link error

When CONFIG_IPV6 is disabled, the bridge netfilter code
produces a link error:

ERROR: "br_ip6_fragment" [net/bridge/netfilter/nf_conntrack_bridge.ko] undefined!
ERROR: "nf_ct_frag6_gather" [net/bridge/netfilter/nf_conntrack_bridge.ko] undefined!

The problem is that it assumes that whenever IPV6 is not a loadable
module, we can call the functions direction. This is clearly
not true when IPV6 is disabled.

There are two other functions defined like this in linux/netfilter_ipv6.h,
so change them all the same way.

Fixes: 764dd163ac92 ("netfilter: nf_conntrack_bridge: add support for IPv6")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 22e6398bc482..7beb681e1ce5 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -75,8 +75,10 @@ static inline int nf_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 		return 1;
 
 	return v6_ops->chk_addr(net, addr, dev, strict);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return ipv6_chk_addr(net, addr, dev, strict);
+#else
+	return 1;
 #endif
 }
 
@@ -113,8 +115,10 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 		return 1;
 
 	return v6_ops->br_defrag(net, skb, user);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return nf_ct_frag6_gather(net, skb, user);
+#else
+	return 1;
 #endif
 }
 
@@ -138,8 +142,10 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
 		return 1;
 
 	return v6_ops->br_fragment(net, sk, skb, data, output);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return br_ip6_fragment(net, sk, skb, data, output);
+#else
+	return 1;
 #endif
 }
 
@@ -154,8 +160,10 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 		return -EHOSTUNREACH;
 
 	return v6_ops->route_me_harder(net, skb);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return ip6_route_me_harder(net, skb);
+#else
+	return -EHOSTUNREACH;
 #endif
 }
 
-- 
cgit v1.2.3


From a78cf9657ba5426f54aa93a067c10d097944c082 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sat, 15 Jun 2019 10:23:57 +1000
Subject: PCI/ACPI: Evaluate PCI Boot Configuration _DSM

Evaluate _DSM Function #5, the "PCI Boot Configuration" function.  If the
result is 0, the OS should preserve any resource assignments made by the
firmware.

Link: https://lore.kernel.org/r/20190615002359.29577-2-benh@kernel.crashing.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_root.c  | 12 ++++++++++++
 include/linux/pci-acpi.h |  7 ++++---
 include/linux/pci.h      |  2 ++
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index c36781a9b493..0d57f817ef1e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -894,6 +894,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	int node = acpi_get_node(device->handle);
 	struct pci_bus *bus;
 	struct pci_host_bridge *host_bridge;
+	union acpi_object *obj;
 
 	info->root = root;
 	info->bridge = device;
@@ -930,6 +931,17 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
 		host_bridge->native_ltr = 0;
 
+	/*
+	 * Evaluate the "PCI Boot Configuration" _DSM Function.  If it
+	 * exists and returns 0, we must preserve any PCI resource
+	 * assignments made by firmware for this host bridge.
+	 */
+	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1,
+	                        IGNORE_PCI_BOOT_CONFIG_DSM, NULL);
+	if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0)
+		host_bridge->preserve_config = 1;
+	ACPI_FREE(obj);
+
 	pci_scan_child_bus(bus);
 	pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info,
 				    info);
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 8082b612f561..62b7fdcc661c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -107,9 +107,10 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { }
 #endif
 
 extern const guid_t pci_acpi_dsm_guid;
-#define DEVICE_LABEL_DSM	0x07
-#define RESET_DELAY_DSM		0x08
-#define FUNCTION_DELAY_DSM	0x09
+#define IGNORE_PCI_BOOT_CONFIG_DSM	0x05
+#define DEVICE_LABEL_DSM		0x07
+#define RESET_DELAY_DSM			0x08
+#define FUNCTION_DELAY_DSM		0x09
 
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..5e2b309363a3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -505,6 +505,8 @@ struct pci_host_bridge {
 	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
+	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
+
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
 			const struct resource *res,
-- 
cgit v1.2.3


From 4cfd218855923a07dc02a5bec3d3bb37a118ebc2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 18 Jun 2019 23:13:48 +0200
Subject: PCI: let pci_disable_link_state propagate errors

Drivers may rely on pci_disable_link_state() having disabled certain
ASPM link states. If OS can't control ASPM then pci_disable_link_state()
turns into a no-op w/o informing the caller. The driver therefore may
falsely assume the respective ASPM link states are disabled.
Let pci_disable_link_state() propagate errors to the caller, enabling
the caller to react accordingly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pcie/aspm.c  | 20 +++++++++++---------
 include/linux/pci-aspm.h |  7 ++++---
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index fd4cb75088f9..e44af7f4d37f 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1062,18 +1062,18 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev)
 	up_read(&pci_bus_sem);
 }
 
-static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
+static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 {
 	struct pci_dev *parent = pdev->bus->self;
 	struct pcie_link_state *link;
 
 	if (!pci_is_pcie(pdev))
-		return;
+		return 0;
 
 	if (pdev->has_secondary_link)
 		parent = pdev;
 	if (!parent || !parent->link_state)
-		return;
+		return -EINVAL;
 
 	/*
 	 * A driver requested that ASPM be disabled on this device, but
@@ -1085,7 +1085,7 @@ static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 	 */
 	if (aspm_disabled) {
 		pci_warn(pdev, "can't disable ASPM; OS doesn't have ASPM control\n");
-		return;
+		return -EPERM;
 	}
 
 	if (sem)
@@ -1105,11 +1105,13 @@ static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 	mutex_unlock(&aspm_lock);
 	if (sem)
 		up_read(&pci_bus_sem);
+
+	return 0;
 }
 
-void pci_disable_link_state_locked(struct pci_dev *pdev, int state)
+int pci_disable_link_state_locked(struct pci_dev *pdev, int state)
 {
-	__pci_disable_link_state(pdev, state, false);
+	return __pci_disable_link_state(pdev, state, false);
 }
 EXPORT_SYMBOL(pci_disable_link_state_locked);
 
@@ -1117,14 +1119,14 @@ EXPORT_SYMBOL(pci_disable_link_state_locked);
  * pci_disable_link_state - Disable device's link state, so the link will
  * never enter specific states.  Note that if the BIOS didn't grant ASPM
  * control to the OS, this does nothing because we can't touch the LNKCTL
- * register.
+ * register. Returns 0 or a negative errno.
  *
  * @pdev: PCI device
  * @state: ASPM link state to disable
  */
-void pci_disable_link_state(struct pci_dev *pdev, int state)
+int pci_disable_link_state(struct pci_dev *pdev, int state)
 {
-	__pci_disable_link_state(pdev, state, true);
+	return __pci_disable_link_state(pdev, state, true);
 }
 EXPORT_SYMBOL(pci_disable_link_state);
 
diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h
index df28af5cef21..67064145d76e 100644
--- a/include/linux/pci-aspm.h
+++ b/include/linux/pci-aspm.h
@@ -24,11 +24,12 @@
 #define PCIE_LINK_STATE_CLKPM	4
 
 #ifdef CONFIG_PCIEASPM
-void pci_disable_link_state(struct pci_dev *pdev, int state);
-void pci_disable_link_state_locked(struct pci_dev *pdev, int state);
+int pci_disable_link_state(struct pci_dev *pdev, int state);
+int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
 void pcie_no_aspm(void);
 #else
-static inline void pci_disable_link_state(struct pci_dev *pdev, int state) { }
+static inline int pci_disable_link_state(struct pci_dev *pdev, int state)
+{ return 0; }
 static inline void pcie_no_aspm(void) { }
 #endif
 
-- 
cgit v1.2.3


From d308dfbf62eff897d71968d764f21a78678ee0a5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 18 Jun 2019 12:58:33 +0200
Subject: i2c: mux/i801: Switch to use descriptor passing

This switches the i801 GPIO mux to use GPIO descriptors for
handling the GPIO lines. The previous hack which was reaching
inside the GPIO chips etc cannot live on. We pass descriptors
along with the GPIO mux device at creation instead.

The GPIO mux was only used by way of platform data with a
platform device from one place in the kernel: the i801 i2c bus
driver. Let's just associate the GPIO descriptor table with
the actual device like everyone else and dynamically create
a descriptor table passed along with the GPIO i2c mux.

This enables simplification of the GPIO i2c mux driver to
use only the descriptor API and the OF probe path gets
simplified in the process.

The i801 driver was registering the GPIO i2c mux with
PLATFORM_DEVID_AUTO which would make it hard to predict the
device name and assign the descriptor table properly, but
this seems to be a mistake to begin with: all of the
GPIO mux devices are hardcoded to look up GPIO lines from
the "gpio_ich" GPIO chip. If there are more than one mux,
there is certainly more than one gpio chip as well, and
then we have more serious problems. Switch to
PLATFORM_DEVID_NONE instead. There can be only one.

Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Jean Delvare <jdelvare@suse.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[Removed a newline, suggested by Andy. /Peter]
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/busses/i2c-i801.c              |  37 +++++++--
 drivers/i2c/muxes/i2c-mux-gpio.c           | 116 ++++++++---------------------
 include/linux/platform_data/i2c-mux-gpio.h |   7 --
 3 files changed, 60 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 679c6c41f64b..bf484cd775ec 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -107,7 +107,7 @@
 #include <linux/pm_runtime.h>
 
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
-#include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_data/i2c-mux-gpio.h>
 #endif
 
@@ -274,6 +274,7 @@ struct i801_priv {
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
 	const struct i801_mux_config *mux_drvdata;
 	struct platform_device *mux_pdev;
+	struct gpiod_lookup_table *lookup;
 #endif
 	struct platform_device *tco_pdev;
 
@@ -1258,7 +1259,8 @@ static int i801_add_mux(struct i801_priv *priv)
 	struct device *dev = &priv->adapter.dev;
 	const struct i801_mux_config *mux_config;
 	struct i2c_mux_gpio_platform_data gpio_data;
-	int err;
+	struct gpiod_lookup_table *lookup;
+	int err, i;
 
 	if (!priv->mux_drvdata)
 		return 0;
@@ -1270,17 +1272,36 @@ static int i801_add_mux(struct i801_priv *priv)
 	gpio_data.values = mux_config->values;
 	gpio_data.n_values = mux_config->n_values;
 	gpio_data.classes = mux_config->classes;
-	gpio_data.gpio_chip = mux_config->gpio_chip;
-	gpio_data.gpios = mux_config->gpios;
-	gpio_data.n_gpios = mux_config->n_gpios;
 	gpio_data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	/* Register the mux device */
+	/* Register GPIO descriptor lookup table */
+	lookup = devm_kzalloc(dev,
+			      struct_size(lookup, table, mux_config->n_gpios),
+			      GFP_KERNEL);
+	if (!lookup)
+		return -ENOMEM;
+	lookup->dev_id = "i2c-mux-gpio";
+	for (i = 0; i < mux_config->n_gpios; i++) {
+		lookup->table[i].chip_label = mux_config->gpio_chip;
+		lookup->table[i].chip_hwnum = mux_config->gpios[i];
+		lookup->table[i].con_id = "mux";
+	}
+	gpiod_add_lookup_table(lookup);
+	priv->lookup = lookup;
+
+	/*
+	 * Register the mux device, we use PLATFORM_DEVID_NONE here
+	 * because since we are referring to the GPIO chip by name we are
+	 * anyways in deep trouble if there is more than one of these
+	 * devices, and there should likely only be one platform controller
+	 * hub.
+	 */
 	priv->mux_pdev = platform_device_register_data(dev, "i2c-mux-gpio",
-				PLATFORM_DEVID_AUTO, &gpio_data,
+				PLATFORM_DEVID_NONE, &gpio_data,
 				sizeof(struct i2c_mux_gpio_platform_data));
 	if (IS_ERR(priv->mux_pdev)) {
 		err = PTR_ERR(priv->mux_pdev);
+		gpiod_remove_lookup_table(lookup);
 		priv->mux_pdev = NULL;
 		dev_err(dev, "Failed to register i2c-mux-gpio device\n");
 		return err;
@@ -1293,6 +1314,8 @@ static void i801_del_mux(struct i801_priv *priv)
 {
 	if (priv->mux_pdev)
 		platform_device_unregister(priv->mux_pdev);
+	if (priv->lookup)
+		gpiod_remove_lookup_table(priv->lookup);
 }
 
 static unsigned int i801_get_adapter_class(struct i801_priv *priv)
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 13882a2a4f60..fd482feafb19 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -14,13 +14,14 @@
 #include <linux/platform_device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/bits.h>
+#include <linux/gpio/consumer.h>
+/* FIXME: stop poking around inside gpiolib */
 #include "../../gpio/gpiolib.h"
-#include <linux/of_gpio.h>
 
 struct gpiomux {
 	struct i2c_mux_gpio_platform_data data;
-	unsigned gpio_base;
+	int ngpios;
 	struct gpio_desc **gpios;
 };
 
@@ -30,8 +31,7 @@ static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
 
 	values[0] = val;
 
-	gpiod_set_array_value_cansleep(mux->data.n_gpios, mux->gpios, NULL,
-				       values);
+	gpiod_set_array_value_cansleep(mux->ngpios, mux->gpios, NULL, values);
 }
 
 static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
@@ -52,12 +52,6 @@ static int i2c_mux_gpio_deselect(struct i2c_mux_core *muxc, u32 chan)
 	return 0;
 }
 
-static int match_gpio_chip_by_label(struct gpio_chip *chip,
-					      void *data)
-{
-	return !strcmp(chip->label, data);
-}
-
 #ifdef CONFIG_OF
 static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 					struct platform_device *pdev)
@@ -65,8 +59,8 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *adapter_np, *child;
 	struct i2c_adapter *adapter;
-	unsigned *values, *gpios;
-	int i = 0, ret;
+	unsigned *values;
+	int i = 0;
 
 	if (!np)
 		return -ENODEV;
@@ -103,29 +97,6 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	if (of_property_read_u32(np, "idle-state", &mux->data.idle))
 		mux->data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	mux->data.n_gpios = of_gpio_named_count(np, "mux-gpios");
-	if (mux->data.n_gpios < 0) {
-		dev_err(&pdev->dev, "Missing mux-gpios property in the DT.\n");
-		return -EINVAL;
-	}
-
-	gpios = devm_kcalloc(&pdev->dev,
-			     mux->data.n_gpios, sizeof(*mux->data.gpios),
-			     GFP_KERNEL);
-	if (!gpios) {
-		dev_err(&pdev->dev, "Cannot allocate gpios array");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < mux->data.n_gpios; i++) {
-		ret = of_get_named_gpio(np, "mux-gpios", i);
-		if (ret < 0)
-			return ret;
-		gpios[i] = ret;
-	}
-
-	mux->data.gpios = gpios;
-
 	return 0;
 }
 #else
@@ -142,8 +113,8 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	struct gpiomux *mux;
 	struct i2c_adapter *parent;
 	struct i2c_adapter *root;
-	unsigned initial_state, gpio_base;
-	int i, ret;
+	unsigned initial_state;
+	int i, ngpios, ret;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
 	if (!mux)
@@ -158,29 +129,19 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 			sizeof(mux->data));
 	}
 
-	/*
-	 * If a GPIO chip name is provided, the GPIO pin numbers provided are
-	 * relative to its base GPIO number. Otherwise they are absolute.
-	 */
-	if (mux->data.gpio_chip) {
-		struct gpio_chip *gpio;
-
-		gpio = gpiochip_find(mux->data.gpio_chip,
-				     match_gpio_chip_by_label);
-		if (!gpio)
-			return -EPROBE_DEFER;
-
-		gpio_base = gpio->base;
-	} else {
-		gpio_base = 0;
+	ngpios = gpiod_count(&pdev->dev, "mux");
+	if (ngpios <= 0) {
+		dev_err(&pdev->dev, "no valid gpios provided\n");
+		return ngpios ?: -EINVAL;
 	}
+	mux->ngpios = ngpios;
 
 	parent = i2c_get_adapter(mux->data.parent);
 	if (!parent)
 		return -EPROBE_DEFER;
 
 	muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values,
-			     mux->data.n_gpios * sizeof(*mux->gpios), 0,
+			     ngpios * sizeof(*mux->gpios), 0,
 			     i2c_mux_gpio_select, NULL);
 	if (!muxc) {
 		ret = -ENOMEM;
@@ -194,7 +155,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	root = i2c_root_adapter(&parent->dev);
 
 	muxc->mux_locked = true;
-	mux->gpio_base = gpio_base;
 
 	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
 		initial_state = mux->data.idle;
@@ -203,34 +163,28 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 		initial_state = mux->data.values[0];
 	}
 
-	for (i = 0; i < mux->data.n_gpios; i++) {
+	for (i = 0; i < ngpios; i++) {
 		struct device *gpio_dev;
-		struct gpio_desc *gpio_desc;
-
-		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
-		if (ret) {
-			dev_err(&pdev->dev, "Failed to request GPIO %d\n",
-				mux->data.gpios[i]);
-			goto err_request_gpio;
+		struct gpio_desc *gpiod;
+		enum gpiod_flags flag;
+
+		if (initial_state & BIT(i))
+			flag = GPIOD_OUT_HIGH;
+		else
+			flag = GPIOD_OUT_LOW;
+		gpiod = devm_gpiod_get_index(&pdev->dev, "mux", i, flag);
+		if (IS_ERR(gpiod)) {
+			ret = PTR_ERR(gpiod);
+			goto alloc_failed;
 		}
 
-		ret = gpio_direction_output(gpio_base + mux->data.gpios[i],
-					    initial_state & (1 << i));
-		if (ret) {
-			dev_err(&pdev->dev,
-				"Failed to set direction of GPIO %d to output\n",
-				mux->data.gpios[i]);
-			i++;	/* gpio_request above succeeded, so must free */
-			goto err_request_gpio;
-		}
-
-		gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]);
-		mux->gpios[i] = gpio_desc;
+		mux->gpios[i] = gpiod;
 
 		if (!muxc->mux_locked)
 			continue;
 
-		gpio_dev = &gpio_desc->gdev->dev;
+		/* FIXME: find a proper way to access the GPIO device */
+		gpio_dev = &gpiod->gdev->dev;
 		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
 	}
 
@@ -253,10 +207,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 add_adapter_failed:
 	i2c_mux_del_adapters(muxc);
-	i = mux->data.n_gpios;
-err_request_gpio:
-	for (; i > 0; i--)
-		gpio_free(gpio_base + mux->data.gpios[i - 1]);
 alloc_failed:
 	i2c_put_adapter(parent);
 
@@ -266,14 +216,8 @@ alloc_failed:
 static int i2c_mux_gpio_remove(struct platform_device *pdev)
 {
 	struct i2c_mux_core *muxc = platform_get_drvdata(pdev);
-	struct gpiomux *mux = i2c_mux_priv(muxc);
-	int i;
 
 	i2c_mux_del_adapters(muxc);
-
-	for (i = 0; i < mux->data.n_gpios; i++)
-		gpio_free(mux->gpio_base + mux->data.gpios[i]);
-
 	i2c_put_adapter(muxc->parent);
 
 	return 0;
diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h
index 4406108201fe..28f288eed652 100644
--- a/include/linux/platform_data/i2c-mux-gpio.h
+++ b/include/linux/platform_data/i2c-mux-gpio.h
@@ -22,10 +22,6 @@
  *	position
  * @n_values: Number of multiplexer positions (busses to instantiate)
  * @classes: Optional I2C auto-detection classes
- * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given
- *	relative to the base GPIO number of that chip
- * @gpios: Array of GPIO numbers used to control MUX
- * @n_gpios: Number of GPIOs used to control MUX
  * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
  */
 struct i2c_mux_gpio_platform_data {
@@ -34,9 +30,6 @@ struct i2c_mux_gpio_platform_data {
 	const unsigned *values;
 	int n_values;
 	const unsigned *classes;
-	char *gpio_chip;
-	const unsigned *gpios;
-	int n_gpios;
 	unsigned idle;
 };
 
-- 
cgit v1.2.3


From e9bea8f98a539080070e3eff70a1731ce0ffdc8d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 19 Jun 2019 00:48:15 +0200
Subject: PM: sleep: Update struct wakeup_source documentation

The kerneldoc comment for struct wakeup_source has become outdated,
so fix that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pm_wakeup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index ce57771fab9b..91027602d137 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -36,7 +36,7 @@ struct wake_irq;
  * @expire_count: Number of times the wakeup source's timeout has expired.
  * @wakeup_count: Number of times the wakeup source might abort suspend.
  * @active: Status of the wakeup source.
- * @has_timeout: The wakeup source has been activated with a timeout.
+ * @autosleep_enabled: Autosleep is active, so update @prevent_sleep_time.
  */
 struct wakeup_source {
 	const char 		*name;
-- 
cgit v1.2.3


From 9285ec4c8b61d4930a575081abeba2cd4f449a74 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 21 Jun 2019 22:32:48 +0200
Subject: timekeeping: Use proper clock specifier names in functions

This makes boot uniformly boottime and tai uniformly clocktai, to
address the remaining oversights.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lkml.kernel.org/r/20190621203249.3909-2-Jason@zx2c4.com
---
 Documentation/core-api/timekeeping.rst                 |  2 +-
 arch/x86/kvm/pmu.c                                     |  4 ++--
 arch/x86/kvm/x86.c                                     | 12 ++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  2 +-
 drivers/iio/humidity/dht11.c                           |  8 ++++----
 drivers/iio/industrialio-core.c                        |  4 ++--
 drivers/infiniband/hw/mlx4/alias_GUID.c                |  6 +++---
 drivers/leds/trigger/ledtrig-activity.c                |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/rx.c            |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c          |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c         |  2 +-
 drivers/net/wireless/mac80211_hwsim.c                  |  2 +-
 drivers/net/wireless/ti/wlcore/main.c                  |  2 +-
 drivers/net/wireless/ti/wlcore/rx.c                    |  2 +-
 drivers/net/wireless/ti/wlcore/tx.c                    |  2 +-
 drivers/net/wireless/virt_wifi.c                       |  2 +-
 include/linux/timekeeping.h                            |  4 ++--
 include/net/cfg80211.h                                 |  2 +-
 kernel/bpf/syscall.c                                   |  2 +-
 kernel/events/core.c                                   |  4 ++--
 kernel/fork.c                                          |  2 +-
 22 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 93cbeb9daec0..4d92b1ac8024 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -65,7 +65,7 @@ different format depending on what is required by the user:
 .. c:function:: u64 ktime_get_ns( void )
 		u64 ktime_get_boottime_ns( void )
 		u64 ktime_get_real_ns( void )
-		u64 ktime_get_tai_ns( void )
+		u64 ktime_get_clocktai_ns( void )
 		u64 ktime_get_raw_ns( void )
 
 	Same as the plain ktime_get functions, but returning a u64 number
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index dd745b58ffd8..1aea628ef6b8 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -264,10 +264,10 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 		ctr_val = rdtsc();
 		break;
 	case VMWARE_BACKDOOR_PMC_REAL_TIME:
-		ctr_val = ktime_get_boot_ns();
+		ctr_val = ktime_get_boottime_ns();
 		break;
 	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
-		ctr_val = ktime_get_boot_ns() +
+		ctr_val = ktime_get_boottime_ns() +
 			vcpu->kvm->arch.kvmclock_offset;
 		break;
 	default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83aefd759846..81a0914a1ec1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1731,7 +1731,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
-	ns = ktime_get_boot_ns();
+	ns = ktime_get_boottime_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
@@ -2073,7 +2073,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 	spin_lock(&ka->pvclock_gtod_sync_lock);
 	if (!ka->use_master_clock) {
 		spin_unlock(&ka->pvclock_gtod_sync_lock);
-		return ktime_get_boot_ns() + ka->kvmclock_offset;
+		return ktime_get_boottime_ns() + ka->kvmclock_offset;
 	}
 
 	hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2089,7 +2089,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 				   &hv_clock.tsc_to_system_mul);
 		ret = __pvclock_read_cycles(&hv_clock, rdtsc());
 	} else
-		ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+		ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
 
 	put_cpu();
 
@@ -2188,7 +2188,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	}
 	if (!use_master_clock) {
 		host_tsc = rdtsc();
-		kernel_ns = ktime_get_boot_ns();
+		kernel_ns = ktime_get_boottime_ns();
 	}
 
 	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -9018,7 +9018,7 @@ int kvm_arch_hardware_enable(void)
 	 * before any KVM threads can be running.  Unfortunately, we can't
 	 * bring the TSCs fully up to date with real time, as we aren't yet far
 	 * enough into CPU bringup that we know how much real time has actually
-	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+	 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
 	 * variables that haven't been updated yet.
 	 *
 	 * So we simply find the maximum observed TSC above, then record the
@@ -9246,7 +9246,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
-	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+	kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
 	pvclock_update_vm_gtod_copy(kvm);
 
 	kvm->arch.guest_can_read_msr_platform_info = true;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 083bd8114db1..dd6b4b0b5f30 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -837,7 +837,7 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
 
 	/* No access to rdtsc. Using raw monotonic time */
 	args->cpu_clock_counter = ktime_get_raw_ns();
-	args->system_clock_counter = ktime_get_boot_ns();
+	args->system_clock_counter = ktime_get_boottime_ns();
 
 	/* Since the counter is in nano-seconds we use 1GHz frequency */
 	args->system_clock_freq = 1000000000;
diff --git a/drivers/iio/humidity/dht11.c b/drivers/iio/humidity/dht11.c
index c8159205c77d..4e22b3c3e488 100644
--- a/drivers/iio/humidity/dht11.c
+++ b/drivers/iio/humidity/dht11.c
@@ -149,7 +149,7 @@ static int dht11_decode(struct dht11 *dht11, int offset)
 		return -EIO;
 	}
 
-	dht11->timestamp = ktime_get_boot_ns();
+	dht11->timestamp = ktime_get_boottime_ns();
 	if (hum_int < 4) {  /* DHT22: 100000 = (3*256+232)*100 */
 		dht11->temperature = (((temp_int & 0x7f) << 8) + temp_dec) *
 					((temp_int & 0x80) ? -100 : 100);
@@ -177,7 +177,7 @@ static irqreturn_t dht11_handle_irq(int irq, void *data)
 
 	/* TODO: Consider making the handler safe for IRQ sharing */
 	if (dht11->num_edges < DHT11_EDGES_PER_READ && dht11->num_edges >= 0) {
-		dht11->edges[dht11->num_edges].ts = ktime_get_boot_ns();
+		dht11->edges[dht11->num_edges].ts = ktime_get_boottime_ns();
 		dht11->edges[dht11->num_edges++].value =
 						gpio_get_value(dht11->gpio);
 
@@ -196,7 +196,7 @@ static int dht11_read_raw(struct iio_dev *iio_dev,
 	int ret, timeres, offset;
 
 	mutex_lock(&dht11->lock);
-	if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_boot_ns()) {
+	if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_boottime_ns()) {
 		timeres = ktime_get_resolution_ns();
 		dev_dbg(dht11->dev, "current timeresolution: %dns\n", timeres);
 		if (timeres > DHT11_MIN_TIMERES) {
@@ -322,7 +322,7 @@ static int dht11_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	dht11->timestamp = ktime_get_boot_ns() - DHT11_DATA_VALID_TIME - 1;
+	dht11->timestamp = ktime_get_boottime_ns() - DHT11_DATA_VALID_TIME - 1;
 	dht11->num_edges = -1;
 
 	platform_set_drvdata(pdev, iio);
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index f5a4581302f4..16008f862d19 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -231,9 +231,9 @@ s64 iio_get_time_ns(const struct iio_dev *indio_dev)
 		ktime_get_coarse_ts64(&tp);
 		return timespec64_to_ns(&tp);
 	case CLOCK_BOOTTIME:
-		return ktime_get_boot_ns();
+		return ktime_get_boottime_ns();
 	case CLOCK_TAI:
-		return ktime_get_tai_ns();
+		return ktime_get_clocktai_ns();
 	default:
 		BUG();
 	}
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 2a0b59a4b6eb..cca414ecfcd5 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -310,7 +310,7 @@ static void aliasguid_query_handler(int status,
 	if (status) {
 		pr_debug("(port: %d) failed: status = %d\n",
 			 cb_ctx->port, status);
-		rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC;
+		rec->time_to_run = ktime_get_boottime_ns() + 1 * NSEC_PER_SEC;
 		goto out;
 	}
 
@@ -416,7 +416,7 @@ next_entry:
 			 be64_to_cpu((__force __be64)rec->guid_indexes),
 			 be64_to_cpu((__force __be64)applied_guid_indexes),
 			 be64_to_cpu((__force __be64)declined_guid_indexes));
-		rec->time_to_run = ktime_get_boot_ns() +
+		rec->time_to_run = ktime_get_boottime_ns() +
 			resched_delay_sec * NSEC_PER_SEC;
 	} else {
 		rec->status = MLX4_GUID_INFO_STATUS_SET;
@@ -709,7 +709,7 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
 		}
 	}
 	if (resched_delay_sec) {
-		u64 curr_time = ktime_get_boot_ns();
+		u64 curr_time = ktime_get_boottime_ns();
 
 		*resched_delay_sec = (low_record_time < curr_time) ? 0 :
 			div_u64((low_record_time - curr_time), NSEC_PER_SEC);
diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c
index bcbf41c90c30..0f130dd998b3 100644
--- a/drivers/leds/trigger/ledtrig-activity.c
+++ b/drivers/leds/trigger/ledtrig-activity.c
@@ -73,7 +73,7 @@ static void led_activity_function(struct timer_list *t)
 	 * down to 16us, ensuring we won't overflow 32-bit computations below
 	 * even up to 3k CPUs, while keeping divides cheap on smaller systems.
 	 */
-	curr_boot = ktime_get_boot_ns() * cpus;
+	curr_boot = ktime_get_boottime_ns() * cpus;
 	diff_boot = (curr_boot - activity_data->last_boot) >> 16;
 	diff_used = (curr_used - activity_data->last_used) >> 16;
 	activity_data->last_boot = curr_boot;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
index fec38a47696e..9f4b117db9d7 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
@@ -93,7 +93,7 @@ void iwl_mvm_ftm_restart(struct iwl_mvm *mvm)
 	struct cfg80211_pmsr_result result = {
 		.status = NL80211_PMSR_STATUS_FAILURE,
 		.final = 1,
-		.host_time = ktime_get_boot_ns(),
+		.host_time = ktime_get_boottime_ns(),
 		.type = NL80211_PMSR_TYPE_FTM,
 	};
 	int i;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
index fbd3014e8b82..160b0db27103 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
@@ -555,7 +555,7 @@ void iwl_mvm_rx_rx_mpdu(struct iwl_mvm *mvm, struct napi_struct *napi,
 
 	if (unlikely(ieee80211_is_beacon(hdr->frame_control) ||
 		     ieee80211_is_probe_resp(hdr->frame_control)))
-		rx_status->boottime_ns = ktime_get_boot_ns();
+		rx_status->boottime_ns = ktime_get_boottime_ns();
 
 	/* Take a reference briefly to kick off a d0i3 entry delay so
 	 * we can handle bursts of RX packets without toggling the
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 1824566d08fc..64f950501287 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -1684,7 +1684,7 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi,
 
 		if (unlikely(ieee80211_is_beacon(hdr->frame_control) ||
 			     ieee80211_is_probe_resp(hdr->frame_control)))
-			rx_status->boottime_ns = ktime_get_boot_ns();
+			rx_status->boottime_ns = ktime_get_boottime_ns();
 	}
 
 	if (iwl_mvm_create_skb(mvm, skb, hdr, len, crypt_len, rxb)) {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index b9914efc55c4..724a25ab32f2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -1443,7 +1443,7 @@ void iwl_mvm_get_sync_time(struct iwl_mvm *mvm, u32 *gp2, u64 *boottime)
 	}
 
 	*gp2 = iwl_mvm_get_systime(mvm);
-	*boottime = ktime_get_boot_ns();
+	*boottime = ktime_get_boottime_ns();
 
 	if (!ps_disabled) {
 		mvm->ps_disabled = ps_disabled;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 60ca13e0f15b..52ee165d6f1d 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1274,7 +1274,7 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw,
 	 */
 	if (ieee80211_is_beacon(hdr->frame_control) ||
 	    ieee80211_is_probe_resp(hdr->frame_control)) {
-		rx_status.boottime_ns = ktime_get_boot_ns();
+		rx_status.boottime_ns = ktime_get_boottime_ns();
 		now = data->abs_bcn_ts;
 	} else {
 		now = mac80211_hwsim_get_tsf_raw();
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index c9a485ecee7b..b74dc8bc9755 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -483,7 +483,7 @@ static int wlcore_fw_status(struct wl1271 *wl, struct wl_fw_status *status)
 	}
 
 	/* update the host-chipset time offset */
-	wl->time_offset = (ktime_get_boot_ns() >> 10) -
+	wl->time_offset = (ktime_get_boottime_ns() >> 10) -
 		(s64)(status->fw_localtime);
 
 	wl->fw_fast_lnk_map = status->link_fast_bitmap;
diff --git a/drivers/net/wireless/ti/wlcore/rx.c b/drivers/net/wireless/ti/wlcore/rx.c
index d96bb602fae6..307fab21050b 100644
--- a/drivers/net/wireless/ti/wlcore/rx.c
+++ b/drivers/net/wireless/ti/wlcore/rx.c
@@ -93,7 +93,7 @@ static void wl1271_rx_status(struct wl1271 *wl,
 	}
 
 	if (beacon || probe_rsp)
-		status->boottime_ns = ktime_get_boot_ns();
+		status->boottime_ns = ktime_get_boottime_ns();
 
 	if (beacon)
 		wlcore_set_pending_regdomain_ch(wl, (u16)desc->channel,
diff --git a/drivers/net/wireless/ti/wlcore/tx.c b/drivers/net/wireless/ti/wlcore/tx.c
index 057c6be330e7..90e56d4c3df3 100644
--- a/drivers/net/wireless/ti/wlcore/tx.c
+++ b/drivers/net/wireless/ti/wlcore/tx.c
@@ -273,7 +273,7 @@ static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct wl12xx_vif *wlvif,
 	}
 
 	/* configure packet life time */
-	hosttime = (ktime_get_boot_ns() >> 10);
+	hosttime = (ktime_get_boottime_ns() >> 10);
 	desc->start_time = cpu_to_le32(hosttime - wl->time_offset);
 
 	is_dummy = wl12xx_is_dummy_packet(wl, skb);
diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
index 606999f102eb..be92e1220284 100644
--- a/drivers/net/wireless/virt_wifi.c
+++ b/drivers/net/wireless/virt_wifi.c
@@ -172,7 +172,7 @@ static void virt_wifi_scan_result(struct work_struct *work)
 	informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz,
 					   CFG80211_BSS_FTYPE_PRESP,
 					   fake_router_bssid,
-					   ktime_get_boot_ns(),
+					   ktime_get_boottime_ns(),
 					   WLAN_CAPABILITY_ESS, 0,
 					   (void *)&ssid, sizeof(ssid),
 					   DBM_TO_MBM(-50), GFP_KERNEL);
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index a8ab0f143ac4..fd6123722ea8 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -131,12 +131,12 @@ static inline u64 ktime_get_real_ns(void)
 	return ktime_to_ns(ktime_get_real());
 }
 
-static inline u64 ktime_get_boot_ns(void)
+static inline u64 ktime_get_boottime_ns(void)
 {
 	return ktime_to_ns(ktime_get_boottime());
 }
 
-static inline u64 ktime_get_tai_ns(void)
+static inline u64 ktime_get_clocktai_ns(void)
 {
 	return ktime_to_ns(ktime_get_clocktai());
 }
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 87dae868707e..f8058e92f59d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2010,7 +2010,7 @@ enum cfg80211_signal_type {
  *	received by the device (not just by the host, in case it was
  *	buffered on the device) and be accurate to about 10ms.
  *	If the frame isn't buffered, just passing the return value of
- *	ktime_get_boot_ns() is likely appropriate.
+ *	ktime_get_boottime_ns() is likely appropriate.
  * @parent_tsf: the time at the start of reception of the first octet of the
  *	timestamp field of the frame. The time is the TSF of the BSS specified
  *	by %parent_bssid.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ef63d26622f2..96c8928b468b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1666,7 +1666,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (err < 0)
 		goto free_prog;
 
-	prog->aux->load_time = ktime_get_boot_ns();
+	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
 	if (err)
 		goto free_prog;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..e2d014395fc6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10680,11 +10680,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 		break;
 
 	case CLOCK_BOOTTIME:
-		event->clock = &ktime_get_boot_ns;
+		event->clock = &ktime_get_boottime_ns;
 		break;
 
 	case CLOCK_TAI:
-		event->clock = &ktime_get_tai_ns;
+		event->clock = &ktime_get_clocktai_ns;
 		break;
 
 	default:
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..4722f1a320bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2139,7 +2139,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 
 	p->start_time = ktime_get_ns();
-	p->real_start_time = ktime_get_boot_ns();
+	p->real_start_time = ktime_get_boottime_ns();
 
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
-- 
cgit v1.2.3


From 4c54294d01e605a9f992361b924c5d8b12822a6d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 21 Jun 2019 22:32:49 +0200
Subject: timekeeping: Add missing _ns functions for coarse accessors

This further unifies the accessors for the fast and coarse functions, so
that the same types of functions are available for each. There was also
a bit of confusion with the documentation, which prior advertised a
function that has never existed. Finally, the vanilla ktime_get_coarse()
was omitted from the API originally, so this fills this oversight.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lkml.kernel.org/r/20190621203249.3909-3-Jason@zx2c4.com
---
 Documentation/core-api/timekeeping.rst | 10 +++++++---
 include/linux/timekeeping.h            | 28 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 4d92b1ac8024..15fc58e85ef9 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -99,16 +99,20 @@ Coarse and fast_ns access
 
 Some additional variants exist for more specialized cases:
 
-.. c:function:: ktime_t ktime_get_coarse_boottime( void )
+.. c:function:: ktime_t ktime_get_coarse( void )
+		ktime_t ktime_get_coarse_boottime( void )
 		ktime_t ktime_get_coarse_real( void )
 		ktime_t ktime_get_coarse_clocktai( void )
-		ktime_t ktime_get_coarse_raw( void )
+
+.. c:function:: u64 ktime_get_coarse_ns( void )
+		u64 ktime_get_coarse_boot_ns( void )
+		u64 ktime_get_coarse_real_ns( void )
+		u64 ktime_get_coarse_clocktai_ns( void )
 
 .. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
 		void ktime_get_coarse_boottime_ts64( struct timespec64 * )
 		void ktime_get_coarse_real_ts64( struct timespec64 * )
 		void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
-		void ktime_get_coarse_raw_ts64( struct timespec64 * )
 
 	These are quicker than the non-coarse versions, but less accurate,
 	corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fd6123722ea8..dcffc00755f2 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -113,6 +113,34 @@ static inline ktime_t ktime_get_coarse_clocktai(void)
 	return ktime_get_coarse_with_offset(TK_OFFS_TAI);
 }
 
+static inline ktime_t ktime_get_coarse(void)
+{
+	struct timespec64 ts;
+
+	ktime_get_coarse_ts64(&ts);
+	return timespec64_to_ktime(ts);
+}
+
+static inline u64 ktime_get_coarse_ns(void)
+{
+	return ktime_to_ns(ktime_get_coarse());
+}
+
+static inline u64 ktime_get_coarse_real_ns(void)
+{
+	return ktime_to_ns(ktime_get_coarse_real());
+}
+
+static inline u64 ktime_get_coarse_boot_ns(void)
+{
+	return ktime_to_ns(ktime_get_coarse_boottime());
+}
+
+static inline u64 ktime_get_coarse_clocktai_ns(void)
+{
+	return ktime_to_ns(ktime_get_coarse_clocktai());
+}
+
 /**
  * ktime_mono_to_real - Convert monotonic time to clock realtime
  */
-- 
cgit v1.2.3


From 62de37da9f382455b983f2f92b10012109005278 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 20 Jun 2019 15:26:29 +0300
Subject: mtd: spi-nor: intel-spi: Convert to use SPDX identifier

This gets rid of the license boilerplate duplicated in each file.

No functional changes intended.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
---
 drivers/mtd/spi-nor/intel-spi-pci.c      | 5 +----
 drivers/mtd/spi-nor/intel-spi-platform.c | 5 +----
 drivers/mtd/spi-nor/intel-spi.c          | 5 +----
 drivers/mtd/spi-nor/intel-spi.h          | 5 +----
 include/linux/platform_data/intel-spi.h  | 5 +----
 5 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/intel-spi-pci.c b/drivers/mtd/spi-nor/intel-spi-pci.c
index 578f0c74e536..1b9c2d99ba38 100644
--- a/drivers/mtd/spi-nor/intel-spi-pci.c
+++ b/drivers/mtd/spi-nor/intel-spi-pci.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash PCI driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi-platform.c b/drivers/mtd/spi-nor/intel-spi-platform.c
index 5c943df9398f..25b18804e9bb 100644
--- a/drivers/mtd/spi-nor/intel-spi-platform.c
+++ b/drivers/mtd/spi-nor/intel-spi-platform.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash platform driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.c b/drivers/mtd/spi-nor/intel-spi.c
index d60cbf23d9aa..021cef930f9f 100644
--- a/drivers/mtd/spi-nor/intel-spi.c
+++ b/drivers/mtd/spi-nor/intel-spi.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/err.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.h b/drivers/mtd/spi-nor/intel-spi.h
index 5ab7dc250050..b03bf296fda3 100644
--- a/drivers/mtd/spi-nor/intel-spi.h
+++ b/drivers/mtd/spi-nor/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_H
diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h
index 942b0c3f8f08..001f377fb5ef 100644
--- a/include/linux/platform_data/intel-spi.h
+++ b/include/linux/platform_data/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_PDATA_H
-- 
cgit v1.2.3


From 32e29396f00e7849ea0b1aeebae097fc1de6e979 Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Sat, 22 Jun 2019 15:02:07 +0200
Subject: hrtimer: Split out hrtimer defines into separate header

To avoid include dependency hell split out the hrtimer defines which are
required in the upcoming VDSO library into a separate header file.

[ tglx: Split out from the VDSO library patch and included ktime.h as
        the new header depends on it. ]

Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shijith Thotton <sthotton@marvell.com>
Tested-by: Andre Przywara <andre.przywara@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Mark Salyzyn <salyzyn@android.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Huw Davies <huw@codeweavers.com>
Link: https://lkml.kernel.org/r/20190621095252.32307-3-vincenzo.frascino@arm.com
---
 include/linux/hrtimer.h      | 16 +---------------
 include/linux/hrtimer_defs.h | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/hrtimer_defs.h

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2e8957eac4d4..4971100a8cab 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -12,8 +12,8 @@
 #ifndef _LINUX_HRTIMER_H
 #define _LINUX_HRTIMER_H
 
+#include <linux/hrtimer_defs.h>
 #include <linux/rbtree.h>
-#include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
@@ -298,26 +298,12 @@ struct clock_event_device;
 
 extern void hrtimer_interrupt(struct clock_event_device *dev);
 
-/*
- * The resolution of the clocks. The resolution value is returned in
- * the clock_getres() system call to give application programmers an
- * idea of the (in)accuracy of timers. Timer values are rounded up to
- * this resolution values.
- */
-# define HIGH_RES_NSEC		1
-# define KTIME_HIGH_RES		(HIGH_RES_NSEC)
-# define MONOTONIC_RES_NSEC	HIGH_RES_NSEC
-# define KTIME_MONOTONIC_RES	KTIME_HIGH_RES
-
 extern void clock_was_set_delayed(void);
 
 extern unsigned int hrtimer_resolution;
 
 #else
 
-# define MONOTONIC_RES_NSEC	LOW_RES_NSEC
-# define KTIME_MONOTONIC_RES	KTIME_LOW_RES
-
 #define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
 static inline void clock_was_set_delayed(void) { }
diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
new file mode 100644
index 000000000000..2d3e3c5fb946
--- /dev/null
+++ b/include/linux/hrtimer_defs.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HRTIMER_DEFS_H
+#define _LINUX_HRTIMER_DEFS_H
+
+#include <linux/ktime.h>
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+# define HIGH_RES_NSEC		1
+# define KTIME_HIGH_RES		(HIGH_RES_NSEC)
+# define MONOTONIC_RES_NSEC	HIGH_RES_NSEC
+# define KTIME_MONOTONIC_RES	KTIME_HIGH_RES
+
+#else
+
+# define MONOTONIC_RES_NSEC	LOW_RES_NSEC
+# define KTIME_MONOTONIC_RES	KTIME_LOW_RES
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 438ac88009bcb10f9ced07fbb4b32d5377ee936b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 19 Jun 2019 23:46:28 +0200
Subject: net: fastopen: robustness and endianness fixes for SipHash

Some changes to the TCP fastopen code to make it more robust
against future changes in the choice of key/cookie size, etc.

- Instead of keeping the SipHash key in an untyped u8[] buffer
  and casting it to the right type upon use, use the correct
  type directly. This ensures that the key will appear at the
  correct alignment if we ever change the way these data
  structures are allocated. (Currently, they are only allocated
  via kmalloc so they always appear at the correct alignment)

- Use DIV_ROUND_UP when sizing the u64[] array to hold the
  cookie, so it is always of sufficient size, even if
  TCP_FASTOPEN_COOKIE_MAX is no longer a multiple of 8.

- Drop the 'len' parameter from the tcp_fastopen_reset_cipher()
  function, which is no longer used.

- Add endian swabbing when setting the keys and calculating the hash,
  to ensure that cookie values are the same for a given key and
  source/destination address pair regardless of the endianness of
  the server.

Note that none of these are functional changes wrt the current
state of the code, with the exception of the swabbing, which only
affects big endian systems.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        |  2 +-
 include/net/tcp.h          |  8 ++++----
 net/ipv4/sysctl_net_ipv4.c |  3 +--
 net/ipv4/tcp.c             |  3 +--
 net/ipv4/tcp_fastopen.c    | 35 +++++++++++++++++------------------
 5 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2689b0b0b68a..f3a85a7fb4b1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -58,7 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
-	u64	val[TCP_FASTOPEN_COOKIE_MAX / sizeof(u64)];
+	__le64	val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
 	s8	len;
 	bool	exp;	/* In RFC6994 experimental option format */
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 573c9e9b0d72..9d36cc88d043 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/siphash.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -1612,8 +1613,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 void tcp_fastopen_destroy_cipher(struct sock *sk);
 void tcp_fastopen_ctx_destroy(struct net *net);
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *primary_key, void *backup_key,
-			      unsigned int len);
+			      void *primary_key, void *backup_key);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
@@ -1623,14 +1623,14 @@ void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
-#define TCP_FASTOPEN_KEY_LENGTH 16
+#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
 #define TCP_FASTOPEN_KEY_MAX 2
 #define TCP_FASTOPEN_KEY_BUF_LENGTH \
 	(TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
 
 /* Fastopen key context */
 struct tcp_fastopen_context {
-	__u8		key[TCP_FASTOPEN_KEY_MAX][TCP_FASTOPEN_KEY_LENGTH];
+	siphash_key_t	key[TCP_FASTOPEN_KEY_MAX];
 	int		num;
 	struct rcu_head	rcu;
 };
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7d802acde040..7d66306b5f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -365,8 +365,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			}
 		}
 		tcp_fastopen_reset_cipher(net, NULL, key,
-					  backup_data ? key + 4 : NULL,
-					  TCP_FASTOPEN_KEY_LENGTH);
+					  backup_data ? key + 4 : NULL);
 	}
 
 bad_key:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index efd7f2b1d1f0..47c217905864 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2822,8 +2822,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
 			backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
 
-		return tcp_fastopen_reset_cipher(net, sk, key, backup_key,
-						 TCP_FASTOPEN_KEY_LENGTH);
+		return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
 	}
 	default:
 		/* fallthru */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f918599181dd..3fd451271a70 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -7,7 +7,6 @@
 #include <linux/tcp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
-#include <linux/siphash.h>
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
@@ -31,7 +30,7 @@ void tcp_fastopen_init_key_once(struct net *net)
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
 	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(net, NULL, key, NULL, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, NULL);
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -68,8 +67,7 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 }
 
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *primary_key, void *backup_key,
-			      unsigned int len)
+			      void *primary_key, void *backup_key)
 {
 	struct tcp_fastopen_context *ctx, *octx;
 	struct fastopen_queue *q;
@@ -81,9 +79,11 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 		goto out;
 	}
 
-	memcpy(ctx->key[0], primary_key, len);
+	ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+	ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
 	if (backup_key) {
-		memcpy(ctx->key[1], backup_key, len);
+		ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+		ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
 		ctx->num = 2;
 	} else {
 		ctx->num = 1;
@@ -110,19 +110,18 @@ out:
 
 static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 					     struct sk_buff *syn,
-					     const u8 *key,
+					     const siphash_key_t *key,
 					     struct tcp_fastopen_cookie *foc)
 {
-	BUILD_BUG_ON(TCP_FASTOPEN_KEY_LENGTH != sizeof(siphash_key_t));
 	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
 
 	if (req->rsk_ops->family == AF_INET) {
 		const struct iphdr *iph = ip_hdr(syn);
 
-		foc->val[0] = siphash(&iph->saddr,
-				      sizeof(iph->saddr) +
-				      sizeof(iph->daddr),
-				      (const siphash_key_t *)key);
+		foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+					  sizeof(iph->saddr) +
+					  sizeof(iph->daddr),
+					  key));
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -130,10 +129,10 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 	if (req->rsk_ops->family == AF_INET6) {
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 
-		foc->val[0] = siphash(&ip6h->saddr,
-				      sizeof(ip6h->saddr) +
-				      sizeof(ip6h->daddr),
-				      (const siphash_key_t *)key);
+		foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+					  sizeof(ip6h->saddr) +
+					  sizeof(ip6h->daddr),
+					  key));
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -154,7 +153,7 @@ static void tcp_fastopen_cookie_gen(struct sock *sk,
 	rcu_read_lock();
 	ctx = tcp_fastopen_get_ctx(sk);
 	if (ctx)
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[0], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
 	rcu_read_unlock();
 }
 
@@ -218,7 +217,7 @@ static int tcp_fastopen_cookie_gen_check(struct sock *sk,
 	if (!ctx)
 		goto out;
 	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[i], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
 		if (tcp_fastopen_cookie_match(foc, orig)) {
 			ret = i + 1;
 			goto out;
-- 
cgit v1.2.3


From caa759323c73676b3e48c8d9c86093c88b4aba97 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Wed, 12 Jun 2019 23:48:05 -0700
Subject: smp: Remove smp_call_function() and on_each_cpu() return values

The return value is fixed. Remove it and amend the callers.

[ tglx: Fixup arm/bL_switcher and powerpc/rtas ]

Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lkml.kernel.org/r/20190613064813.8102-2-namit@vmware.com
---
 arch/alpha/kernel/smp.c       | 19 +++++--------------
 arch/alpha/oprofile/common.c  |  6 +++---
 arch/arm/common/bL_switcher.c |  6 ++----
 arch/ia64/kernel/perfmon.c    | 12 ++----------
 arch/ia64/kernel/uncached.c   |  8 ++++----
 arch/powerpc/kernel/rtas.c    |  3 +--
 arch/x86/lib/cache-smp.c      |  3 ++-
 drivers/char/agp/generic.c    |  3 +--
 include/linux/smp.h           |  7 +++----
 kernel/smp.c                  | 10 +++-------
 kernel/up.c                   |  3 +--
 11 files changed, 27 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index d0dccae53ba9..5f90df30be20 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -614,8 +614,7 @@ void
 smp_imb(void)
 {
 	/* Must wait other processors to flush their icache before continue. */
-	if (on_each_cpu(ipi_imb, NULL, 1))
-		printk(KERN_CRIT "smp_imb: timed out\n");
+	on_each_cpu(ipi_imb, NULL, 1);
 }
 EXPORT_SYMBOL(smp_imb);
 
@@ -630,9 +629,7 @@ flush_tlb_all(void)
 {
 	/* Although we don't have any data to pass, we do want to
 	   synchronize with the other processors.  */
-	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1)) {
-		printk(KERN_CRIT "flush_tlb_all: timed out\n");
-	}
+	on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 }
 
 #define asn_locked() (cpu_data[smp_processor_id()].asn_lock)
@@ -667,9 +664,7 @@ flush_tlb_mm(struct mm_struct *mm)
 		}
 	}
 
-	if (smp_call_function(ipi_flush_tlb_mm, mm, 1)) {
-		printk(KERN_CRIT "flush_tlb_mm: timed out\n");
-	}
+	smp_call_function(ipi_flush_tlb_mm, mm, 1);
 
 	preempt_enable();
 }
@@ -720,9 +715,7 @@ flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 	data.mm = mm;
 	data.addr = addr;
 
-	if (smp_call_function(ipi_flush_tlb_page, &data, 1)) {
-		printk(KERN_CRIT "flush_tlb_page: timed out\n");
-	}
+	smp_call_function(ipi_flush_tlb_page, &data, 1);
 
 	preempt_enable();
 }
@@ -772,9 +765,7 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 		}
 	}
 
-	if (smp_call_function(ipi_flush_icache_page, mm, 1)) {
-		printk(KERN_CRIT "flush_icache_page: timed out\n");
-	}
+	smp_call_function(ipi_flush_icache_page, mm, 1);
 
 	preempt_enable();
 }
diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c
index 310a4ce1dccc..1b1259c7d7d1 100644
--- a/arch/alpha/oprofile/common.c
+++ b/arch/alpha/oprofile/common.c
@@ -65,7 +65,7 @@ op_axp_setup(void)
 	model->reg_setup(&reg, ctr, &sys);
 
 	/* Configure the registers on all cpus.  */
-	(void)smp_call_function(model->cpu_setup, &reg, 1);
+	smp_call_function(model->cpu_setup, &reg, 1);
 	model->cpu_setup(&reg);
 	return 0;
 }
@@ -86,7 +86,7 @@ op_axp_cpu_start(void *dummy)
 static int
 op_axp_start(void)
 {
-	(void)smp_call_function(op_axp_cpu_start, NULL, 1);
+	smp_call_function(op_axp_cpu_start, NULL, 1);
 	op_axp_cpu_start(NULL);
 	return 0;
 }
@@ -101,7 +101,7 @@ op_axp_cpu_stop(void *dummy)
 static void
 op_axp_stop(void)
 {
-	(void)smp_call_function(op_axp_cpu_stop, NULL, 1);
+	smp_call_function(op_axp_cpu_stop, NULL, 1);
 	op_axp_cpu_stop(NULL);
 }
 
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 57f3b7512636..17bc259729e2 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -542,16 +542,14 @@ static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
 
 int bL_switcher_trace_trigger(void)
 {
-	int ret;
-
 	preempt_disable();
 
 	bL_switcher_trace_trigger_cpu(NULL);
-	ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+	smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
 
 	preempt_enable();
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 58a6337c0690..7c52bd2695a2 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6390,11 +6390,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 	}
 
 	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
-	if (ret) {
-		DPRINT(("on_each_cpu() failed: %d\n", ret));
-		goto cleanup_reserve;
-	}
+	on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
 
 	/* officially change to the alternate interrupt handler */
 	pfm_alt_intr_handler = hdl;
@@ -6421,7 +6417,6 @@ int
 pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 {
 	int i;
-	int ret;
 
 	if (hdl == NULL) return -EINVAL;
 
@@ -6435,10 +6430,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 
 	pfm_alt_intr_handler = NULL;
 
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
-	if (ret) {
-		DPRINT(("on_each_cpu() failed: %d\n", ret));
-	}
+	on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
 
 	for_each_online_cpu(i) {
 		pfm_unreserve_session(NULL, 1, i);
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 583f7ff6b589..c618d0745e22 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -124,8 +124,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
 	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
 		atomic_set(&uc_pool->status, 0);
-		status = smp_call_function(uncached_ipi_visibility, uc_pool, 1);
-		if (status || atomic_read(&uc_pool->status))
+		smp_call_function(uncached_ipi_visibility, uc_pool, 1);
+		if (atomic_read(&uc_pool->status))
 			goto failed;
 	} else if (status != PAL_VISIBILITY_OK)
 		goto failed;
@@ -146,8 +146,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	if (status != PAL_STATUS_SUCCESS)
 		goto failed;
 	atomic_set(&uc_pool->status, 0);
-	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
-	if (status || atomic_read(&uc_pool->status))
+	smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
+	if (atomic_read(&uc_pool->status))
 		goto failed;
 
 	/*
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index fbc676160adf..64d95eb6ffff 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -994,8 +994,7 @@ int rtas_ibm_suspend_me(u64 handle)
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
-	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
-		atomic_set(&data.error, -EINVAL);
+	on_each_cpu(rtas_percpu_suspend_me, &data, 0);
 
 	wait_for_completion(&done);
 
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 1811fa4a1b1a..7c48ff4ae8d1 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -15,6 +15,7 @@ EXPORT_SYMBOL(wbinvd_on_cpu);
 
 int wbinvd_on_all_cpus(void)
 {
-	return on_each_cpu(__wbinvd, NULL, 1);
+	on_each_cpu(__wbinvd, NULL, 1);
+	return 0;
 }
 EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 658664a5a5aa..df1edb5ec0ad 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -1311,8 +1311,7 @@ static void ipi_handler(void *null)
 
 void global_cache_flush(void)
 {
-	if (on_each_cpu(ipi_handler, NULL, 1) != 0)
-		panic(PFX "timed out waiting for the other CPUs!\n");
+	on_each_cpu(ipi_handler, NULL, 1);
 }
 EXPORT_SYMBOL(global_cache_flush);
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a56f08ff3097..bb8b451ab01f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -35,7 +35,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 /*
  * Call a function on all processors
  */
-int on_each_cpu(smp_call_func_t func, void *info, int wait);
+void on_each_cpu(smp_call_func_t func, void *info, int wait);
 
 /*
  * Call a function on processors specified by mask, which might include
@@ -101,7 +101,7 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int smp_call_function(smp_call_func_t func, void *info, int wait);
+void smp_call_function(smp_call_func_t func, void *info, int wait);
 void smp_call_function_many(const struct cpumask *mask,
 			    smp_call_func_t func, void *info, bool wait);
 
@@ -144,9 +144,8 @@ static inline void smp_send_stop(void) { }
  *	These macros fold the SMP functionality into a single CPU system
  */
 #define raw_smp_processor_id()			0
-static inline int up_smp_call_function(smp_call_func_t func, void *info)
+static inline void up_smp_call_function(smp_call_func_t func, void *info)
 {
-	return 0;
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
diff --git a/kernel/smp.c b/kernel/smp.c
index 220ad142f5dd..616d4d114847 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -487,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many);
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(smp_call_func_t func, void *info, int wait)
+void smp_call_function(smp_call_func_t func, void *info, int wait)
 {
 	preempt_disable();
 	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
-
-	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
 
@@ -594,18 +592,16 @@ void __init smp_init(void)
  * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
  * of local_irq_disable/enable().
  */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
+void on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
 	unsigned long flags;
-	int ret = 0;
 
 	preempt_disable();
-	ret = smp_call_function(func, info, wait);
+	smp_call_function(func, info, wait);
 	local_irq_save(flags);
 	func(info);
 	local_irq_restore(flags);
 	preempt_enable();
-	return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
 
diff --git a/kernel/up.c b/kernel/up.c
index 483c9962c999..862b460ab97a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -35,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 }
 EXPORT_SYMBOL(smp_call_function_single_async);
 
-int on_each_cpu(smp_call_func_t func, void *info, int wait)
+void on_each_cpu(smp_call_func_t func, void *info, int wait)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	func(info);
 	local_irq_restore(flags);
-	return 0;
 }
 EXPORT_SYMBOL(on_each_cpu);
 
-- 
cgit v1.2.3


From e67d4dfc9ff19dbe74b29617cf2592ccc50c3920 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Wed, 12 Jun 2019 01:44:04 -0700
Subject: power: supply: Add HWMON compatibility layer

Add code implementing HWMON adapter/compatibility layer to allow
expositing various sensors present on power supply devices via HWMON
subsystem. This is done in order to allow userspace to use single
ABI/library(libsensors) to access/manipulate all of the sensors of the
system.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Chris Healy <cphealy@gmail.com>
Cc: Chris Healy <cphealy@gmail.com>
Cc: Cory Tusar <cory.tusar@zii.aero>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/Kconfig              |  14 ++
 drivers/power/supply/Makefile             |   1 +
 drivers/power/supply/power_supply_core.c  |   7 +
 drivers/power/supply/power_supply_hwmon.c | 355 ++++++++++++++++++++++++++++++
 include/linux/power_supply.h              |  13 ++
 5 files changed, 390 insertions(+)
 create mode 100644 drivers/power/supply/power_supply_hwmon.c

(limited to 'include/linux')

diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 26dacdab03cc..1f2252cb95fd 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -14,6 +14,20 @@ config POWER_SUPPLY_DEBUG
 	  Say Y here to enable debugging messages for power supply class
 	  and drivers.
 
+config POWER_SUPPLY_HWMON
+	bool
+	prompt "Expose power supply sensors as hwmon device"
+	depends on HWMON=y || HWMON=POWER_SUPPLY
+	default y
+	help
+	  This options enables API that allows sensors found on a
+	  power supply device (current, voltage, temperature) to be
+	  exposed as a hwmon device.
+
+	  Say 'Y' here if you want power supplies to
+	  have hwmon sysfs interface too.
+
+
 config PDA_POWER
 	tristate "Generic PDA/phone power driver"
 	depends on !S390
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index f208273f9686..c47e88ba16b9 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -6,6 +6,7 @@ power_supply-$(CONFIG_SYSFS)		+= power_supply_sysfs.o
 power_supply-$(CONFIG_LEDS_TRIGGERS)	+= power_supply_leds.o
 
 obj-$(CONFIG_POWER_SUPPLY)	+= power_supply.o
+obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o
 obj-$(CONFIG_GENERIC_ADC_BATTERY)	+= generic-adc-battery.o
 
 obj-$(CONFIG_PDA_POWER)		+= pda_power.o
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..35624193a346 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1072,6 +1072,10 @@ __power_supply_register(struct device *parent,
 	if (rc)
 		goto create_triggers_failed;
 
+	rc = power_supply_add_hwmon_sysfs(psy);
+	if (rc)
+		goto add_hwmon_sysfs_failed;
+
 	/*
 	 * Update use_cnt after any uevents (most notably from device_add()).
 	 * We are here still during driver's probe but
@@ -1090,6 +1094,8 @@ __power_supply_register(struct device *parent,
 
 	return psy;
 
+add_hwmon_sysfs_failed:
+	power_supply_remove_triggers(psy);
 create_triggers_failed:
 	psy_unregister_cooler(psy);
 register_cooler_failed:
@@ -1242,6 +1248,7 @@ void power_supply_unregister(struct power_supply *psy)
 	cancel_work_sync(&psy->changed_work);
 	cancel_delayed_work_sync(&psy->deferred_register_work);
 	sysfs_remove_link(&psy->dev.kobj, "powers");
+	power_supply_remove_hwmon_sysfs(psy);
 	power_supply_remove_triggers(psy);
 	psy_unregister_cooler(psy);
 	psy_unregister_thermal(psy);
diff --git a/drivers/power/supply/power_supply_hwmon.c b/drivers/power/supply/power_supply_hwmon.c
new file mode 100644
index 000000000000..51fe60440d12
--- /dev/null
+++ b/drivers/power/supply/power_supply_hwmon.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  power_supply_hwmon.c - power supply hwmon support.
+ */
+
+#include <linux/err.h>
+#include <linux/hwmon.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+struct power_supply_hwmon {
+	struct power_supply *psy;
+	unsigned long *props;
+};
+
+static int power_supply_hwmon_in_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_in_average:
+		return POWER_SUPPLY_PROP_VOLTAGE_AVG;
+	case hwmon_in_min:
+		return POWER_SUPPLY_PROP_VOLTAGE_MIN;
+	case hwmon_in_max:
+		return POWER_SUPPLY_PROP_VOLTAGE_MAX;
+	case hwmon_in_input:
+		return POWER_SUPPLY_PROP_VOLTAGE_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_curr_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_curr_average:
+		return POWER_SUPPLY_PROP_CURRENT_AVG;
+	case hwmon_curr_max:
+		return POWER_SUPPLY_PROP_CURRENT_MAX;
+	case hwmon_curr_input:
+		return POWER_SUPPLY_PROP_CURRENT_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_temp_to_property(u32 attr, int channel)
+{
+	if (channel) {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX;
+		default:
+			break;
+		}
+	} else {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP;
+		case hwmon_temp_max:
+			return POWER_SUPPLY_PROP_TEMP_MAX;
+		case hwmon_temp_min:
+			return POWER_SUPPLY_PROP_TEMP_MIN;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MAX;
+		default:
+			break;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int
+power_supply_hwmon_to_property(enum hwmon_sensor_types type,
+			       u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_in:
+		return power_supply_hwmon_in_to_property(attr);
+	case hwmon_curr:
+		return power_supply_hwmon_curr_to_property(attr);
+	case hwmon_temp:
+		return power_supply_hwmon_temp_to_property(attr, channel);
+	default:
+		return -EINVAL;
+	}
+}
+
+static bool power_supply_hwmon_is_a_label(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	return type == hwmon_temp && attr == hwmon_temp_label;
+}
+
+static bool power_supply_hwmon_is_writable(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	switch (type) {
+	case hwmon_in:
+		return attr == hwmon_in_min ||
+		       attr == hwmon_in_max;
+	case hwmon_curr:
+		return attr == hwmon_curr_max;
+	case hwmon_temp:
+		return attr == hwmon_temp_max ||
+		       attr == hwmon_temp_min ||
+		       attr == hwmon_temp_min_alarm ||
+		       attr == hwmon_temp_max_alarm;
+	default:
+		return false;
+	}
+}
+
+static umode_t power_supply_hwmon_is_visible(const void *data,
+					     enum hwmon_sensor_types type,
+					     u32 attr, int channel)
+{
+	const struct power_supply_hwmon *psyhw = data;
+	int prop;
+
+
+	if (power_supply_hwmon_is_a_label(type, attr))
+		return 0444;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0 || !test_bit(prop, psyhw->props))
+		return 0;
+
+	if (power_supply_property_is_writeable(psyhw->psy, prop) > 0 &&
+	    power_supply_hwmon_is_writable(type, attr))
+		return 0644;
+
+	return 0444;
+}
+
+static int power_supply_hwmon_read_string(struct device *dev,
+					  enum hwmon_sensor_types type,
+					  u32 attr, int channel,
+					  const char **str)
+{
+	*str = channel ? "temp" : "temp ambient";
+	return 0;
+}
+
+static int
+power_supply_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int ret, prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	ret  = power_supply_get_property(psy, prop, &pspval);
+	if (ret)
+		return ret;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 1000);
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		if (check_mul_overflow(pspval.intval, 100,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*val = pspval.intval;
+
+	return 0;
+}
+
+static int
+power_supply_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+			 u32 attr, int channel, long val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	pspval.intval = val;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		if (check_mul_overflow(pspval.intval, 1000,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 100);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return power_supply_set_property(psy, prop, &pspval);
+}
+
+static const struct hwmon_ops power_supply_hwmon_ops = {
+	.is_visible	= power_supply_hwmon_is_visible,
+	.read		= power_supply_hwmon_read,
+	.write		= power_supply_hwmon_write,
+	.read_string	= power_supply_hwmon_read_string,
+};
+
+static const struct hwmon_channel_info *power_supply_hwmon_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MAX       |
+			   HWMON_T_MIN       |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_MIN_ALARM,
+
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_LABEL     |
+			   HWMON_T_MAX_ALARM),
+
+	HWMON_CHANNEL_INFO(curr,
+			   HWMON_C_AVERAGE |
+			   HWMON_C_MAX     |
+			   HWMON_C_INPUT),
+
+	HWMON_CHANNEL_INFO(in,
+			   HWMON_I_AVERAGE |
+			   HWMON_I_MIN     |
+			   HWMON_I_MAX     |
+			   HWMON_I_INPUT),
+	NULL
+};
+
+static const struct hwmon_chip_info power_supply_hwmon_chip_info = {
+	.ops = &power_supply_hwmon_ops,
+	.info = power_supply_hwmon_info,
+};
+
+static void power_supply_hwmon_bitmap_free(void *data)
+{
+	bitmap_free(data);
+}
+
+int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	const struct power_supply_desc *desc = psy->desc;
+	struct power_supply_hwmon *psyhw;
+	struct device *dev = &psy->dev;
+	struct device *hwmon;
+	int ret, i;
+
+	if (!devres_open_group(dev, power_supply_add_hwmon_sysfs,
+			       GFP_KERNEL))
+		return -ENOMEM;
+
+	psyhw = devm_kzalloc(dev, sizeof(*psyhw), GFP_KERNEL);
+	if (!psyhw) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	psyhw->psy = psy;
+	psyhw->props = bitmap_zalloc(POWER_SUPPLY_PROP_TIME_TO_FULL_AVG + 1,
+				     GFP_KERNEL);
+	if (!psyhw->props) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = devm_add_action(dev, power_supply_hwmon_bitmap_free,
+			      psyhw->props);
+	if (ret)
+		goto error;
+
+	for (i = 0; i < desc->num_properties; i++) {
+		const enum power_supply_property prop = desc->properties[i];
+
+		switch (prop) {
+		case POWER_SUPPLY_PROP_CURRENT_AVG:
+		case POWER_SUPPLY_PROP_CURRENT_MAX:
+		case POWER_SUPPLY_PROP_CURRENT_NOW:
+		case POWER_SUPPLY_PROP_TEMP:
+		case POWER_SUPPLY_PROP_TEMP_MAX:
+		case POWER_SUPPLY_PROP_TEMP_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_AVG:
+		case POWER_SUPPLY_PROP_VOLTAGE_MIN:
+		case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+			set_bit(prop, psyhw->props);
+			break;
+		default:
+			break;
+		}
+	}
+
+	hwmon = devm_hwmon_device_register_with_info(dev, psy->desc->name,
+						psyhw,
+						&power_supply_hwmon_chip_info,
+						NULL);
+	ret = PTR_ERR_OR_ZERO(hwmon);
+	if (ret)
+		goto error;
+
+	devres_close_group(dev, power_supply_add_hwmon_sysfs);
+	return 0;
+error:
+	devres_release_group(dev, NULL);
+	return ret;
+}
+
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy)
+{
+	devres_release_group(&psy->dev, power_supply_add_hwmon_sysfs);
+}
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d9c0c094f8a0..d5b15e039f4f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -481,4 +481,17 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp)
 	return 0;
 }
 
+#ifdef CONFIG_POWER_SUPPLY_HWMON
+int power_supply_add_hwmon_sysfs(struct power_supply *psy);
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy);
+#else
+static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	return 0;
+}
+
+static inline
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
+#endif
+
 #endif /* __LINUX_POWER_SUPPLY_H__ */
-- 
cgit v1.2.3


From 06d2bfedd147d26af6908e4202466586133e73a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 09:08:52 +0200
Subject: binfmt_flat: remove the uapi <linux/flat.h> header

The split between the two flat.h files is completely arbitrary, and the
uapi version even contains CONFIG_ ifdefs that can't work in userspace.
The only userspace program known to use the header is elf2flt, and it
ships with its own version of the combined header.

Use the chance to move the <asm/flat.h> inclusion out of this file, as it
is in no way needed for the format defintion, but just for the binfmt
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Vladimir Murzin <vladimir.murzin@arm.com>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 fs/binfmt_flat.c          |  1 +
 include/linux/flat.h      | 45 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/flat.h | 59 -----------------------------------------------
 3 files changed, 42 insertions(+), 63 deletions(-)
 delete mode 100644 include/uapi/linux/flat.h

(limited to 'include/linux')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a15fdd5d95ed..b63c5e63ae3f 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -42,6 +42,7 @@
 #include <asm/unaligned.h>
 #include <asm/cacheflush.h>
 #include <asm/page.h>
+#include <asm/flat.h>
 
 #ifndef flat_get_relocate_addr
 #define flat_get_relocate_addr(rel)	(rel)
diff --git a/include/linux/flat.h b/include/linux/flat.h
index 569b67d64d5c..21d901ba191b 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -10,8 +10,47 @@
 #ifndef _LINUX_FLAT_H
 #define _LINUX_FLAT_H
 
-#include <uapi/linux/flat.h>
-#include <asm/flat.h>
+#define	FLAT_VERSION			0x00000004L
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+#define	MAX_SHARED_LIBS			(4)
+#else
+#define	MAX_SHARED_LIBS			(1)
+#endif
+
+/*
+ * To make everything easier to port and manage cross platform
+ * development,  all fields are in network byte order.
+ */
+
+struct flat_hdr {
+	char magic[4];
+	unsigned long rev;          /* version (as above) */
+	unsigned long entry;        /* Offset of first executable instruction
+	                               with text segment from beginning of file */
+	unsigned long data_start;   /* Offset of data segment from beginning of
+	                               file */
+	unsigned long data_end;     /* Offset of end of data segment
+	                               from beginning of file */
+	unsigned long bss_end;      /* Offset of end of bss segment from beginning
+	                               of file */
+
+	/* (It is assumed that data_end through bss_end forms the bss segment.) */
+
+	unsigned long stack_size;   /* Size of stack, in bytes */
+	unsigned long reloc_start;  /* Offset of relocation records from
+	                               beginning of file */
+	unsigned long reloc_count;  /* Number of relocation records */
+	unsigned long flags;
+	unsigned long build_date;   /* When the program/library was built */
+	unsigned long filler[5];    /* Reservered, set to zero */
+};
+
+#define FLAT_FLAG_RAM    0x0001 /* load program entirely into RAM */
+#define FLAT_FLAG_GOTPIC 0x0002 /* program is PIC with GOT */
+#define FLAT_FLAG_GZIP   0x0004 /* all but the header is compressed */
+#define FLAT_FLAG_GZDATA 0x0008 /* only data/relocs are compressed (for XIP) */
+#define FLAT_FLAG_KTRACE 0x0010 /* output useful kernel trace for debugging */
 
 /*
  * While it would be nice to keep this header clean,  users of older
@@ -22,8 +61,6 @@
  *        with the format above,  except to fix bugs with old format support.
  */
 
-#include <asm/byteorder.h>
-
 #define	OLD_FLAT_VERSION			0x00000002L
 #define OLD_FLAT_RELOC_TYPE_TEXT	0
 #define OLD_FLAT_RELOC_TYPE_DATA	1
diff --git a/include/uapi/linux/flat.h b/include/uapi/linux/flat.h
deleted file mode 100644
index 27e595e44fb7..000000000000
--- a/include/uapi/linux/flat.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2002-2003  David McCullough <davidm@snapgear.com>
- * Copyright (C) 1998       Kenneth Albanowski <kjahds@kjahds.com>
- *                          The Silver Hammer Group, Ltd.
- *
- * This file provides the definitions and structures needed to
- * support uClinux flat-format executables.
- */
-
-#ifndef _UAPI_LINUX_FLAT_H
-#define _UAPI_LINUX_FLAT_H
-
-
-#define	FLAT_VERSION			0x00000004L
-
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-#define	MAX_SHARED_LIBS			(4)
-#else
-#define	MAX_SHARED_LIBS			(1)
-#endif
-
-/*
- * To make everything easier to port and manage cross platform
- * development,  all fields are in network byte order.
- */
-
-struct flat_hdr {
-	char magic[4];
-	unsigned long rev;          /* version (as above) */
-	unsigned long entry;        /* Offset of first executable instruction
-	                               with text segment from beginning of file */
-	unsigned long data_start;   /* Offset of data segment from beginning of
-	                               file */
-	unsigned long data_end;     /* Offset of end of data segment
-	                               from beginning of file */
-	unsigned long bss_end;      /* Offset of end of bss segment from beginning
-	                               of file */
-
-	/* (It is assumed that data_end through bss_end forms the bss segment.) */
-
-	unsigned long stack_size;   /* Size of stack, in bytes */
-	unsigned long reloc_start;  /* Offset of relocation records from
-	                               beginning of file */
-	unsigned long reloc_count;  /* Number of relocation records */
-	unsigned long flags;       
-	unsigned long build_date;   /* When the program/library was built */
-	unsigned long filler[5];    /* Reservered, set to zero */
-};
-
-#define FLAT_FLAG_RAM    0x0001 /* load program entirely into RAM */
-#define FLAT_FLAG_GOTPIC 0x0002 /* program is PIC with GOT */
-#define FLAT_FLAG_GZIP   0x0004 /* all but the header is compressed */
-#define FLAT_FLAG_GZDATA 0x0008 /* only data/relocs are compressed (for XIP) */
-#define FLAT_FLAG_KTRACE 0x0010 /* output useful kernel trace for debugging */
-
-
-
-#endif /* _UAPI_LINUX_FLAT_H */
-- 
cgit v1.2.3


From 38e63483a31747ef8a964ba3f0184c1e5b507749 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 09:08:53 +0200
Subject: binfmt_flat: remove the unused OLD_FLAT_FLAG_RAM definition

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 include/linux/flat.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flat.h b/include/linux/flat.h
index 21d901ba191b..2b7cda6e9c1b 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -72,15 +72,12 @@ typedef union {
 # if defined(mc68000) && !defined(CONFIG_COLDFIRE)
 		signed long offset : 30;
 		unsigned long type : 2;
-#   	define OLD_FLAT_FLAG_RAM    0x1 /* load program entirely into RAM */
 # elif defined(__BIG_ENDIAN_BITFIELD)
 		unsigned long type : 2;
 		signed long offset : 30;
-#   	define OLD_FLAT_FLAG_RAM    0x1 /* load program entirely into RAM */
 # elif defined(__LITTLE_ENDIAN_BITFIELD)
 		signed long offset : 30;
 		unsigned long type : 2;
-#   	define OLD_FLAT_FLAG_RAM    0x1 /* load program entirely into RAM */
 # else
 #   	error "Unknown bitfield order for flat files."
 # endif
-- 
cgit v1.2.3


From 3f8b76a66e0d49e3afaba595b9762c126448e783 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 09:08:54 +0200
Subject: binfmt_flat: consolidate two version of flat_v2_reloc_t

Two branches of the ifdef maze actually have the same content, so merge
them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 include/linux/flat.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flat.h b/include/linux/flat.h
index 2b7cda6e9c1b..19c586b74b99 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -69,15 +69,13 @@ struct flat_hdr {
 typedef union {
 	unsigned long	value;
 	struct {
-# if defined(mc68000) && !defined(CONFIG_COLDFIRE)
+#if defined(__LITTLE_ENDIAN_BITFIELD) || \
+    (defined(mc68000) && !defined(CONFIG_COLDFIRE))
 		signed long offset : 30;
 		unsigned long type : 2;
 # elif defined(__BIG_ENDIAN_BITFIELD)
 		unsigned long type : 2;
 		signed long offset : 30;
-# elif defined(__LITTLE_ENDIAN_BITFIELD)
-		signed long offset : 30;
-		unsigned long type : 2;
 # else
 #   	error "Unknown bitfield order for flat files."
 # endif
-- 
cgit v1.2.3


From 34b4664ac4824d6c7a8b29db24b18733df07b2f4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 09:08:55 +0200
Subject: binfmt_flat: use fixed size type for the on-disk format

So far binfmt_flat has only been supported on 32-bit platforms, so the
variable size of the fields didn't matter.  But the upcoming RISC-V
nommu port supports 64-bit CPUs, and we now have a conflict between
the elf2flt creation tool that always uses 32-bit fields and the kernel
that uses (unsigned) long field.  Switch to the userspace view as the
rest of the binfmt_flat format is completely architecture neutral,
and binfmt_flat isn't the right binary format for huge executables to
start with.

While we're at it also ensure these fields are using __be types as
they big endian and are byte swapped when loaded.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 include/linux/flat.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flat.h b/include/linux/flat.h
index 19c586b74b99..d586bb6e64a7 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -24,26 +24,26 @@
  */
 
 struct flat_hdr {
-	char magic[4];
-	unsigned long rev;          /* version (as above) */
-	unsigned long entry;        /* Offset of first executable instruction
-	                               with text segment from beginning of file */
-	unsigned long data_start;   /* Offset of data segment from beginning of
-	                               file */
-	unsigned long data_end;     /* Offset of end of data segment
-	                               from beginning of file */
-	unsigned long bss_end;      /* Offset of end of bss segment from beginning
-	                               of file */
+	char	magic[4];
+	__be32	rev;          /* version (as above) */
+	__be32	entry;        /* Offset of first executable instruction
+				 with text segment from beginning of file */
+	__be32	data_start;   /* Offset of data segment from beginning of
+				 file */
+	__be32	data_end;     /* Offset of end of data segment from beginning
+				 of file */
+	__be32	bss_end;      /* Offset of end of bss segment from beginning
+				 of file */
 
 	/* (It is assumed that data_end through bss_end forms the bss segment.) */
 
-	unsigned long stack_size;   /* Size of stack, in bytes */
-	unsigned long reloc_start;  /* Offset of relocation records from
-	                               beginning of file */
-	unsigned long reloc_count;  /* Number of relocation records */
-	unsigned long flags;
-	unsigned long build_date;   /* When the program/library was built */
-	unsigned long filler[5];    /* Reservered, set to zero */
+	__be32	stack_size;   /* Size of stack, in bytes */
+	__be32	reloc_start;  /* Offset of relocation records from beginning of
+				 file */
+	__be32	reloc_count;  /* Number of relocation records */
+	__be32	flags;
+	__be32	build_date;   /* When the program/library was built */
+	__u32	filler[5];    /* Reservered, set to zero */
 };
 
 #define FLAT_FLAG_RAM    0x0001 /* load program entirely into RAM */
@@ -67,15 +67,15 @@ struct flat_hdr {
 #define OLD_FLAT_RELOC_TYPE_BSS		2
 
 typedef union {
-	unsigned long	value;
+	u32		value;
 	struct {
 #if defined(__LITTLE_ENDIAN_BITFIELD) || \
     (defined(mc68000) && !defined(CONFIG_COLDFIRE))
-		signed long offset : 30;
-		unsigned long type : 2;
+		s32	offset : 30;
+		u32	type : 2;
 # elif defined(__BIG_ENDIAN_BITFIELD)
-		unsigned long type : 2;
-		signed long offset : 30;
+		u32	type : 2;
+		s32	offset : 30;
 # else
 #   	error "Unknown bitfield order for flat files."
 # endif
-- 
cgit v1.2.3


From a445d988b4790e06bb94e927e740017675d7e700 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 09:09:01 +0200
Subject: binfmt_flat: move the MAX_SHARED_LIBS definition to binfmt_flat.c

MAX_SHARED_LIBS is an implementation detail of the kernel loader,
and should be kept away from the file format definition.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 fs/binfmt_flat.c     | 6 ++++++
 include/linux/flat.h | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 0ca65d51bb01..ccd9843e979e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -68,6 +68,12 @@
 #define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
 
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+#define	MAX_SHARED_LIBS			(4)
+#else
+#define	MAX_SHARED_LIBS			(1)
+#endif
+
 struct lib_info {
 	struct {
 		unsigned long start_code;		/* Start of text segment */
diff --git a/include/linux/flat.h b/include/linux/flat.h
index d586bb6e64a7..83977c0ce3de 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -12,12 +12,6 @@
 
 #define	FLAT_VERSION			0x00000004L
 
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-#define	MAX_SHARED_LIBS			(4)
-#else
-#define	MAX_SHARED_LIBS			(1)
-#endif
-
 /*
  * To make everything easier to port and manage cross platform
  * development,  all fields are in network byte order.
-- 
cgit v1.2.3


From 29d14b668d2f2e7b692525ee3f69bf12b06be0f0 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 14 Jun 2019 18:53:57 +0100
Subject: mfd: Remove unused helper syscon_regmap_lookup_by_pdevname

Nobody uses the exported helper syscon_regmap_lookup_by_pdevname,
to lookup a device by name. Let us remove it.

Suggested-by: Arnd Bergman <arnd@arnd.de>
Cc: Arnd Bergman <arnd@arnd.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mfd/syscon.c       | 21 ---------------------
 include/linux/mfd/syscon.h |  6 ------
 2 files changed, 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c
index 8ce1e41d632c..b65e585fc8c6 100644
--- a/drivers/mfd/syscon.c
+++ b/drivers/mfd/syscon.c
@@ -190,27 +190,6 @@ struct regmap *syscon_regmap_lookup_by_compatible(const char *s)
 }
 EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_compatible);
 
-static int syscon_match_pdevname(struct device *dev, void *data)
-{
-	return !strcmp(dev_name(dev), (const char *)data);
-}
-
-struct regmap *syscon_regmap_lookup_by_pdevname(const char *s)
-{
-	struct device *dev;
-	struct syscon *syscon;
-
-	dev = driver_find_device(&syscon_driver.driver, NULL, (void *)s,
-				 syscon_match_pdevname);
-	if (!dev)
-		return ERR_PTR(-EPROBE_DEFER);
-
-	syscon = dev_get_drvdata(dev);
-
-	return syscon->regmap;
-}
-EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_pdevname);
-
 struct regmap *syscon_regmap_lookup_by_phandle(struct device_node *np,
 					const char *property)
 {
diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index f0273c9e972b..8cfda0554381 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -19,7 +19,6 @@ struct device_node;
 #ifdef CONFIG_MFD_SYSCON
 extern struct regmap *syscon_node_to_regmap(struct device_node *np);
 extern struct regmap *syscon_regmap_lookup_by_compatible(const char *s);
-extern struct regmap *syscon_regmap_lookup_by_pdevname(const char *s);
 extern struct regmap *syscon_regmap_lookup_by_phandle(
 					struct device_node *np,
 					const char *property);
@@ -34,11 +33,6 @@ static inline struct regmap *syscon_regmap_lookup_by_compatible(const char *s)
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline struct regmap *syscon_regmap_lookup_by_pdevname(const char *s)
-{
-	return ERR_PTR(-ENOTSUPP);
-}
-
 static inline struct regmap *syscon_regmap_lookup_by_phandle(
 					struct device_node *np,
 					const char *property)
-- 
cgit v1.2.3


From 418e3ea157efb0eb2c6dd412a8d5f052477c7f5a Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 14 Jun 2019 18:53:59 +0100
Subject: bus_find_device: Unify the match callback with class_find_device

There is an arbitrary difference between the prototypes of
bus_find_device() and class_find_device() preventing their callers
from passing the same pair of data and match() arguments to both of
them, which is the const qualifier used in the prototype of
class_find_device().  If that qualifier is also used in the
bus_find_device() prototype, it will be possible to pass the same
match() callback function to both bus_find_device() and
class_find_device(), which will allow some optimizations to be made in
order to avoid code duplication going forward.  Also with that, constify
the "data" parameter as it is passed as a const to the match function.

For this reason, change the prototype of bus_find_device() to match
the prototype of class_find_device() and adjust its callers to use the
const qualifier in accordance with the new prototype of it.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Andreas Noever <andreas.noever@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Corey Minyard <minyard@acm.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Kershner <david.kershner@unisys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Airlie <airlied@linux.ie>
Cc: Felipe Balbi <balbi@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Hartmut Knaack <knaack.h@gmx.de>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael Jamet <michael.jamet@intel.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Sebastian Ott <sebott@linux.ibm.com>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Yehezkel Bernat <YehezkelShB@gmail.com>
Cc: rafael@kernel.org
Acked-by: Corey Minyard <minyard@acm.org>
Acked-by: David Kershner <david.kershner@unisys.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Acked-by: Wolfram Sang <wsa@the-dreams.de> # for the I2C parts
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/pseries/ibmebus.c           | 4 ++--
 drivers/acpi/acpi_lpss.c                           | 4 ++--
 drivers/acpi/sleep.c                               | 2 +-
 drivers/acpi/utils.c                               | 4 ++--
 drivers/base/bus.c                                 | 6 +++---
 drivers/base/devcon.c                              | 2 +-
 drivers/char/ipmi/ipmi_si_platform.c               | 2 +-
 drivers/firmware/efi/dev-path-parser.c             | 4 ++--
 drivers/gpu/drm/drm_mipi_dsi.c                     | 2 +-
 drivers/hwtracing/coresight/coresight.c            | 6 +++---
 drivers/hwtracing/coresight/of_coresight.c         | 2 +-
 drivers/hwtracing/intel_th/core.c                  | 5 ++---
 drivers/i2c/i2c-core-acpi.c                        | 4 ++--
 drivers/i2c/i2c-core-of.c                          | 4 ++--
 drivers/iio/inkern.c                               | 2 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c         | 2 +-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 2 +-
 drivers/net/ethernet/ti/cpsw-phy-sel.c             | 4 ++--
 drivers/net/ethernet/ti/davinci_emac.c             | 2 +-
 drivers/net/ethernet/toshiba/tc35815.c             | 4 ++--
 drivers/nvmem/core.c                               | 2 +-
 drivers/of/of_mdio.c                               | 2 +-
 drivers/of/platform.c                              | 2 +-
 drivers/pci/probe.c                                | 2 +-
 drivers/pci/search.c                               | 4 ++--
 drivers/s390/cio/css.c                             | 4 ++--
 drivers/s390/cio/device.c                          | 4 ++--
 drivers/s390/cio/scm.c                             | 4 ++--
 drivers/s390/crypto/ap_bus.c                       | 8 ++++----
 drivers/scsi/scsi_proc.c                           | 2 +-
 drivers/spi/spi.c                                  | 4 ++--
 drivers/thunderbolt/switch.c                       | 4 ++--
 drivers/usb/core/devio.c                           | 4 ++--
 drivers/usb/core/usb.c                             | 4 ++--
 drivers/usb/phy/phy-am335x-control.c               | 4 ++--
 drivers/usb/phy/phy-isp1301.c                      | 4 ++--
 drivers/visorbus/visorbus_main.c                   | 4 ++--
 include/linux/device.h                             | 4 ++--
 sound/soc/rockchip/rk3399_gru_sound.c              | 2 +-
 39 files changed, 67 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 84e8ec4011ba..b91eb0929ed1 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -147,13 +147,13 @@ static const struct dma_map_ops ibmebus_dma_ops = {
 	.unmap_page         = ibmebus_unmap_page,
 };
 
-static int ibmebus_match_path(struct device *dev, void *data)
+static int ibmebus_match_path(struct device *dev, const void *data)
 {
 	struct device_node *dn = to_platform_device(dev)->dev.of_node;
 	return (of_find_node_by_path(data) == dn);
 }
 
-static int ibmebus_match_node(struct device *dev, void *data)
+static int ibmebus_match_node(struct device *dev, const void *data)
 {
 	return to_platform_device(dev)->dev.of_node == data;
 }
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index cf768608437e..dc2ca78748a2 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -511,10 +511,10 @@ struct hid_uid {
 	const char *uid;
 };
 
-static int match_hid_uid(struct device *dev, void *data)
+static int match_hid_uid(struct device *dev, const void *data)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
-	struct hid_uid *id = data;
+	const struct hid_uid *id = data;
 
 	if (!adev)
 		return 0;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a34deccd7317..fcf4386ecc78 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -454,7 +454,7 @@ static int acpi_pm_prepare(void)
 	return error;
 }
 
-static int find_powerf_dev(struct device *dev, void *data)
+static int find_powerf_dev(struct device *dev, const void *data)
 {
 	struct acpi_device *device = to_acpi_device(dev);
 	const char *hid = acpi_device_hid(device);
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 1391b63cadfd..e3974a8f8fd4 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -730,10 +730,10 @@ struct acpi_dev_match_info {
 	s64 hrv;
 };
 
-static int acpi_dev_match_cb(struct device *dev, void *data)
+static int acpi_dev_match_cb(struct device *dev, const void *data)
 {
 	struct acpi_device *adev = to_acpi_device(dev);
-	struct acpi_dev_match_info *match = data;
+	const struct acpi_dev_match_info *match = data;
 	unsigned long long hrv;
 	acpi_status status;
 
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 0a58e969f8b7..df3cac739813 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -323,8 +323,8 @@ EXPORT_SYMBOL_GPL(bus_for_each_dev);
  * return to the caller and not iterate over any more devices.
  */
 struct device *bus_find_device(struct bus_type *bus,
-			       struct device *start, void *data,
-			       int (*match)(struct device *dev, void *data))
+			       struct device *start, const void *data,
+			       int (*match)(struct device *dev, const void *data))
 {
 	struct klist_iter i;
 	struct device *dev;
@@ -342,7 +342,7 @@ struct device *bus_find_device(struct bus_type *bus,
 }
 EXPORT_SYMBOL_GPL(bus_find_device);
 
-static int match_name(struct device *dev, void *data)
+static int match_name(struct device *dev, const void *data)
 {
 	const char *name = data;
 
diff --git a/drivers/base/devcon.c b/drivers/base/devcon.c
index 04db9ae235e4..ac026d5fc672 100644
--- a/drivers/base/devcon.c
+++ b/drivers/base/devcon.c
@@ -107,7 +107,7 @@ static struct bus_type *generic_match_buses[] = {
 	NULL,
 };
 
-static int device_fwnode_match(struct device *dev, void *fwnode)
+static int device_fwnode_match(struct device *dev, const void *fwnode)
 {
 	return dev_fwnode(dev) == fwnode;
 }
diff --git a/drivers/char/ipmi/ipmi_si_platform.c b/drivers/char/ipmi/ipmi_si_platform.c
index f2a91c4d8cab..fd94c4238449 100644
--- a/drivers/char/ipmi/ipmi_si_platform.c
+++ b/drivers/char/ipmi/ipmi_si_platform.c
@@ -426,7 +426,7 @@ static int ipmi_remove(struct platform_device *pdev)
 	return ipmi_si_remove_by_dev(&pdev->dev);
 }
 
-static int pdev_match_name(struct device *dev, void *data)
+static int pdev_match_name(struct device *dev, const void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	const char *name = data;
diff --git a/drivers/firmware/efi/dev-path-parser.c b/drivers/firmware/efi/dev-path-parser.c
index 85ec99f97841..20123384271c 100644
--- a/drivers/firmware/efi/dev-path-parser.c
+++ b/drivers/firmware/efi/dev-path-parser.c
@@ -17,9 +17,9 @@ struct acpi_hid_uid {
 	char uid[11]; /* UINT_MAX + null byte */
 };
 
-static int __init match_acpi_dev(struct device *dev, void *data)
+static int __init match_acpi_dev(struct device *dev, const void *data)
 {
-	struct acpi_hid_uid hid_uid = *(struct acpi_hid_uid *)data;
+	struct acpi_hid_uid hid_uid = *(const struct acpi_hid_uid *)data;
 	struct acpi_device *adev = to_acpi_device(dev);
 
 	if (acpi_match_device_ids(adev, hid_uid.hid))
diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c
index 80b75501f5c6..ad19df0686c9 100644
--- a/drivers/gpu/drm/drm_mipi_dsi.c
+++ b/drivers/gpu/drm/drm_mipi_dsi.c
@@ -93,7 +93,7 @@ static struct bus_type mipi_dsi_bus_type = {
 	.pm = &mipi_dsi_device_pm_ops,
 };
 
-static int of_device_match(struct device *dev, void *data)
+static int of_device_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 4b130281236a..b67ab6a09587 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -498,9 +498,9 @@ struct coresight_device *coresight_get_sink(struct list_head *path)
 	return csdev;
 }
 
-static int coresight_enabled_sink(struct device *dev, void *data)
+static int coresight_enabled_sink(struct device *dev, const void *data)
 {
-	bool *reset = data;
+	const bool *reset = data;
 	struct coresight_device *csdev = to_coresight_device(dev);
 
 	if ((csdev->type == CORESIGHT_DEV_TYPE_SINK ||
@@ -544,7 +544,7 @@ struct coresight_device *coresight_get_enabled_sink(bool deactivate)
 	return dev ? to_coresight_device(dev) : NULL;
 }
 
-static int coresight_sink_by_id(struct device *dev, void *data)
+static int coresight_sink_by_id(struct device *dev, const void *data)
 {
 	struct coresight_device *csdev = to_coresight_device(dev);
 	unsigned long hash;
diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c
index 7045930fc958..3fc200ec1c03 100644
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -18,7 +18,7 @@
 #include <asm/smp_plat.h>
 
 
-static int of_dev_node_match(struct device *dev, void *data)
+static int of_dev_node_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c
index 033dce563c99..55922896d862 100644
--- a/drivers/hwtracing/intel_th/core.c
+++ b/drivers/hwtracing/intel_th/core.c
@@ -789,10 +789,9 @@ static int intel_th_populate(struct intel_th *th)
 	return 0;
 }
 
-static int match_devt(struct device *dev, void *data)
+static int match_devt(struct device *dev, const void *data)
 {
-	dev_t devt = (dev_t)(unsigned long)data;
-
+	dev_t devt = (dev_t)(unsigned long)(void *)data;
 	return dev->devt == devt;
 }
 
diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index d84095591e45..8af35f114821 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -318,7 +318,7 @@ u32 i2c_acpi_find_bus_speed(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(i2c_acpi_find_bus_speed);
 
-static int i2c_acpi_find_match_adapter(struct device *dev, void *data)
+static int i2c_acpi_find_match_adapter(struct device *dev, const void *data)
 {
 	struct i2c_adapter *adapter = i2c_verify_adapter(dev);
 
@@ -328,7 +328,7 @@ static int i2c_acpi_find_match_adapter(struct device *dev, void *data)
 	return ACPI_HANDLE(dev) == (acpi_handle)data;
 }
 
-static int i2c_acpi_find_match_device(struct device *dev, void *data)
+static int i2c_acpi_find_match_device(struct device *dev, const void *data)
 {
 	return ACPI_COMPANION(dev) == data;
 }
diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c
index 406e5f695a7e..2eb59a260ad4 100644
--- a/drivers/i2c/i2c-core-of.c
+++ b/drivers/i2c/i2c-core-of.c
@@ -112,12 +112,12 @@ void of_i2c_register_devices(struct i2c_adapter *adap)
 	of_node_put(bus);
 }
 
-static int of_dev_node_match(struct device *dev, void *data)
+static int of_dev_node_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
 
-static int of_dev_or_parent_node_match(struct device *dev, void *data)
+static int of_dev_or_parent_node_match(struct device *dev, const void *data)
 {
 	if (dev->of_node == data)
 		return 1;
diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 4a5eff3f18bc..c46fb59d92cb 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -93,7 +93,7 @@ static const struct iio_chan_spec
 
 #ifdef CONFIG_OF
 
-static int iio_dev_node_match(struct device *dev, void *data)
+static int iio_dev_node_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data && dev->type == &iio_device_type;
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 4c5d0f160c10..fd90b05849c8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -4497,7 +4497,7 @@ static const struct acpi_device_id hns_roce_acpi_match[] = {
 };
 MODULE_DEVICE_TABLE(acpi, hns_roce_acpi_match);
 
-static int hns_roce_node_match(struct device *dev, void *fwnode)
+static int hns_roce_node_match(struct device *dev, const void *fwnode)
 {
 	return dev->fwnode == fwnode;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
index 09c16d88172e..bb6586d0e5af 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
@@ -754,7 +754,7 @@ struct dsaf_misc_op *hns_misc_op_get(struct dsaf_device *dsaf_dev)
 	return (void *)misc_op;
 }
 
-static int hns_dsaf_dev_match(struct device *dev, void *fwnode)
+static int hns_dsaf_dev_match(struct device *dev, const void *fwnode)
 {
 	return dev->fwnode == fwnode;
 }
diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c
index 48e0924259f5..4e184eecc8e1 100644
--- a/drivers/net/ethernet/ti/cpsw-phy-sel.c
+++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c
@@ -151,9 +151,9 @@ static void cpsw_gmii_sel_dra7xx(struct cpsw_phy_sel_priv *priv,
 }
 
 static struct platform_driver cpsw_phy_sel_driver;
-static int match(struct device *dev, void *data)
+static int match(struct device *dev, const void *data)
 {
-	struct device_node *node = (struct device_node *)data;
+	const struct device_node *node = (const struct device_node *)data;
 	return dev->of_node == node &&
 		dev->driver == &cpsw_phy_sel_driver.driver;
 }
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 4bf65cab79e6..57d131a04db3 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1371,7 +1371,7 @@ static int emac_devioctl(struct net_device *ndev, struct ifreq *ifrq, int cmd)
 		return -EOPNOTSUPP;
 }
 
-static int match_first_device(struct device *dev, void *data)
+static int match_first_device(struct device *dev, const void *data)
 {
 	if (dev->parent && dev->parent->of_node)
 		return of_device_is_compatible(dev->parent->of_node,
diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c
index c50a9772f4af..8479a440527b 100644
--- a/drivers/net/ethernet/toshiba/tc35815.c
+++ b/drivers/net/ethernet/toshiba/tc35815.c
@@ -694,10 +694,10 @@ err_out:
  * should provide a "tc35815-mac" device with a MAC address in its
  * platform_data.
  */
-static int tc35815_mac_match(struct device *dev, void *data)
+static int tc35815_mac_match(struct device *dev, const void *data)
 {
 	struct platform_device *plat_dev = to_platform_device(dev);
-	struct pci_dev *pci_dev = data;
+	const struct pci_dev *pci_dev = data;
 	unsigned int id = pci_dev->irq;
 	return !strcmp(plat_dev->name, "tc35815-mac") && plat_dev->id == id;
 }
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index c7892c3da91f..ac5d945be88a 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -76,7 +76,7 @@ static struct bus_type nvmem_bus_type = {
 	.name		= "nvmem",
 };
 
-static int of_nvmem_match(struct device *dev, void *nvmem_np)
+static int of_nvmem_match(struct device *dev, const void *nvmem_np)
 {
 	return dev->of_node == nvmem_np;
 }
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index de6157357e26..dfe12948c834 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -282,7 +282,7 @@ unregister:
 EXPORT_SYMBOL(of_mdiobus_register);
 
 /* Helper function for of_phy_find_device */
-static int of_phy_match(struct device *dev, void *phy_np)
+static int of_phy_match(struct device *dev, const void *phy_np)
 {
 	return dev->of_node == phy_np;
 }
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 04ad312fd85b..008d79e33c2d 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -37,7 +37,7 @@ static const struct of_device_id of_skipped_node_table[] = {
 	{} /* Empty terminated list */
 };
 
-static int of_dev_node_match(struct device *dev, void *data)
+static int of_dev_node_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0e8e2c186f50..f9ef7ad3f75d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -64,7 +64,7 @@ static struct resource *get_pci_domain_busn_res(int domain_nr)
 	return &r->res;
 }
 
-static int find_anything(struct device *dev, void *data)
+static int find_anything(struct device *dev, const void *data)
 {
 	return 1;
 }
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 5c7922612733..7f4e65872b8d 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -236,10 +236,10 @@ struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus,
 }
 EXPORT_SYMBOL(pci_get_domain_bus_and_slot);
 
-static int match_pci_dev_by_id(struct device *dev, void *data)
+static int match_pci_dev_by_id(struct device *dev, const void *data)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
-	struct pci_device_id *id = data;
+	const struct pci_device_id *id = data;
 
 	if (pci_match_one_device(id, pdev))
 		return 1;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index aea502922646..a2c97830efe0 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -434,10 +434,10 @@ static int css_probe_device(struct subchannel_id schid, struct schib *schib)
 }
 
 static int
-check_subchannel(struct device * dev, void * data)
+check_subchannel(struct device *dev, const void *data)
 {
 	struct subchannel *sch;
-	struct subchannel_id *schid = data;
+	struct subchannel_id *schid = (void *)data;
 
 	sch = to_subchannel(dev);
 	return schid_equal(&sch->schid, schid);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 1540229a37bb..d32f373e5bc7 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -642,10 +642,10 @@ static int ccw_device_add(struct ccw_device *cdev)
 	return device_add(dev);
 }
 
-static int match_dev_id(struct device *dev, void *data)
+static int match_dev_id(struct device *dev, const void *data)
 {
 	struct ccw_device *cdev = to_ccwdev(dev);
-	struct ccw_dev_id *dev_id = data;
+	struct ccw_dev_id *dev_id = (void *)data;
 
 	return ccw_dev_id_is_equal(&cdev->private->dev_id, dev_id);
 }
diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c
index 6bca1d5455d4..9f26d4310bb3 100644
--- a/drivers/s390/cio/scm.c
+++ b/drivers/s390/cio/scm.c
@@ -174,10 +174,10 @@ out:
 		kobject_uevent(&scmdev->dev.kobj, KOBJ_CHANGE);
 }
 
-static int check_address(struct device *dev, void *data)
+static int check_address(struct device *dev, const void *data)
 {
 	struct scm_device *scmdev = to_scm_dev(dev);
-	struct sale *sale = data;
+	const struct sale *sale = data;
 
 	return scmdev->address == sale->sa;
 }
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index b9fc502c58c2..b7902b643ec8 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1356,16 +1356,16 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func)
  * Helper function to be used with bus_find_dev
  * matches for the card device with the given id
  */
-static int __match_card_device_with_id(struct device *dev, void *data)
+static int __match_card_device_with_id(struct device *dev, const void *data)
 {
-	return is_card_dev(dev) && to_ap_card(dev)->id == (int)(long) data;
+	return is_card_dev(dev) && to_ap_card(dev)->id == (int)(long)(void *) data;
 }
 
 /*
  * Helper function to be used with bus_find_dev
  * matches for the queue device with a given qid
  */
-static int __match_queue_device_with_qid(struct device *dev, void *data)
+static int __match_queue_device_with_qid(struct device *dev, const void *data)
 {
 	return is_queue_dev(dev) && to_ap_queue(dev)->qid == (int)(long) data;
 }
@@ -1374,7 +1374,7 @@ static int __match_queue_device_with_qid(struct device *dev, void *data)
  * Helper function to be used with bus_find_dev
  * matches any queue device with given queue id
  */
-static int __match_queue_device_with_queue_id(struct device *dev, void *data)
+static int __match_queue_device_with_queue_id(struct device *dev, const void *data)
 {
 	return is_queue_dev(dev)
 		&& AP_QID_QUEUE(to_ap_queue(dev)->qid) == (int)(long) data;
diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 7f0ceb65c3f3..c074631086a4 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -372,7 +372,7 @@ static ssize_t proc_scsi_write(struct file *file, const char __user *buf,
 	return err;
 }
 
-static int always_match(struct device *dev, void *data)
+static int always_match(struct device *dev, const void *data)
 {
 	return 1;
 }
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 5e75944ad5d1..3da1121f7572 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3538,7 +3538,7 @@ EXPORT_SYMBOL_GPL(spi_write_then_read);
 /*-------------------------------------------------------------------------*/
 
 #if IS_ENABLED(CONFIG_OF)
-static int __spi_of_device_match(struct device *dev, void *data)
+static int __spi_of_device_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
@@ -3639,7 +3639,7 @@ static int spi_acpi_controller_match(struct device *dev, const void *data)
 	return ACPI_COMPANION(dev->parent) == data;
 }
 
-static int spi_acpi_device_match(struct device *dev, void *data)
+static int spi_acpi_device_match(struct device *dev, const void *data)
 {
 	return ACPI_COMPANION(dev) == data;
 }
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index c1b016574fb4..c9a7e4a779cd 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -1946,10 +1946,10 @@ struct tb_sw_lookup {
 	u64 route;
 };
 
-static int tb_switch_match(struct device *dev, void *data)
+static int tb_switch_match(struct device *dev, const void *data)
 {
 	struct tb_switch *sw = tb_to_switch(dev);
-	struct tb_sw_lookup *lookup = data;
+	const struct tb_sw_lookup *lookup = data;
 
 	if (!sw)
 		return 0;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index fa783531ee88..7bd7de7273a3 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -947,9 +947,9 @@ error:
 	return ret;
 }
 
-static int match_devt(struct device *dev, void *data)
+static int match_devt(struct device *dev, const void *data)
 {
-	return dev->devt == (dev_t) (unsigned long) data;
+	return dev->devt == (dev_t)(unsigned long)(void *)data;
 }
 
 static struct usb_device *usbdev_lookup_by_devt(dev_t devt)
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 7fcb9f782931..1678e305e037 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -325,9 +325,9 @@ struct find_interface_arg {
 	struct device_driver *drv;
 };
 
-static int __find_interface(struct device *dev, void *data)
+static int __find_interface(struct device *dev, const void *data)
 {
-	struct find_interface_arg *arg = data;
+	const struct find_interface_arg *arg = data;
 	struct usb_interface *intf;
 
 	if (!is_usb_interface(dev))
diff --git a/drivers/usb/phy/phy-am335x-control.c b/drivers/usb/phy/phy-am335x-control.c
index a3cb25cb74f8..d16dfc320faa 100644
--- a/drivers/usb/phy/phy-am335x-control.c
+++ b/drivers/usb/phy/phy-am335x-control.c
@@ -118,9 +118,9 @@ static const struct of_device_id omap_control_usb_id_table[] = {
 MODULE_DEVICE_TABLE(of, omap_control_usb_id_table);
 
 static struct platform_driver am335x_control_driver;
-static int match(struct device *dev, void *data)
+static int match(struct device *dev, const void *data)
 {
-	struct device_node *node = (struct device_node *)data;
+	const struct device_node *node = (const struct device_node *)data;
 	return dev->of_node == node &&
 		dev->driver == &am335x_control_driver.driver;
 }
diff --git a/drivers/usb/phy/phy-isp1301.c b/drivers/usb/phy/phy-isp1301.c
index 93b7d6a30aad..6cf6fbd39237 100644
--- a/drivers/usb/phy/phy-isp1301.c
+++ b/drivers/usb/phy/phy-isp1301.c
@@ -142,9 +142,9 @@ static struct i2c_driver isp1301_driver = {
 
 module_i2c_driver(isp1301_driver);
 
-static int match(struct device *dev, void *data)
+static int match(struct device *dev, const void *data)
 {
-	struct device_node *node = (struct device_node *)data;
+	const struct device_node *node = (const struct device_node *)data;
 	return (dev->of_node == node) &&
 		(dev->driver == &isp1301_driver.driver);
 }
diff --git a/drivers/visorbus/visorbus_main.c b/drivers/visorbus/visorbus_main.c
index 0b2434cc4ecd..152fd29f04f2 100644
--- a/drivers/visorbus/visorbus_main.c
+++ b/drivers/visorbus/visorbus_main.c
@@ -171,10 +171,10 @@ struct visor_busdev {
 	u32 dev_no;
 };
 
-static int match_visorbus_dev_by_id(struct device *dev, void *data)
+static int match_visorbus_dev_by_id(struct device *dev, const void *data)
 {
 	struct visor_device *vdev = to_visor_device(dev);
-	struct visor_busdev *id = data;
+	const struct visor_busdev *id = data;
 
 	if (vdev->chipset_bus_no == id->bus_no &&
 	    vdev->chipset_dev_no == id->dev_no)
diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..cbbdcadc660e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -166,8 +166,8 @@ void subsys_dev_iter_exit(struct subsys_dev_iter *iter);
 int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
 		     int (*fn)(struct device *dev, void *data));
 struct device *bus_find_device(struct bus_type *bus, struct device *start,
-			       void *data,
-			       int (*match)(struct device *dev, void *data));
+			       const void *data,
+			       int (*match)(struct device *dev, const void *data));
 struct device *bus_find_device_by_name(struct bus_type *bus,
 				       struct device *start,
 				       const char *name);
diff --git a/sound/soc/rockchip/rk3399_gru_sound.c b/sound/soc/rockchip/rk3399_gru_sound.c
index 3d0cc6e90d7b..c04c9ed185b7 100644
--- a/sound/soc/rockchip/rk3399_gru_sound.c
+++ b/sound/soc/rockchip/rk3399_gru_sound.c
@@ -405,7 +405,7 @@ static const struct dailink_match_data dailink_match[] = {
 	},
 };
 
-static int of_dev_node_match(struct device *dev, void *data)
+static int of_dev_node_match(struct device *dev, const void *data)
 {
 	return dev->of_node == data;
 }
-- 
cgit v1.2.3


From 92ce7e83b4e5c86687d748ba53cb755acdce1256 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 14 Jun 2019 18:54:00 +0100
Subject: driver_find_device: Unify the match function with class_find_device()

The driver_find_device() accepts a match function pointer to
filter the devices for lookup, similar to bus/class_find_device().
However, there is a minor difference in the prototype for the
match parameter for driver_find_device() with the now unified
version accepted by {bus/class}_find_device(), where it doesn't
accept a "const" qualifier for the data argument. This prevents
us from reusing the generic match functions for driver_find_device().

For this reason, change the prototype of the driver_find_device() to
make the "match" parameter in line with {bus/class}_find_device()
and adjust its callers to use the const qualifier. Also, we could
now promote the "data" parameter to const as we pass it down
as a const parameter to the match functions.

Cc: Corey Minyard <minyard@acm.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Sebastian Ott <sebott@linux.ibm.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Nehal Shah <nehal-bakulchandra.shah@amd.com>
Cc: Shyam Sundar S K <shyam-sundar.s-k@amd.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/amba/tegra-ahb.c             | 4 ++--
 drivers/base/driver.c                | 4 ++--
 drivers/char/ipmi/ipmi_msghandler.c  | 8 ++++----
 drivers/gpu/drm/tegra/dc.c           | 4 ++--
 drivers/i2c/busses/i2c-amd-mp2-pci.c | 2 +-
 drivers/iommu/arm-smmu-v3.c          | 2 +-
 drivers/iommu/arm-smmu.c             | 2 +-
 drivers/mfd/altera-sysmgr.c          | 4 ++--
 drivers/s390/cio/ccwgroup.c          | 4 ++--
 drivers/s390/cio/chsc_sch.c          | 2 +-
 drivers/s390/cio/device.c            | 2 +-
 include/linux/device.h               | 4 ++--
 12 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/amba/tegra-ahb.c b/drivers/amba/tegra-ahb.c
index 3751d811be39..42175a67ba0e 100644
--- a/drivers/amba/tegra-ahb.c
+++ b/drivers/amba/tegra-ahb.c
@@ -143,10 +143,10 @@ static inline void gizmo_writel(struct tegra_ahb *ahb, u32 value, u32 offset)
 }
 
 #ifdef CONFIG_TEGRA_IOMMU_SMMU
-static int tegra_ahb_match_by_smmu(struct device *dev, void *data)
+static int tegra_ahb_match_by_smmu(struct device *dev, const void *data)
 {
 	struct tegra_ahb *ahb = dev_get_drvdata(dev);
-	struct device_node *dn = data;
+	const struct device_node *dn = data;
 
 	return (ahb->dev->of_node == dn) ? 1 : 0;
 }
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 857c8f1b876e..4e5ca632f35e 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -73,8 +73,8 @@ EXPORT_SYMBOL_GPL(driver_for_each_device);
  * return to the caller and not iterate over any more devices.
  */
 struct device *driver_find_device(struct device_driver *drv,
-				  struct device *start, void *data,
-				  int (*match)(struct device *dev, void *data))
+				  struct device *start, const void *data,
+				  int (*match)(struct device *dev, const void *data))
 {
 	struct klist_iter i;
 	struct device *dev;
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 1dc10740fc0f..6707659cffd6 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2819,9 +2819,9 @@ static const struct device_type bmc_device_type = {
 	.groups		= bmc_dev_attr_groups,
 };
 
-static int __find_bmc_guid(struct device *dev, void *data)
+static int __find_bmc_guid(struct device *dev, const void *data)
 {
-	guid_t *guid = data;
+	const guid_t *guid = data;
 	struct bmc_device *bmc;
 	int rv;
 
@@ -2857,9 +2857,9 @@ struct prod_dev_id {
 	unsigned char device_id;
 };
 
-static int __find_bmc_prod_dev_id(struct device *dev, void *data)
+static int __find_bmc_prod_dev_id(struct device *dev, const void *data)
 {
-	struct prod_dev_id *cid = data;
+	const struct prod_dev_id *cid = data;
 	struct bmc_device *bmc;
 	int rv;
 
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index 607a6ea17ecc..52109a63e797 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -2375,10 +2375,10 @@ static int tegra_dc_parse_dt(struct tegra_dc *dc)
 	return 0;
 }
 
-static int tegra_dc_match_by_pipe(struct device *dev, void *data)
+static int tegra_dc_match_by_pipe(struct device *dev, const void *data)
 {
 	struct tegra_dc *dc = dev_get_drvdata(dev);
-	unsigned int pipe = (unsigned long)data;
+	unsigned int pipe = (unsigned long)(void *)data;
 
 	return dc->pipe == pipe;
 }
diff --git a/drivers/i2c/busses/i2c-amd-mp2-pci.c b/drivers/i2c/busses/i2c-amd-mp2-pci.c
index 455e1f36a2a3..c7fe3b44a860 100644
--- a/drivers/i2c/busses/i2c-amd-mp2-pci.c
+++ b/drivers/i2c/busses/i2c-amd-mp2-pci.c
@@ -457,7 +457,7 @@ static struct pci_driver amd_mp2_pci_driver = {
 };
 module_pci_driver(amd_mp2_pci_driver);
 
-static int amd_mp2_device_match(struct device *dev, void *data)
+static int amd_mp2_device_match(struct device *dev, const void *data)
 {
 	return 1;
 }
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4d5a694f02c2..d787856f9dcf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2023,7 +2023,7 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 
 static struct platform_driver arm_smmu_driver;
 
-static int arm_smmu_match_node(struct device *dev, void *data)
+static int arm_smmu_match_node(struct device *dev, const void *data)
 {
 	return dev->fwnode == data;
 }
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 5e54cc0a28b3..4ce429b74655 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1431,7 +1431,7 @@ static bool arm_smmu_capable(enum iommu_cap cap)
 	}
 }
 
-static int arm_smmu_match_node(struct device *dev, void *data)
+static int arm_smmu_match_node(struct device *dev, const void *data)
 {
 	return dev->fwnode == data;
 }
diff --git a/drivers/mfd/altera-sysmgr.c b/drivers/mfd/altera-sysmgr.c
index 8976f82785bb..2ee14d8a6d31 100644
--- a/drivers/mfd/altera-sysmgr.c
+++ b/drivers/mfd/altera-sysmgr.c
@@ -92,9 +92,9 @@ static struct regmap_config altr_sysmgr_regmap_cfg = {
  * Matching function used by driver_find_device().
  * Return: True if match is found, otherwise false.
  */
-static int sysmgr_match_phandle(struct device *dev, void *data)
+static int sysmgr_match_phandle(struct device *dev, const void *data)
 {
-	return dev->of_node == (struct device_node *)data;
+	return dev->of_node == (const struct device_node *)data;
 }
 
 /**
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index 4ebf6d4fc66c..ea17615789c9 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -581,7 +581,7 @@ int ccwgroup_driver_register(struct ccwgroup_driver *cdriver)
 }
 EXPORT_SYMBOL(ccwgroup_driver_register);
 
-static int __ccwgroup_match_all(struct device *dev, void *data)
+static int __ccwgroup_match_all(struct device *dev, const void *data)
 {
 	return 1;
 }
@@ -608,7 +608,7 @@ void ccwgroup_driver_unregister(struct ccwgroup_driver *cdriver)
 }
 EXPORT_SYMBOL(ccwgroup_driver_unregister);
 
-static int __ccwgroupdev_check_busid(struct device *dev, void *id)
+static int __ccwgroupdev_check_busid(struct device *dev, const void *id)
 {
 	char *bus_id = id;
 
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 8d9f36625ba5..8f080d3fd380 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -203,7 +203,7 @@ static void chsc_cleanup_sch_driver(void)
 
 static DEFINE_SPINLOCK(chsc_lock);
 
-static int chsc_subchannel_match_next_free(struct device *dev, void *data)
+static int chsc_subchannel_match_next_free(struct device *dev, const void *data)
 {
 	struct subchannel *sch = to_subchannel(dev);
 
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index d32f373e5bc7..f27536ba58eb 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -1653,7 +1653,7 @@ EXPORT_SYMBOL_GPL(ccw_device_force_console);
  * get ccw_device matching the busid, but only if owned by cdrv
  */
 static int
-__ccwdev_check_busid(struct device *dev, void *id)
+__ccwdev_check_busid(struct device *dev, const void *id)
 {
 	char *bus_id;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index cbbdcadc660e..4d7c88131a4d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -336,8 +336,8 @@ extern int __must_check driver_for_each_device(struct device_driver *drv,
 					       int (*fn)(struct device *dev,
 							 void *));
 struct device *driver_find_device(struct device_driver *drv,
-				  struct device *start, void *data,
-				  int (*match)(struct device *dev, void *data));
+				  struct device *start, const void *data,
+				  int (*match)(struct device *dev, const void *data));
 
 void driver_deferred_probe_add(struct device *dev);
 int driver_deferred_probe_check_state(struct device *dev);
-- 
cgit v1.2.3


From 65b66682344a15ba2069d4dd8d0cc39cc3aed7e9 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 14 Jun 2019 18:54:01 +0100
Subject: drivers: Add generic helper to match by of_node

Add a helper to match device by the of_node. This will be later used
to provide wrappers to the device iterators for {bus/class/driver}_find_device().
Convert other users to reuse this new helper.

Cc: Alan Tull <atull@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: devicetree@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Jonathan Hunter <jonathanh@nvidia.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: linux-fpga@vger.kernel.org
Cc: linux-i2c@vger.kernel.org
Cc: linux-spi@vger.kernel.org
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Peter Rosin <peda@axentia.se>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Thor Thayer <thor.thayer@linux.intel.com>
Cc: Wolfram Sang <wsa@the-dreams.de>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c           | 6 ++++++
 drivers/fpga/of-fpga-region.c | 7 +------
 include/linux/device.h        | 2 ++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd7511e04e62..92119080474c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3328,3 +3328,9 @@ void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
 	dev->of_node_reused = true;
 }
 EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
+
+int device_match_of_node(struct device *dev, const void *np)
+{
+	return dev->of_node == np;
+}
+EXPORT_SYMBOL_GPL(device_match_of_node);
diff --git a/drivers/fpga/of-fpga-region.c b/drivers/fpga/of-fpga-region.c
index 75f64abf9c81..e405309baadc 100644
--- a/drivers/fpga/of-fpga-region.c
+++ b/drivers/fpga/of-fpga-region.c
@@ -22,11 +22,6 @@ static const struct of_device_id fpga_region_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, fpga_region_of_match);
 
-static int fpga_region_of_node_match(struct device *dev, const void *data)
-{
-	return dev->of_node == data;
-}
-
 /**
  * of_fpga_region_find - find FPGA region
  * @np: device node of FPGA Region
@@ -37,7 +32,7 @@ static int fpga_region_of_node_match(struct device *dev, const void *data)
  */
 static struct fpga_region *of_fpga_region_find(struct device_node *np)
 {
-	return fpga_region_class_find(NULL, np, fpga_region_of_node_match);
+	return fpga_region_class_find(NULL, np, device_match_of_node);
 }
 
 /**
diff --git a/include/linux/device.h b/include/linux/device.h
index 4d7c88131a4d..709308560d32 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -163,6 +163,8 @@ void subsys_dev_iter_init(struct subsys_dev_iter *iter,
 struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter);
 void subsys_dev_iter_exit(struct subsys_dev_iter *iter);
 
+int device_match_of_node(struct device *dev, const void *np);
+
 int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
 		     int (*fn)(struct device *dev, void *data));
 struct device *bus_find_device(struct bus_type *bus, struct device *start,
-- 
cgit v1.2.3


From 38b37d631aec80da0c65ac03a7ef680b468c7857 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Date: Fri, 7 Jun 2019 12:49:11 +0200
Subject: module: allow arch overrides for .exit section names

Some archs like ARM store unwind information for .exit.text in sections
with unusual names. As this unwind information refers to .exit.text, it
must not be loaded when .exit.text is not loaded (when CONFIG_MODULE_UNLOAD
is unset); otherwise, loading a module can fail due to relocation failures.

Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleloader.h | 5 +++++
 kernel/module.c              | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 31013c2effd3..5229c18025e9 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -29,6 +29,11 @@ void *module_alloc(unsigned long size);
 /* Free memory returned from module_alloc. */
 void module_memfree(void *module_region);
 
+/* Determines if the section name is an exit section (that is only used during
+ * module unloading)
+ */
+bool module_exit_section(const char *name);
+
 /*
  * Apply the given relocation to the (simplified) ELF.  Return -error
  * or 0.
diff --git a/kernel/module.c b/kernel/module.c
index 41258bab24f1..537c456ce3ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2748,6 +2748,11 @@ void * __weak module_alloc(unsigned long size)
 	return vmalloc_exec(size);
 }
 
+bool __weak module_exit_section(const char *name)
+{
+	return strstarts(name, ".exit");
+}
+
 #ifdef CONFIG_DEBUG_KMEMLEAK
 static void kmemleak_load_module(const struct module *mod,
 				 const struct load_info *info)
@@ -2937,7 +2942,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
 
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+		if (module_exit_section(info->secstrings+shdr->sh_name))
 			shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 	}
-- 
cgit v1.2.3


From b0935123a18360d19f1dcc779ea33841cdc304cc Mon Sep 17 00:00:00 2001
From: Prakhar Srivastava <prsriva02@gmail.com>
Date: Sun, 23 Jun 2019 23:23:29 -0700
Subject: IMA: Define a new hook to measure the kexec boot command line
 arguments

Currently during soft reboot(kexec_file_load) boot command line
arguments are not measured. Define hooks needed to measure kexec
command line arguments during soft reboot(kexec_file_load).

- A new ima hook ima_kexec_cmdline is defined to be called by the
kexec code.
- A new function process_buffer_measurement is defined to measure
the buffer hash into the IMA measurement list.
- A new func policy KEXEC_CMDLINE is defined to control the
 measurement.

Signed-off-by: Prakhar Srivastava <prsriva02@gmail.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 Documentation/ABI/testing/ima_policy |  1 +
 include/linux/ima.h                  |  2 +
 security/integrity/ima/ima.h         |  1 +
 security/integrity/ima/ima_api.c     |  1 +
 security/integrity/ima/ima_main.c    | 72 ++++++++++++++++++++++++++++++++++++
 security/integrity/ima/ima_policy.c  |  7 ++++
 6 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index b383c1763610..fc376a323908 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -28,6 +28,7 @@ Description:
 		base: 	func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
 				[FIRMWARE_CHECK]
 				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
+				[KEXEC_CMDLINE]
 			mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
 			       [[^]MAY_EXEC]
 			fsmagic:= hex value
diff --git a/include/linux/ima.h b/include/linux/ima.h
index fd9f7cf4cdf5..b42f5a006042 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -26,6 +26,7 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
+extern void ima_kexec_cmdline(const void *buf, int size);
 
 #ifdef CONFIG_IMA_KEXEC
 extern void ima_add_kexec_buffer(struct kimage *image);
@@ -92,6 +93,7 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
 	return;
 }
 
+static inline void ima_kexec_cmdline(const void *buf, int size) {}
 #endif /* CONFIG_IMA */
 
 #ifndef CONFIG_IMA_KEXEC
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index e7b9ea7732d9..bdca641f9e51 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -190,6 +190,7 @@ static inline unsigned long ima_hash_key(u8 *digest)
 	hook(KEXEC_KERNEL_CHECK)	\
 	hook(KEXEC_INITRAMFS_CHECK)	\
 	hook(POLICY_CHECK)		\
+	hook(KEXEC_CMDLINE)		\
 	hook(MAX_CHECK)
 #define __ima_hook_enumify(ENUM)	ENUM,
 
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index c0cf4bcfc82f..d426d4d1fe04 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -178,6 +178,7 @@ err_out:
  *		subj=, obj=, type=, func=, mask=, fsmagic=
  *	subj,obj, and type: are LSM specific.
  *	func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
+ *	| KEXEC_CMDLINE
  *	mask: contains the permission mask
  *	fsmagic: hex value
  *
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index a7e7e2d7224c..2507bee1b762 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -609,6 +609,78 @@ int ima_load_data(enum kernel_load_data_id id)
 	return 0;
 }
 
+/*
+ * process_buffer_measurement - Measure the buffer to ima log.
+ * @buf: pointer to the buffer that needs to be added to the log.
+ * @size: size of buffer(in bytes).
+ * @eventname: event name to be used for the buffer entry.
+ * @cred: a pointer to a credentials structure for user validation.
+ * @secid: the secid of the task to be validated.
+ *
+ * Based on policy, the buffer is measured into the ima log.
+ */
+static void process_buffer_measurement(const void *buf, int size,
+				       const char *eventname,
+				       const struct cred *cred, u32 secid)
+{
+	int ret = 0;
+	struct ima_template_entry *entry = NULL;
+	struct integrity_iint_cache iint = {};
+	struct ima_event_data event_data = {.iint = &iint,
+					    .filename = eventname};
+	struct ima_template_desc *template_desc = NULL;
+	struct {
+		struct ima_digest_data hdr;
+		char digest[IMA_MAX_DIGEST_SIZE];
+	} hash = {};
+	int violation = 0;
+	int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
+	int action = 0;
+
+	action = ima_get_action(NULL, cred, secid, 0, KEXEC_CMDLINE, &pcr,
+				&template_desc);
+	if (!(action & IMA_MEASURE))
+		return;
+
+	iint.ima_hash = &hash.hdr;
+	iint.ima_hash->algo = ima_hash_algo;
+	iint.ima_hash->length = hash_digest_size[ima_hash_algo];
+
+	ret = ima_calc_buffer_hash(buf, size, iint.ima_hash);
+	if (ret < 0)
+		goto out;
+
+	ret = ima_alloc_init_template(&event_data, &entry, template_desc);
+	if (ret < 0)
+		goto out;
+
+	ret = ima_store_template(entry, violation, NULL, buf, pcr);
+
+	if (ret < 0)
+		ima_free_template_entry(entry);
+
+out:
+	return;
+}
+
+/**
+ * ima_kexec_cmdline - measure kexec cmdline boot args
+ * @buf: pointer to buffer
+ * @size: size of buffer
+ *
+ * Buffers can only be measured, not appraised.
+ */
+void ima_kexec_cmdline(const void *buf, int size)
+{
+	u32 secid;
+
+	if (buf && size != 0) {
+		security_task_getsecid(current, &secid);
+		process_buffer_measurement(buf, size, "kexec-cmdline",
+					   current_cred(), secid);
+	}
+}
+
 static int __init init_ima(void)
 {
 	int error;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 98c289559079..a3058b03a955 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -374,6 +374,11 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
 {
 	int i;
 
+	if (func == KEXEC_CMDLINE) {
+		if ((rule->flags & IMA_FUNC) && (rule->func == func))
+			return true;
+		return false;
+	}
 	if ((rule->flags & IMA_FUNC) &&
 	    (rule->func != func && func != POST_SETATTR))
 		return false;
@@ -956,6 +961,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 				entry->func = KEXEC_INITRAMFS_CHECK;
 			else if (strcmp(args[0].from, "POLICY_CHECK") == 0)
 				entry->func = POLICY_CHECK;
+			else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0)
+				entry->func = KEXEC_CMDLINE;
 			else
 				result = -EINVAL;
 			if (!result)
-- 
cgit v1.2.3


From 38ca87c6f1e514686d4a385246d1afe1e1f2e482 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:46 +0300
Subject: RDMA/mlx5: Introduce and implement new IB_WR_REG_MR_INTEGRITY work
 request

This new WR will be used to perform PI (protection information) handover
using the new API. Using the new API, the user will post a single WR that
will internally perform all the needed actions to complete PI operation.
This new WR will use a memory region that was allocated as
IB_MR_TYPE_INTEGRITY and was mapped using ib_map_mr_sg_pi to perform the
registration. In the old API, in order to perform a signature handover
operation, each ULP should perform the following:
1. Map and register the data buffers.
2. Map and register the protection buffers.
3. Post a special reg WR to configure the signature handover operation
   layout.
4. Invalidate the signature memory key.
5. Invalidate protection buffers memory key.
6. Invalidate data buffers memory key.

In the new API, the mapping of both data and protection buffers is
performed using a single call to ib_map_mr_sg_pi function. Also the
registration of the buffers and the configuration of the signature
operation layout is done by a single new work request called
IB_WR_REG_MR_INTEGRITY.
This patch implements this operation for mlx5 devices that are capable to
offload data integrity generation/validation while performing the actual
buffer transfer.
This patch will not remove the old signature API that is used by the iSER
initiator and target drivers. This will be done in the future.

In the internal implementation, for each IB_WR_REG_MR_INTEGRITY work
request, we are using a single UMR operation to register both data and
protection buffers using KLM's.
Afterwards, another UMR operation will describe the strided block format.
These will be followed by 2 SET_PSV operations to set the memory/wire
domains initial signature parameters passed by the user.
In the end of the whole transaction, only the signature memory key
(the one that exposed for the RDMA operation) will be invalidated.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 218 ++++++++++++++++++++++++++++++++++++----
 include/linux/mlx5/qp.h         |   3 +-
 include/rdma/ib_verbs.h         |   1 +
 3 files changed, 201 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ce8fccb04c3c..f6651b93e469 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4169,7 +4169,7 @@ static __be64 sig_mkey_mask(void)
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
 			    struct mlx5_ib_mr *mr, u8 flags)
 {
-	int size = mr->ndescs * mr->desc_size;
+	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
@@ -4300,7 +4300,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
 			     struct mlx5_ib_mr *mr,
 			     u32 key, int access)
 {
-	int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
 
 	memset(seg, 0, sizeof(*seg));
 
@@ -4351,7 +4351,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
 			     struct mlx5_ib_mr *mr,
 			     struct mlx5_ib_pd *pd)
 {
-	int bcount = mr->desc_size * mr->ndescs;
+	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
 
 	dseg->addr = cpu_to_be64(mr->desc_map);
 	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
@@ -4544,23 +4544,52 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr,
 	return 0;
 }
 
-static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
-				struct mlx5_ib_qp *qp, void **seg,
-				int *size, void **cur_edge)
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+				struct ib_mr *sig_mr,
+				struct ib_sig_attrs *sig_attrs,
+				struct mlx5_ib_qp *qp, void **seg, int *size,
+				void **cur_edge)
 {
-	struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
-	struct ib_mr *sig_mr = wr->sig_mr;
 	struct mlx5_bsf *bsf;
-	u32 data_len = wr->wr.sg_list->length;
-	u32 data_key = wr->wr.sg_list->lkey;
-	u64 data_va = wr->wr.sg_list->addr;
+	u32 data_len;
+	u32 data_key;
+	u64 data_va;
+	u32 prot_len = 0;
+	u32 prot_key = 0;
+	u64 prot_va = 0;
+	bool prot = false;
 	int ret;
 	int wqe_size;
 
-	if (!wr->prot ||
-	    (data_key == wr->prot->lkey &&
-	     data_va == wr->prot->addr &&
-	     data_len == wr->prot->length)) {
+	if (send_wr->opcode == IB_WR_REG_SIG_MR) {
+		const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
+
+		data_len = wr->wr.sg_list->length;
+		data_key = wr->wr.sg_list->lkey;
+		data_va = wr->wr.sg_list->addr;
+		if (wr->prot) {
+			prot_len = wr->prot->length;
+			prot_key = wr->prot->lkey;
+			prot_va = wr->prot->addr;
+			prot = true;
+		}
+	} else {
+		struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+		struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+
+		data_len = pi_mr->data_length;
+		data_key = pi_mr->ibmr.lkey;
+		data_va = pi_mr->ibmr.iova;
+		if (pi_mr->meta_ndescs) {
+			prot_len = pi_mr->meta_length;
+			prot_key = pi_mr->ibmr.lkey;
+			prot_va = pi_mr->ibmr.iova + data_len;
+			prot = true;
+		}
+	}
+
+	if (!prot || (data_key == prot_key && data_va == prot_va &&
+		      data_len == prot_len)) {
 		/**
 		 * Source domain doesn't contain signature information
 		 * or data and protection are interleaved in memory.
@@ -4594,8 +4623,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
 		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
 		struct mlx5_stride_block_entry *data_sentry;
 		struct mlx5_stride_block_entry *prot_sentry;
-		u32 prot_key = wr->prot->lkey;
-		u64 prot_va = wr->prot->addr;
 		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
 		int prot_size;
 
@@ -4673,6 +4700,56 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
 	umr->mkey_mask = sig_mkey_mask();
 }
 
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+			 struct mlx5_ib_qp *qp, void **seg, int *size,
+			 void **cur_edge)
+{
+	const struct ib_reg_wr *wr = reg_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
+	u32 pdn = get_pd(qp)->pdn;
+	u32 xlt_size;
+	int region_len, ret;
+
+	if (unlikely(send_wr->num_sge != 0) ||
+	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = pi_mr->ibmr.length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+		xlt_size = 0x30;
+	else
+		xlt_size = sizeof(struct mlx5_klm);
+
+	set_sig_umr_segment(*seg, xlt_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+			     pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+				   cur_edge);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
 
 static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 			  struct mlx5_ib_qp *qp, void **seg, int *size,
@@ -4716,7 +4793,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
 	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-	ret = set_sig_data_segment(wr, qp, seg, size, cur_edge);
+	ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg,
+				   size, cur_edge);
 	if (ret)
 		return ret;
 
@@ -4758,7 +4836,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
 {
 	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
 	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-	size_t mr_list_size = mr->ndescs * mr->desc_size;
+	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
 	u8 flags = 0;
 
@@ -4899,8 +4977,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_core_dev *mdev = dev->mdev;
+	struct ib_reg_wr reg_pi_wr;
 	struct mlx5_ib_qp *qp;
 	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *pi_mr;
+	struct ib_sig_attrs *sig_attrs;
 	struct mlx5_wqe_xrc_seg *xrc;
 	struct mlx5_bf *bf;
 	void *cur_edge;
@@ -4954,7 +5035,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			goto out;
 		}
 
-		if (wr->opcode == IB_WR_REG_MR) {
+		if (wr->opcode == IB_WR_REG_MR ||
+		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
 			fence = dev->umr_fence;
 			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
 		} else  {
@@ -5012,6 +5094,102 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 				num_sge = 0;
 				break;
 
+			case IB_WR_REG_MR_INTEGRITY:
+				memset(&reg_pi_wr, 0, sizeof(struct ib_reg_wr));
+
+				mr = to_mmr(reg_wr(wr)->mr);
+				pi_mr = mr->pi_mr;
+
+				reg_pi_wr.mr = &pi_mr->ibmr;
+				reg_pi_wr.access = reg_wr(wr)->access;
+				reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+				qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
+				ctrl->imm = cpu_to_be32(reg_pi_wr.key);
+				/* UMR for data + protection registration */
+				err = set_reg_wr(qp, &reg_pi_wr, &seg, &size,
+						 &cur_edge, false);
+				if (err) {
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				err = begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						&size, &cur_edge, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				/* UMR for sig MR */
+				err = set_pi_umr_wr(wr, qp, &seg, &size,
+						    &cur_edge);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				sig_attrs = mr->ibmr.sig_attrs;
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				qp->next_fence =
+					MLX5_FENCE_MODE_INITIATOR_SMALL;
+				num_sge = 0;
+				goto skip_psv;
+
 			case IB_WR_REG_SIG_MR:
 				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
 				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3ba4edbd17a6..08e43cd9e742 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -37,7 +37,8 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_INVALID_LKEY	0x100
-#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
+/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */
+#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 8)
 #define MLX5_DIF_SIZE		8
 #define MLX5_STRIDE_BLOCK_OP	0x400
 #define MLX5_CPY_GRD_MASK	0xc0
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9169e798334f..28db256cbdb9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1236,6 +1236,7 @@ enum ib_wr_opcode {
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
 	IB_WR_REG_SIG_MR,
+	IB_WR_REG_MR_INTEGRITY,
 
 	/* reserve values for low level drivers' internal use.
 	 * These values will not be used at all in the ib core layer.
-- 
cgit v1.2.3


From fa49e1d37bbd6d25a11379891ece1e4d5d313036 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Tue, 28 May 2019 05:07:26 -0400
Subject: media: marvell-ccic: drop unused stuff

Remove structure members and headers that are not actually used. Saves
us from some noise in subsequent cleanup commits.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/marvell-ccic/mcam-core.c  | 1 -
 drivers/media/platform/marvell-ccic/mcam-core.h  | 2 --
 drivers/media/platform/marvell-ccic/mmp-driver.c | 2 --
 include/linux/platform_data/media/mmp-camera.h   | 1 -
 4 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/platform/marvell-ccic/mcam-core.c b/drivers/media/platform/marvell-ccic/mcam-core.c
index 2494a31de01b..76641d5211ab 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.c
+++ b/drivers/media/platform/marvell-ccic/mcam-core.c
@@ -1776,7 +1776,6 @@ int mccic_register(struct mcam_camera *cam)
 	 */
 	sensor_cfg.clock_speed = cam->clock_speed;
 	sensor_cfg.use_smbus = cam->use_smbus;
-	cam->sensor_addr = ov7670_info.addr;
 	cam->sensor = v4l2_i2c_new_subdev_board(&cam->v4l2_dev,
 			cam->i2c_adapter, &ov7670_info, NULL);
 	if (cam->sensor == NULL) {
diff --git a/drivers/media/platform/marvell-ccic/mcam-core.h b/drivers/media/platform/marvell-ccic/mcam-core.h
index a3a097a45e78..b828b1bb59d3 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.h
+++ b/drivers/media/platform/marvell-ccic/mcam-core.h
@@ -112,7 +112,6 @@ struct mcam_camera {
 	short int use_smbus;	/* SMBUS or straight I2c? */
 	enum mcam_buffer_mode buffer_mode;
 
-	int mclk_min;	/* The minimal value of mclk */
 	int mclk_src;	/* which clock source the mclk derives from */
 	int mclk_div;	/* Clock Divider Value for MCLK */
 
@@ -152,7 +151,6 @@ struct mcam_camera {
 	 */
 	struct video_device vdev;
 	struct v4l2_subdev *sensor;
-	unsigned short sensor_addr;
 
 	/* Videobuf2 stuff */
 	struct vb2_queue vb_queue;
diff --git a/drivers/media/platform/marvell-ccic/mmp-driver.c b/drivers/media/platform/marvell-ccic/mmp-driver.c
index 9c4c7d37d0df..25a4e2b580f4 100644
--- a/drivers/media/platform/marvell-ccic/mmp-driver.c
+++ b/drivers/media/platform/marvell-ccic/mmp-driver.c
@@ -10,7 +10,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/platform_data/i2c-gpio.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -330,7 +329,6 @@ static int mmpcam_probe(struct platform_device *pdev)
 	mcam->calc_dphy = mmpcam_calc_dphy;
 	mcam->dev = &pdev->dev;
 	mcam->use_smbus = 0;
-	mcam->mclk_min = pdata->mclk_min;
 	mcam->mclk_src = pdata->mclk_src;
 	mcam->mclk_div = pdata->mclk_div;
 	mcam->bus_type = pdata->bus_type;
diff --git a/include/linux/platform_data/media/mmp-camera.h b/include/linux/platform_data/media/mmp-camera.h
index d2d3a443eedf..4c3a80a45883 100644
--- a/include/linux/platform_data/media/mmp-camera.h
+++ b/include/linux/platform_data/media/mmp-camera.h
@@ -16,7 +16,6 @@ struct mmp_camera_platform_data {
 	int sensor_power_gpio;
 	int sensor_reset_gpio;
 	enum v4l2_mbus_type bus_type;
-	int mclk_min;	/* The minimal value of MCLK */
 	int mclk_src;	/* which clock source the MCLK derives from */
 	int mclk_div;	/* Clock Divider Value for MCLK */
 	/*
-- 
cgit v1.2.3


From 3eefe36cc00c5391b1ca2a68c5f01e9aa127c2a6 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Tue, 28 May 2019 05:07:30 -0400
Subject: media: marvell-ccic: use async notifier to get the sensor

An instance of a sensor on DT-based MMP2 platform is always going to be
created asynchronously.

Let's move the manual device creation away from the core to the Cafe
driver (used on OLPC XO-1, not present in DT) and set up appropriate
async matches: I2C on Cafe, FWNODE on MMP (OLPC XO-1.75).

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/marvell-ccic/cafe-driver.c |  49 +++++--
 drivers/media/platform/marvell-ccic/mcam-core.c   | 157 +++++++++++++++-------
 drivers/media/platform/marvell-ccic/mcam-core.h   |   5 +-
 drivers/media/platform/marvell-ccic/mmp-driver.c  |  27 ++--
 include/linux/platform_data/media/mmp-camera.h    |   1 -
 5 files changed, 162 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/platform/marvell-ccic/cafe-driver.c b/drivers/media/platform/marvell-ccic/cafe-driver.c
index cd108b14b715..fe85368675cb 100644
--- a/drivers/media/platform/marvell-ccic/cafe-driver.c
+++ b/drivers/media/platform/marvell-ccic/cafe-driver.c
@@ -9,6 +9,7 @@
  *
  * Copyright 2006-11 One Laptop Per Child Association, Inc.
  * Copyright 2006-11 Jonathan Corbet <corbet@lwn.net>
+ * Copyright 2018 Lubomir Rintel <lkundrak@v3.sk>
  *
  * Written by Jonathan Corbet, corbet@lwn.net.
  *
@@ -25,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
+#include <media/i2c/ov7670.h>
 #include <linux/device.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
@@ -50,6 +52,7 @@ struct cafe_camera {
 	int registered;			/* Fully initialized? */
 	struct mcam_camera mcam;
 	struct pci_dev *pdev;
+	struct i2c_adapter *i2c_adapter;
 	wait_queue_head_t smbus_wait;	/* Waiting on i2c events */
 };
 
@@ -349,15 +352,15 @@ static int cafe_smbus_setup(struct cafe_camera *cam)
 		return ret;
 	}
 
-	cam->mcam.i2c_adapter = adap;
+	cam->i2c_adapter = adap;
 	cafe_smbus_enable_irq(cam);
 	return 0;
 }
 
 static void cafe_smbus_shutdown(struct cafe_camera *cam)
 {
-	i2c_del_adapter(cam->mcam.i2c_adapter);
-	kfree(cam->mcam.i2c_adapter);
+	i2c_del_adapter(cam->i2c_adapter);
+	kfree(cam->i2c_adapter);
 }
 
 
@@ -450,6 +453,29 @@ static irqreturn_t cafe_irq(int irq, void *data)
 	return IRQ_RETVAL(handled);
 }
 
+/* -------------------------------------------------------------------------- */
+
+static struct ov7670_config sensor_cfg = {
+	/*
+	 * Exclude QCIF mode, because it only captures a tiny portion
+	 * of the sensor FOV
+	 */
+	.min_width = 320,
+	.min_height = 240,
+
+	/*
+	 * Set the clock speed for the XO 1; I don't believe this
+	 * driver has ever run anywhere else.
+	 */
+	.clock_speed = 45,
+	.use_smbus = 1,
+};
+
+struct i2c_board_info ov7670_info = {
+	.type = "ov7670",
+	.addr = 0x42 >> 1,
+	.platform_data = &sensor_cfg,
+};
 
 /* -------------------------------------------------------------------------- */
 /*
@@ -479,12 +505,6 @@ static int cafe_pci_probe(struct pci_dev *pdev,
 	mcam->plat_power_down = cafe_ctlr_power_down;
 	mcam->dev = &pdev->dev;
 	snprintf(mcam->bus_info, sizeof(mcam->bus_info), "PCI:%s", pci_name(pdev));
-	/*
-	 * Set the clock speed for the XO 1; I don't believe this
-	 * driver has ever run anywhere else.
-	 */
-	mcam->clock_speed = 45;
-	mcam->use_smbus = 1;
 	/*
 	 * Vmalloc mode for buffers is traditional with this driver.
 	 * We *might* be able to run DMA_contig, especially on a system
@@ -525,12 +545,21 @@ static int cafe_pci_probe(struct pci_dev *pdev,
 	if (ret)
 		goto out_pdown;
 
+	mcam->asd.match_type = V4L2_ASYNC_MATCH_I2C;
+	mcam->asd.match.i2c.adapter_id = i2c_adapter_id(cam->i2c_adapter);
+	mcam->asd.match.i2c.address = ov7670_info.addr;
+
 	ret = mccic_register(mcam);
-	if (ret == 0) {
+	if (ret)
+		goto out_smbus_shutdown;
+
+	if (i2c_new_device(cam->i2c_adapter, &ov7670_info)) {
 		cam->registered = 1;
 		return 0;
 	}
 
+	mccic_shutdown(mcam);
+out_smbus_shutdown:
 	cafe_smbus_shutdown(cam);
 out_pdown:
 	cafe_ctlr_power_down(mcam);
diff --git a/drivers/media/platform/marvell-ccic/mcam-core.c b/drivers/media/platform/marvell-ccic/mcam-core.c
index 76641d5211ab..7dc7d9d91782 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.c
+++ b/drivers/media/platform/marvell-ccic/mcam-core.c
@@ -4,6 +4,7 @@
  * so it needs platform-specific support outside of the core.
  *
  * Copyright 2011 Jonathan Corbet corbet@lwn.net
+ * Copyright 2018 Lubomir Rintel <lkundrak@v3.sk>
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -26,7 +27,6 @@
 #include <media/v4l2-ioctl.h>
 #include <media/v4l2-ctrls.h>
 #include <media/v4l2-event.h>
-#include <media/i2c/ov7670.h>
 #include <media/videobuf2-vmalloc.h>
 #include <media/videobuf2-dma-contig.h>
 #include <media/videobuf2-dma-sg.h>
@@ -93,6 +93,9 @@ MODULE_PARM_DESC(buffer_mode,
 #define sensor_call(cam, o, f, args...) \
 	v4l2_subdev_call(cam->sensor, o, f, ##args)
 
+#define notifier_to_mcam(notifier) \
+	container_of(notifier, struct mcam_camera, notifier)
+
 static struct mcam_format_struct {
 	__u8 *desc;
 	__u32 pixelformat;
@@ -1715,23 +1718,94 @@ EXPORT_SYMBOL_GPL(mccic_irq);
 /*
  * Registration and such.
  */
-static struct ov7670_config sensor_cfg = {
+
+static int mccic_notify_bound(struct v4l2_async_notifier *notifier,
+	struct v4l2_subdev *subdev, struct v4l2_async_subdev *asd)
+{
+	struct mcam_camera *cam = notifier_to_mcam(notifier);
+	int ret;
+
+	mutex_lock(&cam->s_mutex);
+	if (cam->sensor) {
+		cam_err(cam, "sensor already bound\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	v4l2_set_subdev_hostdata(subdev, cam);
+	cam->sensor = subdev;
+
+	ret = mcam_cam_init(cam);
+	if (ret) {
+		cam->sensor = NULL;
+		goto out;
+	}
+
+	ret = mcam_setup_vb2(cam);
+	if (ret) {
+		cam->sensor = NULL;
+		goto out;
+	}
+
+	cam->vdev = mcam_v4l_template;
+	cam->vdev.v4l2_dev = &cam->v4l2_dev;
+	cam->vdev.lock = &cam->s_mutex;
+	cam->vdev.queue = &cam->vb_queue;
+	video_set_drvdata(&cam->vdev, cam);
+	ret = video_register_device(&cam->vdev, VFL_TYPE_GRABBER, -1);
+	if (ret) {
+		cam->sensor = NULL;
+		goto out;
+	}
+
+	cam_dbg(cam, "sensor %s bound\n", subdev->name);
+out:
+	mutex_unlock(&cam->s_mutex);
+	return ret;
+}
+
+static void mccic_notify_unbind(struct v4l2_async_notifier *notifier,
+	struct v4l2_subdev *subdev, struct v4l2_async_subdev *asd)
+{
+	struct mcam_camera *cam = notifier_to_mcam(notifier);
+
+	mutex_lock(&cam->s_mutex);
+	if (cam->sensor != subdev) {
+		cam_err(cam, "sensor %s not bound\n", subdev->name);
+		goto out;
+	}
+
+	video_unregister_device(&cam->vdev);
+	cam->sensor = NULL;
+	cam_dbg(cam, "sensor %s unbound\n", subdev->name);
+
+out:
+	mutex_unlock(&cam->s_mutex);
+}
+
+static int mccic_notify_complete(struct v4l2_async_notifier *notifier)
+{
+	struct mcam_camera *cam = notifier_to_mcam(notifier);
+	int ret;
+
 	/*
-	 * Exclude QCIF mode, because it only captures a tiny portion
-	 * of the sensor FOV
+	 * Get the v4l2 setup done.
 	 */
-	.min_width = 320,
-	.min_height = 240,
-};
+	ret = v4l2_ctrl_handler_init(&cam->ctrl_handler, 10);
+	if (!ret)
+		cam->v4l2_dev.ctrl_handler = &cam->ctrl_handler;
+
+	return ret;
+}
 
+static const struct v4l2_async_notifier_operations mccic_notify_ops = {
+	.bound = mccic_notify_bound,
+	.unbind = mccic_notify_unbind,
+	.complete = mccic_notify_complete,
+};
 
 int mccic_register(struct mcam_camera *cam)
 {
-	struct i2c_board_info ov7670_info = {
-		.type = "ov7670",
-		.addr = 0x42 >> 1,
-		.platform_data = &sensor_cfg,
-	};
 	int ret;
 
 	/*
@@ -1744,17 +1818,20 @@ int mccic_register(struct mcam_camera *cam)
 		printk(KERN_ERR "marvell-cam: Cafe can't do S/G I/O, attempting vmalloc mode instead\n");
 		cam->buffer_mode = B_vmalloc;
 	}
+
 	if (!mcam_buffer_mode_supported(cam->buffer_mode)) {
 		printk(KERN_ERR "marvell-cam: buffer mode %d unsupported\n",
 				cam->buffer_mode);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
+
 	/*
 	 * Register with V4L
 	 */
 	ret = v4l2_device_register(cam->dev, &cam->v4l2_dev);
 	if (ret)
-		return ret;
+		goto out;
 
 	mutex_init(&cam->s_mutex);
 	cam->state = S_NOTREADY;
@@ -1764,43 +1841,20 @@ int mccic_register(struct mcam_camera *cam)
 	mcam_ctlr_init(cam);
 
 	/*
-	 * Get the v4l2 setup done.
+	 * Register sensor notifier.
 	 */
-	ret = v4l2_ctrl_handler_init(&cam->ctrl_handler, 10);
-	if (ret)
-		goto out_unregister;
-	cam->v4l2_dev.ctrl_handler = &cam->ctrl_handler;
-
-	/*
-	 * Try to find the sensor.
-	 */
-	sensor_cfg.clock_speed = cam->clock_speed;
-	sensor_cfg.use_smbus = cam->use_smbus;
-	cam->sensor = v4l2_i2c_new_subdev_board(&cam->v4l2_dev,
-			cam->i2c_adapter, &ov7670_info, NULL);
-	if (cam->sensor == NULL) {
-		ret = -ENODEV;
-		goto out_unregister;
+	v4l2_async_notifier_init(&cam->notifier);
+	ret = v4l2_async_notifier_add_subdev(&cam->notifier, &cam->asd);
+	if (ret) {
+		cam_warn(cam, "failed to add subdev to a notifier");
+		goto out;
 	}
 
-	ret = mcam_cam_init(cam);
-	if (ret)
-		goto out_unregister;
-
-	ret = mcam_setup_vb2(cam);
-	if (ret)
-		goto out_unregister;
-
-	mutex_lock(&cam->s_mutex);
-	cam->vdev = mcam_v4l_template;
-	cam->vdev.v4l2_dev = &cam->v4l2_dev;
-	cam->vdev.lock = &cam->s_mutex;
-	cam->vdev.queue = &cam->vb_queue;
-	video_set_drvdata(&cam->vdev, cam);
-	ret = video_register_device(&cam->vdev, VFL_TYPE_GRABBER, -1);
-	if (ret) {
-		mutex_unlock(&cam->s_mutex);
-		goto out_unregister;
+	cam->notifier.ops = &mccic_notify_ops;
+	ret = v4l2_async_notifier_register(&cam->v4l2_dev, &cam->notifier);
+	if (ret < 0) {
+		cam_warn(cam, "failed to register a sensor notifier");
+		goto out;
 	}
 
 	/*
@@ -1811,11 +1865,10 @@ int mccic_register(struct mcam_camera *cam)
 			cam_warn(cam, "Unable to alloc DMA buffers at load will try again later.");
 	}
 
-	mutex_unlock(&cam->s_mutex);
 	return 0;
 
-out_unregister:
-	v4l2_ctrl_handler_free(&cam->ctrl_handler);
+out:
+	v4l2_async_notifier_unregister(&cam->notifier);
 	v4l2_device_unregister(&cam->v4l2_dev);
 	return ret;
 }
@@ -1835,8 +1888,8 @@ void mccic_shutdown(struct mcam_camera *cam)
 	}
 	if (cam->buffer_mode == B_vmalloc)
 		mcam_free_dma_bufs(cam);
-	video_unregister_device(&cam->vdev);
 	v4l2_ctrl_handler_free(&cam->ctrl_handler);
+	v4l2_async_notifier_unregister(&cam->notifier);
 	v4l2_device_unregister(&cam->v4l2_dev);
 }
 EXPORT_SYMBOL_GPL(mccic_shutdown);
diff --git a/drivers/media/platform/marvell-ccic/mcam-core.h b/drivers/media/platform/marvell-ccic/mcam-core.h
index b828b1bb59d3..4a72213aca1a 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.h
+++ b/drivers/media/platform/marvell-ccic/mcam-core.h
@@ -102,14 +102,11 @@ struct mcam_camera {
 	 * These fields should be set by the platform code prior to
 	 * calling mcam_register().
 	 */
-	struct i2c_adapter *i2c_adapter;
 	unsigned char __iomem *regs;
 	unsigned regs_size; /* size in bytes of the register space */
 	spinlock_t dev_lock;
 	struct device *dev; /* For messages, dma alloc */
 	enum mcam_chip_id chip_id;
-	short int clock_speed;	/* Sensor clock speed, default 30 */
-	short int use_smbus;	/* SMBUS or straight I2c? */
 	enum mcam_buffer_mode buffer_mode;
 
 	int mclk_src;	/* which clock source the mclk derives from */
@@ -150,6 +147,8 @@ struct mcam_camera {
 	 * Subsystem structures.
 	 */
 	struct video_device vdev;
+	struct v4l2_async_notifier notifier;
+	struct v4l2_async_subdev asd;
 	struct v4l2_subdev *sensor;
 
 	/* Videobuf2 stuff */
diff --git a/drivers/media/platform/marvell-ccic/mmp-driver.c b/drivers/media/platform/marvell-ccic/mmp-driver.c
index 492663a8a29d..92061e4adbfd 100644
--- a/drivers/media/platform/marvell-ccic/mmp-driver.c
+++ b/drivers/media/platform/marvell-ccic/mmp-driver.c
@@ -4,12 +4,12 @@
  * to work with the Armada 610 as used in the OLPC 1.75 system.
  *
  * Copyright 2011 Jonathan Corbet <corbet@lwn.net>
+ * Copyright 2018 Lubomir Rintel <lkundrak@v3.sk>
  */
 
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -314,6 +314,7 @@ static int mmpcam_probe(struct platform_device *pdev)
 	struct mmp_camera *cam;
 	struct mcam_camera *mcam;
 	struct resource *res;
+	struct fwnode_handle *ep;
 	struct mmp_camera_platform_data *pdata;
 	int ret;
 
@@ -328,7 +329,6 @@ static int mmpcam_probe(struct platform_device *pdev)
 	mcam->plat_power_down = mmpcam_power_down;
 	mcam->calc_dphy = mmpcam_calc_dphy;
 	mcam->dev = &pdev->dev;
-	mcam->use_smbus = 0;
 	pdata = pdev->dev.platform_data;
 	if (pdata) {
 		mcam->mclk_src = pdata->mclk_src;
@@ -372,15 +372,6 @@ static int mmpcam_probe(struct platform_device *pdev)
 	cam->power_regs = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(cam->power_regs))
 		return PTR_ERR(cam->power_regs);
-	/*
-	 * Find the i2c adapter.  This assumes, of course, that the
-	 * i2c bus is already up and functioning.
-	 */
-	mcam->i2c_adapter = platform_get_drvdata(pdata->i2c_device);
-	if (mcam->i2c_adapter == NULL) {
-		dev_err(&pdev->dev, "No i2c adapter\n");
-		return -ENODEV;
-	}
 	/*
 	 * Sensor GPIO pins.
 	 */
@@ -403,6 +394,19 @@ static int mmpcam_probe(struct platform_device *pdev)
 
 	mcam_init_clk(mcam);
 
+	/*
+	 * Create a match of the sensor against its OF node.
+	 */
+	ep = fwnode_graph_get_next_endpoint(of_fwnode_handle(pdev->dev.of_node),
+					    NULL);
+	if (!ep)
+		return -ENODEV;
+
+	mcam->asd.match_type = V4L2_ASYNC_MATCH_FWNODE;
+	mcam->asd.match.fwnode = fwnode_graph_get_remote_port_parent(ep);
+
+	fwnode_handle_put(ep);
+
 	/*
 	 * Power the device up and hand it off to the core.
 	 */
@@ -412,6 +416,7 @@ static int mmpcam_probe(struct platform_device *pdev)
 	ret = mccic_register(mcam);
 	if (ret)
 		goto out_power_down;
+
 	/*
 	 * Finally, set up our IRQ now that the core is ready to
 	 * deal with it.
diff --git a/include/linux/platform_data/media/mmp-camera.h b/include/linux/platform_data/media/mmp-camera.h
index 4c3a80a45883..c573ebc40035 100644
--- a/include/linux/platform_data/media/mmp-camera.h
+++ b/include/linux/platform_data/media/mmp-camera.h
@@ -12,7 +12,6 @@ enum dphy3_algo {
 };
 
 struct mmp_camera_platform_data {
-	struct platform_device *i2c_device;
 	int sensor_power_gpio;
 	int sensor_reset_gpio;
 	enum v4l2_mbus_type bus_type;
-- 
cgit v1.2.3


From 81a409bfd5517d537097d3cfdfed7f8bf8ac469c Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Tue, 28 May 2019 05:07:31 -0400
Subject: media: marvell-ccic: provide a clock for the sensor

The sensor needs the MCLK clock running when it's being probed. On
platforms where the sensor is instantiated from a DT (MMP2) it is going
to happen asynchronously.

Therefore, the current modus operandi, where the bridge driver fiddles
with the sensor power and clock itself is not going to fly. As the comments
wisely note, this doesn't even belong there.

Luckily, the ov7670 driver is already able to control its power and
reset lines, we can just drop the MMP platform glue altogether.

It also requests the clock via the standard clock subsystem. Good -- let's
set up a clock instance so that the sensor can ask us to enable the clock.
Note that this is pretty dumb at the moment: the clock is hardwired to a
particular frequency and parent. It was always the case.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/marvell-ccic/Kconfig       |   2 +
 drivers/media/platform/marvell-ccic/cafe-driver.c |   9 +-
 drivers/media/platform/marvell-ccic/mcam-core.c   | 172 ++++++++++++++++------
 drivers/media/platform/marvell-ccic/mcam-core.h   |   3 +
 drivers/media/platform/marvell-ccic/mmp-driver.c  | 152 ++-----------------
 include/linux/platform_data/media/mmp-camera.h    |   2 -
 6 files changed, 157 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/platform/marvell-ccic/Kconfig b/drivers/media/platform/marvell-ccic/Kconfig
index 86b84474dd8c..3e3f86264762 100644
--- a/drivers/media/platform/marvell-ccic/Kconfig
+++ b/drivers/media/platform/marvell-ccic/Kconfig
@@ -2,6 +2,7 @@
 config VIDEO_CAFE_CCIC
 	tristate "Marvell 88ALP01 (Cafe) CMOS Camera Controller support"
 	depends on PCI && I2C && VIDEO_V4L2
+	depends on COMMON_CLK
 	select VIDEO_OV7670
 	select VIDEOBUF2_VMALLOC
 	select VIDEOBUF2_DMA_CONTIG
@@ -15,6 +16,7 @@ config VIDEO_MMP_CAMERA
 	tristate "Marvell Armada 610 integrated camera controller support"
 	depends on I2C && VIDEO_V4L2
 	depends on ARCH_MMP || COMPILE_TEST
+	depends on COMMON_CLK
 	select VIDEO_OV7670
 	select I2C_GPIO
 	select VIDEOBUF2_VMALLOC
diff --git a/drivers/media/platform/marvell-ccic/cafe-driver.c b/drivers/media/platform/marvell-ccic/cafe-driver.c
index fe85368675cb..16602628f895 100644
--- a/drivers/media/platform/marvell-ccic/cafe-driver.c
+++ b/drivers/media/platform/marvell-ccic/cafe-driver.c
@@ -31,6 +31,7 @@
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/clkdev.h>
 
 #include "mcam-core.h"
 
@@ -531,11 +532,10 @@ static int cafe_pci_probe(struct pci_dev *pdev,
 		goto out_iounmap;
 
 	/*
-	 * Initialize the controller and leave it powered up.  It will
-	 * stay that way until the sensor driver shows up.
+	 * Initialize the controller.
 	 */
 	cafe_ctlr_init(mcam);
-	cafe_ctlr_power_up(mcam);
+
 	/*
 	 * Set up I2C/SMBUS communications.  We have to drop the mutex here
 	 * because the sensor could attach in this call chain, leading to
@@ -553,6 +553,9 @@ static int cafe_pci_probe(struct pci_dev *pdev,
 	if (ret)
 		goto out_smbus_shutdown;
 
+	clkdev_create(mcam->mclk, "xclk", "%d-%04x",
+		i2c_adapter_id(cam->i2c_adapter), ov7670_info.addr);
+
 	if (i2c_new_device(cam->i2c_adapter, &ov7670_info)) {
 		cam->registered = 1;
 		return 0;
diff --git a/drivers/media/platform/marvell-ccic/mcam-core.c b/drivers/media/platform/marvell-ccic/mcam-core.c
index 7dc7d9d91782..f9ac1547d093 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.c
+++ b/drivers/media/platform/marvell-ccic/mcam-core.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/io.h>
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
 #include <media/v4l2-ioctl.h>
@@ -303,9 +304,6 @@ static void mcam_enable_mipi(struct mcam_camera *mcam)
 		 */
 		mcam_reg_write(mcam, REG_CSI2_CTRL0,
 			CSI2_C0_MIPI_EN | CSI2_C0_ACT_LANE(mcam->lane));
-		mcam_reg_write(mcam, REG_CLKCTRL,
-			(mcam->mclk_src << 29) | mcam->mclk_div);
-
 		mcam->mipi_enabled = true;
 	}
 }
@@ -830,31 +828,6 @@ static void mcam_ctlr_irq_disable(struct mcam_camera *cam)
 	mcam_reg_clear_bit(cam, REG_IRQMASK, FRAMEIRQS);
 }
 
-
-
-static void mcam_ctlr_init(struct mcam_camera *cam)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cam->dev_lock, flags);
-	/*
-	 * Make sure it's not powered down.
-	 */
-	mcam_reg_clear_bit(cam, REG_CTRL1, C1_PWRDWN);
-	/*
-	 * Turn off the enable bit.  It sure should be off anyway,
-	 * but it's good to be sure.
-	 */
-	mcam_reg_clear_bit(cam, REG_CTRL0, C0_ENABLE);
-	/*
-	 * Clock the sensor appropriately.  Controller clock should
-	 * be 48MHz, sensor "typical" value is half that.
-	 */
-	mcam_reg_write_mask(cam, REG_CLKCTRL, 2, CLK_DIV_MASK);
-	spin_unlock_irqrestore(&cam->dev_lock, flags);
-}
-
-
 /*
  * Stop the controller, and don't return until we're really sure that no
  * further DMA is going on.
@@ -898,14 +871,15 @@ static int mcam_ctlr_power_up(struct mcam_camera *cam)
 	int ret;
 
 	spin_lock_irqsave(&cam->dev_lock, flags);
-	ret = cam->plat_power_up(cam);
-	if (ret) {
-		spin_unlock_irqrestore(&cam->dev_lock, flags);
-		return ret;
+	if (cam->plat_power_up) {
+		ret = cam->plat_power_up(cam);
+		if (ret) {
+			spin_unlock_irqrestore(&cam->dev_lock, flags);
+			return ret;
+		}
 	}
 	mcam_reg_clear_bit(cam, REG_CTRL1, C1_PWRDWN);
 	spin_unlock_irqrestore(&cam->dev_lock, flags);
-	msleep(5); /* Just to be sure */
 	return 0;
 }
 
@@ -920,10 +894,101 @@ static void mcam_ctlr_power_down(struct mcam_camera *cam)
 	 * power down routine.
 	 */
 	mcam_reg_set_bit(cam, REG_CTRL1, C1_PWRDWN);
-	cam->plat_power_down(cam);
+	if (cam->plat_power_down)
+		cam->plat_power_down(cam);
 	spin_unlock_irqrestore(&cam->dev_lock, flags);
 }
 
+/* ---------------------------------------------------------------------- */
+/*
+ * Controller clocks.
+ */
+static void mcam_clk_enable(struct mcam_camera *mcam)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_MCAM_CLK; i++) {
+		if (!IS_ERR(mcam->clk[i]))
+			clk_prepare_enable(mcam->clk[i]);
+	}
+}
+
+static void mcam_clk_disable(struct mcam_camera *mcam)
+{
+	int i;
+
+	for (i = NR_MCAM_CLK - 1; i >= 0; i--) {
+		if (!IS_ERR(mcam->clk[i]))
+			clk_disable_unprepare(mcam->clk[i]);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * Master sensor clock.
+ */
+static int mclk_prepare(struct clk_hw *hw)
+{
+	struct mcam_camera *cam = container_of(hw, struct mcam_camera, mclk_hw);
+
+	clk_prepare(cam->clk[0]);
+	return 0;
+}
+
+static void mclk_unprepare(struct clk_hw *hw)
+{
+	struct mcam_camera *cam = container_of(hw, struct mcam_camera, mclk_hw);
+
+	clk_unprepare(cam->clk[0]);
+}
+
+static int mclk_enable(struct clk_hw *hw)
+{
+	struct mcam_camera *cam = container_of(hw, struct mcam_camera, mclk_hw);
+	int mclk_src;
+	int mclk_div;
+
+	/*
+	 * Clock the sensor appropriately.  Controller clock should
+	 * be 48MHz, sensor "typical" value is half that.
+	 */
+	if (cam->bus_type == V4L2_MBUS_CSI2_DPHY) {
+		mclk_src = cam->mclk_src;
+		mclk_div = cam->mclk_div;
+	} else {
+		mclk_src = 3;
+		mclk_div = 2;
+	}
+
+	clk_enable(cam->clk[0]);
+	mcam_reg_write(cam, REG_CLKCTRL, (mclk_src << 29) | mclk_div);
+	mcam_ctlr_power_up(cam);
+
+	return 0;
+}
+
+static void mclk_disable(struct clk_hw *hw)
+{
+	struct mcam_camera *cam = container_of(hw, struct mcam_camera, mclk_hw);
+
+	mcam_ctlr_power_down(cam);
+	clk_disable(cam->clk[0]);
+}
+
+static unsigned long mclk_recalc_rate(struct clk_hw *hw,
+				unsigned long parent_rate)
+{
+	return 48000000;
+}
+
+static const struct clk_ops mclk_ops = {
+	.prepare = mclk_prepare,
+	.unprepare = mclk_unprepare,
+	.enable = mclk_enable,
+	.disable = mclk_disable,
+	.recalc_rate = mclk_recalc_rate,
+};
+
 /* -------------------------------------------------------------------- */
 /*
  * Communications with the sensor.
@@ -948,7 +1013,6 @@ static int mcam_cam_init(struct mcam_camera *cam)
 	ret = __mcam_cam_reset(cam);
 	/* Get/set parameters? */
 	cam->state = S_IDLE;
-	mcam_ctlr_power_down(cam);
 	return ret;
 }
 
@@ -1584,9 +1648,10 @@ static int mcam_v4l_open(struct file *filp)
 	if (ret)
 		goto out;
 	if (v4l2_fh_is_singular_file(filp)) {
-		ret = mcam_ctlr_power_up(cam);
+		ret = sensor_call(cam, core, s_power, 1);
 		if (ret)
 			goto out;
+		mcam_clk_enable(cam);
 		__mcam_cam_reset(cam);
 		mcam_set_config_needed(cam, 1);
 	}
@@ -1608,7 +1673,8 @@ static int mcam_v4l_release(struct file *filp)
 	_vb2_fop_release(filp, NULL);
 	if (last_open) {
 		mcam_disable_mipi(cam);
-		mcam_ctlr_power_down(cam);
+		sensor_call(cam, core, s_power, 0);
+		mcam_clk_disable(cam);
 		if (cam->buffer_mode == B_vmalloc && alloc_bufs_at_read)
 			mcam_free_dma_bufs(cam);
 	}
@@ -1806,6 +1872,7 @@ static const struct v4l2_async_notifier_operations mccic_notify_ops = {
 
 int mccic_register(struct mcam_camera *cam)
 {
+	struct clk_init_data mclk_init = { };
 	int ret;
 
 	/*
@@ -1838,7 +1905,6 @@ int mccic_register(struct mcam_camera *cam)
 	mcam_set_config_needed(cam, 1);
 	cam->pix_format = mcam_def_pix_format;
 	cam->mbus_code = mcam_def_mbus_code;
-	mcam_ctlr_init(cam);
 
 	/*
 	 * Register sensor notifier.
@@ -1857,6 +1923,26 @@ int mccic_register(struct mcam_camera *cam)
 		goto out;
 	}
 
+	/*
+	 * Register sensor master clock.
+	 */
+	mclk_init.parent_names = NULL;
+	mclk_init.num_parents = 0;
+	mclk_init.ops = &mclk_ops;
+	mclk_init.name = "mclk";
+
+	of_property_read_string(cam->dev->of_node, "clock-output-names",
+							&mclk_init.name);
+
+	cam->mclk_hw.init = &mclk_init;
+
+	cam->mclk = devm_clk_register(cam->dev, &cam->mclk_hw);
+	if (IS_ERR(cam->mclk)) {
+		ret = PTR_ERR(cam->mclk);
+		dev_err(cam->dev, "can't register clock\n");
+		goto out;
+	}
+
 	/*
 	 * If so requested, try to get our DMA buffers now.
 	 */
@@ -1884,7 +1970,7 @@ void mccic_shutdown(struct mcam_camera *cam)
 	 */
 	if (!list_empty(&cam->vdev.fh_list)) {
 		cam_warn(cam, "Removing a device with users!\n");
-		mcam_ctlr_power_down(cam);
+		sensor_call(cam, core, s_power, 0);
 	}
 	if (cam->buffer_mode == B_vmalloc)
 		mcam_free_dma_bufs(cam);
@@ -1906,7 +1992,8 @@ void mccic_suspend(struct mcam_camera *cam)
 		enum mcam_state cstate = cam->state;
 
 		mcam_ctlr_stop_dma(cam);
-		mcam_ctlr_power_down(cam);
+		sensor_call(cam, core, s_power, 0);
+		mcam_clk_disable(cam);
 		cam->state = cstate;
 	}
 	mutex_unlock(&cam->s_mutex);
@@ -1919,14 +2006,15 @@ int mccic_resume(struct mcam_camera *cam)
 
 	mutex_lock(&cam->s_mutex);
 	if (!list_empty(&cam->vdev.fh_list)) {
-		ret = mcam_ctlr_power_up(cam);
+		mcam_clk_enable(cam);
+		ret = sensor_call(cam, core, s_power, 1);
 		if (ret) {
 			mutex_unlock(&cam->s_mutex);
 			return ret;
 		}
 		__mcam_cam_reset(cam);
 	} else {
-		mcam_ctlr_power_down(cam);
+		sensor_call(cam, core, s_power, 0);
 	}
 	mutex_unlock(&cam->s_mutex);
 
diff --git a/drivers/media/platform/marvell-ccic/mcam-core.h b/drivers/media/platform/marvell-ccic/mcam-core.h
index 4a72213aca1a..2e3a7567a76a 100644
--- a/drivers/media/platform/marvell-ccic/mcam-core.h
+++ b/drivers/media/platform/marvell-ccic/mcam-core.h
@@ -8,6 +8,7 @@
 #define _MCAM_CORE_H
 
 #include <linux/list.h>
+#include <linux/clk-provider.h>
 #include <media/v4l2-common.h>
 #include <media/v4l2-ctrls.h>
 #include <media/v4l2-dev.h>
@@ -125,6 +126,8 @@ struct mcam_camera {
 
 	/* clock tree support */
 	struct clk *clk[NR_MCAM_CLK];
+	struct clk_hw mclk_hw;
+	struct clk *mclk;
 
 	/*
 	 * Callbacks from the core to the platform code.
diff --git a/drivers/media/platform/marvell-ccic/mmp-driver.c b/drivers/media/platform/marvell-ccic/mmp-driver.c
index 92061e4adbfd..450693e6657d 100644
--- a/drivers/media/platform/marvell-ccic/mmp-driver.c
+++ b/drivers/media/platform/marvell-ccic/mmp-driver.c
@@ -20,9 +20,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
 #include <linux/io.h>
-#include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/pm.h>
 #include <linux/clk.h>
@@ -36,7 +34,6 @@ MODULE_LICENSE("GPL");
 static char *mcam_clks[] = {"axi", "func", "phy"};
 
 struct mmp_camera {
-	void __iomem *power_regs;
 	struct platform_device *pdev;
 	struct mcam_camera mcam;
 	struct list_head devlist;
@@ -92,94 +89,6 @@ static struct mmp_camera *mmpcam_find_device(struct platform_device *pdev)
 	return NULL;
 }
 
-
-
-
-/*
- * Power-related registers; this almost certainly belongs
- * somewhere else.
- *
- * ARMADA 610 register manual, sec 7.2.1, p1842.
- */
-#define CPU_SUBSYS_PMU_BASE	0xd4282800
-#define REG_CCIC_DCGCR		0x28	/* CCIC dyn clock gate ctrl reg */
-#define REG_CCIC_CRCR		0x50	/* CCIC clk reset ctrl reg	*/
-
-static void mcam_clk_enable(struct mcam_camera *mcam)
-{
-	unsigned int i;
-
-	for (i = 0; i < NR_MCAM_CLK; i++) {
-		if (!IS_ERR(mcam->clk[i]))
-			clk_prepare_enable(mcam->clk[i]);
-	}
-}
-
-static void mcam_clk_disable(struct mcam_camera *mcam)
-{
-	int i;
-
-	for (i = NR_MCAM_CLK - 1; i >= 0; i--) {
-		if (!IS_ERR(mcam->clk[i]))
-			clk_disable_unprepare(mcam->clk[i]);
-	}
-}
-
-/*
- * Power control.
- */
-static void mmpcam_power_up_ctlr(struct mmp_camera *cam)
-{
-	iowrite32(0x3f, cam->power_regs + REG_CCIC_DCGCR);
-	iowrite32(0x3805b, cam->power_regs + REG_CCIC_CRCR);
-	mdelay(1);
-}
-
-static int mmpcam_power_up(struct mcam_camera *mcam)
-{
-	struct mmp_camera *cam = mcam_to_cam(mcam);
-	struct mmp_camera_platform_data *pdata;
-
-/*
- * Turn on power and clocks to the controller.
- */
-	mmpcam_power_up_ctlr(cam);
-	mcam_clk_enable(mcam);
-/*
- * Provide power to the sensor.
- */
-	mcam_reg_write(mcam, REG_CLKCTRL, 0x60000002);
-	pdata = cam->pdev->dev.platform_data;
-	gpio_set_value(pdata->sensor_power_gpio, 1);
-	mdelay(5);
-	mcam_reg_clear_bit(mcam, REG_CTRL1, 0x10000000);
-	gpio_set_value(pdata->sensor_reset_gpio, 0); /* reset is active low */
-	mdelay(5);
-	gpio_set_value(pdata->sensor_reset_gpio, 1); /* reset is active low */
-	mdelay(5);
-
-	return 0;
-}
-
-static void mmpcam_power_down(struct mcam_camera *mcam)
-{
-	struct mmp_camera *cam = mcam_to_cam(mcam);
-	struct mmp_camera_platform_data *pdata;
-/*
- * Turn off clocks and set reset lines
- */
-	iowrite32(0, cam->power_regs + REG_CCIC_DCGCR);
-	iowrite32(0, cam->power_regs + REG_CCIC_CRCR);
-/*
- * Shut down the sensor.
- */
-	pdata = cam->pdev->dev.platform_data;
-	gpio_set_value(pdata->sensor_power_gpio, 0);
-	gpio_set_value(pdata->sensor_reset_gpio, 0);
-
-	mcam_clk_disable(mcam);
-}
-
 /*
  * calc the dphy register values
  * There are three dphy registers being used.
@@ -325,8 +234,6 @@ static int mmpcam_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&cam->devlist);
 
 	mcam = &cam->mcam;
-	mcam->plat_power_up = mmpcam_power_up;
-	mcam->plat_power_down = mmpcam_power_down;
 	mcam->calc_dphy = mmpcam_calc_dphy;
 	mcam->dev = &pdev->dev;
 	pdata = pdev->dev.platform_data;
@@ -364,33 +271,6 @@ static int mmpcam_probe(struct platform_device *pdev)
 	if (IS_ERR(mcam->regs))
 		return PTR_ERR(mcam->regs);
 	mcam->regs_size = resource_size(res);
-	/*
-	 * Power/clock memory is elsewhere; get it too.  Perhaps this
-	 * should really be managed outside of this driver?
-	 */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	cam->power_regs = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(cam->power_regs))
-		return PTR_ERR(cam->power_regs);
-	/*
-	 * Sensor GPIO pins.
-	 */
-	ret = devm_gpio_request(&pdev->dev, pdata->sensor_power_gpio,
-							"cam-power");
-	if (ret) {
-		dev_err(&pdev->dev, "Can't get sensor power gpio %d",
-				pdata->sensor_power_gpio);
-		return ret;
-	}
-	gpio_direction_output(pdata->sensor_power_gpio, 0);
-	ret = devm_gpio_request(&pdev->dev, pdata->sensor_reset_gpio,
-							"cam-reset");
-	if (ret) {
-		dev_err(&pdev->dev, "Can't get sensor reset gpio %d",
-				pdata->sensor_reset_gpio);
-		return ret;
-	}
-	gpio_direction_output(pdata->sensor_reset_gpio, 0);
 
 	mcam_init_clk(mcam);
 
@@ -408,14 +288,21 @@ static int mmpcam_probe(struct platform_device *pdev)
 	fwnode_handle_put(ep);
 
 	/*
-	 * Power the device up and hand it off to the core.
+	 * Register the device with the core.
 	 */
-	ret = mmpcam_power_up(mcam);
-	if (ret)
-		return ret;
 	ret = mccic_register(mcam);
 	if (ret)
-		goto out_power_down;
+		return ret;
+
+	/*
+	 * Add OF clock provider.
+	 */
+	ret = of_clk_add_provider(pdev->dev.of_node, of_clk_src_simple_get,
+								mcam->mclk);
+	if (ret) {
+		dev_err(&pdev->dev, "can't add DT clock provider\n");
+		goto out;
+	}
 
 	/*
 	 * Finally, set up our IRQ now that the core is ready to
@@ -424,7 +311,7 @@ static int mmpcam_probe(struct platform_device *pdev)
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (res == NULL) {
 		ret = -ENODEV;
-		goto out_unregister;
+		goto out;
 	}
 	cam->irq = res->start;
 	ret = devm_request_irq(&pdev->dev, cam->irq, mmpcam_irq, IRQF_SHARED,
@@ -434,10 +321,10 @@ static int mmpcam_probe(struct platform_device *pdev)
 		return 0;
 	}
 
-out_unregister:
+out:
+	fwnode_handle_put(mcam->asd.match.fwnode);
 	mccic_shutdown(mcam);
-out_power_down:
-	mmpcam_power_down(mcam);
+
 	return ret;
 }
 
@@ -448,7 +335,6 @@ static int mmpcam_remove(struct mmp_camera *cam)
 
 	mmpcam_remove_device(cam);
 	mccic_shutdown(mcam);
-	mmpcam_power_down(mcam);
 	return 0;
 }
 
@@ -480,12 +366,6 @@ static int mmpcam_resume(struct platform_device *pdev)
 {
 	struct mmp_camera *cam = mmpcam_find_device(pdev);
 
-	/*
-	 * Power up unconditionally just in case the core tries to
-	 * touch a register even if nothing was active before; trust
-	 * me, it's better this way.
-	 */
-	mmpcam_power_up_ctlr(cam);
 	return mccic_resume(&cam->mcam);
 }
 
diff --git a/include/linux/platform_data/media/mmp-camera.h b/include/linux/platform_data/media/mmp-camera.h
index c573ebc40035..53adaab64f28 100644
--- a/include/linux/platform_data/media/mmp-camera.h
+++ b/include/linux/platform_data/media/mmp-camera.h
@@ -12,8 +12,6 @@ enum dphy3_algo {
 };
 
 struct mmp_camera_platform_data {
-	int sensor_power_gpio;
-	int sensor_reset_gpio;
 	enum v4l2_mbus_type bus_type;
 	int mclk_src;	/* which clock source the MCLK derives from */
 	int mclk_div;	/* Clock Divider Value for MCLK */
-- 
cgit v1.2.3


From 65d80db2ee92330269e90313c6af782036f4d23d Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 20 Jun 2019 20:35:27 +0200
Subject: regulator: s2mps11: Add support for disabling S2MPS11 regulators in
 suspend

The driver supported turning off regulators in suspend only for S2MPS14
device.  However this makes also sense for S2MPS11 and can reduce the
power consumption during suspend to RAM.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/s2mps11.c         | 210 ++++++++++++++++++++----------------
 include/linux/mfd/samsung/s2mps11.h |   5 +
 2 files changed, 120 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index 93570712eb56..9c06ecd80a90 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -34,7 +34,7 @@ struct s2mps11_info {
 	enum sec_device_type dev_type;
 
 	/*
-	 * One bit for each S2MPS13/S2MPS14/S2MPU02 regulator whether
+	 * One bit for each S2MPS11/S2MPS13/S2MPS14/S2MPU02 regulator whether
 	 * the suspend mode was enabled.
 	 */
 	DECLARE_BITMAP(suspend_state, S2MPS_REGULATOR_MAX);
@@ -225,27 +225,133 @@ ramp_disable:
 				  1 << enable_shift, 0);
 }
 
+static int s2mps11_regulator_enable(struct regulator_dev *rdev)
+{
+	struct s2mps11_info *s2mps11 = rdev_get_drvdata(rdev);
+	int rdev_id = rdev_get_id(rdev);
+	unsigned int val;
+
+	switch (s2mps11->dev_type) {
+	case S2MPS11X:
+		if (test_bit(rdev_id, s2mps11->suspend_state))
+			val = S2MPS14_ENABLE_SUSPEND;
+		else
+			val = rdev->desc->enable_mask;
+		break;
+	case S2MPS13X:
+	case S2MPS14X:
+		if (test_bit(rdev_id, s2mps11->suspend_state))
+			val = S2MPS14_ENABLE_SUSPEND;
+		else if (s2mps11->ext_control_gpiod[rdev_id])
+			val = S2MPS14_ENABLE_EXT_CONTROL;
+		else
+			val = rdev->desc->enable_mask;
+		break;
+	case S2MPU02:
+		if (test_bit(rdev_id, s2mps11->suspend_state))
+			val = S2MPU02_ENABLE_SUSPEND;
+		else
+			val = rdev->desc->enable_mask;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
+			rdev->desc->enable_mask, val);
+}
+
+static int s2mps11_regulator_set_suspend_disable(struct regulator_dev *rdev)
+{
+	int ret;
+	unsigned int val, state;
+	struct s2mps11_info *s2mps11 = rdev_get_drvdata(rdev);
+	int rdev_id = rdev_get_id(rdev);
+
+	/* Below LDO should be always on or does not support suspend mode. */
+	switch (s2mps11->dev_type) {
+	case S2MPS11X:
+		switch (rdev_id) {
+		case S2MPS11_LDO2:
+		case S2MPS11_LDO36:
+		case S2MPS11_LDO37:
+		case S2MPS11_LDO38:
+			return 0;
+		default:
+			state = S2MPS14_ENABLE_SUSPEND;
+			break;
+		}
+		break;
+	case S2MPS13X:
+	case S2MPS14X:
+		switch (rdev_id) {
+		case S2MPS14_LDO3:
+			return 0;
+		default:
+			state = S2MPS14_ENABLE_SUSPEND;
+			break;
+		}
+		break;
+	case S2MPU02:
+		switch (rdev_id) {
+		case S2MPU02_LDO13:
+		case S2MPU02_LDO14:
+		case S2MPU02_LDO15:
+		case S2MPU02_LDO17:
+		case S2MPU02_BUCK7:
+			state = S2MPU02_DISABLE_SUSPEND;
+			break;
+		default:
+			state = S2MPU02_ENABLE_SUSPEND;
+			break;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = regmap_read(rdev->regmap, rdev->desc->enable_reg, &val);
+	if (ret < 0)
+		return ret;
+
+	set_bit(rdev_id, s2mps11->suspend_state);
+	/*
+	 * Don't enable suspend mode if regulator is already disabled because
+	 * this would effectively for a short time turn on the regulator after
+	 * resuming.
+	 * However we still want to toggle the suspend_state bit for regulator
+	 * in case if it got enabled before suspending the system.
+	 */
+	if (!(val & rdev->desc->enable_mask))
+		return 0;
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
+				  rdev->desc->enable_mask, state);
+}
+
 static const struct regulator_ops s2mps11_ldo_ops = {
 	.list_voltage		= regulator_list_voltage_linear,
 	.map_voltage		= regulator_map_voltage_linear,
 	.is_enabled		= regulator_is_enabled_regmap,
-	.enable			= regulator_enable_regmap,
+	.enable			= s2mps11_regulator_enable,
 	.disable		= regulator_disable_regmap,
 	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.set_suspend_disable	= s2mps11_regulator_set_suspend_disable,
 };
 
 static const struct regulator_ops s2mps11_buck_ops = {
 	.list_voltage		= regulator_list_voltage_linear,
 	.map_voltage		= regulator_map_voltage_linear,
 	.is_enabled		= regulator_is_enabled_regmap,
-	.enable			= regulator_enable_regmap,
+	.enable			= s2mps11_regulator_enable,
 	.disable		= regulator_disable_regmap,
 	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.set_voltage_time_sel	= s2mps11_regulator_set_voltage_time_sel,
 	.set_ramp_delay		= s2mps11_set_ramp_delay,
+	.set_suspend_disable	= s2mps11_regulator_set_suspend_disable,
 };
 
 #define regulator_desc_s2mps11_ldo(num, step) {		\
@@ -501,102 +607,16 @@ static const struct regulator_desc s2mps13_regulators[] = {
 	regulator_desc_s2mps13_buck8_10(10, MIN_500_MV,  STEP_6_25_MV, 0x10),
 };
 
-static int s2mps14_regulator_enable(struct regulator_dev *rdev)
-{
-	struct s2mps11_info *s2mps11 = rdev_get_drvdata(rdev);
-	int rdev_id = rdev_get_id(rdev);
-	unsigned int val;
-
-	switch (s2mps11->dev_type) {
-	case S2MPS13X:
-	case S2MPS14X:
-		if (test_bit(rdev_id, s2mps11->suspend_state))
-			val = S2MPS14_ENABLE_SUSPEND;
-		else if (s2mps11->ext_control_gpiod[rdev_id])
-			val = S2MPS14_ENABLE_EXT_CONTROL;
-		else
-			val = rdev->desc->enable_mask;
-		break;
-	case S2MPU02:
-		if (test_bit(rdev_id, s2mps11->suspend_state))
-			val = S2MPU02_ENABLE_SUSPEND;
-		else
-			val = rdev->desc->enable_mask;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
-			rdev->desc->enable_mask, val);
-}
-
-static int s2mps14_regulator_set_suspend_disable(struct regulator_dev *rdev)
-{
-	int ret;
-	unsigned int val, state;
-	struct s2mps11_info *s2mps11 = rdev_get_drvdata(rdev);
-	int rdev_id = rdev_get_id(rdev);
-
-	/* Below LDO should be always on or does not support suspend mode. */
-	switch (s2mps11->dev_type) {
-	case S2MPS13X:
-	case S2MPS14X:
-		switch (rdev_id) {
-		case S2MPS14_LDO3:
-			return 0;
-		default:
-			state = S2MPS14_ENABLE_SUSPEND;
-			break;
-		}
-		break;
-	case S2MPU02:
-		switch (rdev_id) {
-		case S2MPU02_LDO13:
-		case S2MPU02_LDO14:
-		case S2MPU02_LDO15:
-		case S2MPU02_LDO17:
-		case S2MPU02_BUCK7:
-			state = S2MPU02_DISABLE_SUSPEND;
-			break;
-		default:
-			state = S2MPU02_ENABLE_SUSPEND;
-			break;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	ret = regmap_read(rdev->regmap, rdev->desc->enable_reg, &val);
-	if (ret < 0)
-		return ret;
-
-	set_bit(rdev_id, s2mps11->suspend_state);
-	/*
-	 * Don't enable suspend mode if regulator is already disabled because
-	 * this would effectively for a short time turn on the regulator after
-	 * resuming.
-	 * However we still want to toggle the suspend_state bit for regulator
-	 * in case if it got enabled before suspending the system.
-	 */
-	if (!(val & rdev->desc->enable_mask))
-		return 0;
-
-	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
-			rdev->desc->enable_mask, state);
-}
-
 static const struct regulator_ops s2mps14_reg_ops = {
 	.list_voltage		= regulator_list_voltage_linear,
 	.map_voltage		= regulator_map_voltage_linear,
 	.is_enabled		= regulator_is_enabled_regmap,
-	.enable			= s2mps14_regulator_enable,
+	.enable			= s2mps11_regulator_enable,
 	.disable		= regulator_disable_regmap,
 	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
-	.set_suspend_disable	= s2mps14_regulator_set_suspend_disable,
+	.set_suspend_disable	= s2mps11_regulator_set_suspend_disable,
 };
 
 #define regulator_desc_s2mps14_ldo(num, min, step) {	\
@@ -888,24 +908,24 @@ static const struct regulator_ops s2mpu02_ldo_ops = {
 	.list_voltage		= regulator_list_voltage_linear,
 	.map_voltage		= regulator_map_voltage_linear,
 	.is_enabled		= regulator_is_enabled_regmap,
-	.enable			= s2mps14_regulator_enable,
+	.enable			= s2mps11_regulator_enable,
 	.disable		= regulator_disable_regmap,
 	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
-	.set_suspend_disable	= s2mps14_regulator_set_suspend_disable,
+	.set_suspend_disable	= s2mps11_regulator_set_suspend_disable,
 };
 
 static const struct regulator_ops s2mpu02_buck_ops = {
 	.list_voltage		= regulator_list_voltage_linear,
 	.map_voltage		= regulator_map_voltage_linear,
 	.is_enabled		= regulator_is_enabled_regmap,
-	.enable			= s2mps14_regulator_enable,
+	.enable			= s2mps11_regulator_enable,
 	.disable		= regulator_disable_regmap,
 	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
-	.set_suspend_disable	= s2mps14_regulator_set_suspend_disable,
+	.set_suspend_disable	= s2mps11_regulator_set_suspend_disable,
 	.set_ramp_delay		= s2mpu02_set_ramp_delay,
 };
 
diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h
index 6e7668a389a1..f6c035eb87be 100644
--- a/include/linux/mfd/samsung/s2mps11.h
+++ b/include/linux/mfd/samsung/s2mps11.h
@@ -188,4 +188,9 @@ enum s2mps11_regulators {
 #define S2MPS11_BUCK6_RAMP_EN_SHIFT	0
 #define S2MPS11_PMIC_EN_SHIFT	6
 
+/*
+ * Bits for "enable suspend" (On/Off controlled by PWREN)
+ * are the same as in S2MPS14: S2MPS14_ENABLE_SUSPEND
+ */
+
 #endif /*  __LINUX_MFD_S2MPS11_H */
-- 
cgit v1.2.3


From 8ec59c0f5f4966f89f4e3e3cab81710c7fa959d0 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 17 Jun 2019 17:00:17 +0200
Subject: sched/topology: Remove unused 'sd' parameter from
 arch_scale_cpu_capacity()

The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is
unused since commit:

  765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'")

Remove it.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: gregkh@linuxfoundation.org
Cc: linux@armlinux.org.uk
Cc: quentin.perret@arm.com
Cc: rafael@kernel.org
Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/topology.c       |  2 +-
 drivers/base/arch_topology.c     |  6 +++---
 include/linux/arch_topology.h    |  2 +-
 include/linux/energy_model.h     |  2 +-
 include/linux/sched/topology.h   | 14 +++-----------
 kernel/power/energy_model.c      |  2 +-
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/sched/deadline.c          |  2 +-
 kernel/sched/fair.c              |  6 +++---
 kernel/sched/pelt.c              |  2 +-
 kernel/sched/pelt.h              |  2 +-
 kernel/sched/sched.h             |  2 +-
 kernel/sched/topology.c          |  8 ++++----
 13 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 60e375ce1ab2..d17cb1e6d679 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu)
 	topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
 
 	pr_info("CPU%u: update cpu_capacity %lu\n",
-		cpu, topology_get_cpu_scale(NULL, cpu));
+		cpu, topology_get_cpu_scale(cpu));
 }
 
 #else
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1739d7e1952a..9b09e31ae82f 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
 {
 	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
+	return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
 }
 
 static void update_topology_flags_workfn(struct work_struct *work);
@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void)
 			/ capacity_scale;
 		topology_set_cpu_scale(cpu, capacity);
 		pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
-			cpu, topology_get_cpu_scale(NULL, cpu));
+			cpu, topology_get_cpu_scale(cpu));
 	}
 }
 
@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
 	cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
 
 	for_each_cpu(cpu, policy->related_cpus) {
-		raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
+		raw_capacity[cpu] = topology_get_cpu_scale(cpu) *
 				    policy->cpuinfo.max_freq / 1000UL;
 		capacity_scale = max(raw_capacity[cpu], capacity_scale);
 	}
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d9bdc1a7f4e7..1cfe05ea1d89 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale);
 
 struct sched_domain;
 static inline
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+unsigned long topology_get_cpu_scale(int cpu)
 {
 	return per_cpu(cpu_scale, cpu);
 }
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index aa027f7bcb3e..73f8c3cb9588 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 	 * like schedutil.
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(cpu);
 	cs = &pd->table[pd->nr_cap_states - 1];
 	freq = map_util_freq(max_util, cs->frequency, scale_cpu);
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 53afbe07354a..e445d3767cdd 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -196,14 +196,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
 # define SD_INIT_NAME(type)
 #endif
 
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-	return SCHED_CAPACITY_SCALE;
-}
-#endif
-
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -219,16 +211,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+#endif	/* !CONFIG_SMP */
+
 #ifndef arch_scale_cpu_capacity
 static __always_inline
-unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+unsigned long arch_scale_cpu_capacity(int cpu)
 {
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
 
-#endif	/* !CONFIG_SMP */
-
 static inline int task_node(const struct task_struct *p)
 {
 	return cpu_to_node(task_cpu(p));
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
 		 * All CPUs of a domain must have the same micro-architecture
 		 * since they all share the same table.
 		 */
-		cap = arch_scale_cpu_capacity(NULL, cpu);
+		cap = arch_scale_cpu_capacity(cpu);
 		if (prev_cap && prev_cap != cap) {
 			pr_err("CPUs of %*pbl must have the same capacity\n",
 							cpumask_pr_args(span));
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..7c4ce69067c4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -276,7 +276,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
 	unsigned long util = cpu_util_cfs(rq);
-	unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+	unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
 
 	sg_cpu->max = max;
 	sg_cpu->bw_dl = cpu_bw_dl(rq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c1ef30861068..8b5bb2ac16e2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
 						 &curr->dl);
 	} else {
 		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
-		unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
 
 		scaled_delta_exec = cap_scale(delta_exec, scale_freq);
 		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c11dcdedcbc..4f8754157763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct task_struct *p)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
-	long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
 	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 
 	if (cap > 0) {
@@ -7646,7 +7646,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+	unsigned long max = arch_scale_cpu_capacity(cpu);
 	unsigned long used, free;
 	unsigned long irq;
 
@@ -7671,7 +7671,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	unsigned long capacity = scale_rt_capacity(sd, cpu);
 	struct sched_group *sdg = sd->groups;
 
-	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
 
 	if (!capacity)
 		capacity = 1;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..42ea66b07b1d 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -366,7 +366,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	 * reflect the real amount of computation
 	 */
 	running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
-	running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+	running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
 
 	/*
 	 * We know the time that has been used by interrupt since last update
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 	 * Scale the elapsed time to reflect the real amount of
 	 * computation
 	 */
-	delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+	delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
 	delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
 
 	rq->clock_pelt += delta;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b08dee29ef5e..e58ab597ec88 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2248,7 +2248,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
 
 static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
 {
-	unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+	unsigned long max = arch_scale_cpu_capacity(cpu);
 
 	return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 63184cf0d0d7..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1874,10 +1874,10 @@ static struct sched_domain_topology_level
 	unsigned long cap;
 
 	/* Is there any asymmetry? */
-	cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+	cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
 
 	for_each_cpu(i, cpu_map) {
-		if (arch_scale_cpu_capacity(NULL, i) != cap) {
+		if (arch_scale_cpu_capacity(i) != cap) {
 			asym = true;
 			break;
 		}
@@ -1892,7 +1892,7 @@ static struct sched_domain_topology_level
 	 * to everyone.
 	 */
 	for_each_cpu(i, cpu_map) {
-		unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+		unsigned long max_capacity = arch_scale_cpu_capacity(i);
 		int tl_id = 0;
 
 		for_each_sd_topology(tl) {
@@ -1902,7 +1902,7 @@ static struct sched_domain_topology_level
 			for_each_cpu_and(j, tl->mask(i), cpu_map) {
 				unsigned long capacity;
 
-				capacity = arch_scale_cpu_capacity(NULL, j);
+				capacity = arch_scale_cpu_capacity(j);
 
 				if (capacity <= max_capacity)
 					continue;
-- 
cgit v1.2.3


From 3c93a0c04dfdcba199982b53b97488b1b1d90eff Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Tue, 4 Jun 2019 12:14:55 +0100
Subject: sched/debug: Add a new sched_trace_*() helper functions

The new functions allow modules to access internal data structures of
unexported struct cfs_rq and struct rq to extract important information
from the tracepoints to be introduced in later patches.

While at it fix alphabetical order of struct declarations in sched.h

Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavankumar Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uwe Kleine-Konig <u.kleine-koenig@pengutronix.de>
Link: https://lkml.kernel.org/r/20190604111459.2862-3-qais.yousef@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 16 ++++++++-
 kernel/sched/fair.c   | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1b2590a8d038..044c023875e8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@ struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
+struct capture_control;
 struct cfs_rq;
 struct fs_struct;
 struct futex_pi_state;
@@ -47,8 +48,9 @@ struct pid_namespace;
 struct pipe_inode_info;
 struct rcu_node;
 struct reclaim_state;
-struct capture_control;
 struct robust_list_head;
+struct root_domain;
+struct rq;
 struct sched_attr;
 struct sched_param;
 struct seq_file;
@@ -1920,4 +1922,16 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
+
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
+
+int sched_trace_rq_cpu(struct rq *rq);
+
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
+
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f8754157763..461c3e9a67b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return grp->my_q;
 }
 
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+	if (!path)
+		return;
+
+	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+		autogroup_path(cfs_rq->tg, path, len);
+	else if (cfs_rq && cfs_rq->tg->css.cgroup)
+		cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+	else
+		strlcpy(path, "(null)", len);
+}
+
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return NULL;
 }
 
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+	if (path)
+		strlcpy(path, "(null)", len);
+}
+
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	return true;
@@ -10408,3 +10427,83 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
+
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+	return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+	if (!cfs_rq) {
+		if (str)
+			strlcpy(str, "(null)", len);
+		else
+			return NULL;
+	}
+
+	cfs_rq_tg_path(cfs_rq, str, len);
+	return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq ? &rq->avg_rt : NULL;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq ? &rq->avg_dl : NULL;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+	return rq ? &rq->avg_irq : NULL;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+
+int sched_trace_rq_cpu(struct rq *rq)
+{
+	return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+	return rd ? rd->span : NULL;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-- 
cgit v1.2.3


From 69842cba9ace84849bb9b8edcdf2cefccd97901c Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 21 Jun 2019 09:42:02 +0100
Subject: sched/uclamp: Add CPU's clamp buckets refcounting

Utilization clamping allows to clamp the CPU's utilization within a
[util_min, util_max] range, depending on the set of RUNNABLE tasks on
that CPU. Each task references two "clamp buckets" defining its minimum
and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp
bucket is active if there is at least one RUNNABLE tasks enqueued on
that CPU and refcounting that bucket.

When a task is {en,de}queued {on,from} a rq, the set of active clamp
buckets on that CPU can change. If the set of active clamp buckets
changes for a CPU a new "aggregated" clamp value is computed for that
CPU. This is because each clamp bucket enforces a different utilization
clamp value.

Clamp values are always MAX aggregated for both util_min and util_max.
This ensures that no task can affect the performance of other
co-scheduled tasks which are more boosted (i.e. with higher util_min
clamp) or less capped (i.e. with higher util_max clamp).

A task has:
   task_struct::uclamp[clamp_id]::bucket_id
to track the "bucket index" of the CPU's clamp bucket it refcounts while
enqueued, for each clamp index (clamp_id).

A runqueue has:
   rq::uclamp[clamp_id]::bucket[bucket_id].tasks
to track how many RUNNABLE tasks on that CPU refcount each
clamp bucket (bucket_id) of a clamp index (clamp_id).
It also has a:
   rq::uclamp[clamp_id]::bucket[bucket_id].value
to track the clamp value of each clamp bucket (bucket_id) of a clamp
index (clamp_id).

The rq::uclamp::bucket[clamp_id][] array is scanned every time it's
needed to find a new MAX aggregated clamp value for a clamp_id. This
operation is required only when it's dequeued the last task of a clamp
bucket tracking the current MAX aggregated clamp value. In this case,
the CPU is either entering IDLE or going to schedule a less boosted or
more clamped task.
The expected number of different clamp values configured at build time
is small enough to fit the full unordered array into a single cache
line, for configurations of up to 7 buckets.

Add to struct rq the basic data structures required to refcount the
number of RUNNABLE tasks for each clamp bucket. Add also the max
aggregation required to update the rq's clamp value at each
enqueue/dequeue event.

Use a simple linear mapping of clamp values into clamp buckets.
Pre-compute and cache bucket_id to avoid integer divisions at
enqueue/dequeue time.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alessio Balsini <balsini@android.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/log2.h           |  34 +++++++++
 include/linux/sched.h          |  39 ++++++++++
 include/linux/sched/topology.h |   6 --
 init/Kconfig                   |  53 +++++++++++++
 kernel/sched/core.c            | 166 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h           |  51 +++++++++++++
 6 files changed, 343 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1aec01365ed4..83a4a3ca3e8a 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
 		ilog2((n) - 1) + 1) :		\
 	__order_base_2(n)			\
 )
+
+static inline __attribute__((const))
+int __bits_per(unsigned long n)
+{
+	if (n < 2)
+		return 1;
+	if (is_power_of_2(n))
+		return order_base_2(n) + 1;
+	return order_base_2(n);
+}
+
+/**
+ * bits_per - calculate the number of bits required for the argument
+ * @n: parameter
+ *
+ * This is constant-capable and can be used for compile time
+ * initializations, e.g bitfields.
+ *
+ * The first few values calculated by this routine:
+ * bf(0) = 1
+ * bf(1) = 1
+ * bf(2) = 2
+ * bf(3) = 2
+ * bf(4) = 3
+ * ... and so on.
+ */
+#define bits_per(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		((n) == 0 || (n) == 1)		\
+			? 1 : ilog2(n) + 1	\
+	) :					\
+	__bits_per(n)				\
+)
 #endif /* _LINUX_LOG2_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 044c023875e8..80235bcd05f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -283,6 +283,18 @@ struct vtime {
 	u64			gtime;
 };
 
+/*
+ * Utilization clamp constraints.
+ * @UCLAMP_MIN:	Minimum utilization
+ * @UCLAMP_MAX:	Maximum utilization
+ * @UCLAMP_CNT:	Utilization clamp constraints count
+ */
+enum uclamp_id {
+	UCLAMP_MIN = 0,
+	UCLAMP_MAX,
+	UCLAMP_CNT
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
@@ -314,6 +326,10 @@ struct sched_info {
 # define SCHED_FIXEDPOINT_SHIFT		10
 # define SCHED_FIXEDPOINT_SCALE		(1L << SCHED_FIXEDPOINT_SHIFT)
 
+/* Increase resolution of cpu_capacity calculations */
+# define SCHED_CAPACITY_SHIFT		SCHED_FIXEDPOINT_SHIFT
+# define SCHED_CAPACITY_SCALE		(1L << SCHED_CAPACITY_SHIFT)
+
 struct load_weight {
 	unsigned long			weight;
 	u32				inv_weight;
@@ -562,6 +578,25 @@ struct sched_dl_entity {
 	struct hrtimer inactive_timer;
 };
 
+#ifdef CONFIG_UCLAMP_TASK
+/* Number of utilization clamp buckets (shorter alias) */
+#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
+
+/*
+ * Utilization clamp for a scheduling entity
+ * @value:		clamp value "assigned" to a se
+ * @bucket_id:		bucket index corresponding to the "assigned" value
+ *
+ * The bucket_id is the index of the clamp bucket matching the clamp value
+ * which is pre-computed and stored to avoid expensive integer divisions from
+ * the fast path.
+ */
+struct uclamp_se {
+	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
+	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
 union rcu_special {
 	struct {
 		u8			blocked;
@@ -642,6 +677,10 @@ struct task_struct {
 #endif
 	struct sched_dl_entity		dl;
 
+#ifdef CONFIG_UCLAMP_TASK
+	struct uclamp_se		uclamp[UCLAMP_CNT];
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* List of struct preempt_notifier: */
 	struct hlist_head		preempt_notifiers;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e445d3767cdd..7863bb62d2ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -6,12 +6,6 @@
 
 #include <linux/sched/idle.h>
 
-/*
- * Increase resolution of cpu_capacity calculations
- */
-#define SCHED_CAPACITY_SHIFT	SCHED_FIXEDPOINT_SHIFT
-#define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
-
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
diff --git a/init/Kconfig b/init/Kconfig
index 0e2344389501..c88289c18d59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 config GENERIC_SCHED_CLOCK
 	bool
 
+menu "Scheduler features"
+
+config UCLAMP_TASK
+	bool "Enable utilization clamping for RT/FAIR tasks"
+	depends on CPU_FREQ_GOV_SCHEDUTIL
+	help
+	  This feature enables the scheduler to track the clamped utilization
+	  of each CPU based on RUNNABLE tasks scheduled on that CPU.
+
+	  With this option, the user can specify the min and max CPU
+	  utilization allowed for RUNNABLE tasks. The max utilization defines
+	  the maximum frequency a task should use while the min utilization
+	  defines the minimum frequency it should use.
+
+	  Both min and max utilization clamp values are hints to the scheduler,
+	  aiming at improving its frequency selection policy, but they do not
+	  enforce or grant any specific bandwidth for tasks.
+
+	  If in doubt, say N.
+
+config UCLAMP_BUCKETS_COUNT
+	int "Number of supported utilization clamp buckets"
+	range 5 20
+	default 5
+	depends on UCLAMP_TASK
+	help
+	  Defines the number of clamp buckets to use. The range of each bucket
+	  will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
+	  number of clamp buckets the finer their granularity and the higher
+	  the precision of clamping aggregation and tracking at run-time.
+
+	  For example, with the minimum configuration value we will have 5
+	  clamp buckets tracking 20% utilization each. A 25% boosted tasks will
+	  be refcounted in the [20..39]% bucket and will set the bucket clamp
+	  effective value to 25%.
+	  If a second 30% boosted task should be co-scheduled on the same CPU,
+	  that task will be refcounted in the same bucket of the first task and
+	  it will boost the bucket clamp effective value to 30%.
+	  The clamp effective value of a bucket is reset to its nominal value
+	  (20% in the example above) when there are no more tasks refcounted in
+	  that bucket.
+
+	  An additional boost/capping margin can be added to some tasks. In the
+	  example above the 25% task will be boosted to 30% until it exits the
+	  CPU. If that should be considered not acceptable on certain systems,
+	  it's always possible to reduce the margin by increasing the number of
+	  clamp buckets to trade off used memory for run-time tracking
+	  precision.
+
+	  If in doubt, use the default value.
+
+endmenu
+
 #
 # For architectures that want to enable the support for NUMA-affine scheduler
 # balancing logic:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5e02d23e693..d8c1e67afd82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	}
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+#define for_each_clamp_id(clamp_id) \
+	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+	return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+
+static inline unsigned int uclamp_none(int clamp_id)
+{
+	if (clamp_id == UCLAMP_MIN)
+		return 0;
+	return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
+{
+	uc_se->value = value;
+	uc_se->bucket_id = uclamp_bucket_id(value);
+}
+
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id)
+{
+	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+	int bucket_id = UCLAMP_BUCKETS - 1;
+
+	/*
+	 * Since both min and max clamps are max aggregated, find the
+	 * top most bucket with tasks in.
+	 */
+	for ( ; bucket_id >= 0; bucket_id--) {
+		if (!bucket[bucket_id].tasks)
+			continue;
+		return bucket[bucket_id].value;
+	}
+
+	/* No tasks -- default clamp values */
+	return uclamp_none(clamp_id);
+}
+
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+				    unsigned int clamp_id)
+{
+	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+	struct uclamp_bucket *bucket;
+
+	lockdep_assert_held(&rq->lock);
+
+	bucket = &uc_rq->bucket[uc_se->bucket_id];
+	bucket->tasks++;
+
+	if (uc_se->value > READ_ONCE(uc_rq->value))
+		WRITE_ONCE(uc_rq->value, bucket->value);
+}
+
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+				    unsigned int clamp_id)
+{
+	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+	struct uclamp_bucket *bucket;
+	unsigned int rq_clamp;
+
+	lockdep_assert_held(&rq->lock);
+
+	bucket = &uc_rq->bucket[uc_se->bucket_id];
+	SCHED_WARN_ON(!bucket->tasks);
+	if (likely(bucket->tasks))
+		bucket->tasks--;
+
+	if (likely(bucket->tasks))
+		return;
+
+	rq_clamp = READ_ONCE(uc_rq->value);
+	/*
+	 * Defensive programming: this should never happen. If it happens,
+	 * e.g. due to future modification, warn and fixup the expected value.
+	 */
+	SCHED_WARN_ON(bucket->value > rq_clamp);
+	if (bucket->value >= rq_clamp)
+		WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id));
+}
+
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+	unsigned int clamp_id;
+
+	if (unlikely(!p->sched_class->uclamp_enabled))
+		return;
+
+	for_each_clamp_id(clamp_id)
+		uclamp_rq_inc_id(rq, p, clamp_id);
+}
+
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+	unsigned int clamp_id;
+
+	if (unlikely(!p->sched_class->uclamp_enabled))
+		return;
+
+	for_each_clamp_id(clamp_id)
+		uclamp_rq_dec_id(rq, p, clamp_id);
+}
+
+static void __init init_uclamp(void)
+{
+	unsigned int clamp_id;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct uclamp_bucket *bucket;
+		struct uclamp_rq *uc_rq;
+		unsigned int bucket_id;
+
+		memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+
+		for_each_clamp_id(clamp_id) {
+			uc_rq = &cpu_rq(cpu)->uclamp[clamp_id];
+
+			bucket_id = 1;
+			while (bucket_id < UCLAMP_BUCKETS) {
+				bucket = &uc_rq->bucket[bucket_id];
+				bucket->value = bucket_id * UCLAMP_BUCKET_DELTA;
+				++bucket_id;
+			}
+		}
+	}
+
+	for_each_clamp_id(clamp_id) {
+		uclamp_se_set(&init_task.uclamp[clamp_id],
+			      uclamp_none(clamp_id));
+	}
+}
+
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
+
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (!(flags & ENQUEUE_NOCLOCK))
@@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 		psi_enqueue(p, flags & ENQUEUE_WAKEUP);
 	}
 
+	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
@@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
 	}
 
+	uclamp_rq_dec(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -6093,6 +6257,8 @@ void __init sched_init(void)
 
 	psi_init();
 
+	init_uclamp();
+
 	scheduler_running = 1;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58ab597ec88..cecc6baaba93 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
 #endif
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+	unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+	unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ *   utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ *   maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+	unsigned int value;
+	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -825,6 +867,11 @@ struct rq {
 	unsigned long		nr_load_updates;
 	u64			nr_switches;
 
+#ifdef CONFIG_UCLAMP_TASK
+	/* Utilization clamp values based on CPU's RUNNABLE tasks */
+	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
+#endif
+
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
@@ -1639,6 +1686,10 @@ extern const u32		sched_prio_to_wmult[40];
 struct sched_class {
 	const struct sched_class *next;
 
+#ifdef CONFIG_UCLAMP_TASK
+	int uclamp_enabled;
+#endif
+
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
-- 
cgit v1.2.3


From e8f14172c6b11e9a86c65532497087f8eb0f91b1 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 21 Jun 2019 09:42:05 +0100
Subject: sched/uclamp: Add system default clamps

Tasks without a user-defined clamp value are considered not clamped
and by default their utilization can have any value in the
[0..SCHED_CAPACITY_SCALE] range.

Tasks with a user-defined clamp value are allowed to request any value
in that range, and the required clamp is unconditionally enforced.
However, a "System Management Software" could be interested in limiting
the range of clamp values allowed for all tasks.

Add a privileged interface to define a system default configuration via:

  /proc/sys/kernel/sched_uclamp_util_{min,max}

which works as an unconditional clamp range restriction for all tasks.

With the default configuration, the full SCHED_CAPACITY_SCALE range of
values is allowed for each clamp index. Otherwise, the task-specific
clamp is capped by the corresponding system default value.

Do that by tracking, for each task, the "effective" clamp value and
bucket the task has been refcounted in at enqueue time. This
allows to lazy aggregate "requested" and "system default" values at
enqueue time and simplifies refcounting updates at dequeue time.

The cached bucket ids are used to avoid (relatively) more expensive
integer divisions every time a task is enqueued.

An active flag is used to report when the "effective" value is valid and
thus the task is actually refcounted in the corresponding rq's bucket.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alessio Balsini <balsini@android.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 10 +++++
 include/linux/sched/sysctl.h | 11 +++++
 kernel/sched/core.c          | 99 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c              | 16 +++++++
 4 files changed, 135 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 80235bcd05f2..5485f411e8e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -586,14 +586,21 @@ struct sched_dl_entity {
  * Utilization clamp for a scheduling entity
  * @value:		clamp value "assigned" to a se
  * @bucket_id:		bucket index corresponding to the "assigned" value
+ * @active:		the se is currently refcounted in a rq's bucket
  *
  * The bucket_id is the index of the clamp bucket matching the clamp value
  * which is pre-computed and stored to avoid expensive integer divisions from
  * the fast path.
+ *
+ * The active bit is set whenever a task has got an "effective" value assigned,
+ * which can be different from the clamp value "requested" from user-space.
+ * This allows to know a task is refcounted in the rq's bucket corresponding
+ * to the "effective" bucket_id.
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
 	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
+	unsigned int active		: 1;
 };
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -678,6 +685,9 @@ struct task_struct {
 	struct sched_dl_entity		dl;
 
 #ifdef CONFIG_UCLAMP_TASK
+	/* Clamp values requested for a scheduling entity */
+	struct uclamp_se		uclamp_req[UCLAMP_CNT];
+	/* Effective clamp values used for a scheduling entity */
 	struct uclamp_se		uclamp[UCLAMP_CNT];
 #endif
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d728df7..d4f6215ee03f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
+#ifdef CONFIG_UCLAMP_TASK
+extern unsigned int sysctl_sched_uclamp_util_min;
+extern unsigned int sysctl_sched_uclamp_util_max;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
@@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+#ifdef CONFIG_UCLAMP_TASK
+extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos);
+#endif
+
 extern int sysctl_numa_balancing(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2dde735635ec..b74de86b68c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,14 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
 
 /* Integer rounded range for each bucket */
 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -851,6 +859,25 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 	return uclamp_idle_value(rq, clamp_id, clamp_value);
 }
 
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+	struct uclamp_se uc_max = uclamp_default[clamp_id];
+
+	/* System default restrictions always apply */
+	if (unlikely(uc_req.value > uc_max.value))
+		return uc_max;
+
+	return uc_req;
+}
+
 /*
  * When a task is enqueued on a rq, the clamp bucket currently defined by the
  * task's uclamp::bucket_id is refcounted on that rq. This also immediately
@@ -870,8 +897,12 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
 
 	lockdep_assert_held(&rq->lock);
 
+	/* Update task effective clamp */
+	p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
 	bucket = &uc_rq->bucket[uc_se->bucket_id];
 	bucket->tasks++;
+	uc_se->active = true;
 
 	uclamp_idle_reset(rq, clamp_id, uc_se->value);
 
@@ -910,6 +941,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 	SCHED_WARN_ON(!bucket->tasks);
 	if (likely(bucket->tasks))
 		bucket->tasks--;
+	uc_se->active = false;
 
 	/*
 	 * Keep "local max aggregation" simple and accept to (possibly)
@@ -958,8 +990,65 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
 
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int old_min, old_max;
+	static DEFINE_MUTEX(mutex);
+	int result;
+
+	mutex_lock(&mutex);
+	old_min = sysctl_sched_uclamp_util_min;
+	old_max = sysctl_sched_uclamp_util_max;
+
+	result = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (result)
+		goto undo;
+	if (!write)
+		goto done;
+
+	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+	    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+		result = -EINVAL;
+		goto undo;
+	}
+
+	if (old_min != sysctl_sched_uclamp_util_min) {
+		uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+			      sysctl_sched_uclamp_util_min);
+	}
+	if (old_max != sysctl_sched_uclamp_util_max) {
+		uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+			      sysctl_sched_uclamp_util_max);
+	}
+
+	/*
+	 * Updating all the RUNNABLE task is expensive, keep it simple and do
+	 * just a lazy update at each next enqueue time.
+	 */
+	goto done;
+
+undo:
+	sysctl_sched_uclamp_util_min = old_min;
+	sysctl_sched_uclamp_util_max = old_max;
+done:
+	mutex_unlock(&mutex);
+
+	return result;
+}
+
+static void uclamp_fork(struct task_struct *p)
+{
+	unsigned int clamp_id;
+
+	for_each_clamp_id(clamp_id)
+		p->uclamp[clamp_id].active = false;
+}
+
 static void __init init_uclamp(void)
 {
+	struct uclamp_se uc_max = {};
 	unsigned int clamp_id;
 	int cpu;
 
@@ -969,14 +1058,20 @@ static void __init init_uclamp(void)
 	}
 
 	for_each_clamp_id(clamp_id) {
-		uclamp_se_set(&init_task.uclamp[clamp_id],
+		uclamp_se_set(&init_task.uclamp_req[clamp_id],
 			      uclamp_none(clamp_id));
 	}
+
+	/* System defaults allow max clamp values for both indexes */
+	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX));
+	for_each_clamp_id(clamp_id)
+		uclamp_default[clamp_id] = uc_max;
 }
 
 #else /* CONFIG_UCLAMP_TASK */
 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -2545,6 +2640,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	p->prio = current->normal_prio;
 
+	uclamp_fork(p);
+
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1beca96fb625..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
 	},
+#ifdef CONFIG_UCLAMP_TASK
+	{
+		.procname	= "sched_util_clamp_min",
+		.data		= &sysctl_sched_uclamp_util_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sched_uclamp_handler,
+	},
+	{
+		.procname	= "sched_util_clamp_max",
+		.data		= &sysctl_sched_uclamp_util_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sched_uclamp_handler,
+	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",
-- 
cgit v1.2.3


From a509a7cd79747074a2c018a45bbbc52d1f4aed44 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 21 Jun 2019 09:42:07 +0100
Subject: sched/uclamp: Extend sched_setattr() to support utilization clamping

The SCHED_DEADLINE scheduling class provides an advanced and formal
model to define tasks requirements that can translate into proper
decisions for both task placements and frequencies selections. Other
classes have a more simplified model based on the POSIX concept of
priorities.

Such a simple priority based model however does not allow to exploit
most advanced features of the Linux scheduler like, for example, driving
frequencies selection via the schedutil cpufreq governor. However, also
for non SCHED_DEADLINE tasks, it's still interesting to define tasks
properties to support scheduler decisions.

Utilization clamping exposes to user-space a new set of per-task
attributes the scheduler can use as hints about the expected/required
utilization for a task. This allows to implement a "proactive" per-task
frequency control policy, a more advanced policy than the current one
based just on "passive" measured task utilization. For example, it's
possible to boost interactive tasks (e.g. to get better performance) or
cap background tasks (e.g. to be more energy/thermal efficient).

Introduce a new API to set utilization clamping values for a specified
task by extending sched_setattr(), a syscall which already allows to
define task specific properties for different scheduling classes. A new
pair of attributes allows to specify a minimum and maximum utilization
the scheduler can consider for a task.

Do that by validating the required clamp values before and then applying
the required changes using _the_ same pattern already in use for
__setscheduler(). This ensures that the task is re-enqueued with the new
clamp values.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alessio Balsini <balsini@android.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h            |  9 ++++
 include/uapi/linux/sched.h       | 12 +++++-
 include/uapi/linux/sched/types.h | 66 +++++++++++++++++++++++++----
 kernel/sched/core.c              | 91 ++++++++++++++++++++++++++++++++++++----
 4 files changed, 161 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5485f411e8e1..1113dd4706ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -587,6 +587,7 @@ struct sched_dl_entity {
  * @value:		clamp value "assigned" to a se
  * @bucket_id:		bucket index corresponding to the "assigned" value
  * @active:		the se is currently refcounted in a rq's bucket
+ * @user_defined:	the requested clamp value comes from user-space
  *
  * The bucket_id is the index of the clamp bucket matching the clamp value
  * which is pre-computed and stored to avoid expensive integer divisions from
@@ -596,11 +597,19 @@ struct sched_dl_entity {
  * which can be different from the clamp value "requested" from user-space.
  * This allows to know a task is refcounted in the rq's bucket corresponding
  * to the "effective" bucket_id.
+ *
+ * The user_defined bit is set whenever a task has got a task-specific clamp
+ * value requested from userspace, i.e. the system defaults apply to this task
+ * just as a restriction. This allows to relax default clamps when a less
+ * restrictive task-specific value has been requested, thus allowing to
+ * implement a "nice" semantic. For example, a task running with a 20%
+ * default boost can still drop its own boosting to 0%.
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
 	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
 	unsigned int active		: 1;
+	unsigned int user_defined	: 1;
 };
 #endif /* CONFIG_UCLAMP_TASK */
 
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 58b2368d3634..617bb59aa8ba 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -52,10 +52,20 @@
 #define SCHED_FLAG_RECLAIM		0x02
 #define SCHED_FLAG_DL_OVERRUN		0x04
 #define SCHED_FLAG_KEEP_POLICY		0x08
+#define SCHED_FLAG_KEEP_PARAMS		0x10
+#define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+#define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+
+#define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+				 SCHED_FLAG_KEEP_PARAMS)
+
+#define SCHED_FLAG_UTIL_CLAMP	(SCHED_FLAG_UTIL_CLAMP_MIN | \
+				 SCHED_FLAG_UTIL_CLAMP_MAX)
 
 #define SCHED_FLAG_ALL	(SCHED_FLAG_RESET_ON_FORK	| \
 			 SCHED_FLAG_RECLAIM		| \
 			 SCHED_FLAG_DL_OVERRUN		| \
-			 SCHED_FLAG_KEEP_POLICY)
+			 SCHED_FLAG_KEEP_ALL		| \
+			 SCHED_FLAG_UTIL_CLAMP)
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 10fbb8031930..c852153ddb0d 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -9,6 +9,7 @@ struct sched_param {
 };
 
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+#define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
 
 /*
  * Extended scheduling parameters data structure.
@@ -21,8 +22,33 @@ struct sched_param {
  * the tasks may be useful for a wide variety of application fields, e.g.,
  * multimedia, streaming, automation and control, and many others.
  *
- * This variant (sched_attr) is meant at describing a so-called
- * sporadic time-constrained task. In such model a task is specified by:
+ * This variant (sched_attr) allows to define additional attributes to
+ * improve the scheduler knowledge about task requirements.
+ *
+ * Scheduling Class Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes specifies the
+ * scheduling policy and relative POSIX attributes:
+ *
+ *  @size		size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy	task's scheduling policy
+ *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority	task's static priority (SCHED_FIFO/RR)
+ *
+ * Certain more advanced scheduling features can be controlled by a
+ * predefined set of flags via the attribute:
+ *
+ *  @sched_flags	for customizing the scheduler behaviour
+ *
+ * Sporadic Time-Constrained Task Attributes
+ * =========================================
+ *
+ * A subset of sched_attr attributes allows to describe a so-called
+ * sporadic time-constrained task.
+ *
+ * In such a model a task is specified by:
  *  - the activation period or minimum instance inter-arrival time;
  *  - the maximum (or average, depending on the actual scheduling
  *    discipline) computation time of all instances, a.k.a. runtime;
@@ -34,14 +60,8 @@ struct sched_param {
  * than the runtime and must be completed by time instant t equal to
  * the instance activation time + the deadline.
  *
- * This is reflected by the actual fields of the sched_attr structure:
+ * This is reflected by the following fields of the sched_attr structure:
  *
- *  @size		size of the structure, for fwd/bwd compat.
- *
- *  @sched_policy	task's scheduling policy
- *  @sched_flags	for customizing the scheduler behaviour
- *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
- *  @sched_priority	task's static priority (SCHED_FIFO/RR)
  *  @sched_deadline	representative of the task's deadline
  *  @sched_runtime	representative of the task's runtime
  *  @sched_period	representative of the task's period
@@ -53,6 +73,29 @@ struct sched_param {
  * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
  * only user of this new interface. More information about the algorithm
  * available in the scheduling class file or in Documentation/.
+ *
+ * Task Utilization Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the utilization
+ * expected for a task. These attributes allow to inform the scheduler about
+ * the utilization boundaries within which it should schedule the task. These
+ * boundaries are valuable hints to support scheduler decisions on both task
+ * placement and frequency selection.
+ *
+ *  @sched_util_min	represents the minimum utilization
+ *  @sched_util_max	represents the maximum utilization
+ *
+ * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It
+ * represents the percentage of CPU time used by a task when running at the
+ * maximum frequency on the highest capacity CPU of the system. For example, a
+ * 20% utilization task is a task running for 2ms every 10ms at maximum
+ * frequency.
+ *
+ * A task with a min utilization value bigger than 0 is more likely scheduled
+ * on a CPU with a capacity big enough to fit the specified value.
+ * A task with a max utilization value smaller than 1024 is more likely
+ * scheduled on a CPU with no more capacity than the specified value.
  */
 struct sched_attr {
 	__u32 size;
@@ -70,6 +113,11 @@ struct sched_attr {
 	__u64 sched_runtime;
 	__u64 sched_deadline;
 	__u64 sched_period;
+
+	/* Utilization hints */
+	__u32 sched_util_min;
+	__u32 sched_util_max;
+
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d519f3f9789..e9a669266fa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -805,10 +805,12 @@ static inline unsigned int uclamp_none(int clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
 
-static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+				 unsigned int value, bool user_defined)
 {
 	uc_se->value = value;
 	uc_se->bucket_id = uclamp_bucket_id(value);
+	uc_se->user_defined = user_defined;
 }
 
 static inline unsigned int
@@ -1016,11 +1018,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 
 	if (old_min != sysctl_sched_uclamp_util_min) {
 		uclamp_se_set(&uclamp_default[UCLAMP_MIN],
-			      sysctl_sched_uclamp_util_min);
+			      sysctl_sched_uclamp_util_min, false);
 	}
 	if (old_max != sysctl_sched_uclamp_util_max) {
 		uclamp_se_set(&uclamp_default[UCLAMP_MAX],
-			      sysctl_sched_uclamp_util_max);
+			      sysctl_sched_uclamp_util_max, false);
 	}
 
 	/*
@@ -1038,6 +1040,42 @@ done:
 	return result;
 }
 
+static int uclamp_validate(struct task_struct *p,
+			   const struct sched_attr *attr)
+{
+	unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+	unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+
+	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+		lower_bound = attr->sched_util_min;
+	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+		upper_bound = attr->sched_util_max;
+
+	if (lower_bound > upper_bound)
+		return -EINVAL;
+	if (upper_bound > SCHED_CAPACITY_SCALE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+				  const struct sched_attr *attr)
+{
+	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+		return;
+
+	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+			      attr->sched_util_min, true);
+	}
+
+	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+			      attr->sched_util_max, true);
+	}
+}
+
 static void uclamp_fork(struct task_struct *p)
 {
 	unsigned int clamp_id;
@@ -1059,11 +1097,11 @@ static void __init init_uclamp(void)
 
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&init_task.uclamp_req[clamp_id],
-			      uclamp_none(clamp_id));
+			      uclamp_none(clamp_id), false);
 	}
 
 	/* System defaults allow max clamp values for both indexes */
-	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX));
+	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
 	for_each_clamp_id(clamp_id)
 		uclamp_default[clamp_id] = uc_max;
 }
@@ -1071,6 +1109,13 @@ static void __init init_uclamp(void)
 #else /* CONFIG_UCLAMP_TASK */
 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+				  const struct sched_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+				  const struct sched_attr *attr) { }
 static inline void uclamp_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
@@ -4412,6 +4457,13 @@ static void __setscheduler_params(struct task_struct *p,
 static void __setscheduler(struct rq *rq, struct task_struct *p,
 			   const struct sched_attr *attr, bool keep_boost)
 {
+	/*
+	 * If params can't change scheduling class changes aren't allowed
+	 * either.
+	 */
+	if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+		return;
+
 	__setscheduler_params(p, attr);
 
 	/*
@@ -4549,6 +4601,13 @@ recheck:
 			return retval;
 	}
 
+	/* Update task specific "requested" clamps */
+	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+		retval = uclamp_validate(p, attr);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * Make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
@@ -4578,6 +4637,8 @@ recheck:
 			goto change;
 		if (dl_policy(policy) && dl_param_changed(p, attr))
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &rf);
@@ -4658,7 +4719,9 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
+
 	__setscheduler(rq, p, attr, pi);
+	__setscheduler_uclamp(p, attr);
 
 	if (queued) {
 		/*
@@ -4834,6 +4897,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
 	if (ret)
 		return -EFAULT;
 
+	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+	    size < SCHED_ATTR_SIZE_VER1)
+		return -EINVAL;
+
 	/*
 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
 	 * to be strict and return an error on out-of-bounds values?
@@ -4903,10 +4970,15 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
-	if (p != NULL)
-		retval = sched_setattr(p, &attr);
+	if (likely(p))
+		get_task_struct(p);
 	rcu_read_unlock();
 
+	if (likely(p)) {
+		retval = sched_setattr(p, &attr);
+		put_task_struct(p);
+	}
+
 	return retval;
 }
 
@@ -5057,6 +5129,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	else
 		attr.sched_nice = task_nice(p);
 
+#ifdef CONFIG_UCLAMP_TASK
+	attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+	attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+
 	rcu_read_unlock();
 
 	retval = sched_read_attr(uattr, &attr, size);
-- 
cgit v1.2.3


From fd7d55172d1e2e501e6da0a5c1de25f06612dc2e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 1 Jun 2019 01:27:22 -0700
Subject: perf/cgroups: Don't rotate events for cgroups unnecessarily

Currently perf_rotate_context assumes that if the context's nr_events !=
nr_active a rotation is necessary for perf event multiplexing. With
cgroups, nr_events is the total count of events for all cgroups and
nr_active will not include events in a cgroup other than the current
task's. This makes rotation appear necessary for cgroups when it is not.

Add a perf_event_context flag that is set when rotation is necessary.
Clear the flag during sched_out and set it when a flexible sched_in
fails due to resources.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/20190601082722.44543-1-irogers@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/core.c       | 42 ++++++++++++++++++++++--------------------
 2 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3dc01cf98e16..2ddae518dce6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -749,6 +749,11 @@ struct perf_event_context {
 	int				nr_stat;
 	int				nr_freq;
 	int				rotate_disable;
+	/*
+	 * Set when nr_events != nr_active, except tolerant to events not
+	 * necessary to be active due to scheduling constraints, such as cgroups.
+	 */
+	int				rotate_necessary;
 	refcount_t			refcount;
 	struct task_struct		*task;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 118ad1aef6af..23efe6792abc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 		return;
 
+	/*
+	 * If we had been multiplexing, no rotations are necessary, now no events
+	 * are active.
+	 */
+	ctx->rotate_necessary = 0;
+
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
 		list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
 		return 0;
 
 	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
-			list_add_tail(&event->active_list, &sid->ctx->flexible_active);
-		else
+		int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+		if (ret) {
 			sid->can_add_hw = 0;
+			sid->ctx->rotate_necessary = 1;
+			return 0;
+		}
+		list_add_tail(&event->active_list, &sid->ctx->flexible_active);
 	}
 
 	return 0;
@@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx)
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *cpu_event = NULL, *task_event = NULL;
-	bool cpu_rotate = false, task_rotate = false;
-	struct perf_event_context *ctx = NULL;
+	struct perf_event_context *task_ctx = NULL;
+	int cpu_rotate, task_rotate;
 
 	/*
 	 * Since we run this from IRQ context, nobody can install new
 	 * events, thus the event count values are stable.
 	 */
 
-	if (cpuctx->ctx.nr_events) {
-		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-			cpu_rotate = true;
-	}
-
-	ctx = cpuctx->task_ctx;
-	if (ctx && ctx->nr_events) {
-		if (ctx->nr_events != ctx->nr_active)
-			task_rotate = true;
-	}
+	cpu_rotate = cpuctx->ctx.rotate_necessary;
+	task_ctx = cpuctx->task_ctx;
+	task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
 
 	if (!(cpu_rotate || task_rotate))
 		return false;
@@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
 	if (task_rotate)
-		task_event = ctx_first_active(ctx);
+		task_event = ctx_first_active(task_ctx);
 	if (cpu_rotate)
 		cpu_event = ctx_first_active(&cpuctx->ctx);
 
@@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 	 * As per the order given at ctx_resched() first 'pop' task flexible
 	 * and then, if needed CPU flexible.
 	 */
-	if (task_event || (ctx && cpu_event))
-		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+	if (task_event || (task_ctx && cpu_event))
+		ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
 	if (cpu_event)
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
 	if (task_event)
-		rotate_ctx(ctx, task_event);
+		rotate_ctx(task_ctx, task_event);
 	if (cpu_event)
 		rotate_ctx(&cpuctx->ctx, cpu_event);
 
-	perf_event_sched_in(cpuctx, ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx, current);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-- 
cgit v1.2.3


From 44038bc514a244fba9d0d6d698b15970185ac251 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Mon, 20 May 2019 13:54:58 -0700
Subject: tpm: Abstract crypto agile event size calculations

We need to calculate the size of crypto agile events in multiple
locations, including in the EFI boot stub. The easiest way to do this is
to put it in a header file as an inline and leave a wrapper to ensure we
don't end up with multiple copies of it embedded in the existing code.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Reviewed-by: Bartosz Szczepanek <bsz@semihalf.com>
Tested-by: Bartosz Szczepanek <bsz@semihalf.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/eventlog/tpm2.c | 47 +--------------------------
 include/linux/tpm_eventlog.h     | 68 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/eventlog/tpm2.c b/drivers/char/tpm/eventlog/tpm2.c
index d506362e046f..22b3346b9946 100644
--- a/drivers/char/tpm/eventlog/tpm2.c
+++ b/drivers/char/tpm/eventlog/tpm2.c
@@ -36,52 +36,7 @@
 static size_t calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 				   struct tcg_pcr_event *event_header)
 {
-	struct tcg_efi_specid_event_head *efispecid;
-	struct tcg_event_field *event_field;
-	void *marker;
-	void *marker_start;
-	u32 halg_size;
-	size_t size;
-	u16 halg;
-	int i;
-	int j;
-
-	marker = event;
-	marker_start = marker;
-	marker = marker + sizeof(event->pcr_idx) + sizeof(event->event_type)
-		+ sizeof(event->count);
-
-	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
-
-	/* Check if event is malformed. */
-	if (event->count > efispecid->num_algs)
-		return 0;
-
-	for (i = 0; i < event->count; i++) {
-		halg_size = sizeof(event->digests[i].alg_id);
-		memcpy(&halg, marker, halg_size);
-		marker = marker + halg_size;
-		for (j = 0; j < efispecid->num_algs; j++) {
-			if (halg == efispecid->digest_sizes[j].alg_id) {
-				marker +=
-					efispecid->digest_sizes[j].digest_size;
-				break;
-			}
-		}
-		/* Algorithm without known length. Such event is unparseable. */
-		if (j == efispecid->num_algs)
-			return 0;
-	}
-
-	event_field = (struct tcg_event_field *)marker;
-	marker = marker + sizeof(event_field->event_size)
-		+ event_field->event_size;
-	size = marker - marker_start;
-
-	if ((event->event_type == 0) && (event_field->event_size == 0))
-		return 0;
-
-	return size;
+	return __calc_tpm2_event_size(event, event_header);
 }
 
 static void *tpm2_bios_measurements_start(struct seq_file *m, loff_t *pos)
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 81519f163211..6a86144e13f1 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -112,4 +112,72 @@ struct tcg_pcr_event2_head {
 	struct tpm_digest digests[];
 } __packed;
 
+/**
+ * __calc_tpm2_event_size - calculate the size of a TPM2 event log entry
+ * @event:        Pointer to the event whose size should be calculated
+ * @event_header: Pointer to the initial event containing the digest lengths
+ *
+ * The TPM2 event log format can contain multiple digests corresponding to
+ * separate PCR banks, and also contains a variable length of the data that
+ * was measured. This requires knowledge of how long each digest type is,
+ * and this information is contained within the first event in the log.
+ *
+ * We calculate the length by examining the number of events, and then looking
+ * at each event in turn to determine how much space is used for events in
+ * total. Once we've done this we know the offset of the data length field,
+ * and can calculate the total size of the event.
+ *
+ * Return: size of the event on success, <0 on failure
+ */
+
+static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+					 struct tcg_pcr_event *event_header)
+{
+	struct tcg_efi_specid_event_head *efispecid;
+	struct tcg_event_field *event_field;
+	void *marker;
+	void *marker_start;
+	u32 halg_size;
+	size_t size;
+	u16 halg;
+	int i;
+	int j;
+
+	marker = event;
+	marker_start = marker;
+	marker = marker + sizeof(event->pcr_idx) + sizeof(event->event_type)
+		+ sizeof(event->count);
+
+	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
+
+	/* Check if event is malformed. */
+	if (event->count > efispecid->num_algs)
+		return 0;
+
+	for (i = 0; i < event->count; i++) {
+		halg_size = sizeof(event->digests[i].alg_id);
+		memcpy(&halg, marker, halg_size);
+		marker = marker + halg_size;
+		for (j = 0; j < efispecid->num_algs; j++) {
+			if (halg == efispecid->digest_sizes[j].alg_id) {
+				marker +=
+					efispecid->digest_sizes[j].digest_size;
+				break;
+			}
+		}
+		/* Algorithm without known length. Such event is unparseable. */
+		if (j == efispecid->num_algs)
+			return 0;
+	}
+
+	event_field = (struct tcg_event_field *)marker;
+	marker = marker + sizeof(event_field->event_size)
+		+ event_field->event_size;
+	size = marker - marker_start;
+
+	if ((event->event_type == 0) && (event_field->event_size == 0))
+		return 0;
+
+	return size;
+}
 #endif
-- 
cgit v1.2.3


From c46f3405692de1ac82240d927b9c7a0f9d6a4a36 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Mon, 20 May 2019 13:54:59 -0700
Subject: tpm: Reserve the TPM final events table

UEFI systems provide a boot services protocol for obtaining the TPM
event log, but this is unusable after ExitBootServices() is called.
Unfortunately ExitBootServices() itself triggers additional TPM events
that then can't be obtained using this protocol. The platform provides a
mechanism for the OS to obtain these events by recording them to a
separate UEFI configuration table which the OS can then map.

Unfortunately this table isn't self describing in terms of providing its
length, so we need to parse the events inside it to figure out how long
it is. Since the table isn't mapped at this point, we need to extend the
length calculation function to be able to map the event as it goes
along.

(Fixes by Bartosz Szczepanek <bsz@semihalf.com>)

Signed-off-by: Matthew Garrett <mjg59@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Bartosz Szczepanek <bsz@semihalf.com>
Tested-by: Bartosz Szczepanek <bsz@semihalf.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/eventlog/tpm2.c |   2 +-
 drivers/firmware/efi/efi.c       |   2 +
 drivers/firmware/efi/tpm.c       |  63 ++++++++++++++++++++++--
 include/linux/efi.h              |   9 ++++
 include/linux/tpm_eventlog.h     | 102 +++++++++++++++++++++++++++++++++++----
 5 files changed, 164 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/eventlog/tpm2.c b/drivers/char/tpm/eventlog/tpm2.c
index 22b3346b9946..b9aeda1cbcd7 100644
--- a/drivers/char/tpm/eventlog/tpm2.c
+++ b/drivers/char/tpm/eventlog/tpm2.c
@@ -36,7 +36,7 @@
 static size_t calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 				   struct tcg_pcr_event *event_header)
 {
-	return __calc_tpm2_event_size(event, event_header);
+	return __calc_tpm2_event_size(event, event_header, false);
 }
 
 static void *tpm2_bios_measurements_start(struct seq_file *m, loff_t *pos)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 16b2137d117c..a449e645c44f 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -52,6 +52,7 @@ struct efi __read_mostly efi = {
 	.mem_attr_table		= EFI_INVALID_TABLE_ADDR,
 	.rng_seed		= EFI_INVALID_TABLE_ADDR,
 	.tpm_log		= EFI_INVALID_TABLE_ADDR,
+	.tpm_final_log		= EFI_INVALID_TABLE_ADDR,
 	.mem_reserve		= EFI_INVALID_TABLE_ADDR,
 };
 EXPORT_SYMBOL(efi);
@@ -484,6 +485,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
 	{EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table},
 	{LINUX_EFI_RANDOM_SEED_TABLE_GUID, "RNG", &efi.rng_seed},
 	{LINUX_EFI_TPM_EVENT_LOG_GUID, "TPMEventLog", &efi.tpm_log},
+	{LINUX_EFI_TPM_FINAL_LOG_GUID, "TPMFinalLog", &efi.tpm_final_log},
 	{LINUX_EFI_MEMRESERVE_TABLE_GUID, "MEMRESERVE", &efi.mem_reserve},
 	{NULL_GUID, NULL, NULL},
 };
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index 3a689b40ccc0..1d3f5ca3eaaf 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -4,11 +4,34 @@
  *     Thiebaud Weksteen <tweek@google.com>
  */
 
+#define TPM_MEMREMAP(start, size) early_memremap(start, size)
+#define TPM_MEMUNMAP(start, size) early_memunmap(start, size)
+
+#include <asm/early_ioremap.h>
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/memblock.h>
+#include <linux/tpm_eventlog.h>
 
-#include <asm/early_ioremap.h>
+int efi_tpm_final_log_size;
+EXPORT_SYMBOL(efi_tpm_final_log_size);
+
+static int tpm2_calc_event_log_size(void *data, int count, void *size_info)
+{
+	struct tcg_pcr_event2_head *header;
+	int event_size, size = 0;
+
+	while (count > 0) {
+		header = data + size;
+		event_size = __calc_tpm2_event_size(header, size_info, true);
+		if (event_size == 0)
+			return -1;
+		size += event_size;
+		count--;
+	}
+
+	return size;
+}
 
 /*
  * Reserve the memory associated with the TPM Event Log configuration table.
@@ -16,22 +39,54 @@
 int __init efi_tpm_eventlog_init(void)
 {
 	struct linux_efi_tpm_eventlog *log_tbl;
+	struct efi_tcg2_final_events_table *final_tbl;
 	unsigned int tbl_size;
+	int ret = 0;
 
-	if (efi.tpm_log == EFI_INVALID_TABLE_ADDR)
+	if (efi.tpm_log == EFI_INVALID_TABLE_ADDR) {
+		/*
+		 * We can't calculate the size of the final events without the
+		 * first entry in the TPM log, so bail here.
+		 */
 		return 0;
+	}
 
 	log_tbl = early_memremap(efi.tpm_log, sizeof(*log_tbl));
 	if (!log_tbl) {
 		pr_err("Failed to map TPM Event Log table @ 0x%lx\n",
-			efi.tpm_log);
+		       efi.tpm_log);
 		efi.tpm_log = EFI_INVALID_TABLE_ADDR;
 		return -ENOMEM;
 	}
 
 	tbl_size = sizeof(*log_tbl) + log_tbl->size;
 	memblock_reserve(efi.tpm_log, tbl_size);
+
+	if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR)
+		goto out;
+
+	final_tbl = early_memremap(efi.tpm_final_log, sizeof(*final_tbl));
+
+	if (!final_tbl) {
+		pr_err("Failed to map TPM Final Event Log table @ 0x%lx\n",
+		       efi.tpm_final_log);
+		efi.tpm_final_log = EFI_INVALID_TABLE_ADDR;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tbl_size = tpm2_calc_event_log_size((void *)efi.tpm_final_log
+					    + sizeof(final_tbl->version)
+					    + sizeof(final_tbl->nr_events),
+					    final_tbl->nr_events,
+					    log_tbl->log);
+	memblock_reserve((unsigned long)final_tbl,
+			 tbl_size + sizeof(*final_tbl));
+	early_memunmap(final_tbl, sizeof(*final_tbl));
+	efi_tpm_final_log_size = tbl_size;
+
+out:
 	early_memunmap(log_tbl, sizeof(*log_tbl));
-	return 0;
+	return ret;
 }
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 6ebc2098cfe1..b391263d8ec6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -689,6 +689,7 @@ void efi_native_runtime_setup(void);
 #define LINUX_EFI_LOADER_ENTRY_GUID		EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf,  0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f)
 #define LINUX_EFI_RANDOM_SEED_TABLE_GUID	EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2,  0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b)
 #define LINUX_EFI_TPM_EVENT_LOG_GUID		EFI_GUID(0xb7799cb0, 0xeca2, 0x4943,  0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa)
+#define LINUX_EFI_TPM_FINAL_LOG_GUID		EFI_GUID(0x1e2ed096, 0x30e2, 0x4254,  0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25)
 #define LINUX_EFI_MEMRESERVE_TABLE_GUID		EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5,  0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2)
 
 typedef struct {
@@ -996,6 +997,7 @@ extern struct efi {
 	unsigned long mem_attr_table;	/* memory attributes table */
 	unsigned long rng_seed;		/* UEFI firmware random seed */
 	unsigned long tpm_log;		/* TPM2 Event Log table */
+	unsigned long tpm_final_log;	/* TPM2 Final Events Log table */
 	unsigned long mem_reserve;	/* Linux EFI memreserve table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
@@ -1712,6 +1714,13 @@ struct linux_efi_tpm_eventlog {
 
 extern int efi_tpm_eventlog_init(void);
 
+struct efi_tcg2_final_events_table {
+	u64 version;
+	u64 nr_events;
+	u8 events[];
+};
+extern int efi_tpm_final_log_size;
+
 /*
  * efi_runtime_service() function identifiers.
  * "NONE" is used by efi_recover_from_page_fault() to check if the page
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 6a86144e13f1..63238c84dc0b 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -112,10 +112,35 @@ struct tcg_pcr_event2_head {
 	struct tpm_digest digests[];
 } __packed;
 
+struct tcg_algorithm_size {
+	u16 algorithm_id;
+	u16 algorithm_size;
+};
+
+struct tcg_algorithm_info {
+	u8 signature[16];
+	u32 platform_class;
+	u8 spec_version_minor;
+	u8 spec_version_major;
+	u8 spec_errata;
+	u8 uintn_size;
+	u32 number_of_algorithms;
+	struct tcg_algorithm_size digest_sizes[];
+};
+
+#ifndef TPM_MEMREMAP
+#define TPM_MEMREMAP(start, size) NULL
+#endif
+
+#ifndef TPM_MEMUNMAP
+#define TPM_MEMUNMAP(start, size) do{} while(0)
+#endif
+
 /**
  * __calc_tpm2_event_size - calculate the size of a TPM2 event log entry
  * @event:        Pointer to the event whose size should be calculated
  * @event_header: Pointer to the initial event containing the digest lengths
+ * @do_mapping:   Whether or not the event needs to be mapped
  *
  * The TPM2 event log format can contain multiple digests corresponding to
  * separate PCR banks, and also contains a variable length of the data that
@@ -131,10 +156,13 @@ struct tcg_pcr_event2_head {
  */
 
 static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
-					 struct tcg_pcr_event *event_header)
+					 struct tcg_pcr_event *event_header,
+					 bool do_mapping)
 {
 	struct tcg_efi_specid_event_head *efispecid;
 	struct tcg_event_field *event_field;
+	void *mapping = NULL;
+	int mapping_size;
 	void *marker;
 	void *marker_start;
 	u32 halg_size;
@@ -148,16 +176,49 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 	marker = marker + sizeof(event->pcr_idx) + sizeof(event->event_type)
 		+ sizeof(event->count);
 
+	/* Map the event header */
+	if (do_mapping) {
+		mapping_size = marker - marker_start;
+		mapping = TPM_MEMREMAP((unsigned long)marker_start,
+				       mapping_size);
+		if (!mapping) {
+			size = 0;
+			goto out;
+		}
+	} else {
+		mapping = marker_start;
+	}
+
+	event = (struct tcg_pcr_event2_head *)mapping;
+
 	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
 
 	/* Check if event is malformed. */
-	if (event->count > efispecid->num_algs)
-		return 0;
+	if (event->count > efispecid->num_algs) {
+		size = 0;
+		goto out;
+	}
 
 	for (i = 0; i < event->count; i++) {
 		halg_size = sizeof(event->digests[i].alg_id);
-		memcpy(&halg, marker, halg_size);
+
+		/* Map the digest's algorithm identifier */
+		if (do_mapping) {
+			TPM_MEMUNMAP(mapping, mapping_size);
+			mapping_size = halg_size;
+			mapping = TPM_MEMREMAP((unsigned long)marker,
+					     mapping_size);
+			if (!mapping) {
+				size = 0;
+				goto out;
+			}
+		} else {
+			mapping = marker;
+		}
+
+		memcpy(&halg, mapping, halg_size);
 		marker = marker + halg_size;
+
 		for (j = 0; j < efispecid->num_algs; j++) {
 			if (halg == efispecid->digest_sizes[j].alg_id) {
 				marker +=
@@ -166,18 +227,41 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 			}
 		}
 		/* Algorithm without known length. Such event is unparseable. */
-		if (j == efispecid->num_algs)
-			return 0;
+		if (j == efispecid->num_algs) {
+			size = 0;
+			goto out;
+		}
+	}
+
+	/*
+	 * Map the event size - we don't read from the event itself, so
+	 * we don't need to map it
+	 */
+	if (do_mapping) {
+		TPM_MEMUNMAP(mapping, mapping_size);
+		mapping_size += sizeof(event_field->event_size);
+		mapping = TPM_MEMREMAP((unsigned long)marker,
+				       mapping_size);
+		if (!mapping) {
+			size = 0;
+			goto out;
+		}
+	} else {
+		mapping = marker;
 	}
 
-	event_field = (struct tcg_event_field *)marker;
+	event_field = (struct tcg_event_field *)mapping;
+
 	marker = marker + sizeof(event_field->event_size)
 		+ event_field->event_size;
 	size = marker - marker_start;
 
 	if ((event->event_type == 0) && (event_field->event_size == 0))
-		return 0;
-
+		size = 0;
+out:
+	if (do_mapping)
+		TPM_MEMUNMAP(mapping, mapping_size);
 	return size;
 }
+
 #endif
-- 
cgit v1.2.3


From 166a2809d65b282272c474835ec22c882a39ca1b Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthewgarrett@google.com>
Date: Fri, 7 Jun 2019 13:51:47 -0700
Subject: tpm: Don't duplicate events from the final event log in the TCG2 log

After the first call to GetEventLog() on UEFI systems using the TCG2
crypto agile log format, any further log events (other than those
triggered by ExitBootServices()) will be logged in both the main log and
also in the Final Events Log. While the kernel only calls GetEventLog()
immediately before ExitBootServices(), we can't control whether earlier
parts of the boot process have done so. This will result in log entries
that exist in both logs, and so the current approach of simply appending
the Final Event Log to the main log will result in events being
duplicated.

We can avoid this problem by looking at the size of the Final Event Log
just before we call ExitBootServices() and exporting this to the main
kernel. The kernel can then skip over all events that occured before
ExitBootServices() and only append events that were not also logged to
the main log.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Reported-by: Joe Richey <joerichey@google.com>
Suggested-by: Joe Richey <joerichey@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/eventlog/efi.c    | 11 ++++++++++-
 drivers/firmware/efi/libstub/tpm.c | 30 ++++++++++++++++++++++++++++++
 include/linux/efi.h                |  1 +
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/eventlog/efi.c b/drivers/char/tpm/eventlog/efi.c
index 73b981026627..6bb023de17f1 100644
--- a/drivers/char/tpm/eventlog/efi.c
+++ b/drivers/char/tpm/eventlog/efi.c
@@ -75,6 +75,8 @@ int tpm_read_log_efi(struct tpm_chip *chip)
 		goto out;
 	}
 
+	efi_tpm_final_log_size -= log_tbl->final_events_preboot_size;
+
 	tmp = krealloc(log->bios_event_log,
 		       log_size + efi_tpm_final_log_size,
 		       GFP_KERNEL);
@@ -85,8 +87,15 @@ int tpm_read_log_efi(struct tpm_chip *chip)
 	}
 
 	log->bios_event_log = tmp;
+
+	/*
+	 * Copy any of the final events log that didn't also end up in the
+	 * main log. Events can be logged in both if events are generated
+	 * between GetEventLog() and ExitBootServices().
+	 */
 	memcpy((void *)log->bios_event_log + log_size,
-	       final_tbl->events, efi_tpm_final_log_size);
+	       final_tbl->events + log_tbl->final_events_preboot_size,
+	       efi_tpm_final_log_size);
 	log->bios_event_log_end = log->bios_event_log +
 		log_size + efi_tpm_final_log_size;
 
diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c
index 6b3b507a54eb..eb9af83e4d59 100644
--- a/drivers/firmware/efi/libstub/tpm.c
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -64,11 +64,13 @@ void efi_retrieve_tpm2_eventlog(efi_system_table_t *sys_table_arg)
 	efi_status_t status;
 	efi_physical_addr_t log_location = 0, log_last_entry = 0;
 	struct linux_efi_tpm_eventlog *log_tbl = NULL;
+	struct efi_tcg2_final_events_table *final_events_table;
 	unsigned long first_entry_addr, last_entry_addr;
 	size_t log_size, last_entry_size;
 	efi_bool_t truncated;
 	int version = EFI_TCG2_EVENT_LOG_FORMAT_TCG_2;
 	void *tcg2_protocol = NULL;
+	int final_events_size = 0;
 
 	status = efi_call_early(locate_protocol, &tcg2_guid, NULL,
 				&tcg2_protocol);
@@ -134,8 +136,36 @@ void efi_retrieve_tpm2_eventlog(efi_system_table_t *sys_table_arg)
 		return;
 	}
 
+	/*
+	 * Figure out whether any events have already been logged to the
+	 * final events structure, and if so how much space they take up
+	 */
+	final_events_table = get_efi_config_table(sys_table_arg,
+						LINUX_EFI_TPM_FINAL_LOG_GUID);
+	if (final_events_table && final_events_table->nr_events) {
+		struct tcg_pcr_event2_head *header;
+		int offset;
+		void *data;
+		int event_size;
+		int i = final_events_table->nr_events;
+
+		data = (void *)final_events_table;
+		offset = sizeof(final_events_table->version) +
+			sizeof(final_events_table->nr_events);
+
+		while (i > 0) {
+			header = data + offset + final_events_size;
+			event_size = __calc_tpm2_event_size(header,
+						   (void *)(long)log_location,
+						   false);
+			final_events_size += event_size;
+			i--;
+		}
+	}
+
 	memset(log_tbl, 0, sizeof(*log_tbl) + log_size);
 	log_tbl->size = log_size;
+	log_tbl->final_events_preboot_size = final_events_size;
 	log_tbl->version = version;
 	memcpy(log_tbl->log, (void *) first_entry_addr, log_size);
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index b391263d8ec6..f87fabea4a85 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1708,6 +1708,7 @@ struct linux_efi_random_seed {
 
 struct linux_efi_tpm_eventlog {
 	u32	size;
+	u32	final_events_preboot_size;
 	u8	version;
 	u8	log[];
 };
-- 
cgit v1.2.3


From 792c4e9d0bbb53b34bf1c07c2ef25609d746c57d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Thu, 20 Jun 2019 07:03:47 +0000
Subject: net/mlx5: Convert mkey_table to XArray

The lock protecting the data structure does not need to be an rwlock.  The
only read access to the lock is in an error path, and if that's limiting
your scalability, you have bigger performance problems.

Eliminate mlx5_mkey_table in favour of using the xarray directly.
reg_mr_callback must use GFP_ATOMIC for allocating XArray nodes as it may
be called in interrupt context.

This also fixes a minor bug where SRCU locking was being used on the radix
tree read side, when RCU was needed too.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c              |  8 ++++----
 drivers/infiniband/hw/mlx5/devx.c            | 18 ++++--------------
 drivers/infiniband/hw/mlx5/mr.c              | 10 +++++-----
 drivers/infiniband/hw/mlx5/odp.c             | 10 +++++-----
 drivers/net/ethernet/mellanox/mlx5/core/mr.c | 27 +++++++++++----------------
 include/linux/mlx5/driver.h                  | 13 ++-----------
 include/linux/mlx5/qp.h                      |  5 -----
 7 files changed, 31 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2e2e65f00257..0220736b073e 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -522,9 +522,9 @@ repoll:
 	case MLX5_CQE_SIG_ERR:
 		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
 
-		read_lock(&dev->mdev->priv.mkey_table.lock);
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		xa_lock(&dev->mdev->priv.mkey_table);
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
 		mr = to_mibmr(mmkey);
 		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
 		mr->sig->sig_err_exists = true;
@@ -537,7 +537,7 @@ repoll:
 			     mr->sig->err_item.expected,
 			     mr->sig->err_item.actual);
 
-		read_unlock(&dev->mdev->priv.mkey_table.lock);
+		xa_unlock(&dev->mdev->priv.mkey_table);
 		goto repoll;
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 80b42d069328..931f587dfb8f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1043,13 +1043,10 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 				     struct mlx5_ib_dev *dev,
 				     void *in, void *out)
 {
-	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
 	struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
-	unsigned long flags;
 	struct mlx5_core_mkey *mkey;
 	void *mkc;
 	u8 key;
-	int err;
 
 	mkey = &devx_mr->mmkey;
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -1062,11 +1059,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
 	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
 
-	write_lock_irqsave(&table->lock, flags);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key),
-				mkey);
-	write_unlock_irqrestore(&table->lock, flags);
-	return err;
+	return xa_err(xa_store(&dev->mdev->priv.mkey_table,
+			       mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
 }
 
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1117,12 +1111,8 @@ static void devx_free_indirect_mkey(struct rcu_head *rcu)
  */
 static void devx_cleanup_mkey(struct devx_obj *obj)
 {
-	struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table;
-	unsigned long flags;
-
-	write_lock_irqsave(&table->lock, flags);
-	radix_tree_delete(&table->tree, mlx5_base_mkey(obj->devx_mr.mmkey.key));
-	write_unlock_irqrestore(&table->lock, flags);
+	xa_erase(&obj->mdev->priv.mkey_table,
+		 mlx5_base_mkey(obj->devx_mr.mmkey.key));
 }
 
 static int devx_obj_cleanup(struct ib_uobject *uobject,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5f09699fab98..83b452d977d4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -130,7 +130,7 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	u8 key;
 	unsigned long flags;
-	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+	struct xarray *mkeys = &dev->mdev->priv.mkey_table;
 	int err;
 
 	spin_lock_irqsave(&ent->lock, flags);
@@ -158,12 +158,12 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
 	ent->size++;
 	spin_unlock_irqrestore(&ent->lock, flags);
 
-	write_lock_irqsave(&table->lock, flags);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
-				&mr->mmkey);
+	xa_lock_irqsave(mkeys, flags);
+	err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
+				&mr->mmkey, GFP_ATOMIC));
+	xa_unlock_irqrestore(mkeys, flags);
 	if (err)
 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
-	write_unlock_irqrestore(&table->lock, flags);
 
 	if (!completion_done(&ent->compl))
 		complete(&ent->compl);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 12ccee1eb047..c594489eb2d7 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -768,7 +768,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 	bcnt -= *bytes_committed;
 
 next_mr:
-	mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
+	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
@@ -1686,8 +1686,8 @@ static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
 		struct mlx5_core_mkey *mmkey;
 		struct mlx5_ib_mr *mr;
 
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(sg_list[i].lkey));
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 		atomic_dec(&mr->num_pending_prefetch);
 	}
@@ -1706,8 +1706,8 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd,
 		struct mlx5_core_mkey *mmkey;
 		struct mlx5_ib_mr *mr;
 
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(sg_list[i].lkey));
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
 		if (!mmkey || mmkey->key != sg_list[i].lkey) {
 			ret = false;
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index ea744d8466ea..9231b39d18b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -38,15 +38,12 @@
 
 void mlx5_init_mkey_table(struct mlx5_core_dev *dev)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
-
-	memset(table, 0, sizeof(*table));
-	rwlock_init(&table->lock);
-	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	xa_init_flags(&dev->priv.mkey_table, XA_FLAGS_LOCK_IRQ);
 }
 
 void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev)
 {
+	WARN_ON(!xa_empty(&dev->priv.mkey_table));
 }
 
 int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
@@ -56,8 +53,8 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 			     mlx5_async_cbk_t callback,
 			     struct mlx5_async_work *context)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
 	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
+	struct xarray *mkeys = &dev->priv.mkey_table;
 	u32 mkey_index;
 	void *mkc;
 	int err;
@@ -88,12 +85,10 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
 		      mkey_index, key, mkey->key);
 
-	/* connect to mkey tree */
-	write_lock_irq(&table->lock);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey);
-	write_unlock_irq(&table->lock);
+	err = xa_err(xa_store_irq(mkeys, mlx5_base_mkey(mkey->key), mkey,
+				  GFP_KERNEL));
 	if (err) {
-		mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n",
+		mlx5_core_warn(dev, "failed xarray insert of mkey 0x%x, %d\n",
 			       mlx5_base_mkey(mkey->key), err);
 		mlx5_core_destroy_mkey(dev, mkey);
 	}
@@ -114,17 +109,17 @@ EXPORT_SYMBOL(mlx5_core_create_mkey);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
 			   struct mlx5_core_mkey *mkey)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
 	u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)]   = {0};
+	struct xarray *mkeys = &dev->priv.mkey_table;
 	struct mlx5_core_mkey *deleted_mkey;
 	unsigned long flags;
 
-	write_lock_irqsave(&table->lock, flags);
-	deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key));
-	write_unlock_irqrestore(&table->lock, flags);
+	xa_lock_irqsave(mkeys, flags);
+	deleted_mkey = __xa_erase(mkeys, mlx5_base_mkey(mkey->key));
+	xa_unlock_irqrestore(mkeys, flags);
 	if (!deleted_mkey) {
-		mlx5_core_dbg(dev, "failed radix tree delete of mkey 0x%x\n",
+		mlx5_core_dbg(dev, "failed xarray delete of mkey 0x%x\n",
 			      mlx5_base_mkey(mkey->key));
 		return -ENOENT;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d8ab633406c2..87f77ded78d4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,7 +41,7 @@
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 #include <linux/mempool.h>
 #include <linux/interrupt.h>
@@ -452,13 +452,6 @@ struct mlx5_qp_table {
 	struct radix_tree_root	tree;
 };
 
-struct mlx5_mkey_table {
-	/* protect radix tree
-	 */
-	rwlock_t		lock;
-	struct radix_tree_root	tree;
-};
-
 struct mlx5_vf_context {
 	int	enabled;
 	u64	port_guid;
@@ -546,9 +539,7 @@ struct mlx5_priv {
 	struct dentry	       *cmdif_debugfs;
 	/* end: qp staff */
 
-	/* start: mkey staff */
-	struct mlx5_mkey_table	mkey_table;
-	/* end: mkey staff */
+	struct xarray           mkey_table;
 
 	/* start: alloc staff */
 	/* protect buffer alocation according to numa node */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3ba4edbd17a6..d1f353c64797 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -551,11 +551,6 @@ static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u
 	return radix_tree_lookup(&dev->priv.qp_table.tree, qpn);
 }
 
-static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key)
-{
-	return radix_tree_lookup(&dev->priv.mkey_table.tree, key);
-}
-
 int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 			 struct mlx5_core_dct *qp,
 			 u32 *in, int inlen,
-- 
cgit v1.2.3


From 4b85faed211ccfbcc7f3adf1cd62f0b00d1a172b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jun 2019 16:06:10 +0200
Subject: dma-mapping: add a dma_alloc_need_uncached helper

Check if we need to allocate uncached memory for a device given the
allocation flags.  Switch over the uncached segment check to this helper
to deal with architectures that do not support the dma_cache_sync
operation and thus should not returned cacheable memory for
DMA_ATTR_NON_CONSISTENT allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-noncoherent.h | 14 ++++++++++++++
 kernel/dma/direct.c             |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 7e0126a04e02..732919ac5c11 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -20,6 +20,20 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 
+/*
+ * Check if an allocation needs to be marked uncached to be coherent.
+ */
+static inline bool dma_alloc_need_uncached(struct device *dev,
+		unsigned long attrs)
+{
+	if (dev_is_dma_coherent(dev))
+		return false;
+	if (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
+	    (attrs & DMA_ATTR_NON_CONSISTENT))
+		return false;
+	return true;
+}
+
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b67f0aa08aa3..c2893713bf80 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -160,7 +160,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	memset(ret, 0, size);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) {
+	    dma_alloc_need_uncached(dev, attrs)) {
 		arch_dma_prep_coherent(page, size);
 		ret = uncached_kernel_address(ret);
 	}
@@ -182,7 +182,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT))
+	    dma_alloc_need_uncached(dev, attrs))
 		cpu_addr = cached_kernel_address(cpu_addr);
 	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
 }
-- 
cgit v1.2.3


From d48e0cd8fcaf314175a15d3076d7a1e71bd4e628 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 24 Jun 2019 11:15:39 +0200
Subject: timekeeping: Boot should be boottime for coarse ns accessor

Somewhere in all the patchsets before, this cleanup got lost.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: https://lkml.kernel.org/r/20190624091539.13512-1-Jason@zx2c4.com
---
 Documentation/core-api/timekeeping.rst | 2 +-
 include/linux/timekeeping.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 15fc58e85ef9..20ee447a50f3 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -105,7 +105,7 @@ Some additional variants exist for more specialized cases:
 		ktime_t ktime_get_coarse_clocktai( void )
 
 .. c:function:: u64 ktime_get_coarse_ns( void )
-		u64 ktime_get_coarse_boot_ns( void )
+		u64 ktime_get_coarse_boottime_ns( void )
 		u64 ktime_get_coarse_real_ns( void )
 		u64 ktime_get_coarse_clocktai_ns( void )
 
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index dcffc00755f2..b27e2ffa96c1 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -131,7 +131,7 @@ static inline u64 ktime_get_coarse_real_ns(void)
 	return ktime_to_ns(ktime_get_coarse_real());
 }
 
-static inline u64 ktime_get_coarse_boot_ns(void)
+static inline u64 ktime_get_coarse_boottime_ns(void)
 {
 	return ktime_to_ns(ktime_get_coarse_boottime());
 }
-- 
cgit v1.2.3


From ec6516bfbaf72e7c81811162b6de96322e32a027 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:31 +0900
Subject: pinctrl: remove unneeded #ifdef around declarations

What is the point in surrounding the whole of declarations with
ifdef like this?

  #ifdef CONFIG_FOO
  int foo(void);
  #endif

If CONFIG_FOO is not defined, all callers of foo() will fail
with implicit declaration errors since the top Makefile adds
-Werror-implicit-function-declaration to KBUILD_CFLAGS.

This breaks the build earlier when you are doing something wrong.
That's it.

Anyway, it will fail to link since the definition of foo() is not
compiled.

In summary, these ifdef are unneeded.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 20 ++++++--------------
 include/linux/pinctrl/pinconf.h         |  4 ----
 include/linux/pinctrl/pinctrl.h         |  4 ----
 include/linux/pinctrl/pinmux.h          |  4 ----
 4 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 72d06d6a3099..673828a52294 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -12,6 +12,12 @@
 #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H
 #define __LINUX_PINCTRL_PINCONF_GENERIC_H
 
+#include <linux/device.h>
+#include <linux/pinctrl/machine.h>
+
+struct pinctrl_dev;
+struct pinctrl_map;
+
 /**
  * enum pin_config_param - possible pin configuration parameters
  * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it
@@ -159,9 +165,6 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 	return PIN_CONF_PACKED(param, argument);
 }
 
-#ifdef CONFIG_GENERIC_PINCONF
-
-#ifdef CONFIG_DEBUG_FS
 #define PCONFDUMP(a, b, c, d) {					\
 	.param = a, .display = b, .format = c, .has_arg = d	\
 	}
@@ -172,14 +175,6 @@ struct pin_config_item {
 	const char * const format;
 	bool has_arg;
 };
-#endif /* CONFIG_DEBUG_FS */
-
-#ifdef CONFIG_OF
-
-#include <linux/device.h>
-#include <linux/pinctrl/machine.h>
-struct pinctrl_dev;
-struct pinctrl_map;
 
 struct pinconf_generic_params {
 	const char * const property;
@@ -224,8 +219,5 @@ static inline int pinconf_generic_dt_node_to_map_all(
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_INVALID);
 }
-#endif
-
-#endif /* CONFIG_GENERIC_PINCONF */
 
 #endif /* __LINUX_PINCTRL_PINCONF_GENERIC_H */
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 9bebc3554809..513883dcc5a9 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCONF_H
 #define __LINUX_PINCTRL_PINCONF_H
 
-#ifdef CONFIG_PINCONF
-
 #include <linux/types.h>
 
 struct pinctrl_dev;
@@ -67,6 +65,4 @@ struct pinconf_ops {
 					    unsigned long config);
 };
 
-#endif
-
 #endif /* __LINUX_PINCTRL_PINCONF_H */
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 36a79fe7b84f..27738164daa7 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCTRL_H
 #define __LINUX_PINCTRL_PINCTRL_H
 
-#ifdef CONFIG_PINCTRL
-
 #include <linux/radix-tree.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
@@ -203,6 +201,4 @@ extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
 
-#endif /* !CONFIG_PINCTRL */
-
 #endif /* __LINUX_PINCTRL_PINCTRL_H */
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index ace60d775b20..566a5fe8eab5 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -16,8 +16,6 @@
 #include <linux/seq_file.h>
 #include <linux/pinctrl/pinctrl.h>
 
-#ifdef CONFIG_PINMUX
-
 struct pinctrl_dev;
 
 /**
@@ -85,6 +83,4 @@ struct pinmux_ops {
 	bool strict;
 };
 
-#endif /* CONFIG_PINMUX */
-
 #endif /* __LINUX_PINCTRL_PINMUX_H */
-- 
cgit v1.2.3


From d8ca7d184b33af7913c244900df77c6cad6a5590 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 24 Jun 2019 00:08:31 +0300
Subject: regulator: core: Introduce API for regulators coupling customization

Right now regulator core supports only one type of regulators coupling,
the "voltage max-spread" which keeps voltages of coupled regulators in a
given range from each other. A more sophisticated coupling may be required
in practice, one example is the NVIDIA Tegra SoCs which besides the
max-spreading have other restrictions that must be adhered. Introduce API
that allow platforms to provide their own customized coupling algorithms.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 136 ++++++++++++++++++++++++++++++++------
 drivers/regulator/of_regulator.c  |  63 ++++++++++++------
 include/linux/regulator/coupler.h |  62 +++++++++++++++++
 include/linux/regulator/driver.h  |   6 +-
 include/linux/regulator/machine.h |   2 +-
 5 files changed, 225 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/regulator/coupler.h

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 955a0a15b9cb..12c870f790f5 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -28,6 +28,7 @@
 #include <linux/regmap.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/consumer.h>
+#include <linux/regulator/coupler.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 #include <linux/module.h>
@@ -55,6 +56,7 @@ static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_map_list);
 static LIST_HEAD(regulator_ena_gpio_list);
 static LIST_HEAD(regulator_supply_alias_list);
+static LIST_HEAD(regulator_coupler_list);
 static bool has_full_constraints;
 
 static struct dentry *debugfs_root;
@@ -3439,11 +3441,10 @@ static int regulator_get_optimal_voltage(struct regulator_dev *rdev,
 	struct coupling_desc *c_desc = &rdev->coupling_desc;
 	struct regulator_dev **c_rdevs = c_desc->coupled_rdevs;
 	struct regulation_constraints *constraints = rdev->constraints;
-	int max_spread = constraints->max_spread;
 	int desired_min_uV = 0, desired_max_uV = INT_MAX;
 	int max_current_uV = 0, min_current_uV = INT_MAX;
 	int highest_min_uV = 0, target_uV, possible_uV;
-	int i, ret;
+	int i, ret, max_spread;
 	bool done;
 
 	*current_uV = -1;
@@ -3497,6 +3498,8 @@ static int regulator_get_optimal_voltage(struct regulator_dev *rdev,
 		}
 	}
 
+	max_spread = constraints->max_spread[0];
+
 	/*
 	 * Let target_uV be equal to the desired one if possible.
 	 * If not, set it to minimum voltage, allowed by other coupled
@@ -3578,9 +3581,11 @@ static int regulator_balance_voltage(struct regulator_dev *rdev,
 	struct regulator_dev **c_rdevs;
 	struct regulator_dev *best_rdev;
 	struct coupling_desc *c_desc = &rdev->coupling_desc;
+	struct regulator_coupler *coupler = c_desc->coupler;
 	int i, ret, n_coupled, best_min_uV, best_max_uV, best_c_rdev;
-	bool best_c_rdev_done, c_rdev_done[MAX_COUPLED];
 	unsigned int delta, best_delta;
+	unsigned long c_rdev_done = 0;
+	bool best_c_rdev_done;
 
 	c_rdevs = c_desc->coupled_rdevs;
 	n_coupled = c_desc->n_coupled;
@@ -3597,8 +3602,9 @@ static int regulator_balance_voltage(struct regulator_dev *rdev,
 		return -EPERM;
 	}
 
-	for (i = 0; i < n_coupled; i++)
-		c_rdev_done[i] = false;
+	/* Invoke custom balancer for customized couplers */
+	if (coupler && coupler->balance_voltage)
+		return coupler->balance_voltage(coupler, rdev, state);
 
 	/*
 	 * Find the best possible voltage change on each loop. Leave the loop
@@ -3625,7 +3631,7 @@ static int regulator_balance_voltage(struct regulator_dev *rdev,
 			 */
 			int optimal_uV = 0, optimal_max_uV = 0, current_uV = 0;
 
-			if (c_rdev_done[i])
+			if (test_bit(i, &c_rdev_done))
 				continue;
 
 			ret = regulator_get_optimal_voltage(c_rdevs[i],
@@ -3660,7 +3666,8 @@ static int regulator_balance_voltage(struct regulator_dev *rdev,
 		if (ret < 0)
 			goto out;
 
-		c_rdev_done[best_c_rdev] = best_c_rdev_done;
+		if (best_c_rdev_done)
+			set_bit(best_c_rdev, &c_rdev_done);
 
 	} while (n_coupled > 1);
 
@@ -4712,8 +4719,60 @@ static int regulator_register_resolve_supply(struct device *dev, void *data)
 	return 0;
 }
 
+int regulator_coupler_register(struct regulator_coupler *coupler)
+{
+	mutex_lock(&regulator_list_mutex);
+	list_add_tail(&coupler->list, &regulator_coupler_list);
+	mutex_unlock(&regulator_list_mutex);
+
+	return 0;
+}
+
+static struct regulator_coupler *
+regulator_find_coupler(struct regulator_dev *rdev)
+{
+	struct regulator_coupler *coupler;
+	int err;
+
+	/*
+	 * Note that regulators are appended to the list and the generic
+	 * coupler is registered first, hence it will be attached at last
+	 * if nobody cared.
+	 */
+	list_for_each_entry_reverse(coupler, &regulator_coupler_list, list) {
+		err = coupler->attach_regulator(coupler, rdev);
+		if (!err) {
+			if (!coupler->balance_voltage &&
+			    rdev->coupling_desc.n_coupled > 2)
+				goto err_unsupported;
+
+			return coupler;
+		}
+
+		if (err < 0)
+			return ERR_PTR(err);
+
+		if (err == 1)
+			continue;
+
+		break;
+	}
+
+	return ERR_PTR(-EINVAL);
+
+err_unsupported:
+	if (coupler->detach_regulator)
+		coupler->detach_regulator(coupler, rdev);
+
+	rdev_err(rdev,
+		"Voltage balancing for multiple regulator couples is unimplemented\n");
+
+	return ERR_PTR(-EPERM);
+}
+
 static void regulator_resolve_coupling(struct regulator_dev *rdev)
 {
+	struct regulator_coupler *coupler = rdev->coupling_desc.coupler;
 	struct coupling_desc *c_desc = &rdev->coupling_desc;
 	int n_coupled = c_desc->n_coupled;
 	struct regulator_dev *c_rdev;
@@ -4729,6 +4788,12 @@ static void regulator_resolve_coupling(struct regulator_dev *rdev)
 		if (!c_rdev)
 			continue;
 
+		if (c_rdev->coupling_desc.coupler != coupler) {
+			rdev_err(rdev, "coupler mismatch with %s\n",
+				 rdev_get_name(c_rdev));
+			return;
+		}
+
 		regulator_lock(c_rdev);
 
 		c_desc->coupled_rdevs[i] = c_rdev;
@@ -4742,10 +4807,12 @@ static void regulator_resolve_coupling(struct regulator_dev *rdev)
 
 static void regulator_remove_coupling(struct regulator_dev *rdev)
 {
+	struct regulator_coupler *coupler = rdev->coupling_desc.coupler;
 	struct coupling_desc *__c_desc, *c_desc = &rdev->coupling_desc;
 	struct regulator_dev *__c_rdev, *c_rdev;
 	unsigned int __n_coupled, n_coupled;
 	int i, k;
+	int err;
 
 	n_coupled = c_desc->n_coupled;
 
@@ -4775,21 +4842,33 @@ static void regulator_remove_coupling(struct regulator_dev *rdev)
 		c_desc->coupled_rdevs[i] = NULL;
 		c_desc->n_resolved--;
 	}
+
+	if (coupler && coupler->detach_regulator) {
+		err = coupler->detach_regulator(coupler, rdev);
+		if (err)
+			rdev_err(rdev, "failed to detach from coupler: %d\n",
+				 err);
+	}
+
+	kfree(rdev->coupling_desc.coupled_rdevs);
+	rdev->coupling_desc.coupled_rdevs = NULL;
 }
 
 static int regulator_init_coupling(struct regulator_dev *rdev)
 {
-	int n_phandles;
+	int err, n_phandles;
+	size_t alloc_size;
 
 	if (!IS_ENABLED(CONFIG_OF))
 		n_phandles = 0;
 	else
 		n_phandles = of_get_n_coupled(rdev);
 
-	if (n_phandles + 1 > MAX_COUPLED) {
-		rdev_err(rdev, "too many regulators coupled\n");
-		return -EPERM;
-	}
+	alloc_size = sizeof(*rdev) * (n_phandles + 1);
+
+	rdev->coupling_desc.coupled_rdevs = kzalloc(alloc_size, GFP_KERNEL);
+	if (!rdev->coupling_desc.coupled_rdevs)
+		return -ENOMEM;
 
 	/*
 	 * Every regulator should always have coupling descriptor filled with
@@ -4803,23 +4882,35 @@ static int regulator_init_coupling(struct regulator_dev *rdev)
 	if (n_phandles == 0)
 		return 0;
 
-	/* regulator, which can't change its voltage, can't be coupled */
-	if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) {
-		rdev_err(rdev, "voltage operation not allowed\n");
+	if (!of_check_coupling_data(rdev))
 		return -EPERM;
-	}
 
-	if (rdev->constraints->max_spread <= 0) {
-		rdev_err(rdev, "wrong max_spread value\n");
-		return -EPERM;
+	rdev->coupling_desc.coupler = regulator_find_coupler(rdev);
+	if (IS_ERR(rdev->coupling_desc.coupler)) {
+		err = PTR_ERR(rdev->coupling_desc.coupler);
+		rdev_err(rdev, "failed to get coupler: %d\n", err);
+		return err;
 	}
 
-	if (!of_check_coupling_data(rdev))
+	return 0;
+}
+
+static int generic_coupler_attach(struct regulator_coupler *coupler,
+				  struct regulator_dev *rdev)
+{
+	if (rdev->coupling_desc.n_coupled > 2) {
+		rdev_err(rdev,
+			 "Voltage balancing for multiple regulator couples is unimplemented\n");
 		return -EPERM;
+	}
 
 	return 0;
 }
 
+static struct regulator_coupler generic_regulator_coupler = {
+	.attach_regulator = generic_coupler_attach,
+};
+
 /**
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
@@ -4981,7 +5072,9 @@ regulator_register(const struct regulator_desc *regulator_desc,
 	if (ret < 0)
 		goto wash;
 
+	mutex_lock(&regulator_list_mutex);
 	ret = regulator_init_coupling(rdev);
+	mutex_unlock(&regulator_list_mutex);
 	if (ret < 0)
 		goto wash;
 
@@ -5030,6 +5123,7 @@ regulator_register(const struct regulator_desc *regulator_desc,
 unset_supplies:
 	mutex_lock(&regulator_list_mutex);
 	unset_regulator_supplies(rdev);
+	regulator_remove_coupling(rdev);
 	mutex_unlock(&regulator_list_mutex);
 wash:
 	kfree(rdev->constraints);
@@ -5485,6 +5579,8 @@ static int __init regulator_init(void)
 #endif
 	regulator_dummy_init();
 
+	regulator_coupler_register(&generic_regulator_coupler);
+
 	return ret;
 }
 
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 6dca0ba044d8..db1cb2714b92 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -25,7 +25,8 @@ static const char *const regulator_states[PM_SUSPEND_MAX + 1] = {
 	[PM_SUSPEND_MAX]	= "regulator-state-disk",
 };
 
-static void of_get_regulation_constraints(struct device_node *np,
+static int of_get_regulation_constraints(struct device *dev,
+					struct device_node *np,
 					struct regulator_init_data **init_data,
 					const struct regulator_desc *desc)
 {
@@ -34,8 +35,13 @@ static void of_get_regulation_constraints(struct device_node *np,
 	struct device_node *suspend_np;
 	unsigned int mode;
 	int ret, i, len;
+	int n_phandles;
 	u32 pval;
 
+	n_phandles = of_count_phandle_with_args(np, "regulator-coupled-with",
+						NULL);
+	n_phandles = max(n_phandles, 0);
+
 	constraints->name = of_get_property(np, "regulator-name", NULL);
 
 	if (!of_property_read_u32(np, "regulator-min-microvolt", &pval))
@@ -167,9 +173,17 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!of_property_read_u32(np, "regulator-system-load", &pval))
 		constraints->system_load = pval;
 
-	if (!of_property_read_u32(np, "regulator-coupled-max-spread",
-				  &pval))
-		constraints->max_spread = pval;
+	if (n_phandles) {
+		constraints->max_spread = devm_kzalloc(dev,
+				sizeof(*constraints->max_spread) * n_phandles,
+				GFP_KERNEL);
+
+		if (!constraints->max_spread)
+			return -ENOMEM;
+
+		of_property_read_u32_array(np, "regulator-coupled-max-spread",
+					   constraints->max_spread, n_phandles);
+	}
 
 	if (!of_property_read_u32(np, "regulator-max-step-microvolt",
 				  &pval))
@@ -246,6 +260,8 @@ static void of_get_regulation_constraints(struct device_node *np,
 		suspend_state = NULL;
 		suspend_np = NULL;
 	}
+
+	return 0;
 }
 
 /**
@@ -271,7 +287,9 @@ struct regulator_init_data *of_get_regulator_init_data(struct device *dev,
 	if (!init_data)
 		return NULL; /* Out of memory? */
 
-	of_get_regulation_constraints(node, &init_data, desc);
+	if (of_get_regulation_constraints(dev, node, &init_data, desc))
+		return NULL;
+
 	return init_data;
 }
 EXPORT_SYMBOL_GPL(of_get_regulator_init_data);
@@ -477,7 +495,8 @@ int of_get_n_coupled(struct regulator_dev *rdev)
 
 /* Looks for "to_find" device_node in src's "regulator-coupled-with" property */
 static bool of_coupling_find_node(struct device_node *src,
-				  struct device_node *to_find)
+				  struct device_node *to_find,
+				  int *index)
 {
 	int n_phandles, i;
 	bool found = false;
@@ -499,8 +518,10 @@ static bool of_coupling_find_node(struct device_node *src,
 
 		of_node_put(tmp);
 
-		if (found)
+		if (found) {
+			*index = i;
 			break;
+		}
 	}
 
 	return found;
@@ -521,22 +542,23 @@ static bool of_coupling_find_node(struct device_node *src,
  */
 bool of_check_coupling_data(struct regulator_dev *rdev)
 {
-	int max_spread = rdev->constraints->max_spread;
 	struct device_node *node = rdev->dev.of_node;
 	int n_phandles = of_get_n_coupled(rdev);
 	struct device_node *c_node;
+	int index;
 	int i;
 	bool ret = true;
 
-	if (max_spread <= 0) {
-		dev_err(&rdev->dev, "max_spread value invalid\n");
-		return false;
-	}
-
 	/* iterate over rdev's phandles */
 	for (i = 0; i < n_phandles; i++) {
+		int max_spread = rdev->constraints->max_spread[i];
 		int c_max_spread, c_n_phandles;
 
+		if (max_spread <= 0) {
+			dev_err(&rdev->dev, "max_spread value invalid\n");
+			return false;
+		}
+
 		c_node = of_parse_phandle(node,
 					  "regulator-coupled-with", i);
 
@@ -553,22 +575,23 @@ bool of_check_coupling_data(struct regulator_dev *rdev)
 			goto clean;
 		}
 
-		if (of_property_read_u32(c_node, "regulator-coupled-max-spread",
-					 &c_max_spread)) {
+		if (!of_coupling_find_node(c_node, node, &index)) {
+			dev_err(&rdev->dev, "missing 2-way linking for coupled regulators\n");
 			ret = false;
 			goto clean;
 		}
 
-		if (c_max_spread != max_spread) {
-			dev_err(&rdev->dev,
-				"coupled regulators max_spread mismatch\n");
+		if (of_property_read_u32_index(c_node, "regulator-coupled-max-spread",
+					       index, &c_max_spread)) {
 			ret = false;
 			goto clean;
 		}
 
-		if (!of_coupling_find_node(c_node, node)) {
-			dev_err(&rdev->dev, "missing 2-way linking for coupled regulators\n");
+		if (c_max_spread != max_spread) {
+			dev_err(&rdev->dev,
+				"coupled regulators max_spread mismatch\n");
 			ret = false;
+			goto clean;
 		}
 
 clean:
diff --git a/include/linux/regulator/coupler.h b/include/linux/regulator/coupler.h
new file mode 100644
index 000000000000..98dd1f74d605
--- /dev/null
+++ b/include/linux/regulator/coupler.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * coupler.h -- SoC Regulator support, coupler API.
+ *
+ * Regulator Coupler Interface.
+ */
+
+#ifndef __LINUX_REGULATOR_COUPLER_H_
+#define __LINUX_REGULATOR_COUPLER_H_
+
+#include <linux/kernel.h>
+#include <linux/suspend.h>
+
+struct regulator_coupler;
+struct regulator_dev;
+
+/**
+ * struct regulator_coupler - customized regulator's coupler
+ *
+ * Regulator's coupler allows to customize coupling algorithm.
+ *
+ * @list: couplers list entry
+ * @attach_regulator: Callback invoked on creation of a coupled regulator,
+ *                    couples are unresolved at this point. The callee should
+ *                    check that it could handle the regulator and return 0 on
+ *                    success, -errno on failure and 1 if given regulator is
+ *                    not suitable for this coupler (case of having multiple
+ *                    regulators in a system). Callback shall be implemented.
+ * @detach_regulator: Callback invoked on destruction of a coupled regulator.
+ *                    This callback is optional and could be NULL.
+ * @balance_voltage: Callback invoked when voltage of a coupled regulator is
+ *                   changing. Called with all of the coupled rdev's being held
+ *                   under "consumer lock". The callee should perform voltage
+ *                   balancing, changing voltage of the coupled regulators as
+ *                   needed. It's up to the coupler to verify the voltage
+ *                   before changing it in hardware, i.e. coupler should
+ *                   check consumer's min/max and etc. This callback is
+ *                   optional and could be NULL, in which case a generic
+ *                   voltage balancer will be used.
+ */
+struct regulator_coupler {
+	struct list_head list;
+
+	int (*attach_regulator)(struct regulator_coupler *coupler,
+				struct regulator_dev *rdev);
+	int (*detach_regulator)(struct regulator_coupler *coupler,
+				struct regulator_dev *rdev);
+	int (*balance_voltage)(struct regulator_coupler *coupler,
+			       struct regulator_dev *rdev,
+			       suspend_state_t state);
+};
+
+#ifdef CONFIG_REGULATOR
+int regulator_coupler_register(struct regulator_coupler *coupler);
+#else
+static inline int regulator_coupler_register(struct regulator_coupler *coupler)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 377da2357118..31b38a2b6995 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -15,8 +15,6 @@
 #ifndef __LINUX_REGULATOR_DRIVER_H_
 #define __LINUX_REGULATOR_DRIVER_H_
 
-#define MAX_COUPLED		2
-
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/regulator/consumer.h>
@@ -426,7 +424,8 @@ struct regulator_config {
  * incremented.
  */
 struct coupling_desc {
-	struct regulator_dev *coupled_rdevs[MAX_COUPLED];
+	struct regulator_dev **coupled_rdevs;
+	struct regulator_coupler *coupler;
 	int n_resolved;
 	int n_coupled;
 };
@@ -552,4 +551,5 @@ void regulator_unlock(struct regulator_dev *rdev);
  */
 int regulator_desc_list_voltage_linear_range(const struct regulator_desc *desc,
 					     unsigned int selector);
+
 #endif
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 1d34a70ffda2..21db06e5c1ed 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -156,7 +156,7 @@ struct regulation_constraints {
 	int system_load;
 
 	/* used for coupled regulators */
-	int max_spread;
+	u32 *max_spread;
 
 	/* used for changing voltage in steps */
 	int max_uV_step;
-- 
cgit v1.2.3


From d22b85a1b97d12a4940ef9d778f6122546736f78 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 24 Jun 2019 00:08:32 +0300
Subject: regulator: core: Expose some of core functions needed by couplers

Expose some of internal functions that are required for implementation of
customized regulator couplers.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 58 ++++++++++++++++++---------------------
 include/linux/regulator/coupler.h | 35 +++++++++++++++++++++++
 2 files changed, 62 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 12c870f790f5..b9bc45128b8c 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -100,7 +100,6 @@ struct regulator_supply_alias {
 
 static int _regulator_is_enabled(struct regulator_dev *rdev);
 static int _regulator_disable(struct regulator *regulator);
-static int _regulator_get_voltage(struct regulator_dev *rdev);
 static int _regulator_get_current_limit(struct regulator_dev *rdev);
 static unsigned int _regulator_get_mode(struct regulator_dev *rdev);
 static int _notifier_call_chain(struct regulator_dev *rdev,
@@ -109,15 +108,12 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev,
 				     int min_uV, int max_uV);
 static int regulator_balance_voltage(struct regulator_dev *rdev,
 				     suspend_state_t state);
-static int regulator_set_voltage_rdev(struct regulator_dev *rdev,
-				      int min_uV, int max_uV,
-				      suspend_state_t state);
 static struct regulator *create_regulator(struct regulator_dev *rdev,
 					  struct device *dev,
 					  const char *supply_name);
 static void _regulator_put(struct regulator *regulator);
 
-static const char *rdev_get_name(struct regulator_dev *rdev)
+const char *rdev_get_name(struct regulator_dev *rdev)
 {
 	if (rdev->constraints && rdev->constraints->name)
 		return rdev->constraints->name;
@@ -431,8 +427,8 @@ static struct device_node *of_get_regulator(struct device *dev, const char *supp
 }
 
 /* Platform voltage constraint check */
-static int regulator_check_voltage(struct regulator_dev *rdev,
-				   int *min_uV, int *max_uV)
+int regulator_check_voltage(struct regulator_dev *rdev,
+			    int *min_uV, int *max_uV)
 {
 	BUG_ON(*min_uV > *max_uV);
 
@@ -464,9 +460,9 @@ static int regulator_check_states(suspend_state_t state)
 /* Make sure we select a voltage that suits the needs of all
  * regulator consumers
  */
-static int regulator_check_consumers(struct regulator_dev *rdev,
-				     int *min_uV, int *max_uV,
-				     suspend_state_t state)
+int regulator_check_consumers(struct regulator_dev *rdev,
+			      int *min_uV, int *max_uV,
+			      suspend_state_t state)
 {
 	struct regulator *regulator;
 	struct regulator_voltage *voltage;
@@ -577,7 +573,7 @@ static ssize_t regulator_uV_show(struct device *dev,
 	ssize_t ret;
 
 	regulator_lock(rdev);
-	ret = sprintf(buf, "%d\n", _regulator_get_voltage(rdev));
+	ret = sprintf(buf, "%d\n", regulator_get_voltage_rdev(rdev));
 	regulator_unlock(rdev);
 
 	return ret;
@@ -948,7 +944,7 @@ static int drms_uA_update(struct regulator_dev *rdev)
 			rdev_err(rdev, "failed to set load %d\n", current_uA);
 	} else {
 		/* get output voltage */
-		output_uV = _regulator_get_voltage(rdev);
+		output_uV = regulator_get_voltage_rdev(rdev);
 		if (output_uV <= 0) {
 			rdev_err(rdev, "invalid output voltage found\n");
 			return -EINVAL;
@@ -1061,7 +1057,7 @@ static void print_constraints(struct regulator_dev *rdev)
 
 	if (!constraints->min_uV ||
 	    constraints->min_uV != constraints->max_uV) {
-		ret = _regulator_get_voltage(rdev);
+		ret = regulator_get_voltage_rdev(rdev);
 		if (ret > 0)
 			count += scnprintf(buf + count, len - count,
 					   "at %d mV ", ret / 1000);
@@ -1120,7 +1116,7 @@ static int machine_constraints_voltage(struct regulator_dev *rdev,
 	if (rdev->constraints->apply_uV &&
 	    rdev->constraints->min_uV && rdev->constraints->max_uV) {
 		int target_min, target_max;
-		int current_uV = _regulator_get_voltage(rdev);
+		int current_uV = regulator_get_voltage_rdev(rdev);
 
 		if (current_uV == -ENOTRECOVERABLE) {
 			/* This regulator can't be read and must be initialized */
@@ -1130,7 +1126,7 @@ static int machine_constraints_voltage(struct regulator_dev *rdev,
 			_regulator_do_set_voltage(rdev,
 						  rdev->constraints->min_uV,
 						  rdev->constraints->max_uV);
-			current_uV = _regulator_get_voltage(rdev);
+			current_uV = regulator_get_voltage_rdev(rdev);
 		}
 
 		if (current_uV < 0) {
@@ -3072,7 +3068,7 @@ static int _regulator_call_set_voltage(struct regulator_dev *rdev,
 	struct pre_voltage_change_data data;
 	int ret;
 
-	data.old_uV = _regulator_get_voltage(rdev);
+	data.old_uV = regulator_get_voltage_rdev(rdev);
 	data.min_uV = min_uV;
 	data.max_uV = max_uV;
 	ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE,
@@ -3096,7 +3092,7 @@ static int _regulator_call_set_voltage_sel(struct regulator_dev *rdev,
 	struct pre_voltage_change_data data;
 	int ret;
 
-	data.old_uV = _regulator_get_voltage(rdev);
+	data.old_uV = regulator_get_voltage_rdev(rdev);
 	data.min_uV = uV;
 	data.max_uV = uV;
 	ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE,
@@ -3149,7 +3145,7 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev,
 	unsigned int selector;
 	int old_selector = -1;
 	const struct regulator_ops *ops = rdev->desc->ops;
-	int old_uV = _regulator_get_voltage(rdev);
+	int old_uV = regulator_get_voltage_rdev(rdev);
 
 	trace_regulator_set_voltage(rdev_get_name(rdev), min_uV, max_uV);
 
@@ -3176,7 +3172,7 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev,
 				best_val = ops->list_voltage(rdev,
 							     selector);
 			else
-				best_val = _regulator_get_voltage(rdev);
+				best_val = regulator_get_voltage_rdev(rdev);
 		}
 
 	} else if (ops->set_voltage_sel) {
@@ -3295,7 +3291,7 @@ static int regulator_set_voltage_unlocked(struct regulator *regulator,
 	 * changing the voltage.
 	 */
 	if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) {
-		current_uV = _regulator_get_voltage(rdev);
+		current_uV = regulator_get_voltage_rdev(rdev);
 		if (min_uV <= current_uV && current_uV <= max_uV) {
 			voltage->min_uV = min_uV;
 			voltage->max_uV = max_uV;
@@ -3332,8 +3328,8 @@ out:
 	return ret;
 }
 
-static int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV,
-				      int max_uV, suspend_state_t state)
+int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV,
+			       int max_uV, suspend_state_t state)
 {
 	int best_supply_uV = 0;
 	int supply_change_uV = 0;
@@ -3361,7 +3357,7 @@ static int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV,
 
 		best_supply_uV += rdev->desc->min_dropout_uV;
 
-		current_supply_uV = _regulator_get_voltage(rdev->supply->rdev);
+		current_supply_uV = regulator_get_voltage_rdev(rdev->supply->rdev);
 		if (current_supply_uV < 0) {
 			ret = current_supply_uV;
 			goto out;
@@ -3412,7 +3408,7 @@ static int regulator_limit_voltage_step(struct regulator_dev *rdev,
 		return 1;
 
 	if (*current_uV < 0) {
-		*current_uV = _regulator_get_voltage(rdev);
+		*current_uV = regulator_get_voltage_rdev(rdev);
 
 		if (*current_uV < 0)
 			return *current_uV;
@@ -3517,7 +3513,7 @@ static int regulator_get_optimal_voltage(struct regulator_dev *rdev,
 		if (!_regulator_is_enabled(c_rdevs[i]))
 			continue;
 
-		tmp_act = _regulator_get_voltage(c_rdevs[i]);
+		tmp_act = regulator_get_voltage_rdev(c_rdevs[i]);
 		if (tmp_act < 0)
 			return tmp_act;
 
@@ -3559,7 +3555,7 @@ finish:
 	if (n_coupled > 1 && *current_uV == -1) {
 
 		if (_regulator_is_enabled(rdev)) {
-			ret = _regulator_get_voltage(rdev);
+			ret = regulator_get_voltage_rdev(rdev);
 			if (ret < 0)
 				return ret;
 
@@ -3923,7 +3919,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(regulator_sync_voltage);
 
-static int _regulator_get_voltage(struct regulator_dev *rdev)
+int regulator_get_voltage_rdev(struct regulator_dev *rdev)
 {
 	int sel, ret;
 	bool bypassed;
@@ -3940,7 +3936,7 @@ static int _regulator_get_voltage(struct regulator_dev *rdev)
 				return -EPROBE_DEFER;
 			}
 
-			return _regulator_get_voltage(rdev->supply->rdev);
+			return regulator_get_voltage_rdev(rdev->supply->rdev);
 		}
 	}
 
@@ -3956,7 +3952,7 @@ static int _regulator_get_voltage(struct regulator_dev *rdev)
 	} else if (rdev->desc->fixed_uV && (rdev->desc->n_voltages == 1)) {
 		ret = rdev->desc->fixed_uV;
 	} else if (rdev->supply) {
-		ret = _regulator_get_voltage(rdev->supply->rdev);
+		ret = regulator_get_voltage_rdev(rdev->supply->rdev);
 	} else {
 		return -EINVAL;
 	}
@@ -3981,7 +3977,7 @@ int regulator_get_voltage(struct regulator *regulator)
 	int ret;
 
 	regulator_lock_dependent(regulator->rdev, &ww_ctx);
-	ret = _regulator_get_voltage(regulator->rdev);
+	ret = regulator_get_voltage_rdev(regulator->rdev);
 	regulator_unlock_dependent(regulator->rdev, &ww_ctx);
 
 	return ret;
@@ -5377,7 +5373,7 @@ static void regulator_summary_show_subtree(struct seq_file *s,
 		   rdev->use_count, rdev->open_count, rdev->bypass_count,
 		   regulator_opmode_to_str(opmode));
 
-	seq_printf(s, "%5dmV ", _regulator_get_voltage(rdev) / 1000);
+	seq_printf(s, "%5dmV ", regulator_get_voltage_rdev(rdev) / 1000);
 	seq_printf(s, "%5dmA ",
 		   _regulator_get_current_limit_unlocked(rdev) / 1000);
 
diff --git a/include/linux/regulator/coupler.h b/include/linux/regulator/coupler.h
index 98dd1f74d605..0212d6255e4e 100644
--- a/include/linux/regulator/coupler.h
+++ b/include/linux/regulator/coupler.h
@@ -52,11 +52,46 @@ struct regulator_coupler {
 
 #ifdef CONFIG_REGULATOR
 int regulator_coupler_register(struct regulator_coupler *coupler);
+const char *rdev_get_name(struct regulator_dev *rdev);
+int regulator_check_consumers(struct regulator_dev *rdev,
+			      int *min_uV, int *max_uV,
+			      suspend_state_t state);
+int regulator_check_voltage(struct regulator_dev *rdev,
+			    int *min_uV, int *max_uV);
+int regulator_get_voltage_rdev(struct regulator_dev *rdev);
+int regulator_set_voltage_rdev(struct regulator_dev *rdev,
+			       int min_uV, int max_uV,
+			       suspend_state_t state);
 #else
 static inline int regulator_coupler_register(struct regulator_coupler *coupler)
 {
 	return 0;
 }
+static inline const char *rdev_get_name(struct regulator_dev *rdev)
+{
+	return NULL;
+}
+static inline int regulator_check_consumers(struct regulator_dev *rdev,
+					    int *min_uV, int *max_uV,
+					    suspend_state_t state)
+{
+	return -EINVAL;
+}
+static inline int regulator_check_voltage(struct regulator_dev *rdev,
+					  int *min_uV, int *max_uV)
+{
+	return -EINVAL;
+}
+static inline int regulator_get_voltage_rdev(struct regulator_dev *rdev)
+{
+	return -EINVAL;
+}
+static inline int regulator_set_voltage_rdev(struct regulator_dev *rdev,
+					     int min_uV, int max_uV,
+					     suspend_state_t state)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 4f41845b340783eaec9cc2840fe3cb9a00574054 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 25 Jun 2019 12:51:25 +0100
Subject: iommu/io-pgtable: Replace IO_PGTABLE_QUIRK_NO_DMA with specific flag

IO_PGTABLE_QUIRK_NO_DMA is a bit of a misnomer, since it's really just
an indication of whether or not the page-table walker for the IOMMU is
coherent with the CPU caches. Since cache coherency is more than just a
quirk, replace the flag with its own field in the io_pgtable_cfg
structure.

Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/arm-smmu-v3.c        |  4 +---
 drivers/iommu/arm-smmu.c           |  4 +---
 drivers/iommu/io-pgtable-arm-v7s.c | 10 +++++-----
 drivers/iommu/io-pgtable-arm.c     | 19 ++++++++-----------
 drivers/iommu/ipmmu-vmsa.c         |  1 +
 include/linux/io-pgtable.h         | 11 ++++-------
 6 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 65de2458999f..8ff8f61d9e1c 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1789,13 +1789,11 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
 		.oas		= oas,
+		.coherent_walk	= smmu->features & ARM_SMMU_FEAT_COHERENCY,
 		.tlb		= &arm_smmu_gather_ops,
 		.iommu_dev	= smmu->dev,
 	};
 
-	if (smmu->features & ARM_SMMU_FEAT_COHERENCY)
-		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
-
 	if (smmu_domain->non_strict)
 		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 5e54cc0a28b3..009156bb6d42 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -895,13 +895,11 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
 		.oas		= oas,
+		.coherent_walk	= smmu->features & ARM_SMMU_FEAT_COHERENT_WALK,
 		.tlb		= smmu_domain->tlb_ops,
 		.iommu_dev	= smmu->dev,
 	};
 
-	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
-		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
-
 	if (smmu_domain->non_strict)
 		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 9a8a8870e267..8454de93e356 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -215,7 +215,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 		dev_err(dev, "Page table does not fit in PTE: %pa", &phys);
 		goto out_free;
 	}
-	if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
+	if (table && !cfg->coherent_walk) {
 		dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, dma))
 			goto out_free;
@@ -249,7 +249,7 @@ static void __arm_v7s_free_table(void *table, int lvl,
 	struct device *dev = cfg->iommu_dev;
 	size_t size = ARM_V7S_TABLE_SIZE(lvl);
 
-	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+	if (!cfg->coherent_walk)
 		dma_unmap_single(dev, __arm_v7s_dma_addr(table), size,
 				 DMA_TO_DEVICE);
 	if (lvl == 1)
@@ -261,7 +261,7 @@ static void __arm_v7s_free_table(void *table, int lvl,
 static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries,
 			       struct io_pgtable_cfg *cfg)
 {
-	if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)
+	if (cfg->coherent_walk)
 		return;
 
 	dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep),
@@ -727,7 +727,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 			    IO_PGTABLE_QUIRK_NO_PERMS |
 			    IO_PGTABLE_QUIRK_TLBI_ON_MAP |
 			    IO_PGTABLE_QUIRK_ARM_MTK_4GB |
-			    IO_PGTABLE_QUIRK_NO_DMA |
 			    IO_PGTABLE_QUIRK_NON_STRICT))
 		return NULL;
 
@@ -846,7 +845,8 @@ static int __init arm_v7s_do_selftests(void)
 		.tlb = &dummy_tlb_ops,
 		.oas = 32,
 		.ias = 32,
-		.quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA,
+		.coherent_walk = true,
+		.quirks = IO_PGTABLE_QUIRK_ARM_NS,
 		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
 	};
 	unsigned int iova, size, iova_start;
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 2454ac11aa97..91d0a4228b58 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -252,7 +252,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
 		return NULL;
 
 	pages = page_address(p);
-	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
+	if (!cfg->coherent_walk) {
 		dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, dma))
 			goto out_free;
@@ -278,7 +278,7 @@ out_free:
 static void __arm_lpae_free_pages(void *pages, size_t size,
 				  struct io_pgtable_cfg *cfg)
 {
-	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+	if (!cfg->coherent_walk)
 		dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
 				 size, DMA_TO_DEVICE);
 	free_pages((unsigned long)pages, get_order(size));
@@ -296,7 +296,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
 {
 	*ptep = pte;
 
-	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+	if (!cfg->coherent_walk)
 		__arm_lpae_sync_pte(ptep, cfg);
 }
 
@@ -374,8 +374,7 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
 
 	old = cmpxchg64_relaxed(ptep, curr, new);
 
-	if ((cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) ||
-	    (old & ARM_LPAE_PTE_SW_SYNC))
+	if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC))
 		return old;
 
 	/* Even if it's not ours, there's no point waiting; just kick it */
@@ -416,8 +415,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 		pte = arm_lpae_install_table(cptep, ptep, 0, cfg);
 		if (pte)
 			__arm_lpae_free_pages(cptep, tblsz, cfg);
-	} else if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) &&
-		   !(pte & ARM_LPAE_PTE_SW_SYNC)) {
+	} else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) {
 		__arm_lpae_sync_pte(ptep, cfg);
 	}
 
@@ -799,7 +797,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	u64 reg;
 	struct arm_lpae_io_pgtable *data;
 
-	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_NON_STRICT))
 		return NULL;
 
@@ -894,8 +892,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 	struct arm_lpae_io_pgtable *data;
 
 	/* The NS quirk doesn't apply at stage 2 */
-	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_NO_DMA |
-			    IO_PGTABLE_QUIRK_NON_STRICT))
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_NON_STRICT))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
@@ -1230,7 +1227,7 @@ static int __init arm_lpae_do_selftests(void)
 	struct io_pgtable_cfg cfg = {
 		.tlb = &dummy_tlb_ops,
 		.oas = 48,
-		.quirks = IO_PGTABLE_QUIRK_NO_DMA,
+		.coherent_walk = true,
 	};
 
 	for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 9a380c10655e..12bcb95bdaa8 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -431,6 +431,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 	 * TODO: Add support for coherent walk through CCI with DVM and remove
 	 * cache handling. For now, delegate it to the io-pgtable code.
 	 */
+	domain->cfg.coherent_walk = false;
 	domain->cfg.iommu_dev = domain->mmu->root->dev;
 
 	/*
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 76969a564831..b5a450a3bb47 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -44,6 +44,8 @@ struct iommu_gather_ops {
  *                 tables.
  * @ias:           Input address (iova) size, in bits.
  * @oas:           Output address (paddr) size, in bits.
+ * @coherent_walk  A flag to indicate whether or not page table walks made
+ *                 by the IOMMU are coherent with the CPU caches.
  * @tlb:           TLB management callbacks for this set of tables.
  * @iommu_dev:     The device representing the DMA configuration for the
  *                 page table walker.
@@ -68,11 +70,6 @@ struct io_pgtable_cfg {
 	 *	when the SoC is in "4GB mode" and they can only access the high
 	 *	remap of DRAM (0x1_00000000 to 0x1_ffffffff).
 	 *
-	 * IO_PGTABLE_QUIRK_NO_DMA: Guarantees that the tables will only ever
-	 *	be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
-	 *	software-emulated IOMMU), such that pagetable updates need not
-	 *	be treated as explicit DMA data.
-	 *
 	 * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
 	 *	on unmap, for DMA domains using the flush queue mechanism for
 	 *	delayed invalidation.
@@ -81,12 +78,12 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
 	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
 	#define IO_PGTABLE_QUIRK_ARM_MTK_4GB	BIT(3)
-	#define IO_PGTABLE_QUIRK_NO_DMA		BIT(4)
-	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(5)
+	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(4)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
 	unsigned int			oas;
+	bool				coherent_walk;
 	const struct iommu_gather_ops	*tlb;
 	struct device			*iommu_dev;
 
-- 
cgit v1.2.3


From d98849aff87911013aadb730138ab728b52fc547 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jun 2019 16:17:27 +0200
Subject: dma-direct: handle DMA_ATTR_NO_KERNEL_MAPPING in common code

DMA_ATTR_NO_KERNEL_MAPPING is generally implemented by allocating
normal cacheable pages or CMA memory, and then returning the page
pointer as the opaque handle.  Lift that code from the xtensa and
generic dma remapping implementations into the generic dma-direct
code so that we don't even call arch_dma_alloc for these allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/xtensa/kernel/pci-dma.c    |  8 +-------
 include/linux/dma-noncoherent.h |  2 ++
 kernel/dma/direct.c             | 14 ++++++++++++++
 kernel/dma/remap.c              | 13 ++-----------
 4 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 9171bff76fc4..206771277dff 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -167,10 +167,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 
 	*handle = phys_to_dma(dev, page_to_phys(page));
 
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
-		return page;
-	}
-
 #ifdef CONFIG_MMU
 	if (PageHighMem(page)) {
 		void *p;
@@ -196,9 +192,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	struct page *page;
 
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
-		page = vaddr;
-	} else if (platform_vaddr_uncached(vaddr)) {
+	if (platform_vaddr_uncached(vaddr)) {
 		page = virt_to_page(platform_vaddr_to_cached(vaddr));
 	} else {
 #ifdef CONFIG_MMU
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 732919ac5c11..53ee36ecdf37 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -28,6 +28,8 @@ static inline bool dma_alloc_need_uncached(struct device *dev,
 {
 	if (dev_is_dma_coherent(dev))
 		return false;
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
+		return false;
 	if (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
 	    (attrs & DMA_ATTR_NON_CONSISTENT))
 		return false;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index fc354f4f490b..b90e1aede743 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -138,6 +138,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+		/* remove any dirty cache lines on the kernel alias */
+		if (!PageHighMem(page))
+			arch_dma_prep_coherent(page, size);
+		/* return the page pointer as the opaque cookie */
+		return page;
+	}
+
 	if (PageHighMem(page)) {
 		/*
 		 * Depending on the cma= arguments and per-arch setup
@@ -178,6 +186,12 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 {
 	unsigned int page_order = get_order(size);
 
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+		/* cpu_addr is a struct page cookie, not a kernel address */
+		__dma_direct_free_pages(dev, size, cpu_addr);
+		return;
+	}
+
 	if (force_dma_unencrypted())
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 0207e3764d52..a594aec07882 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -202,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 	size = PAGE_ALIGN(size);
 
-	if (!gfpflags_allow_blocking(flags) &&
-	    !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
+	if (!gfpflags_allow_blocking(flags)) {
 		ret = dma_alloc_from_pool(size, &page, flags);
 		if (!ret)
 			return NULL;
@@ -217,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* remove any dirty cache lines on the kernel alias */
 	arch_dma_prep_coherent(page, size);
 
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
-		ret = page; /* opaque cookie */
-		goto done;
-	}
-
 	/* create a coherent mapping */
 	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
 			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@@ -240,10 +234,7 @@ done:
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
-		/* vaddr is a struct page cookie, not a kernel address */
-		__dma_direct_free_pages(dev, size, vaddr);
-	} else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
 		phys_addr_t phys = dma_to_phys(dev, dma_handle);
 		struct page *page = pfn_to_page(__phys_to_pfn(phys));
 
-- 
cgit v1.2.3


From b2c200e3f2fd1158f5f1c93ccb2e0a27d96c4a7a Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Thu, 18 Apr 2019 11:37:47 +0200
Subject: pwm: Add consumer device link
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a device link between the PWM consumer and the PWM provider. This
enforces the PWM user to get suspended before the PWM provider. It
allows proper synchronization of suspend/resume sequences: the PWM user
is responsible for properly stopping PWM, before the provider gets
suspended: see [1]. Add the device link in:
- of_pwm_get()
- pwm_get()
- devm_*pwm_get() variants
as it requires a reference to the device for the PWM consumer.

[1] https://lkml.org/lkml/2019/2/5/770

Suggested-by: Thierry Reding <thierry.reding@gmail.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/pwm.h |  6 ++++--
 2 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 3998ebd51db4..60b8ccc1fd7c 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -639,8 +639,35 @@ static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
 	return ERR_PTR(-EPROBE_DEFER);
 }
 
+static struct device_link *pwm_device_link_add(struct device *dev,
+					       struct pwm_device *pwm)
+{
+	struct device_link *dl;
+
+	if (!dev) {
+		/*
+		 * No device for the PWM consumer has been provided. It may
+		 * impact the PM sequence ordering: the PWM supplier may get
+		 * suspended before the consumer.
+		 */
+		dev_warn(pwm->chip->dev,
+			 "No consumer device specified to create a link to\n");
+		return NULL;
+	}
+
+	dl = device_link_add(dev, pwm->chip->dev, DL_FLAG_AUTOREMOVE_CONSUMER);
+	if (!dl) {
+		dev_err(dev, "failed to create device link to %s\n",
+			dev_name(pwm->chip->dev));
+		return ERR_PTR(-EINVAL);
+	}
+
+	return dl;
+}
+
 /**
  * of_pwm_get() - request a PWM via the PWM framework
+ * @dev: device for PWM consumer
  * @np: device node to get the PWM from
  * @con_id: consumer name
  *
@@ -658,10 +685,12 @@ static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
  * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
  * error code on failure.
  */
-struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
+struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
+			      const char *con_id)
 {
 	struct pwm_device *pwm = NULL;
 	struct of_phandle_args args;
+	struct device_link *dl;
 	struct pwm_chip *pc;
 	int index = 0;
 	int err;
@@ -692,6 +721,14 @@ struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
 	if (IS_ERR(pwm))
 		goto put;
 
+	dl = pwm_device_link_add(dev, pwm);
+	if (IS_ERR(dl)) {
+		/* of_xlate ended up calling pwm_request_from_chip() */
+		pwm_free(pwm);
+		pwm = ERR_CAST(dl);
+		goto put;
+	}
+
 	/*
 	 * If a consumer name was not given, try to look it up from the
 	 * "pwm-names" property if it exists. Otherwise use the name of
@@ -767,6 +804,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	const char *dev_id = dev ? dev_name(dev) : NULL;
 	struct pwm_device *pwm;
 	struct pwm_chip *chip;
+	struct device_link *dl;
 	unsigned int best = 0;
 	struct pwm_lookup *p, *chosen = NULL;
 	unsigned int match;
@@ -774,7 +812,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 
 	/* look up via DT first */
 	if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node)
-		return of_pwm_get(dev->of_node, con_id);
+		return of_pwm_get(dev, dev->of_node, con_id);
 
 	/*
 	 * We look up the provider in the static table typically provided by
@@ -851,6 +889,12 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	if (IS_ERR(pwm))
 		return pwm;
 
+	dl = pwm_device_link_add(dev, pwm);
+	if (IS_ERR(dl)) {
+		pwm_free(pwm);
+		return ERR_CAST(dl);
+	}
+
 	pwm->args.period = chosen->period;
 	pwm->args.polarity = chosen->polarity;
 
@@ -943,7 +987,7 @@ struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	pwm = of_pwm_get(np, con_id);
+	pwm = of_pwm_get(dev, np, con_id);
 	if (!IS_ERR(pwm)) {
 		*ptr = pwm;
 		devres_add(dev, ptr);
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index eaa5c6e3fc9f..8bf5d5f6267d 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -405,7 +405,8 @@ struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc,
 		const struct of_phandle_args *args);
 
 struct pwm_device *pwm_get(struct device *dev, const char *con_id);
-struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id);
+struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
+			      const char *con_id);
 void pwm_put(struct pwm_device *pwm);
 
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id);
@@ -493,7 +494,8 @@ static inline struct pwm_device *pwm_get(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct pwm_device *of_pwm_get(struct device_node *np,
+static inline struct pwm_device *of_pwm_get(struct device *dev,
+					    struct device_node *np,
 					    const char *con_id)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From 6282edb72bed5324352522d732080d4c1b9dfed6 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 30 May 2019 12:50:43 +0200
Subject: clocksource/drivers/exynos_mct: Increase priority over ARM arch timer

Exynos SoCs based on CA7/CA15 have 2 timer interfaces: custom Exynos MCT
(Multi Core Timer) and standard ARM Architected Timers.

There are use cases, where both timer interfaces are used simultanously.
One of such examples is using Exynos MCT for the main system timer and
ARM Architected Timers for the KVM and virtualized guests (KVM requires
arch timers).

Exynos Multi-Core Timer driver (exynos_mct) must be however started
before ARM Architected Timers (arch_timer), because they both share some
common hardware blocks (global system counter) and turning on MCT is
needed to get ARM Architected Timer working properly.

To ensure selecting Exynos MCT as the main system timer, increase MCT
timer rating. To ensure proper starting order of both timers during
suspend/resume cycle, increase MCT hotplug priority over ARM Archictected
Timers.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/exynos_mct.c | 4 ++--
 include/linux/cpuhotplug.h       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index 34bd250d46c6..6aa10cbc1d59 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -209,7 +209,7 @@ static void exynos4_frc_resume(struct clocksource *cs)
 
 static struct clocksource mct_frc = {
 	.name		= "mct-frc",
-	.rating		= 400,
+	.rating		= 450,	/* use value higher than ARM arch timer */
 	.read		= exynos4_frc_read,
 	.mask		= CLOCKSOURCE_MASK(32),
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
@@ -464,7 +464,7 @@ static int exynos4_mct_starting_cpu(unsigned int cpu)
 	evt->set_state_oneshot_stopped = set_state_shutdown;
 	evt->tick_resume = set_state_shutdown;
 	evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
-	evt->rating = 450;
+	evt->rating = 500;	/* use value higher than ARM arch timer */
 
 	exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET);
 
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5c6062206760..87c211adf49e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -116,10 +116,10 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_ACPI_STARTING,
 	CPUHP_AP_PERF_ARM_STARTING,
 	CPUHP_AP_ARM_L2X0_STARTING,
+	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
 	CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
 	CPUHP_AP_JCORE_TIMER_STARTING,
-	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_TWD_STARTING,
 	CPUHP_AP_QCOM_TIMER_STARTING,
 	CPUHP_AP_TEGRA_TIMER_STARTING,
-- 
cgit v1.2.3


From 0e58983de0d89f6ee75daab1b0ea918cfcf6ddbf Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Sun, 4 Nov 2018 19:07:02 +0200
Subject: linux/dim: Move logic to dim.h

In preparation for supporting more implementations of the DIM
algorithm, I'm moving what would become common logic to a common
library. Downstream DIM implementations will use the common lib
for their implementation.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS             |   1 +
 include/linux/dim.h     | 153 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/net_dim.h | 148 +---------------------------------------------
 3 files changed, 156 insertions(+), 146 deletions(-)
 create mode 100644 include/linux/dim.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 429c6c624861..5d4b852d9d39 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5589,6 +5589,7 @@ DYNAMIC INTERRUPT MODERATION
 M:	Tal Gilboa <talgi@mellanox.com>
 S:	Maintained
 F:	include/linux/net_dim.h
+F:	include/linux/dim.h
 
 DZ DECSTATION DZ11 SERIAL DRIVER
 M:	"Maciej W. Rozycki" <macro@linux-mips.org>
diff --git a/include/linux/dim.h b/include/linux/dim.h
new file mode 100644
index 000000000000..67d7ca40f3dd
--- /dev/null
+++ b/include/linux/dim.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef DIM_H
+#define DIM_H
+
+#include <linux/module.h>
+
+#define NET_DIM_NEVENTS 64
+
+/* more than 10% difference */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+& (BIT_ULL(bits) - 1))
+
+struct net_dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u8 cq_period_mode;
+};
+
+struct net_dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+};
+
+struct net_dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+};
+
+struct net_dim { /* Dynamic Interrupt Moderation */
+	u8 state;
+	struct net_dim_stats prev_stats;
+	struct net_dim_sample start_sample;
+	struct work_struct work;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+enum {
+	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	NET_DIM_CQ_PERIOD_NUM_MODES
+};
+
+enum {
+	NET_DIM_START_MEASURE,
+	NET_DIM_MEASURE_IN_PROGRESS,
+	NET_DIM_APPLY_NEW_PROFILE,
+};
+
+enum {
+	NET_DIM_PARKING_ON_TOP,
+	NET_DIM_PARKING_TIRED,
+	NET_DIM_GOING_RIGHT,
+	NET_DIM_GOING_LEFT,
+};
+
+enum {
+	NET_DIM_STATS_WORSE,
+	NET_DIM_STATS_SAME,
+	NET_DIM_STATS_BETTER,
+};
+
+enum {
+	NET_DIM_STEPPED,
+	NET_DIM_TOO_TIRED,
+	NET_DIM_ON_EDGE,
+};
+
+static inline bool net_dim_on_top(struct net_dim *net_dim)
+{
+	switch (net_dim->tune_state) {
+	case NET_DIM_PARKING_ON_TOP:
+	case NET_DIM_PARKING_TIRED:
+		return true;
+	case NET_DIM_GOING_RIGHT:
+		return (net_dim->steps_left > 1) && (net_dim->steps_right == 1);
+	default: /* NET_DIM_GOING_LEFT */
+		return (net_dim->steps_right > 1) && (net_dim->steps_left == 1);
+	}
+}
+
+static inline void net_dim_turn(struct net_dim *net_dim)
+{
+	switch (net_dim->tune_state) {
+	case NET_DIM_PARKING_ON_TOP:
+	case NET_DIM_PARKING_TIRED:
+		break;
+	case NET_DIM_GOING_RIGHT:
+		net_dim->tune_state = NET_DIM_GOING_LEFT;
+		net_dim->steps_left = 0;
+		break;
+	case NET_DIM_GOING_LEFT:
+		net_dim->tune_state = NET_DIM_GOING_RIGHT;
+		net_dim->steps_right = 0;
+		break;
+	}
+}
+
+static inline void net_dim_park_on_top(struct net_dim *net_dim)
+{
+	net_dim->steps_right  = 0;
+	net_dim->steps_left   = 0;
+	net_dim->tired        = 0;
+	net_dim->tune_state   = NET_DIM_PARKING_ON_TOP;
+}
+
+static inline void net_dim_park_tired(struct net_dim *net_dim)
+{
+	net_dim->steps_right  = 0;
+	net_dim->steps_left   = 0;
+	net_dim->tune_state   = NET_DIM_PARKING_TIRED;
+}
+
+static inline void
+net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+static inline void
+net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+		   struct net_dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+}
+
+#endif /* DIM_H */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index fd458389f7d1..373cda74b167 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -35,73 +35,10 @@
 #define NET_DIM_H
 
 #include <linux/module.h>
-
-struct net_dim_cq_moder {
-	u16 usec;
-	u16 pkts;
-	u8 cq_period_mode;
-};
-
-struct net_dim_sample {
-	ktime_t time;
-	u32     pkt_ctr;
-	u32     byte_ctr;
-	u16     event_ctr;
-};
-
-struct net_dim_stats {
-	int ppms; /* packets per msec */
-	int bpms; /* bytes per msec */
-	int epms; /* events per msec */
-};
-
-struct net_dim { /* Adaptive Moderation */
-	u8                                      state;
-	struct net_dim_stats                    prev_stats;
-	struct net_dim_sample                   start_sample;
-	struct work_struct                      work;
-	u8                                      profile_ix;
-	u8                                      mode;
-	u8                                      tune_state;
-	u8                                      steps_right;
-	u8                                      steps_left;
-	u8                                      tired;
-};
-
-enum {
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
-	NET_DIM_CQ_PERIOD_NUM_MODES
-};
-
-/* Adaptive moderation logic */
-enum {
-	NET_DIM_START_MEASURE,
-	NET_DIM_MEASURE_IN_PROGRESS,
-	NET_DIM_APPLY_NEW_PROFILE,
-};
-
-enum {
-	NET_DIM_PARKING_ON_TOP,
-	NET_DIM_PARKING_TIRED,
-	NET_DIM_GOING_RIGHT,
-	NET_DIM_GOING_LEFT,
-};
-
-enum {
-	NET_DIM_STATS_WORSE,
-	NET_DIM_STATS_SAME,
-	NET_DIM_STATS_BETTER,
-};
-
-enum {
-	NET_DIM_STEPPED,
-	NET_DIM_TOO_TIRED,
-	NET_DIM_ON_EDGE,
-};
+#include <linux/dim.h>
 
 #define NET_DIM_PARAMS_NUM_PROFILES 5
-/* Adaptive moderation profiles */
+/* Netdev dynamic interrupt moderation profiles */
 #define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
 #define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
 #define NET_DIM_DEF_PROFILE_CQE 1
@@ -188,36 +125,6 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline bool net_dim_on_top(struct net_dim *dim)
-{
-	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
-		return true;
-	case NET_DIM_GOING_RIGHT:
-		return (dim->steps_left > 1) && (dim->steps_right == 1);
-	default: /* NET_DIM_GOING_LEFT */
-		return (dim->steps_right > 1) && (dim->steps_left == 1);
-	}
-}
-
-static inline void net_dim_turn(struct net_dim *dim)
-{
-	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
-		break;
-	case NET_DIM_GOING_RIGHT:
-		dim->tune_state = NET_DIM_GOING_LEFT;
-		dim->steps_left = 0;
-		break;
-	case NET_DIM_GOING_LEFT:
-		dim->tune_state = NET_DIM_GOING_RIGHT;
-		dim->steps_right = 0;
-		break;
-	}
-}
-
 static inline int net_dim_step(struct net_dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
@@ -245,21 +152,6 @@ static inline int net_dim_step(struct net_dim *dim)
 	return NET_DIM_STEPPED;
 }
 
-static inline void net_dim_park_on_top(struct net_dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tired        = 0;
-	dim->tune_state   = NET_DIM_PARKING_ON_TOP;
-}
-
-static inline void net_dim_park_tired(struct net_dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tune_state   = NET_DIM_PARKING_TIRED;
-}
-
 static inline void net_dim_exit_parking(struct net_dim *dim)
 {
 	dim->tune_state = dim->profile_ix ? NET_DIM_GOING_LEFT :
@@ -267,9 +159,6 @@ static inline void net_dim_exit_parking(struct net_dim *dim)
 	net_dim_step(dim);
 }
 
-#define IS_SIGNIFICANT_DIFF(val, ref) \
-	(((100UL * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */
-
 static inline int net_dim_stats_compare(struct net_dim_stats *curr,
 					struct net_dim_stats *prev)
 {
@@ -351,39 +240,6 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 	return dim->profile_ix != prev_ix;
 }
 
-static inline void net_dim_sample(u16 event_ctr,
-				  u64 packets,
-				  u64 bytes,
-				  struct net_dim_sample *s)
-{
-	s->time	     = ktime_get();
-	s->pkt_ctr   = packets;
-	s->byte_ctr  = bytes;
-	s->event_ctr = event_ctr;
-}
-
-#define NET_DIM_NEVENTS 64
-#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
-
-static inline void net_dim_calc_stats(struct net_dim_sample *start,
-				      struct net_dim_sample *end,
-				      struct net_dim_stats *curr_stats)
-{
-	/* u32 holds up to 71 minutes, should be enough */
-	u32 delta_us = ktime_us_delta(end->time, start->time);
-	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
-	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
-			     start->byte_ctr);
-
-	if (!delta_us)
-		return;
-
-	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
-	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
-					delta_us);
-}
-
 static inline void net_dim(struct net_dim *dim,
 			   struct net_dim_sample end_sample)
 {
-- 
cgit v1.2.3


From 449986ea92412727e8c553eaa5c8d3ed884253c4 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 5 Nov 2018 11:57:10 +0200
Subject: linux/dim: Remove "net" prefix from internal DIM members

Only renaming functions and structs which aren't used by an external code.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/dim.h     | 86 ++++++++++++++++++++++++------------------------
 include/linux/net_dim.h | 87 ++++++++++++++++++++++++-------------------------
 2 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index 67d7ca40f3dd..6ee991681d62 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -6,7 +6,7 @@
 
 #include <linux/module.h>
 
-#define NET_DIM_NEVENTS 64
+#define DIM_NEVENTS 64
 
 /* more than 10% difference */
 #define IS_SIGNIFICANT_DIFF(val, ref) \
@@ -27,7 +27,7 @@ struct net_dim_sample {
 	u16 event_ctr;
 };
 
-struct net_dim_stats {
+struct dim_stats {
 	int ppms; /* packets per msec */
 	int bpms; /* bytes per msec */
 	int epms; /* events per msec */
@@ -35,7 +35,7 @@ struct net_dim_stats {
 
 struct net_dim { /* Dynamic Interrupt Moderation */
 	u8 state;
-	struct net_dim_stats prev_stats;
+	struct dim_stats prev_stats;
 	struct net_dim_sample start_sample;
 	struct work_struct work;
 	u8 profile_ix;
@@ -59,67 +59,67 @@ enum {
 };
 
 enum {
-	NET_DIM_PARKING_ON_TOP,
-	NET_DIM_PARKING_TIRED,
-	NET_DIM_GOING_RIGHT,
-	NET_DIM_GOING_LEFT,
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
 };
 
 enum {
-	NET_DIM_STATS_WORSE,
-	NET_DIM_STATS_SAME,
-	NET_DIM_STATS_BETTER,
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
 };
 
 enum {
-	NET_DIM_STEPPED,
-	NET_DIM_TOO_TIRED,
-	NET_DIM_ON_EDGE,
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
 };
 
-static inline bool net_dim_on_top(struct net_dim *net_dim)
+static inline bool dim_on_top(struct net_dim *dim)
 {
-	switch (net_dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		return true;
-	case NET_DIM_GOING_RIGHT:
-		return (net_dim->steps_left > 1) && (net_dim->steps_right == 1);
-	default: /* NET_DIM_GOING_LEFT */
-		return (net_dim->steps_right > 1) && (net_dim->steps_left == 1);
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
 	}
 }
 
-static inline void net_dim_turn(struct net_dim *net_dim)
+static inline void dim_turn(struct net_dim *dim)
 {
-	switch (net_dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		break;
-	case NET_DIM_GOING_RIGHT:
-		net_dim->tune_state = NET_DIM_GOING_LEFT;
-		net_dim->steps_left = 0;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
 		break;
-	case NET_DIM_GOING_LEFT:
-		net_dim->tune_state = NET_DIM_GOING_RIGHT;
-		net_dim->steps_right = 0;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
 		break;
 	}
 }
 
-static inline void net_dim_park_on_top(struct net_dim *net_dim)
+static inline void dim_park_on_top(struct net_dim *dim)
 {
-	net_dim->steps_right  = 0;
-	net_dim->steps_left   = 0;
-	net_dim->tired        = 0;
-	net_dim->tune_state   = NET_DIM_PARKING_ON_TOP;
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
 }
 
-static inline void net_dim_park_tired(struct net_dim *net_dim)
+static inline void dim_park_tired(struct net_dim *dim)
 {
-	net_dim->steps_right  = 0;
-	net_dim->steps_left   = 0;
-	net_dim->tune_state   = NET_DIM_PARKING_TIRED;
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
 }
 
 static inline void
@@ -132,8 +132,8 @@ net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
 }
 
 static inline void
-net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
-		   struct net_dim_stats *curr_stats)
+dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+	       struct dim_stats *curr_stats)
 {
 	/* u32 holds up to 71 minutes, should be enough */
 	u32 delta_us = ktime_us_delta(end->time, start->time);
@@ -146,7 +146,7 @@ net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
 
 	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
 	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
 					delta_us);
 }
 
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 373cda74b167..f89fa4fdfb46 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -128,67 +128,67 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 static inline int net_dim_step(struct net_dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
-		return NET_DIM_TOO_TIRED;
+		return DIM_TOO_TIRED;
 
 	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		break;
-	case NET_DIM_GOING_RIGHT:
+	case DIM_GOING_RIGHT:
 		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
-			return NET_DIM_ON_EDGE;
+			return DIM_ON_EDGE;
 		dim->profile_ix++;
 		dim->steps_right++;
 		break;
-	case NET_DIM_GOING_LEFT:
+	case DIM_GOING_LEFT:
 		if (dim->profile_ix == 0)
-			return NET_DIM_ON_EDGE;
+			return DIM_ON_EDGE;
 		dim->profile_ix--;
 		dim->steps_left++;
 		break;
 	}
 
 	dim->tired++;
-	return NET_DIM_STEPPED;
+	return DIM_STEPPED;
 }
 
 static inline void net_dim_exit_parking(struct net_dim *dim)
 {
-	dim->tune_state = dim->profile_ix ? NET_DIM_GOING_LEFT :
-					  NET_DIM_GOING_RIGHT;
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
+					  DIM_GOING_RIGHT;
 	net_dim_step(dim);
 }
 
-static inline int net_dim_stats_compare(struct net_dim_stats *curr,
-					struct net_dim_stats *prev)
+static inline int net_dim_stats_compare(struct dim_stats *curr,
+					struct dim_stats *prev)
 {
 	if (!prev->bpms)
-		return curr->bpms ? NET_DIM_STATS_BETTER :
-				    NET_DIM_STATS_SAME;
+		return curr->bpms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
-		return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
 	if (!prev->ppms)
-		return curr->ppms ? NET_DIM_STATS_BETTER :
-				    NET_DIM_STATS_SAME;
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-		return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
 	if (!prev->epms)
-		return NET_DIM_STATS_SAME;
+		return DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
-		return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
-	return NET_DIM_STATS_SAME;
+	return DIM_STATS_SAME;
 }
 
-static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
+static inline bool net_dim_decision(struct dim_stats *curr_stats,
 				    struct net_dim *dim)
 {
 	int prev_state = dim->tune_state;
@@ -197,44 +197,44 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 	int step_res;
 
 	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
+	case DIM_PARKING_ON_TOP:
 		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != NET_DIM_STATS_SAME)
+		if (stats_res != DIM_STATS_SAME)
 			net_dim_exit_parking(dim);
 		break;
 
-	case NET_DIM_PARKING_TIRED:
+	case DIM_PARKING_TIRED:
 		dim->tired--;
 		if (!dim->tired)
 			net_dim_exit_parking(dim);
 		break;
 
-	case NET_DIM_GOING_RIGHT:
-	case NET_DIM_GOING_LEFT:
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
 		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != NET_DIM_STATS_BETTER)
-			net_dim_turn(dim);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
 
-		if (net_dim_on_top(dim)) {
-			net_dim_park_on_top(dim);
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
 			break;
 		}
 
 		step_res = net_dim_step(dim);
 		switch (step_res) {
-		case NET_DIM_ON_EDGE:
-			net_dim_park_on_top(dim);
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
 			break;
-		case NET_DIM_TOO_TIRED:
-			net_dim_park_tired(dim);
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
 			break;
 		}
 
 		break;
 	}
 
-	if ((prev_state      != NET_DIM_PARKING_ON_TOP) ||
-	    (dim->tune_state != NET_DIM_PARKING_ON_TOP))
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
 		dim->prev_stats = *curr_stats;
 
 	return dim->profile_ix != prev_ix;
@@ -243,7 +243,7 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 static inline void net_dim(struct net_dim *dim,
 			   struct net_dim_sample end_sample)
 {
-	struct net_dim_stats curr_stats;
+	struct dim_stats curr_stats;
 	u16 nevents;
 
 	switch (dim->state) {
@@ -251,10 +251,9 @@ static inline void net_dim(struct net_dim *dim,
 		nevents = BIT_GAP(BITS_PER_TYPE(u16),
 				  end_sample.event_ctr,
 				  dim->start_sample.event_ctr);
-		if (nevents < NET_DIM_NEVENTS)
+		if (nevents < DIM_NEVENTS)
 			break;
-		net_dim_calc_stats(&dim->start_sample, &end_sample,
-				   &curr_stats);
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
 		if (net_dim_decision(&curr_stats, dim)) {
 			dim->state = NET_DIM_APPLY_NEW_PROFILE;
 			schedule_work(&dim->work);
-- 
cgit v1.2.3


From c002bd529d719858d4cc233431c88c9efa844053 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 5 Nov 2018 12:07:52 +0200
Subject: linux/dim: Rename externally exposed macros

Renamed macros in use by external drivers.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        |  4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++-----
 include/linux/dim.h                               | 12 ++++++------
 include/linux/net_dim.h                           | 18 +++++++++---------
 8 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index c623896e3ccb..b5e2f9d2cb71 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1099,7 +1099,7 @@ static void bcm_sysport_dim_work(struct work_struct *work)
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 /* RX and misc interrupt routine */
@@ -1440,7 +1440,7 @@ static void bcm_sysport_init_dim(struct bcm_sysport_priv *priv,
 	struct bcm_sysport_net_dim *dim = &priv->dim;
 
 	INIT_WORK(&dim->dim.work, cb);
-	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	dim->event_ctr = 0;
 	dim->packets = 0;
 	dim->bytes = 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 8314c00d7537..49de873043c0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7810,7 +7810,7 @@ static void bnxt_enable_napi(struct bnxt *bp)
 
 		if (bp->bnapi[i]->rx_ring) {
 			INIT_WORK(&cpr->dim.work, bnxt_dim_work);
-			cpr->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+			cpr->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 		}
 		napi_enable(&bp->bnapi[i]->napi);
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index afa97c8bb081..16a4588709d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -28,5 +28,5 @@ void bnxt_dim_work(struct work_struct *work)
 	cpr->rx_ring_coal.coal_bufs = cur_moder.pkts;
 
 	bnxt_hwrm_set_ring_coal(bnapi->bp, bnapi);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 374b9ff05c88..5286a46ecfb0 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1928,7 +1928,7 @@ static void bcmgenet_dim_work(struct work_struct *work)
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 /* Assign skb to RX DMA descriptor. */
@@ -2085,7 +2085,7 @@ static void bcmgenet_init_dim(struct bcmgenet_rx_ring *ring,
 	struct bcmgenet_net_dim *dim = &ring->dim;
 
 	INIT_WORK(&dim->dim.work, cb);
-	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	dim->event_ctr = 0;
 	dim->packets = 0;
 	dim->bytes = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index d67adf70a97b..a80303add7c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -38,7 +38,7 @@ mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
 			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
 {
 	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 void mlx5e_rx_dim_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 457cc39423f2..5b89e992e482 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -584,11 +584,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	switch (params->rx_cq_moderation.cq_period_mode) {
 	case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
-		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 		break;
 	case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 	default:
-		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	}
 
 	rq->page_cache.head = 0;
@@ -2151,7 +2151,7 @@ static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
 
 	mlx5e_build_common_cq_param(priv, param);
 
-	param->cq_period_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	param->cq_period_mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 }
 
 static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
@@ -4440,8 +4440,8 @@ static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
 static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode)
 {
 	return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ?
-		NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE :
-		NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+		DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 }
 
 void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 6ee991681d62..989dbbdf9d45 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -47,15 +47,15 @@ struct net_dim { /* Dynamic Interrupt Moderation */
 };
 
 enum {
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
-	NET_DIM_CQ_PERIOD_NUM_MODES
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
 };
 
 enum {
-	NET_DIM_START_MEASURE,
-	NET_DIM_MEASURE_IN_PROGRESS,
-	NET_DIM_APPLY_NEW_PROFILE,
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
 };
 
 enum {
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index f89fa4fdfb46..e0c97f824dd0 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -78,13 +78,13 @@
 }
 
 static const struct net_dim_cq_moder
-rx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_RX_EQE_PROFILES,
 	NET_DIM_RX_CQE_PROFILES,
 };
 
 static const struct net_dim_cq_moder
-tx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_TX_EQE_PROFILES,
 	NET_DIM_TX_CQE_PROFILES,
 };
@@ -101,7 +101,7 @@ net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 static inline struct net_dim_cq_moder
 net_dim_get_def_rx_moderation(u8 cq_period_mode)
 {
-	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
 			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
 	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
@@ -119,7 +119,7 @@ net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 static inline struct net_dim_cq_moder
 net_dim_get_def_tx_moderation(u8 cq_period_mode)
 {
-	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
 			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
@@ -247,7 +247,7 @@ static inline void net_dim(struct net_dim *dim,
 	u16 nevents;
 
 	switch (dim->state) {
-	case NET_DIM_MEASURE_IN_PROGRESS:
+	case DIM_MEASURE_IN_PROGRESS:
 		nevents = BIT_GAP(BITS_PER_TYPE(u16),
 				  end_sample.event_ctr,
 				  dim->start_sample.event_ctr);
@@ -255,17 +255,17 @@ static inline void net_dim(struct net_dim *dim,
 			break;
 		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
 		if (net_dim_decision(&curr_stats, dim)) {
-			dim->state = NET_DIM_APPLY_NEW_PROFILE;
+			dim->state = DIM_APPLY_NEW_PROFILE;
 			schedule_work(&dim->work);
 			break;
 		}
 		/* fall through */
-	case NET_DIM_START_MEASURE:
+	case DIM_START_MEASURE:
 		net_dim_sample(end_sample.event_ctr, end_sample.pkt_ctr, end_sample.byte_ctr,
 			       &dim->start_sample);
-		dim->state = NET_DIM_MEASURE_IN_PROGRESS;
+		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
-	case NET_DIM_APPLY_NEW_PROFILE:
+	case DIM_APPLY_NEW_PROFILE:
 		break;
 	}
 }
-- 
cgit v1.2.3


From e5b6ab02d7aa4118c9a36491633812dcc442acbe Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 14 Jan 2019 15:32:49 +0200
Subject: linux/dim: Rename net_dim_sample() to net_dim_update_sample()

In order to avoid confusion between the function and the similarly
named struct.
In preparation for removing the 'net' prefix from dim members.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 8 ++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 ++----
 include/linux/dim.h                               | 3 ++-
 include/linux/net_dim.h                           | 4 ++--
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index b5e2f9d2cb71..faaf8ade15e5 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1019,8 +1019,8 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (priv->dim.use_dim) {
-		net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
-			       priv->dim.bytes, &dim_sample);
+		net_dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
+				      priv->dim.bytes, &dim_sample);
 		net_dim(&priv->dim.dim, dim_sample);
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 49de873043c0..eaec949c367a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2130,10 +2130,10 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 	if (bp->flags & BNXT_FLAG_DIM) {
 		struct net_dim_sample dim_sample;
 
-		net_dim_sample(cpr->event_ctr,
-			       cpr->rx_packets,
-			       cpr->rx_bytes,
-			       &dim_sample);
+		net_dim_update_sample(cpr->event_ctr,
+				      cpr->rx_packets,
+				      cpr->rx_bytes,
+				      &dim_sample);
 		net_dim(&cpr->dim, dim_sample);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 5286a46ecfb0..297ae786ffed 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1909,8 +1909,8 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (ring->dim.use_dim) {
-		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
-			       ring->dim.bytes, &dim_sample);
+		net_dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
+				      ring->dim.bytes, &dim_sample);
 		net_dim(&ring->dim.dim, dim_sample);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index f9862bf75491..07432e6428cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -53,8 +53,7 @@ static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
-	net_dim_sample(sq->cq.event_ctr, stats->packets, stats->bytes,
-		       &dim_sample);
+	net_dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&sq->dim, dim_sample);
 }
 
@@ -66,8 +65,7 @@ static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
-	net_dim_sample(rq->cq.event_ctr, stats->packets, stats->bytes,
-		       &dim_sample);
+	net_dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&rq->dim, dim_sample);
 }
 
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 989dbbdf9d45..f0f20ed25497 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -123,7 +123,8 @@ static inline void dim_park_tired(struct net_dim *dim)
 }
 
 static inline void
-net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
+net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
+		      struct net_dim_sample *s)
 {
 	s->time	     = ktime_get();
 	s->pkt_ctr   = packets;
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index e0c97f824dd0..d4b40adc7fa1 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -261,8 +261,8 @@ static inline void net_dim(struct net_dim *dim,
 		}
 		/* fall through */
 	case DIM_START_MEASURE:
-		net_dim_sample(end_sample.event_ctr, end_sample.pkt_ctr, end_sample.byte_ctr,
-			       &dim->start_sample);
+		net_dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				      end_sample.byte_ctr, &dim->start_sample);
 		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
 	case DIM_APPLY_NEW_PROFILE:
-- 
cgit v1.2.3


From 8960b38932bee8db0bc9c4d8c135f21df6cdd297 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Thu, 31 Jan 2019 16:44:48 +0200
Subject: linux/dim: Rename externally used net_dim members

Removed 'net' prefix from functions and structs used by external drivers.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c         | 16 ++++++------
 drivers/net/ethernet/broadcom/bcmsysport.h         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          | 10 ++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h          |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c  |  4 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c      |  5 ++--
 drivers/net/ethernet/broadcom/genet/bcmgenet.c     | 14 +++++-----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  8 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   | 10 ++++----
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  4 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 12 ++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  8 +++---
 include/linux/dim.h                                | 21 ++++++++-------
 include/linux/net_dim.h                            | 30 +++++++++++-----------
 15 files changed, 73 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index faaf8ade15e5..c1247b2948ff 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -612,7 +612,7 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 	unsigned int i;
 
@@ -995,7 +995,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_priv *priv =
 		container_of(napi, struct bcm_sysport_priv, napi);
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_desc_rx(priv, budget);
@@ -1019,8 +1019,8 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (priv->dim.use_dim) {
-		net_dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
-				      priv->dim.bytes, &dim_sample);
+		dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
+				  priv->dim.bytes, &dim_sample);
 		net_dim(&priv->dim.dim, dim_sample);
 	}
 
@@ -1090,13 +1090,13 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 
 static void bcm_sysport_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bcm_sysport_net_dim *ndim =
 			container_of(dim, struct bcm_sysport_net_dim, dim);
 	struct bcm_sysport_priv *priv =
 			container_of(ndim, struct bcm_sysport_priv, dim);
-	struct net_dim_cq_moder cur_profile =
-			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+	struct dim_cq_moder cur_profile = net_dim_get_rx_moderation(dim->mode,
+								    dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
 	dim->state = DIM_START_MEASURE;
@@ -1449,7 +1449,7 @@ static void bcm_sysport_init_dim(struct bcm_sysport_priv *priv,
 static void bcm_sysport_init_rx_coalesce(struct bcm_sysport_priv *priv)
 {
 	struct bcm_sysport_net_dim *dim = &priv->dim;
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	usecs = priv->rx_coalesce_usecs;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index 6f3141c86436..cbe6d559d964 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -705,7 +705,7 @@ struct bcm_sysport_net_dim {
 	u16			event_ctr;
 	unsigned long		packets;
 	unsigned long		bytes;
-	struct net_dim		dim;
+	struct dim		dim;
 };
 
 /* Software view of the TX ring */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index eaec949c367a..c54668004600 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2128,12 +2128,12 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 		}
 	}
 	if (bp->flags & BNXT_FLAG_DIM) {
-		struct net_dim_sample dim_sample;
+		struct dim_sample dim_sample;
 
-		net_dim_update_sample(cpr->event_ctr,
-				      cpr->rx_packets,
-				      cpr->rx_bytes,
-				      &dim_sample);
+		dim_update_sample(cpr->event_ctr,
+				  cpr->rx_packets,
+				  cpr->rx_bytes,
+				  &dim_sample);
 		net_dim(&cpr->dim, dim_sample);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index eca36dd6b751..a552c5539cc9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -809,7 +809,7 @@ struct bnxt_cp_ring_info {
 	u64			rx_bytes;
 	u64			event_ctr;
 
-	struct net_dim		dim;
+	struct dim		dim;
 
 	union {
 		struct tx_cmp	*cp_desc_ring[MAX_CP_PAGES];
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
index 94e208e9789f..3d1d53fbb135 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -21,7 +21,7 @@ static ssize_t debugfs_dim_read(struct file *filep,
 				char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	struct net_dim *dim = filep->private_data;
+	struct dim *dim = filep->private_data;
 	int len;
 	char *buf;
 
@@ -61,7 +61,7 @@ static const struct file_operations debugfs_dim_fops = {
 	.read = debugfs_dim_read,
 };
 
-static struct dentry *debugfs_dim_ring_init(struct net_dim *dim, int ring_idx,
+static struct dentry *debugfs_dim_ring_init(struct dim *dim, int ring_idx,
 					    struct dentry *dd)
 {
 	static char qname[16];
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 16a4588709d1..11605f9fa61e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -13,15 +13,14 @@
 
 void bnxt_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim,
-					   work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bnxt_cp_ring_info *cpr = container_of(dim,
 						     struct bnxt_cp_ring_info,
 						     dim);
 	struct bnxt_napi *bnapi = container_of(cpr,
 					       struct bnxt_napi,
 					       cp_ring);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	cpr->rx_ring_coal.coal_ticks = cur_moder.usec;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 297ae786ffed..b7f8f4f1088f 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -643,7 +643,7 @@ static void bcmgenet_set_rx_coalesce(struct bcmgenet_rx_ring *ring,
 static void bcmgenet_set_ring_rx_coalesce(struct bcmgenet_rx_ring *ring,
 					  struct ethtool_coalesce *ec)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	ring->rx_coalesce_usecs = ec->rx_coalesce_usecs;
@@ -1898,7 +1898,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_rx_ring *ring = container_of(napi,
 			struct bcmgenet_rx_ring, napi);
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 	unsigned int work_done;
 
 	work_done = bcmgenet_desc_rx(ring, budget);
@@ -1909,8 +1909,8 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (ring->dim.use_dim) {
-		net_dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
-				      ring->dim.bytes, &dim_sample);
+		dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
+				  ring->dim.bytes, &dim_sample);
 		net_dim(&ring->dim.dim, dim_sample);
 	}
 
@@ -1919,12 +1919,12 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 
 static void bcmgenet_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bcmgenet_net_dim *ndim =
 			container_of(dim, struct bcmgenet_net_dim, dim);
 	struct bcmgenet_rx_ring *ring =
 			container_of(ndim, struct bcmgenet_rx_ring, dim);
-	struct net_dim_cq_moder cur_profile =
+	struct dim_cq_moder cur_profile =
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
@@ -2094,7 +2094,7 @@ static void bcmgenet_init_dim(struct bcmgenet_rx_ring *ring,
 static void bcmgenet_init_rx_coalesce(struct bcmgenet_rx_ring *ring)
 {
 	struct bcmgenet_net_dim *dim = &ring->dim;
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	usecs = ring->rx_coalesce_usecs;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 14b49612aa86..6e418d9c3706 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -581,7 +581,7 @@ struct bcmgenet_net_dim {
 	u16		event_ctr;
 	unsigned long	packets;
 	unsigned long	bytes;
-	struct net_dim	dim;
+	struct dim	dim;
 };
 
 struct bcmgenet_rx_ring {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3a183d690e23..11efd6e4bdc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -238,9 +238,9 @@ struct mlx5e_params {
 	u16 num_channels;
 	u8  num_tc;
 	bool rx_cqe_compress_def;
-	struct net_dim_cq_moder rx_cq_moderation;
-	struct net_dim_cq_moder tx_cq_moderation;
 	bool tunneled_offload_en;
+	struct dim_cq_moder rx_cq_moderation;
+	struct dim_cq_moder tx_cq_moderation;
 	bool lro_en;
 	u8  tx_min_inline_mode;
 	bool vlan_strip_disable;
@@ -356,7 +356,7 @@ struct mlx5e_txqsq {
 	/* dirtied @completion */
 	u16                        cc;
 	u32                        dma_fifo_cc;
-	struct net_dim             dim; /* Adaptive Moderation */
+	struct dim                 dim; /* Adaptive Moderation */
 
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
@@ -595,7 +595,7 @@ struct mlx5e_rq {
 	int                    ix;
 	unsigned int           hw_mtu;
 
-	struct net_dim         dim; /* Dynamic Interrupt Moderation */
+	struct dim         dim; /* Dynamic Interrupt Moderation */
 
 	/* XDP */
 	struct bpf_prog       *xdp_prog;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index a80303add7c0..ba3c1be9f2d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -34,7 +34,7 @@
 #include "en.h"
 
 static void
-mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
+mlx5e_complete_dim_work(struct dim *dim, struct dim_cq_moder moder,
 			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
 {
 	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
@@ -43,9 +43,9 @@ mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
 
 void mlx5e_rx_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5e_complete_dim_work(dim, cur_moder, rq->mdev, &rq->cq.mcq);
@@ -53,9 +53,9 @@ void mlx5e_rx_dim_work(struct work_struct *work)
 
 void mlx5e_tx_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct mlx5e_txqsq *sq = container_of(dim, struct mlx5e_txqsq, dim);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index dd764e0471f2..c853b657739c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -466,7 +466,7 @@ static int mlx5e_set_channels(struct net_device *dev,
 int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
-	struct net_dim_cq_moder *rx_moder, *tx_moder;
+	struct dim_cq_moder *rx_moder, *tx_moder;
 
 	if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
 		return -EOPNOTSUPP;
@@ -521,7 +521,7 @@ mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesc
 int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
-	struct net_dim_cq_moder *rx_moder, *tx_moder;
+	struct dim_cq_moder *rx_moder, *tx_moder;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_channels new_channels = {};
 	int err = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5b89e992e482..9705101c0235 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1569,7 +1569,7 @@ static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
 }
 
 static int mlx5e_open_cq(struct mlx5e_channel *c,
-			 struct net_dim_cq_moder moder,
+			 struct dim_cq_moder moder,
 			 struct mlx5e_cq_param *param,
 			 struct mlx5e_cq *cq)
 {
@@ -1774,7 +1774,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 			      struct mlx5e_channel **cp)
 {
 	int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, ix));
-	struct net_dim_cq_moder icocq_moder = {0, 0};
+	struct dim_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
 	struct mlx5e_channel *c;
 	unsigned int irq;
@@ -4411,9 +4411,9 @@ static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 		link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw;
 }
 
-static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
+static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
@@ -4424,9 +4424,9 @@ static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 	return moder;
 }
 
-static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
+static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 07432e6428cf..e6c434efbd46 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -48,24 +48,24 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 {
 	struct mlx5e_sq_stats *stats = sq->stats;
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
-	net_dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
+	dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&sq->dim, dim_sample);
 }
 
 static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 {
 	struct mlx5e_rq_stats *stats = rq->stats;
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
-	net_dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
+	dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&rq->dim, dim_sample);
 }
 
diff --git a/include/linux/dim.h b/include/linux/dim.h
index f0f20ed25497..60e5074a7cc0 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -14,13 +14,13 @@
 #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
 & (BIT_ULL(bits) - 1))
 
-struct net_dim_cq_moder {
+struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
 	u8 cq_period_mode;
 };
 
-struct net_dim_sample {
+struct dim_sample {
 	ktime_t time;
 	u32 pkt_ctr;
 	u32 byte_ctr;
@@ -33,10 +33,10 @@ struct dim_stats {
 	int epms; /* events per msec */
 };
 
-struct net_dim { /* Dynamic Interrupt Moderation */
+struct dim { /* Dynamic Interrupt Moderation */
 	u8 state;
 	struct dim_stats prev_stats;
-	struct net_dim_sample start_sample;
+	struct dim_sample start_sample;
 	struct work_struct work;
 	u8 profile_ix;
 	u8 mode;
@@ -77,7 +77,7 @@ enum {
 	DIM_ON_EDGE,
 };
 
-static inline bool dim_on_top(struct net_dim *dim)
+static inline bool dim_on_top(struct dim *dim)
 {
 	switch (dim->tune_state) {
 	case DIM_PARKING_ON_TOP:
@@ -90,7 +90,7 @@ static inline bool dim_on_top(struct net_dim *dim)
 	}
 }
 
-static inline void dim_turn(struct net_dim *dim)
+static inline void dim_turn(struct dim *dim)
 {
 	switch (dim->tune_state) {
 	case DIM_PARKING_ON_TOP:
@@ -107,7 +107,7 @@ static inline void dim_turn(struct net_dim *dim)
 	}
 }
 
-static inline void dim_park_on_top(struct net_dim *dim)
+static inline void dim_park_on_top(struct dim *dim)
 {
 	dim->steps_right  = 0;
 	dim->steps_left   = 0;
@@ -115,7 +115,7 @@ static inline void dim_park_on_top(struct net_dim *dim)
 	dim->tune_state   = DIM_PARKING_ON_TOP;
 }
 
-static inline void dim_park_tired(struct net_dim *dim)
+static inline void dim_park_tired(struct dim *dim)
 {
 	dim->steps_right  = 0;
 	dim->steps_left   = 0;
@@ -123,8 +123,7 @@ static inline void dim_park_tired(struct net_dim *dim)
 }
 
 static inline void
-net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
-		      struct net_dim_sample *s)
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 {
 	s->time	     = ktime_get();
 	s->pkt_ctr   = packets;
@@ -133,7 +132,7 @@ net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
 }
 
 static inline void
-dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	       struct dim_stats *curr_stats)
 {
 	/* u32 holds up to 71 minutes, should be enough */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index d4b40adc7fa1..4e009ec193ef 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -77,28 +77,28 @@
 	{64, 32}   \
 }
 
-static const struct net_dim_cq_moder
+static const struct dim_cq_moder
 rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_RX_EQE_PROFILES,
 	NET_DIM_RX_CQE_PROFILES,
 };
 
-static const struct net_dim_cq_moder
+static const struct dim_cq_moder
 tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_TX_EQE_PROFILES,
 	NET_DIM_TX_CQE_PROFILES,
 };
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_def_rx_moderation(u8 cq_period_mode)
 {
 	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
@@ -107,16 +107,16 @@ net_dim_get_def_rx_moderation(u8 cq_period_mode)
 	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_def_tx_moderation(u8 cq_period_mode)
 {
 	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
@@ -125,7 +125,7 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline int net_dim_step(struct net_dim *dim)
+static inline int net_dim_step(struct dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
 		return DIM_TOO_TIRED;
@@ -152,7 +152,7 @@ static inline int net_dim_step(struct net_dim *dim)
 	return DIM_STEPPED;
 }
 
-static inline void net_dim_exit_parking(struct net_dim *dim)
+static inline void net_dim_exit_parking(struct dim *dim)
 {
 	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
 					  DIM_GOING_RIGHT;
@@ -189,7 +189,7 @@ static inline int net_dim_stats_compare(struct dim_stats *curr,
 }
 
 static inline bool net_dim_decision(struct dim_stats *curr_stats,
-				    struct net_dim *dim)
+				    struct dim *dim)
 {
 	int prev_state = dim->tune_state;
 	int prev_ix = dim->profile_ix;
@@ -240,8 +240,8 @@ static inline bool net_dim_decision(struct dim_stats *curr_stats,
 	return dim->profile_ix != prev_ix;
 }
 
-static inline void net_dim(struct net_dim *dim,
-			   struct net_dim_sample end_sample)
+static inline void net_dim(struct dim *dim,
+			   struct dim_sample end_sample)
 {
 	struct dim_stats curr_stats;
 	u16 nevents;
@@ -261,8 +261,8 @@ static inline void net_dim(struct net_dim *dim,
 		}
 		/* fall through */
 	case DIM_START_MEASURE:
-		net_dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
-				      end_sample.byte_ctr, &dim->start_sample);
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
 		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
 	case DIM_APPLY_NEW_PROFILE:
-- 
cgit v1.2.3


From 4f75da3666c0c572967729a2401ac650be5581b6 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Thu, 10 Jan 2019 17:33:17 +0200
Subject: linux/dim: Move implementation to .c files

Moved all logic from dim.h and net_dim.h to dim.c and net_dim.c.
This is both more structurally appealing and would allow to only
expose externally used functions.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS                                       |   2 +-
 drivers/net/ethernet/broadcom/Kconfig             |   1 +
 drivers/net/ethernet/broadcom/bcmsysport.h        |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |   2 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.h    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |   2 +-
 include/linux/dim.h                               | 319 +++++++++++++++++-----
 include/linux/net_dim.h                           | 273 ------------------
 lib/Kconfig                                       |   8 +
 lib/Makefile                                      |   1 +
 lib/dim/Makefile                                  |   9 +
 lib/dim/dim.c                                     |  74 +++++
 lib/dim/net_dim.c                                 | 190 +++++++++++++
 17 files changed, 547 insertions(+), 345 deletions(-)
 delete mode 100644 include/linux/net_dim.h
 create mode 100644 lib/dim/Makefile
 create mode 100644 lib/dim/dim.c
 create mode 100644 lib/dim/net_dim.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5d4b852d9d39..f78dd16195e3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5588,8 +5588,8 @@ F:	include/linux/dynamic_debug.h
 DYNAMIC INTERRUPT MODERATION
 M:	Tal Gilboa <talgi@mellanox.com>
 S:	Maintained
-F:	include/linux/net_dim.h
 F:	include/linux/dim.h
+F:	lib/dim/
 
 DZ DECSTATION DZ11 SERIAL DRIVER
 M:	"Maciej W. Rozycki" <macro@linux-mips.org>
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index b123509d385f..2e4a8c7237ef 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -8,6 +8,7 @@ config NET_VENDOR_BROADCOM
 	default y
 	depends on (SSB_POSSIBLE && HAS_DMA) || PCI || BCM63XX || \
 		   SIBYTE_SB1xxx_SOC
+	select DIMLIB
 	---help---
 	  If you have a network (Ethernet) chipset belonging to this class,
 	  say Y.
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index cbe6d559d964..f6677a02d811 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -14,7 +14,7 @@
 #include <linux/bitmap.h>
 #include <linux/ethtool.h>
 #include <linux/if_vlan.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 /* Receive/transmit descriptor format */
 #define DESC_ADDR_HI_STATUS_LEN	0x00
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index a552c5539cc9..54c01705f3bd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -23,7 +23,7 @@
 #include <net/devlink.h>
 #include <net/dst_metadata.h>
 #include <net/xdp.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 struct tx_bd {
 	__le32 tx_bd_len_flags_type;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
index 3d1d53fbb135..61393f351a77 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include "bnxt_hsi.h"
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "bnxt.h"
 #include "bnxt_debugfs.h"
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 11605f9fa61e..6f6576dc417a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -7,7 +7,7 @@
  * the Free Software Foundation.
  */
 
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "bnxt_hsi.h"
 #include "bnxt.h"
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 6e418d9c3706..b2f05e47dc65 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,7 +16,7 @@
 #include <linux/mii.h>
 #include <linux/if_vlan.h>
 #include <linux/phy.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 /* total number of Buffer Descriptors, same for Rx/Tx */
 #define TOTAL_DESC				256
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 2391e3cfb56b..7845aa5bf6be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -34,6 +34,7 @@ config MLX5_CORE_EN
 	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
 	select PAGE_POOL
+	select DIMLIB
 	default n
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 11efd6e4bdc3..abf42d3aabe9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -48,7 +48,7 @@
 #include <linux/rhashtable.h>
 #include <net/switchdev.h>
 #include <net/xdp.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include <linux/bits.h>
 #include "wq.h"
 #include "mlx5_core.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index ba3c1be9f2d3..ca9cfbf57d8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -30,7 +30,7 @@
  * SOFTWARE.
  */
 
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "en.h"
 
 static void
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 60e5074a7cc0..f48ede3e0322 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -6,20 +6,49 @@
 
 #include <linux/module.h>
 
+/**
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
 #define DIM_NEVENTS 64
 
-/* more than 10% difference */
+/**
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
 #define IS_SIGNIFICANT_DIFF(val, ref) \
 	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/**
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
 #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
-& (BIT_ULL(bits) - 1))
+		& (BIT_ULL(bits) - 1))
 
+/**
+ * Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @cq_period_mode: CQ priod count mode (from CQE/EQE)
+ */
 struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
 	u8 cq_period_mode;
 };
 
+/**
+ * Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ */
 struct dim_sample {
 	ktime_t time;
 	u32 pkt_ctr;
@@ -27,13 +56,36 @@ struct dim_sample {
 	u16 event_ctr;
 };
 
+/**
+ * Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ */
 struct dim_stats {
-	int ppms; /* packets per msec */
-	int bpms; /* bytes per msec */
-	int epms; /* events per msec */
+	int ppms;
+	int bpms;
+	int epms;
 };
 
-struct dim { /* Dynamic Interrupt Moderation */
+/**
+ * Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @work: Work to perform on action required
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
 	u8 state;
 	struct dim_stats prev_stats;
 	struct dim_sample start_sample;
@@ -46,18 +98,49 @@ struct dim { /* Dynamic Interrupt Moderation */
 	u8 tired;
 };
 
+/**
+ * enum dim_cq_period_mode
+ *
+ * These are the modes for CQ period count.
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
 enum {
 	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
 	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
 	DIM_CQ_PERIOD_NUM_MODES
 };
 
+/**
+ * enum dim_state
+ *
+ * These are the DIM algorithm states.
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
 enum {
 	DIM_START_MEASURE,
 	DIM_MEASURE_IN_PROGRESS,
 	DIM_APPLY_NEW_PROFILE,
 };
 
+/**
+ * enum dim_tune_state
+ *
+ * These are the DIM algorithm tune states.
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
 enum {
 	DIM_PARKING_ON_TOP,
 	DIM_PARKING_TIRED,
@@ -65,63 +148,95 @@ enum {
 	DIM_GOING_LEFT,
 };
 
+/**
+ * enum dim_stats_state
+ *
+ * These are the DIM algorithm statistics states.
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_WORSE: Current iteration shows same performance than before
+ * @DIM_STATS_WORSE: Current iteration shows better performance than before
+ */
 enum {
 	DIM_STATS_WORSE,
 	DIM_STATS_SAME,
 	DIM_STATS_BETTER,
 };
 
+/**
+ * enum dim_step_result
+ *
+ * These are the DIM algorithm step results.
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
 enum {
 	DIM_STEPPED,
 	DIM_TOO_TIRED,
 	DIM_ON_EDGE,
 };
 
-static inline bool dim_on_top(struct dim *dim)
-{
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		return true;
-	case DIM_GOING_RIGHT:
-		return (dim->steps_left > 1) && (dim->steps_right == 1);
-	default: /* DIM_GOING_LEFT */
-		return (dim->steps_right > 1) && (dim->steps_left == 1);
-	}
-}
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
 
-static inline void dim_turn(struct dim *dim)
-{
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		break;
-	case DIM_GOING_RIGHT:
-		dim->tune_state = DIM_GOING_LEFT;
-		dim->steps_left = 0;
-		break;
-	case DIM_GOING_LEFT:
-		dim->tune_state = DIM_GOING_RIGHT;
-		dim->steps_right = 0;
-		break;
-	}
-}
+/**
+ *	dim_turn - change profile alterning direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
 
-static inline void dim_park_on_top(struct dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tired        = 0;
-	dim->tune_state   = DIM_PARKING_ON_TOP;
-}
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
 
-static inline void dim_park_tired(struct dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tune_state   = DIM_PARKING_TIRED;
-}
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
 
+/**
+ *	dim_update_sample - set a sample's fields with give values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
 static inline void
 dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 {
@@ -131,23 +246,99 @@ dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 	s->event_ctr = event_ctr;
 }
 
-static inline void
-dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
-	       struct dim_stats *curr_stats)
-{
-	/* u32 holds up to 71 minutes, should be enough */
-	u32 delta_us = ktime_us_delta(end->time, start->time);
-	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
-	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
-			     start->byte_ctr);
-
-	if (!delta_us)
-		return;
-
-	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
-	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
-					delta_us);
+/* Net DIM */
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 }
 
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order to decide on next
+ * required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
 #endif /* DIM_H */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
deleted file mode 100644
index 4e009ec193ef..000000000000
--- a/include/linux/net_dim.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- * Copyright (c) 2017-2018, Broadcom Limited. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef NET_DIM_H
-#define NET_DIM_H
-
-#include <linux/module.h>
-#include <linux/dim.h>
-
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-/* Netdev dynamic interrupt moderation profiles */
-#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
-#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
-/* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */
-#define NET_DIM_RX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-}
-
-#define NET_DIM_RX_CQE_PROFILES { \
-	{2,  256},             \
-	{8,  128},             \
-	{16, 64},              \
-	{32, 64},              \
-	{64, 64}               \
-}
-
-#define NET_DIM_TX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
-}
-
-#define NET_DIM_TX_CQE_PROFILES { \
-	{5,  128},  \
-	{8,  64},  \
-	{16, 32},  \
-	{32, 32},  \
-	{64, 32}   \
-}
-
-static const struct dim_cq_moder
-rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_RX_EQE_PROFILES,
-	NET_DIM_RX_CQE_PROFILES,
-};
-
-static const struct dim_cq_moder
-tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_TX_EQE_PROFILES,
-	NET_DIM_TX_CQE_PROFILES,
-};
-
-static inline struct dim_cq_moder
-net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
-{
-	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
-
-	cq_moder.cq_period_mode = cq_period_mode;
-	return cq_moder;
-}
-
-static inline struct dim_cq_moder
-net_dim_get_def_rx_moderation(u8 cq_period_mode)
-{
-	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
-			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
-
-	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
-}
-
-static inline struct dim_cq_moder
-net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
-{
-	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
-
-	cq_moder.cq_period_mode = cq_period_mode;
-	return cq_moder;
-}
-
-static inline struct dim_cq_moder
-net_dim_get_def_tx_moderation(u8 cq_period_mode)
-{
-	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
-			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
-
-	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
-}
-
-static inline int net_dim_step(struct dim *dim)
-{
-	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
-		return DIM_TOO_TIRED;
-
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		break;
-	case DIM_GOING_RIGHT:
-		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
-			return DIM_ON_EDGE;
-		dim->profile_ix++;
-		dim->steps_right++;
-		break;
-	case DIM_GOING_LEFT:
-		if (dim->profile_ix == 0)
-			return DIM_ON_EDGE;
-		dim->profile_ix--;
-		dim->steps_left++;
-		break;
-	}
-
-	dim->tired++;
-	return DIM_STEPPED;
-}
-
-static inline void net_dim_exit_parking(struct dim *dim)
-{
-	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
-					  DIM_GOING_RIGHT;
-	net_dim_step(dim);
-}
-
-static inline int net_dim_stats_compare(struct dim_stats *curr,
-					struct dim_stats *prev)
-{
-	if (!prev->bpms)
-		return curr->bpms ? DIM_STATS_BETTER :
-				    DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
-		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	if (!prev->ppms)
-		return curr->ppms ? DIM_STATS_BETTER :
-				    DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	if (!prev->epms)
-		return DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
-		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	return DIM_STATS_SAME;
-}
-
-static inline bool net_dim_decision(struct dim_stats *curr_stats,
-				    struct dim *dim)
-{
-	int prev_state = dim->tune_state;
-	int prev_ix = dim->profile_ix;
-	int stats_res;
-	int step_res;
-
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != DIM_STATS_SAME)
-			net_dim_exit_parking(dim);
-		break;
-
-	case DIM_PARKING_TIRED:
-		dim->tired--;
-		if (!dim->tired)
-			net_dim_exit_parking(dim);
-		break;
-
-	case DIM_GOING_RIGHT:
-	case DIM_GOING_LEFT:
-		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != DIM_STATS_BETTER)
-			dim_turn(dim);
-
-		if (dim_on_top(dim)) {
-			dim_park_on_top(dim);
-			break;
-		}
-
-		step_res = net_dim_step(dim);
-		switch (step_res) {
-		case DIM_ON_EDGE:
-			dim_park_on_top(dim);
-			break;
-		case DIM_TOO_TIRED:
-			dim_park_tired(dim);
-			break;
-		}
-
-		break;
-	}
-
-	if (prev_state != DIM_PARKING_ON_TOP ||
-	    dim->tune_state != DIM_PARKING_ON_TOP)
-		dim->prev_stats = *curr_stats;
-
-	return dim->profile_ix != prev_ix;
-}
-
-static inline void net_dim(struct dim *dim,
-			   struct dim_sample end_sample)
-{
-	struct dim_stats curr_stats;
-	u16 nevents;
-
-	switch (dim->state) {
-	case DIM_MEASURE_IN_PROGRESS:
-		nevents = BIT_GAP(BITS_PER_TYPE(u16),
-				  end_sample.event_ctr,
-				  dim->start_sample.event_ctr);
-		if (nevents < DIM_NEVENTS)
-			break;
-		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
-		if (net_dim_decision(&curr_stats, dim)) {
-			dim->state = DIM_APPLY_NEW_PROFILE;
-			schedule_work(&dim->work);
-			break;
-		}
-		/* fall through */
-	case DIM_START_MEASURE:
-		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
-				  end_sample.byte_ctr, &dim->start_sample);
-		dim->state = DIM_MEASURE_IN_PROGRESS;
-		break;
-	case DIM_APPLY_NEW_PROFILE:
-		break;
-	}
-}
-
-#endif /* NET_DIM_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 90623a0e1942..78ddb9526b62 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -562,6 +562,14 @@ config SIGNATURE
 	  Digital signature verification. Currently only RSA is supported.
 	  Implementation is done using GnuPG MPI library
 
+config DIMLIB
+	bool "DIM library"
+	default y
+	help
+	  Dynamic Interrupt Moderation library.
+	  Implements an algorithm for dynamically change CQ modertion values
+	  according to run time performance.
+
 #
 # libfdt files, only selected if needed.
 #
diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..dcb558c7554d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -202,6 +202,7 @@ obj-$(CONFIG_GLOB) += glob.o
 obj-$(CONFIG_GLOB_SELFTEST) += globtest.o
 
 obj-$(CONFIG_MPILIB) += mpi/
+obj-$(CONFIG_DIMLIB) += dim/
 obj-$(CONFIG_SIGNATURE) += digsig.o
 
 lib-$(CONFIG_CLZ_TAB) += clz_tab.o
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
new file mode 100644
index 000000000000..160afe288df0
--- /dev/null
+++ b/lib/dim/Makefile
@@ -0,0 +1,9 @@
+#
+# DIM Dynamic Interrupt Moderation library
+#
+
+obj-$(CONFIG_DIMLIB) = net_dim.o
+
+net_dim-y = \
+	dim.o		\
+	net_dim.o
diff --git a/lib/dim/dim.c b/lib/dim/dim.c
new file mode 100644
index 000000000000..17d5236759bd
--- /dev/null
+++ b/lib/dim/dim.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+EXPORT_SYMBOL(dim_on_top);
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+EXPORT_SYMBOL(dim_turn);
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+EXPORT_SYMBOL(dim_park_on_top);
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+EXPORT_SYMBOL(dim_park_tired);
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+}
+EXPORT_SYMBOL(dim_calc_stats);
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
new file mode 100644
index 000000000000..5bcc902c5388
--- /dev/null
+++ b/lib/dim/net_dim.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+EXPORT_SYMBOL(net_dim_get_rx_moderation);
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+EXPORT_SYMBOL(net_dim_get_def_rx_moderation);
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+EXPORT_SYMBOL(net_dim_get_tx_moderation);
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+EXPORT_SYMBOL(net_dim_get_def_tx_moderation);
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(net_dim);
-- 
cgit v1.2.3


From 398c2b05bbee21cc172dfff017c0351d4d14e04c Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Thu, 22 Nov 2018 09:51:17 +0200
Subject: linux/dim: Add completions count to dim_sample

Added a measurement of completions per/msec to allow for completion based
dim algorithms.

In order to use dynamic interrupt moderation with RDMA we need to have a
different measurment than packets per second. This change is meant to
prepare for adding a new DIM method.

All drivers that use net_dim and thus do not need a completion count will
have the completions set to 0.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/dim.h | 28 +++++++++++++++++++++++++---
 lib/dim/dim.c       |  9 +++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index f48ede3e0322..aa9bdd47a648 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -37,6 +37,7 @@
 struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
+	u16 comps;
 	u8 cq_period_mode;
 };
 
@@ -54,6 +55,7 @@ struct dim_sample {
 	u32 pkt_ctr;
 	u32 byte_ctr;
 	u16 event_ctr;
+	u32 comp_ctr;
 };
 
 /**
@@ -65,9 +67,11 @@ struct dim_sample {
  * @epms: Events per msec
  */
 struct dim_stats {
-	int ppms;
-	int bpms;
-	int epms;
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
 };
 
 /**
@@ -89,6 +93,7 @@ struct dim {
 	u8 state;
 	struct dim_stats prev_stats;
 	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
 	struct work_struct work;
 	u8 profile_ix;
 	u8 mode;
@@ -246,6 +251,23 @@ dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 	s->event_ctr = event_ctr;
 }
 
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
 /* Net DIM */
 
 /*
diff --git a/lib/dim/dim.c b/lib/dim/dim.c
index 17d5236759bd..439d641ec796 100644
--- a/lib/dim/dim.c
+++ b/lib/dim/dim.c
@@ -62,6 +62,8 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
 	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
 			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
 
 	if (!delta_us)
 		return;
@@ -70,5 +72,12 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
 	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
 					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio =
+				(curr_stats->cpms * 100) / curr_stats->epms;
+	else
+		curr_stats->cpe_ratio = 0;
+
 }
 EXPORT_SYMBOL(dim_calc_stats);
-- 
cgit v1.2.3


From 2f25528e4edddc6eddd42c8d41c9c9e341c8b9da Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:25 +0200
Subject: clk: Add clk_bulk_get_optional() function

clk_bulk_get_optional() allows to get a group of clocks where one
or more is optional.  For a not available clock, e.g. not specifed
in the clock consumer node in DT, its respective struct clk pointer
will be NULL.  This allows for operating on a group of returned
clocks (struct clk_bulk_data array) with existing clk_bulk* APIs.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-bulk.c | 23 ++++++++++++++++++++---
 include/linux/clk.h    | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 06499568cf07..524bf9a53098 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -75,8 +75,8 @@ void clk_bulk_put(int num_clks, struct clk_bulk_data *clks)
 }
 EXPORT_SYMBOL_GPL(clk_bulk_put);
 
-int __must_check clk_bulk_get(struct device *dev, int num_clks,
-			      struct clk_bulk_data *clks)
+static int __clk_bulk_get(struct device *dev, int num_clks,
+			  struct clk_bulk_data *clks, bool optional)
 {
 	int ret;
 	int i;
@@ -88,10 +88,14 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
 		clks[i].clk = clk_get(dev, clks[i].id);
 		if (IS_ERR(clks[i].clk)) {
 			ret = PTR_ERR(clks[i].clk);
+			clks[i].clk = NULL;
+
+			if (ret == -ENOENT && optional)
+				continue;
+
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "Failed to get clk '%s': %d\n",
 					clks[i].id, ret);
-			clks[i].clk = NULL;
 			goto err;
 		}
 	}
@@ -103,8 +107,21 @@ err:
 
 	return ret;
 }
+
+int __must_check clk_bulk_get(struct device *dev, int num_clks,
+			      struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL(clk_bulk_get);
 
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(clk_bulk_get_optional);
+
 void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks)
 {
 	if (IS_ERR_OR_NULL(clks))
diff --git a/include/linux/clk.h b/include/linux/clk.h
index f689fc58d7be..1b50e7d1675c 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -332,6 +332,19 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
  */
 int __must_check clk_bulk_get_all(struct device *dev,
 				  struct clk_bulk_data **clks);
+
+/**
+ * clk_bulk_get_optional - lookup and obtain a number of references to clock producer
+ * @dev: device for clock "consumer"
+ * @num_clks: the number of clk_bulk_data
+ * @clks: the clk_bulk_data table of consumer
+ *
+ * Behaves the same as clk_bulk_get() except where there is no clock producer.
+ * In this case, instead of returning -ENOENT, the function returns 0 and
+ * NULL for a clk for which a clock producer could not be determined.
+ */
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -718,6 +731,12 @@ static inline int __must_check clk_bulk_get(struct device *dev, int num_clks,
 	return 0;
 }
 
+static inline int __must_check clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check clk_bulk_get_all(struct device *dev,
 					 struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3


From 9bd5ef0bd8743700d9adffb6fbb1baa346575457 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:26 +0200
Subject: clk: Add devm_clk_bulk_get_optional() function

Add managed version of the clk_bulk_get_optional() helper function.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
[sboyd@kernel.org: Mark __devm_clk_bulk_get() static]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c | 22 +++++++++++++++++++---
 include/linux/clk.h      | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index daa1fc8fba53..be160764911b 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -52,8 +52,8 @@ static void devm_clk_bulk_release(struct device *dev, void *res)
 	clk_bulk_put(devres->num_clks, devres->clks);
 }
 
-int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
-		      struct clk_bulk_data *clks)
+static int __devm_clk_bulk_get(struct device *dev, int num_clks,
+			       struct clk_bulk_data *clks, bool optional)
 {
 	struct clk_bulk_devres *devres;
 	int ret;
@@ -63,7 +63,10 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 	if (!devres)
 		return -ENOMEM;
 
-	ret = clk_bulk_get(dev, num_clks, clks);
+	if (optional)
+		ret = clk_bulk_get_optional(dev, num_clks, clks);
+	else
+		ret = clk_bulk_get(dev, num_clks, clks);
 	if (!ret) {
 		devres->clks = clks;
 		devres->num_clks = num_clks;
@@ -74,8 +77,21 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 
 	return ret;
 }
+
+int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL_GPL(devm_clk_bulk_get);
 
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_bulk_get_optional);
+
 int __must_check devm_clk_bulk_get_all(struct device *dev,
 				       struct clk_bulk_data **clks)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 1b50e7d1675c..5e7b2dd84965 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -359,6 +359,28 @@ int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
  */
 int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 				   struct clk_bulk_data *clks);
+/**
+ * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks
+ * @dev: device for clock "consumer"
+ * @clks: pointer to the clk_bulk_data table of consumer
+ *
+ * Behaves the same as devm_clk_bulk_get() except where there is no clock
+ * producer.  In this case, instead of returning -ENOENT, the function returns
+ * NULL for given clk. It is assumed all clocks in clk_bulk_data are optional.
+ *
+ * Returns 0 if all clocks specified in clk_bulk_data table are obtained
+ * successfully or for any clk there was no clk provider available, otherwise
+ * returns valid IS_ERR() condition containing errno.
+ * The implementation uses @dev and @clk_bulk_data.id to determine the
+ * clock consumer, and thereby the clock producer.
+ * The clock returned is stored in each @clk_bulk_data.clk field.
+ *
+ * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_bulk_get should not be called from within interrupt context.
+ */
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+					    struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get_all - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -760,6 +782,12 @@ static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clk
 	return 0;
 }
 
+static inline int __must_check devm_clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check devm_clk_bulk_get_all(struct device *dev,
 						     struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3


From 0966648dd5a5f4037d29d233866b7a4db39d07f7 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 25 Jun 2019 14:38:46 +0900
Subject: usb: renesas_usbhs: remove notify_hotplug callback

The notify_hotplug callback was supported in v3.10, but the last user
(armadillo800eva) was removed by the commit 1fa59bda21c7 ("ARM: shmobile:
Remove legacy board code for Armadillo-800 EVA"). So, this patch
removes it.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c     | 16 ++++------------
 drivers/usb/renesas_usbhs/common.h     |  2 ++
 drivers/usb/renesas_usbhs/mod.c        |  3 ++-
 drivers/usb/renesas_usbhs/mod_gadget.c |  3 ++-
 include/linux/usb/renesas_usbhs.h      | 26 +-------------------------
 5 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index ebbe322182bd..f6b136a4f91e 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -3,6 +3,7 @@
  * Renesas USB driver
  *
  * Copyright (C) 2011 Renesas Solutions Corp.
+ * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
  */
 #include <linux/clk.h>
@@ -513,7 +514,7 @@ static void usbhsc_notify_hotplug(struct work_struct *work)
 	usbhsc_hotplug(priv);
 }
 
-static int usbhsc_drvcllbck_notify_hotplug(struct platform_device *pdev)
+int usbhsc_schedule_notify_hotplug(struct platform_device *pdev)
 {
 	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
 	int delay = usbhs_get_dparam(priv, detection_delay);
@@ -667,7 +668,6 @@ static struct renesas_usbhs_platform_info *usbhs_parse_dt(struct device *dev)
 static int usbhs_probe(struct platform_device *pdev)
 {
 	struct renesas_usbhs_platform_info *info = renesas_usbhs_get_info(pdev);
-	struct renesas_usbhs_driver_callback *dfunc;
 	struct usbhs_priv *priv;
 	struct resource *res, *irq_res;
 	int ret;
@@ -721,10 +721,6 @@ static int usbhs_probe(struct platform_device *pdev)
 	}
 	priv->pfunc = info->platform_callback;
 
-	/* set driver callback functions for platform */
-	dfunc			= &info->driver_callback;
-	dfunc->notify_hotplug	= usbhsc_drvcllbck_notify_hotplug;
-
 	/* set default param if platform doesn't have */
 	if (!priv->dparam.pipe_configs) {
 		priv->dparam.pipe_configs = usbhsc_default_pipe;
@@ -818,7 +814,7 @@ static int usbhs_probe(struct platform_device *pdev)
 	/*
 	 * manual call notify_hotplug for cold plug
 	 */
-	usbhsc_drvcllbck_notify_hotplug(pdev);
+	usbhsc_schedule_notify_hotplug(pdev);
 
 	dev_info(&pdev->dev, "probed\n");
 
@@ -843,13 +839,9 @@ probe_end_pipe_exit:
 static int usbhs_remove(struct platform_device *pdev)
 {
 	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
-	struct renesas_usbhs_platform_info *info = renesas_usbhs_get_info(pdev);
-	struct renesas_usbhs_driver_callback *dfunc = &info->driver_callback;
 
 	dev_dbg(&pdev->dev, "usb remove\n");
 
-	dfunc->notify_hotplug = NULL;
-
 	/* power off */
 	if (!usbhs_get_dparam(priv, runtime_pwctrl))
 		usbhsc_power_ctrl(priv, 0);
@@ -894,7 +886,7 @@ static __maybe_unused int usbhsc_resume(struct device *dev)
 
 	usbhs_platform_call(priv, phy_reset, pdev);
 
-	usbhsc_drvcllbck_notify_hotplug(pdev);
+	usbhsc_schedule_notify_hotplug(pdev);
 
 	return 0;
 }
diff --git a/drivers/usb/renesas_usbhs/common.h b/drivers/usb/renesas_usbhs/common.h
index de74ebd1a347..b2b21fbb7ce2 100644
--- a/drivers/usb/renesas_usbhs/common.h
+++ b/drivers/usb/renesas_usbhs/common.h
@@ -3,6 +3,7 @@
  * Renesas USB driver
  *
  * Copyright (C) 2011 Renesas Solutions Corp.
+ * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
  */
 #ifndef RENESAS_USB_DRIVER_H
@@ -317,6 +318,7 @@ void usbhs_bus_send_sof_enable(struct usbhs_priv *priv);
 void usbhs_bus_send_reset(struct usbhs_priv *priv);
 int usbhs_bus_get_speed(struct usbhs_priv *priv);
 int usbhs_vbus_ctrl(struct usbhs_priv *priv, int enable);
+int usbhsc_schedule_notify_hotplug(struct platform_device *pdev);
 
 /*
  * frame
diff --git a/drivers/usb/renesas_usbhs/mod.c b/drivers/usb/renesas_usbhs/mod.c
index 7475c4f64724..540472abb23a 100644
--- a/drivers/usb/renesas_usbhs/mod.c
+++ b/drivers/usb/renesas_usbhs/mod.c
@@ -3,6 +3,7 @@
  * Renesas USB driver
  *
  * Copyright (C) 2011 Renesas Solutions Corp.
+ * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
  */
 #include <linux/interrupt.h>
@@ -41,7 +42,7 @@ static int usbhsm_autonomy_irq_vbus(struct usbhs_priv *priv,
 {
 	struct platform_device *pdev = usbhs_priv_to_pdev(priv);
 
-	renesas_usbhs_call_notify_hotplug(pdev);
+	usbhsc_schedule_notify_hotplug(pdev);
 
 	return 0;
 }
diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index 59cac40aafcc..0c1e8fa528fc 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -3,6 +3,7 @@
  * Renesas USB driver
  *
  * Copyright (C) 2011 Renesas Solutions Corp.
+ * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
  */
 #include <linux/delay.h>
@@ -1023,7 +1024,7 @@ static int usbhsg_vbus_session(struct usb_gadget *gadget, int is_active)
 
 	gpriv->vbus_active = !!is_active;
 
-	renesas_usbhs_call_notify_hotplug(pdev);
+	usbhsc_schedule_notify_hotplug(pdev);
 
 	return 0;
 }
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index b2cba7c74444..ac601be95ec0 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -3,6 +3,7 @@
  * Renesas USB
  *
  * Copyright (C) 2011 Renesas Solutions Corp.
+ * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
  *
  * This program is distributed in the hope that it will be useful,
@@ -32,17 +33,6 @@ enum {
 	USBHS_MAX,
 };
 
-/*
- * callback functions table for driver
- *
- * These functions are called from platform for driver.
- * Callback function's pointer will be set before
- * renesas_usbhs_platform_callback :: hardware_init was called
- */
-struct renesas_usbhs_driver_callback {
-	int (*notify_hotplug)(struct platform_device *pdev);
-};
-
 /*
  * callback functions for platform
  *
@@ -213,12 +203,6 @@ struct renesas_usbhs_platform_info {
 	 */
 	struct renesas_usbhs_platform_callback	platform_callback;
 
-	/*
-	 * driver set these callback functions pointer.
-	 * platform can use it on callback functions
-	 */
-	struct renesas_usbhs_driver_callback	driver_callback;
-
 	/*
 	 * option:
 	 *
@@ -232,12 +216,4 @@ struct renesas_usbhs_platform_info {
  */
 #define renesas_usbhs_get_info(pdev)\
 	((struct renesas_usbhs_platform_info *)(pdev)->dev.platform_data)
-
-#define renesas_usbhs_call_notify_hotplug(pdev)				\
-	({								\
-		struct renesas_usbhs_driver_callback *dc;		\
-		dc = &(renesas_usbhs_get_info(pdev)->driver_callback);	\
-		if (dc && dc->notify_hotplug)				\
-			dc->notify_hotplug(pdev);			\
-	})
 #endif /* RENESAS_USB_H */
-- 
cgit v1.2.3


From df9f2c278b69fcd8b04c89612310f0036d21ec4c Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 25 Jun 2019 14:38:49 +0900
Subject: usb: renesas_usbhs: Use a specific flag instead of type for
 multi_clks

To remove the type of renesas_usbhs_driver_param in the future, this
patch uses a specific flag "multi_clks".

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 8 +++-----
 include/linux/usb/renesas_usbhs.h  | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 739fe4b4c1d5..530e2eb7ab08 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -288,11 +288,7 @@ static void usbhsc_set_buswait(struct usbhs_priv *priv)
 
 static bool usbhsc_is_multi_clks(struct usbhs_priv *priv)
 {
-	if (priv->dparam.type == USBHS_TYPE_RCAR_GEN3 ||
-	    priv->dparam.type == USBHS_TYPE_RCAR_GEN3_WITH_PLL)
-		return true;
-
-	return false;
+	return priv->dparam.multi_clks;
 }
 
 static int usbhsc_clk_get(struct device *dev, struct usbhs_priv *priv)
@@ -544,6 +540,7 @@ static const struct usbhs_of_data rcar_gen3_data = {
 	.param = {
 		.type = USBHS_TYPE_RCAR_GEN3,
 		.has_usb_dmac = 1,
+		.multi_clks = 1,
 		.pipe_configs = usbhsc_new_pipe,
 		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
 	}
@@ -554,6 +551,7 @@ static const struct usbhs_of_data rcar_gen3_with_pll_data = {
 	.param = {
 		.type = USBHS_TYPE_RCAR_GEN3_WITH_PLL,
 		.has_usb_dmac = 1,
+		.multi_clks = 1,
 		.pipe_configs = usbhsc_new_pipe,
 		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
 	}
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index ac601be95ec0..e249c217cad1 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -181,6 +181,7 @@ struct renesas_usbhs_driver_param {
 	u32 has_cnen:1;
 	u32 cfifo_byte_addr:1; /* CFIFO is byte addressable */
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
+	u32 multi_clks:1;
 };
 
 #define USBHS_TYPE_RCAR_GEN2		1
-- 
cgit v1.2.3


From a4027b409fa98dc47418dacd3dcb5c99c5a76e4d Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 25 Jun 2019 14:38:50 +0900
Subject: usb: renesas_usbhs: Remove type member from
 renesas_usbhs_driver_param

Now no one uses the type member so that this patch removes it.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 5 -----
 include/linux/usb/renesas_usbhs.h  | 7 -------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 530e2eb7ab08..18727561fa65 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -528,7 +528,6 @@ int usbhsc_schedule_notify_hotplug(struct platform_device *pdev)
 static const struct usbhs_of_data rcar_gen2_data = {
 	.platform_callback = &usbhs_rcar2_ops,
 	.param = {
-		.type = USBHS_TYPE_RCAR_GEN2,
 		.has_usb_dmac = 1,
 		.pipe_configs = usbhsc_new_pipe,
 		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
@@ -538,7 +537,6 @@ static const struct usbhs_of_data rcar_gen2_data = {
 static const struct usbhs_of_data rcar_gen3_data = {
 	.platform_callback = &usbhs_rcar3_ops,
 	.param = {
-		.type = USBHS_TYPE_RCAR_GEN3,
 		.has_usb_dmac = 1,
 		.multi_clks = 1,
 		.pipe_configs = usbhsc_new_pipe,
@@ -549,7 +547,6 @@ static const struct usbhs_of_data rcar_gen3_data = {
 static const struct usbhs_of_data rcar_gen3_with_pll_data = {
 	.platform_callback = &usbhs_rcar3_with_pll_ops,
 	.param = {
-		.type = USBHS_TYPE_RCAR_GEN3_WITH_PLL,
 		.has_usb_dmac = 1,
 		.multi_clks = 1,
 		.pipe_configs = usbhsc_new_pipe,
@@ -560,7 +557,6 @@ static const struct usbhs_of_data rcar_gen3_with_pll_data = {
 static const struct usbhs_of_data rza1_data = {
 	.platform_callback = &usbhs_rza1_ops,
 	.param = {
-		.type = USBHS_TYPE_RZA1,
 		.pipe_configs = usbhsc_new_pipe,
 		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
 	}
@@ -569,7 +565,6 @@ static const struct usbhs_of_data rza1_data = {
 static const struct usbhs_of_data rza2_data = {
 	.platform_callback = &usbhs_rza2_ops,
 	.param = {
-		.type = USBHS_TYPE_RZA2,
 		.has_cnen = 1,
 		.cfifo_byte_addr = 1,
 		.pipe_configs = usbhsc_new_pipe,
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index e249c217cad1..fee84b7d4d2a 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -170,7 +170,6 @@ struct renesas_usbhs_driver_param {
 	 */
 	int pio_dma_border; /* default is 64byte */
 
-	uintptr_t type;
 	u32 enable_gpio;
 
 	/*
@@ -184,12 +183,6 @@ struct renesas_usbhs_driver_param {
 	u32 multi_clks:1;
 };
 
-#define USBHS_TYPE_RCAR_GEN2		1
-#define USBHS_TYPE_RCAR_GEN3		2
-#define USBHS_TYPE_RCAR_GEN3_WITH_PLL	3
-#define USBHS_TYPE_RZA1			4
-#define USBHS_TYPE_RZA2			5
-
 /*
  * option:
  *
-- 
cgit v1.2.3


From 98e86506c24932a30f50ffcfcbc98b04e3c9bc60 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 25 Jun 2019 14:38:52 +0900
Subject: usb: renesas_usbhs: Add has_new_pipe_configs flag

In the future, each struct renesas_usbhs_driver_param is stored on
the each platform related source code (e.g. rcar3.c). So, to simplify
the source code, this patch adds a new flag has_new_pipe_configs.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 20 +++++++++-----------
 include/linux/usb/renesas_usbhs.h  |  1 +
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 35b06e7d4eb4..f9476a07b0e9 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -529,8 +529,7 @@ static const struct usbhs_of_data rcar_gen2_data = {
 	.platform_callback = &usbhs_rcar2_ops,
 	.param = {
 		.has_usb_dmac = 1,
-		.pipe_configs = usbhsc_new_pipe,
-		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+		.has_new_pipe_configs = 1,
 	}
 };
 
@@ -539,8 +538,7 @@ static const struct usbhs_of_data rcar_gen3_data = {
 	.param = {
 		.has_usb_dmac = 1,
 		.multi_clks = 1,
-		.pipe_configs = usbhsc_new_pipe,
-		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+		.has_new_pipe_configs = 1,
 	}
 };
 
@@ -549,16 +547,14 @@ static const struct usbhs_of_data rcar_gen3_with_pll_data = {
 	.param = {
 		.has_usb_dmac = 1,
 		.multi_clks = 1,
-		.pipe_configs = usbhsc_new_pipe,
-		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+		.has_new_pipe_configs = 1,
 	}
 };
 
 static const struct usbhs_of_data rza1_data = {
 	.platform_callback = &usbhs_rza1_ops,
 	.param = {
-		.pipe_configs = usbhsc_new_pipe,
-		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+		.has_new_pipe_configs = 1,
 	}
 };
 
@@ -567,8 +563,7 @@ static const struct usbhs_of_data rza2_data = {
 	.param = {
 		.has_cnen = 1,
 		.cfifo_byte_addr = 1,
-		.pipe_configs = usbhsc_new_pipe,
-		.pipe_size = ARRAY_SIZE(usbhsc_new_pipe),
+		.has_new_pipe_configs = 1,
 	}
 };
 
@@ -715,7 +710,10 @@ static int usbhs_probe(struct platform_device *pdev)
 	priv->pfunc = info->platform_callback;
 
 	/* set default param if platform doesn't have */
-	if (!priv->dparam.pipe_configs) {
+	if (usbhs_get_dparam(priv, has_new_pipe_configs)) {
+		priv->dparam.pipe_configs = usbhsc_new_pipe;
+		priv->dparam.pipe_size = ARRAY_SIZE(usbhsc_new_pipe);
+	} else if (!priv->dparam.pipe_configs) {
 		priv->dparam.pipe_configs = usbhsc_default_pipe;
 		priv->dparam.pipe_size = ARRAY_SIZE(usbhsc_default_pipe);
 	}
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index fee84b7d4d2a..6914475bbc86 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -181,6 +181,7 @@ struct renesas_usbhs_driver_param {
 	u32 cfifo_byte_addr:1; /* CFIFO is byte addressable */
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 	u32 multi_clks:1;
+	u32 has_new_pipe_configs:1;
 };
 
 /*
-- 
cgit v1.2.3


From bcc61569997b2188ba89db43b5b991da01ea2d18 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 25 Jun 2019 13:32:41 +0200
Subject: cpufreq: Move the IS_ENABLED(CPU_THERMAL) macro into a stub

cpufreq_online() and cpufreq_offline() [un]register the driver as
a cooling device. This is done if the driver is flagged as a cooling
device in addition with an IS_ENABLED() check to compile out the branching
code.

Group this test in a stub function added in the cpufreq header instead
of having the IS_ENABLED() in the code.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 6 ++----
 include/linux/cpufreq.h   | 6 ++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 85ff958e01f1..aee024e42618 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1378,8 +1378,7 @@ static int cpufreq_online(unsigned int cpu)
 	if (cpufreq_driver->ready)
 		cpufreq_driver->ready(policy);
 
-	if (IS_ENABLED(CONFIG_CPU_THERMAL) &&
-	    cpufreq_driver->flags & CPUFREQ_IS_COOLING_DEV)
+	if (cpufreq_thermal_control_enabled(cpufreq_driver))
 		policy->cdev = of_cpufreq_cooling_register(policy);
 
 	pr_debug("initialization complete\n");
@@ -1469,8 +1468,7 @@ static int cpufreq_offline(unsigned int cpu)
 		goto unlock;
 	}
 
-	if (IS_ENABLED(CONFIG_CPU_THERMAL) &&
-	    cpufreq_driver->flags & CPUFREQ_IS_COOLING_DEV) {
+	if (cpufreq_thermal_control_enabled(cpufreq_driver)) {
 		cpufreq_cooling_unregister(policy->cdev);
 		policy->cdev = NULL;
 	}
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d01a74fbc4db..a1467aa7f58b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -409,6 +409,12 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 const char *cpufreq_get_current_driver(void);
 void *cpufreq_get_driver_data(void);
 
+static inline int cpufreq_thermal_control_enabled(struct cpufreq_driver *drv)
+{
+	return IS_ENABLED(CONFIG_CPU_THERMAL) &&
+		(drv->flags & CPUFREQ_IS_COOLING_DEV);
+}
+
 static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy,
 		unsigned int min, unsigned int max)
 {
-- 
cgit v1.2.3


From 4a6ef8e37c4d9a40f09438068da1734fd965bd75 Mon Sep 17 00:00:00 2001
From: Nikolaus Voss <nikolaus.voss@loewensteinmedical.de>
Date: Wed, 12 Jun 2019 10:36:07 +0200
Subject: pwm: Add support referencing PWMs from ACPI

In analogy to referencing a GPIO using the "gpios" property from ACPI,
support referencing a PWM using the "pwms" property.

ACPI entries must look like
 Package () {"pwms", Package ()
     { <PWM device reference>, <PWM index>, <PWM period> [, <PWM flags>]}}

In contrast to the DT implementation, only _one_ PWM entry in the "pwms"
property is supported. As a consequence "pwm-names"-property and
con_id lookup aren't supported.

Support for ACPI is added via the firmware-node framework which is an
abstraction layer on top of ACPI/DT. To keep this patch clean, DT and
ACPI paths are kept separate. The firmware-node framework could be used
to unify both paths in a future patch.

To support leds-pwm driver, an additional method devm_fwnode_pwm_get()
which supports both ACPI and DT configuration is exported.

Signed-off-by: Nikolaus Voss <nikolaus.voss@loewensteinmedical.de>
[thierry.reding@gmail.com: fix build failures for !ACPI]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pwm.h |  10 +++++
 2 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 60b8ccc1fd7c..c1dbb5f6ebd2 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -19,6 +19,7 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/pwm.h>
 #include <linux/radix-tree.h>
@@ -750,6 +751,85 @@ put:
 }
 EXPORT_SYMBOL_GPL(of_pwm_get);
 
+#if IS_ENABLED(CONFIG_ACPI)
+static struct pwm_chip *device_to_pwmchip(struct device *dev)
+{
+	struct pwm_chip *chip;
+
+	mutex_lock(&pwm_lock);
+
+	list_for_each_entry(chip, &pwm_chips, list) {
+		struct acpi_device *adev = ACPI_COMPANION(chip->dev);
+
+		if ((chip->dev == dev) || (adev && &adev->dev == dev)) {
+			mutex_unlock(&pwm_lock);
+			return chip;
+		}
+	}
+
+	mutex_unlock(&pwm_lock);
+
+	return ERR_PTR(-EPROBE_DEFER);
+}
+#endif
+
+/**
+ * acpi_pwm_get() - request a PWM via parsing "pwms" property in ACPI
+ * @fwnode: firmware node to get the "pwm" property from
+ *
+ * Returns the PWM device parsed from the fwnode and index specified in the
+ * "pwms" property or a negative error-code on failure.
+ * Values parsed from the device tree are stored in the returned PWM device
+ * object.
+ *
+ * This is analogous to of_pwm_get() except con_id is not yet supported.
+ * ACPI entries must look like
+ * Package () {"pwms", Package ()
+ *     { <PWM device reference>, <PWM index>, <PWM period> [, <PWM flags>]}}
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
+ */
+static struct pwm_device *acpi_pwm_get(struct fwnode_handle *fwnode)
+{
+	struct pwm_device *pwm = ERR_PTR(-ENODEV);
+#if IS_ENABLED(CONFIG_ACPI)
+	struct fwnode_reference_args args;
+	struct acpi_device *acpi;
+	struct pwm_chip *chip;
+	int ret;
+
+	memset(&args, 0, sizeof(args));
+
+	ret = __acpi_node_get_property_reference(fwnode, "pwms", 0, 3, &args);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	acpi = to_acpi_device_node(args.fwnode);
+	if (!acpi)
+		return ERR_PTR(-EINVAL);
+
+	if (args.nargs < 2)
+		return ERR_PTR(-EPROTO);
+
+	chip = device_to_pwmchip(&acpi->dev);
+	if (IS_ERR(chip))
+		return ERR_CAST(chip);
+
+	pwm = pwm_request_from_chip(chip, args.args[0], NULL);
+	if (IS_ERR(pwm))
+		return pwm;
+
+	pwm->args.period = args.args[1];
+	pwm->args.polarity = PWM_POLARITY_NORMAL;
+
+	if (args.nargs > 2 && args.args[2] & PWM_POLARITY_INVERTED)
+		pwm->args.polarity = PWM_POLARITY_INVERSED;
+#endif
+
+	return pwm;
+}
+
 /**
  * pwm_add_table() - register PWM device consumers
  * @table: array of consumers to register
@@ -814,6 +894,10 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node)
 		return of_pwm_get(dev, dev->of_node, con_id);
 
+	/* then lookup via ACPI */
+	if (dev && is_acpi_node(dev->fwnode))
+		return acpi_pwm_get(dev->fwnode);
+
 	/*
 	 * We look up the provider in the static table typically provided by
 	 * board setup code. We first try to lookup the consumer device by
@@ -999,6 +1083,44 @@ struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(devm_of_pwm_get);
 
+/**
+ * devm_fwnode_pwm_get() - request a resource managed PWM from firmware node
+ * @dev: device for PWM consumer
+ * @fwnode: firmware node to get the PWM from
+ * @con_id: consumer name
+ *
+ * Returns the PWM device parsed from the firmware node. See of_pwm_get() and
+ * acpi_pwm_get() for a detailed description.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
+ */
+struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
+				       struct fwnode_handle *fwnode,
+				       const char *con_id)
+{
+	struct pwm_device **ptr, *pwm = ERR_PTR(-ENODEV);
+
+	ptr = devres_alloc(devm_pwm_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	if (is_of_node(fwnode))
+		pwm = of_pwm_get(dev, to_of_node(fwnode), con_id);
+	else if (is_acpi_node(fwnode))
+		pwm = acpi_pwm_get(fwnode);
+
+	if (!IS_ERR(pwm)) {
+		*ptr = pwm;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return pwm;
+}
+EXPORT_SYMBOL_GPL(devm_fwnode_pwm_get);
+
 static int devm_pwm_match(struct device *dev, void *res, void *data)
 {
 	struct pwm_device **p = res;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 8bf5d5f6267d..24632a7a7d11 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -412,6 +412,9 @@ void pwm_put(struct pwm_device *pwm);
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id);
 struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 				   const char *con_id);
+struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
+				       struct fwnode_handle *fwnode,
+				       const char *con_id);
 void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 #else
 static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
@@ -518,6 +521,13 @@ static inline struct pwm_device *devm_of_pwm_get(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct pwm_device *
+devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode,
+		    const char *con_id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void devm_pwm_put(struct device *dev, struct pwm_device *pwm)
 {
 }
-- 
cgit v1.2.3


From 550113d4e9f5c7b62be760fc01178c9e0139c1f4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 24 Jun 2019 19:04:02 +0200
Subject: i2c: add newly exported functions to the header, too

Nobody (including me) noticed that these functions were exported but not
added to the header :/

Fixes: 7159dbdae3c5 ("i2c: core: improve return value handling of i2c_new_device and i2c_new_dummy")
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 5 ++---
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index e7d5ada40d48..f1949d1e2b54 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -729,7 +729,7 @@ static int i2c_dev_irq_from_resources(const struct resource *resources,
  * This returns the new i2c client, which may be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
+struct i2c_client *
 i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 {
 	struct i2c_client	*client;
@@ -895,8 +895,7 @@ static struct i2c_driver dummy_driver = {
  * This returns the new i2c client, which should be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
-i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
+struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
 {
 	struct i2c_board_info info = {
 		I2C_BOARD_INFO("dummy", address),
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d8f9060179d0..fa5552c2307b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -442,6 +442,9 @@ struct i2c_board_info {
 extern struct i2c_client *
 i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
 
+extern struct i2c_client *
+i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
+
 /* If you don't know the exact address of an I2C device, use this variant
  * instead, which can probe for device presence in a list of possible
  * addresses. The "probe" callback function is optional. If it is provided,
@@ -463,6 +466,9 @@ extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short ad
 extern struct i2c_client *
 i2c_new_dummy(struct i2c_adapter *adap, u16 address);
 
+extern struct i2c_client *
+i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address);
+
 extern struct i2c_client *
 devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address);
 
-- 
cgit v1.2.3


From 4ae4916b56435d1d5066616120f9ff907bd96b86 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Tue, 28 May 2019 10:59:15 -0700
Subject: i40e: fix 'Unknown bps' in dmesg for 2.5Gb/5Gb speeds

This patch fixes 'NIC Link is Up, Unknown bps' message in dmesg
for 2.5Gb/5Gb speeds. This problem is fixed by adding constants
for VIRTCHNL_LINK_SPEED_2_5GB and VIRTCHNL_LINK_SPEED_5GB cases
in the i40e_virtchnl_link_speed() function.

Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_prototype.h | 4 ++++
 include/linux/avf/virtchnl.h                     | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 882627073dce..eac88bcc6c06 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -350,6 +350,10 @@ i40e_virtchnl_link_speed(enum i40e_aq_link_speed link_speed)
 		return VIRTCHNL_LINK_SPEED_100MB;
 	case I40E_LINK_SPEED_1GB:
 		return VIRTCHNL_LINK_SPEED_1GB;
+	case I40E_LINK_SPEED_2_5GB:
+		return VIRTCHNL_LINK_SPEED_2_5GB;
+	case I40E_LINK_SPEED_5GB:
+		return VIRTCHNL_LINK_SPEED_5GB;
 	case I40E_LINK_SPEED_10GB:
 		return VIRTCHNL_LINK_SPEED_10GB;
 	case I40E_LINK_SPEED_40GB:
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 191621ff7594..ca956b672ac0 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -61,12 +61,14 @@ enum virtchnl_status_code {
 #define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM
 #define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED
 
+#define VIRTCHNL_LINK_SPEED_2_5GB_SHIFT		0x0
 #define VIRTCHNL_LINK_SPEED_100MB_SHIFT		0x1
 #define VIRTCHNL_LINK_SPEED_1000MB_SHIFT	0x2
 #define VIRTCHNL_LINK_SPEED_10GB_SHIFT		0x3
 #define VIRTCHNL_LINK_SPEED_40GB_SHIFT		0x4
 #define VIRTCHNL_LINK_SPEED_20GB_SHIFT		0x5
 #define VIRTCHNL_LINK_SPEED_25GB_SHIFT		0x6
+#define VIRTCHNL_LINK_SPEED_5GB_SHIFT		0x7
 
 enum virtchnl_link_speed {
 	VIRTCHNL_LINK_SPEED_UNKNOWN	= 0,
@@ -76,6 +78,8 @@ enum virtchnl_link_speed {
 	VIRTCHNL_LINK_SPEED_40GB	= BIT(VIRTCHNL_LINK_SPEED_40GB_SHIFT),
 	VIRTCHNL_LINK_SPEED_20GB	= BIT(VIRTCHNL_LINK_SPEED_20GB_SHIFT),
 	VIRTCHNL_LINK_SPEED_25GB	= BIT(VIRTCHNL_LINK_SPEED_25GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_2_5GB	= BIT(VIRTCHNL_LINK_SPEED_2_5GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_5GB		= BIT(VIRTCHNL_LINK_SPEED_5GB_SHIFT),
 };
 
 /* for hsplit_0 field of Rx HMC context */
-- 
cgit v1.2.3


From 65c0f2c1663649217455a73d48b1c303f133180a Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:47:50 +0000
Subject: net/mlx5: Introduce vport metadata matching bits and enum constants

When a dual-port VHCA sends a RoCE packet on its non-native port, and
the packet arrives to its affiliated vport FDB, a mismatch might occur
on the rules that match the packet source vport. So we replace the
match on source port with the match on metadata that was configured in
ingress ACL, and that metadata will be passed further also to the NIC
RX table of the eswitch manager.

Introduce vport metadata matching bits and enum constants as a pre-step
towards metadata matching.
    o metadata type C registers in the misc parameters 2 fields.
    o esw_uplink_ingress_acl bit in esw cap. If it set, the device supports
      ingress ACL for the uplink vport.
    o fdb_to_vport_reg_* bits in flow table cap and esw vport context, to
      support propagating the metadata to the nic rx through the loopback
      path.
    o flow_source in flow context, to indicate the known origin of packets.
    o enum constants, to support the above bits.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 56 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e3c154b573a2..d4409654f760 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -528,7 +528,21 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 
 	struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp;
 
-	u8         reserved_at_80[0x100];
+	u8         metadata_reg_c_7[0x20];
+
+	u8         metadata_reg_c_6[0x20];
+
+	u8         metadata_reg_c_5[0x20];
+
+	u8         metadata_reg_c_4[0x20];
+
+	u8         metadata_reg_c_3[0x20];
+
+	u8         metadata_reg_c_2[0x20];
+
+	u8         metadata_reg_c_1[0x20];
+
+	u8         metadata_reg_c_0[0x20];
 
 	u8         metadata_reg_a[0x20];
 
@@ -636,8 +650,22 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 	u8         reserved_at_e00[0x7200];
 };
 
+enum {
+	MLX5_FDB_TO_VPORT_REG_C_0 = 0x01,
+	MLX5_FDB_TO_VPORT_REG_C_1 = 0x02,
+	MLX5_FDB_TO_VPORT_REG_C_2 = 0x04,
+	MLX5_FDB_TO_VPORT_REG_C_3 = 0x08,
+	MLX5_FDB_TO_VPORT_REG_C_4 = 0x10,
+	MLX5_FDB_TO_VPORT_REG_C_5 = 0x20,
+	MLX5_FDB_TO_VPORT_REG_C_6 = 0x40,
+	MLX5_FDB_TO_VPORT_REG_C_7 = 0x80,
+};
+
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-	u8      reserved_at_0[0x1a];
+	u8      fdb_to_vport_reg_c_id[0x8];
+	u8      reserved_at_8[0xf];
+	u8      flow_source[0x1];
+	u8      reserved_at_18[0x2];
 	u8      multi_fdb_encap[0x1];
 	u8      reserved_at_1b[0x1];
 	u8      fdb_multi_path_to_table[0x1];
@@ -665,7 +693,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x14];
+	u8         reserved_at_5[0x3];
+	u8         esw_uplink_ingress_acl[0x1];
+	u8         reserved_at_9[0x10];
 	u8         esw_functions_changed[0x1];
 	u8         reserved_at_1a[0x1];
 	u8         ecpf_vport_exists[0x1];
@@ -2555,6 +2585,12 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800,
 };
 
+enum {
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT         = 0x0,
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK            = 0x1,
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT       = 0x2,
+};
+
 struct mlx5_ifc_vlan_bits {
 	u8         ethtype[0x10];
 	u8         prio[0x3];
@@ -2574,7 +2610,9 @@ struct mlx5_ifc_flow_context_bits {
 	u8         action[0x10];
 
 	u8         extended_destination[0x1];
-	u8         reserved_at_80[0x7];
+	u8         reserved_at_81[0x1];
+	u8         flow_source[0x2];
+	u8         reserved_at_84[0x4];
 	u8         destination_list_size[0x18];
 
 	u8         reserved_at_a0[0x8];
@@ -3099,12 +3137,14 @@ struct mlx5_ifc_hca_vport_context_bits {
 };
 
 struct mlx5_ifc_esw_vport_context_bits {
-	u8         reserved_at_0[0x3];
+	u8         fdb_to_vport_reg_c[0x1];
+	u8         reserved_at_1[0x2];
 	u8         vport_svlan_strip[0x1];
 	u8         vport_cvlan_strip[0x1];
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert[0x2];
-	u8         reserved_at_8[0x18];
+	u8         fdb_to_vport_reg_c_id[0x8];
+	u8         reserved_at_10[0x10];
 
 	u8         reserved_at_20[0x20];
 
@@ -4985,7 +5025,8 @@ struct mlx5_ifc_modify_esw_vport_context_out_bits {
 };
 
 struct mlx5_ifc_esw_vport_context_fields_select_bits {
-	u8         reserved_at_0[0x1c];
+	u8         reserved_at_0[0x1b];
+	u8         fdb_to_vport_reg_c_id[0x1];
 	u8         vport_cvlan_insert[0x1];
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_strip[0x1];
@@ -5182,6 +5223,7 @@ enum {
 	MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
 	MLX5_ACTION_IN_FIELD_OUT_FIRST_VID     = 0x17,
 	MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
+	MLX5_ACTION_IN_FIELD_METADATA_REG_C_0  = 0x51,
 };
 
 struct mlx5_ifc_alloc_modify_header_context_out_bits {
-- 
cgit v1.2.3


From bb0ee7dcc4ecd6af39823b80ae3995ddc119c373 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:47:58 +0000
Subject: net/mlx5: Add flow context for flow tag

Refactor the flow data structures, add new flow_context and move
flow_tag into it, as flow_tag doesn't belong to the rule action.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/flow.c                  | 13 +++++----
 drivers/infiniband/hw/mlx5/main.c                  | 30 ++++++++++++-------
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  1 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.h        |  2 +-
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  7 +++--
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c   |  8 +++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 34 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  1 +
 include/linux/mlx5/fs.h                            | 15 +++++++---
 11 files changed, 71 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index 1fc302d41a53..b8841355fcd5 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -65,11 +65,12 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 	struct uverbs_attr_bundle *attrs)
 {
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
 	struct mlx5_ib_flow_handler *flow_handler;
 	struct mlx5_ib_flow_matcher *fs_matcher;
 	struct ib_uobject **arr_flow_actions;
 	struct ib_uflow_resources *uflow_res;
+	struct mlx5_flow_act flow_act = {};
 	void *devx_obj;
 	int dest_id, dest_type;
 	void *cmd_in;
@@ -172,17 +173,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 				   arr_flow_actions[i]->object);
 	}
 
-	ret = uverbs_copy_from(&flow_act.flow_tag, attrs,
+	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
 			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
 	if (!ret) {
-		if (flow_act.flow_tag >= BIT(24)) {
+		if (flow_context.flow_tag >= BIT(24)) {
 			ret = -EINVAL;
 			goto err_out;
 		}
-		flow_act.flags |= FLOW_ACT_HAS_TAG;
+		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
 	}
 
-	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act,
+	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher,
+					       &flow_context,
+					       &flow_act,
 					       counter_id,
 					       cmd_in, inlen,
 					       dest_id, dest_type);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abac70ad5c7c..be4c9a687df7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2666,11 +2666,15 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
 	}
 }
 
-static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
-			   u32 *match_v, const union ib_flow_spec *ib_spec,
+static int parse_flow_attr(struct mlx5_core_dev *mdev,
+			   struct mlx5_flow_spec *spec,
+			   const union ib_flow_spec *ib_spec,
 			   const struct ib_flow_attr *flow_attr,
 			   struct mlx5_flow_act *action, u32 prev_type)
 {
+	struct mlx5_flow_context *flow_context = &spec->flow_context;
+	u32 *match_c = spec->match_criteria;
+	u32 *match_v = spec->match_value;
 	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   misc_parameters);
 	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
@@ -2989,8 +2993,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 		if (ib_spec->flow_tag.tag_id >= BIT(24))
 			return -EINVAL;
 
-		action->flow_tag = ib_spec->flow_tag.tag_id;
-		action->flags |= FLOW_ACT_HAS_TAG;
+		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
+		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
 		break;
 	case IB_FLOW_SPEC_ACTION_DROP:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -3084,7 +3088,8 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
 		return VALID_SPEC_NA;
 
 	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
+		(!egress || (!is_drop &&
+			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
 		VALID_SPEC_VALID : VALID_SPEC_INVALID;
 }
 
@@ -3473,7 +3478,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_act flow_act = {};
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_destination dest_arr[2] = {};
 	struct mlx5_flow_destination *rule_dst = dest_arr;
@@ -3504,8 +3509,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	}
 
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(dev->mdev, spec->match_criteria,
-				      spec->match_value,
+		err = parse_flow_attr(dev->mdev, spec,
 				      ib_flow, flow_attr, &flow_act,
 				      prev_type);
 		if (err < 0)
@@ -3572,11 +3576,11 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
-	if ((flow_act.flags & FLOW_ACT_HAS_TAG)  &&
+	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
 	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
 		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
-			     flow_act.flow_tag, flow_attr->type);
+			     spec->flow_context.flow_tag, flow_attr->type);
 		err = -EINVAL;
 		goto free;
 	}
@@ -3947,6 +3951,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 		      struct mlx5_ib_flow_prio *ft_prio,
 		      struct mlx5_flow_destination *dst,
 		      struct mlx5_ib_flow_matcher  *fs_matcher,
+		      struct mlx5_flow_context *flow_context,
 		      struct mlx5_flow_act *flow_act,
 		      void *cmd_in, int inlen,
 		      int dst_num)
@@ -3969,6 +3974,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
 	       fs_matcher->mask_len);
 	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
+	spec->flow_context = *flow_context;
 
 	handler->rule = mlx5_add_flow_rules(ft, spec,
 					    flow_act, dst, dst_num);
@@ -4033,6 +4039,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
 struct mlx5_ib_flow_handler *
 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 			struct mlx5_ib_flow_matcher *fs_matcher,
+			struct mlx5_flow_context *flow_context,
 			struct mlx5_flow_act *flow_act,
 			u32 counter_id,
 			void *cmd_in, int inlen, int dest_id,
@@ -4085,7 +4092,8 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 		dst_num++;
 	}
 
-	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act,
+	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
+					flow_context, flow_act,
 					cmd_in, inlen, dst_num);
 
 	if (IS_ERR(handler)) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a043af7ee366..1c205c2bd486 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1317,6 +1317,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
 struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
 	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
+	struct mlx5_flow_context *flow_context,
 	struct mlx5_flow_act *flow_act, u32 counter_id,
 	void *cmd_in, int inlen, int dest_id, int dest_type);
 bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index a4cf123e3f17..9ec46edf22a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -204,7 +204,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 			   __entry->index = fte->index;
 			   __entry->action = fte->action.action;
 			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
-			   __entry->flow_tag = fte->action.flow_tag;
+			   __entry->flow_tag = fte->flow_context.flow_tag;
 			   memcpy(__entry->mask_outer,
 				  MLX5_ADDR_OF(fte_match_param,
 					       &__entry->fg->mask.match_criteria,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 4421c10f58ae..839662644ed3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -426,7 +426,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
-	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	spec->flow_context.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 122f457091a2..8ff1ca46d8d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -716,19 +716,22 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		      struct mlx5e_tc_flow *flow,
 		      struct netlink_ext_ack *extack)
 {
+	struct mlx5_flow_context *flow_context = &parse_attr->spec.flow_context;
 	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_flow_destination dest[2] = {};
 	struct mlx5_flow_act flow_act = {
 		.action = attr->action,
-		.flow_tag = attr->flow_tag,
 		.reformat_id = 0,
-		.flags    = FLOW_ACT_HAS_TAG | FLOW_ACT_NO_APPEND,
+		.flags    = FLOW_ACT_NO_APPEND,
 	};
 	struct mlx5_fc *counter = NULL;
 	bool table_created = false;
 	int err, dest_ix = 0;
 
+	flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
+	flow_context->flow_tag = attr->flow_tag;
+
 	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN) {
 		err = mlx5e_hairpin_flow_add(priv, flow, parse_attr, extack);
 		if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 52c47d3dd5a5..c76da309506b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -636,7 +636,8 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 					   u8 match_criteria_enable,
 					   const u32 *match_c,
 					   const u32 *match_v,
-					   struct mlx5_flow_act *flow_act)
+					   struct mlx5_flow_act *flow_act,
+					   struct mlx5_flow_context *flow_context)
 {
 	const void *outer_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   outer_headers);
@@ -655,7 +656,7 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 	    (match_criteria_enable &
 	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
 	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
-	     (flow_act->flags & FLOW_ACT_HAS_TAG))
+	     (flow_context->flags & FLOW_CONTEXT_HAS_TAG))
 		return false;
 
 	return true;
@@ -767,7 +768,8 @@ mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev,
 					    fg->mask.match_criteria_enable,
 					    fg->mask.match_criteria,
 					    fte->val,
-					    &fte->action))
+					    &fte->action,
+					    &fte->flow_context))
 		return ERR_PTR(-EINVAL);
 	else if (!mlx5_is_fpga_ipsec_rule(mdev,
 					  fg->mask.match_criteria_enable,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 4f1d402926f1..fb1335a433ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -396,7 +396,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
 
-	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->action.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, flow_tag,
+		 fte->flow_context.flow_tag);
 	MLX5_SET(flow_context, in_flow_context, extended_destination,
 		 extended_dest);
 	if (extended_dest) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fb5b61727ee7..9f5544ac6b8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -584,7 +584,7 @@ err_ida_remove:
 }
 
 static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
-				u32 *match_value,
+				struct mlx5_flow_spec *spec,
 				struct mlx5_flow_act *flow_act)
 {
 	struct mlx5_flow_steering *steering = get_steering(&ft->node);
@@ -594,9 +594,10 @@ static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
 	if (!fte)
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(fte->val, match_value, sizeof(fte->val));
+	memcpy(fte->val, &spec->match_value, sizeof(fte->val));
 	fte->node.type =  FS_TYPE_FLOW_ENTRY;
 	fte->action = *flow_act;
+	fte->flow_context = spec->flow_context;
 
 	tree_init_node(&fte->node, NULL, del_sw_fte);
 
@@ -1428,7 +1429,9 @@ static bool check_conflicting_actions(u32 action1, u32 action2)
 	return false;
 }
 
-static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act *flow_act)
+static int check_conflicting_ftes(struct fs_fte *fte,
+				  const struct mlx5_flow_context *flow_context,
+				  const struct mlx5_flow_act *flow_act)
 {
 	if (check_conflicting_actions(flow_act->action, fte->action.action)) {
 		mlx5_core_warn(get_dev(&fte->node),
@@ -1436,12 +1439,12 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 		return -EEXIST;
 	}
 
-	if ((flow_act->flags & FLOW_ACT_HAS_TAG) &&
-	    fte->action.flow_tag != flow_act->flow_tag) {
+	if ((flow_context->flags & FLOW_CONTEXT_HAS_TAG) &&
+	    fte->flow_context.flow_tag != flow_context->flow_tag) {
 		mlx5_core_warn(get_dev(&fte->node),
 			       "FTE flow tag %u already exists with different flow tag %u\n",
-			       fte->action.flow_tag,
-			       flow_act->flow_tag);
+			       fte->flow_context.flow_tag,
+			       flow_context->flow_tag);
 		return -EEXIST;
 	}
 
@@ -1449,7 +1452,7 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 }
 
 static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
-					    u32 *match_value,
+					    struct mlx5_flow_spec *spec,
 					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
 					    int dest_num,
@@ -1460,7 +1463,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	int i;
 	int ret;
 
-	ret = check_conflicting_ftes(fte, flow_act);
+	ret = check_conflicting_ftes(fte, &spec->flow_context, flow_act);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -1635,7 +1638,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 	u64  version;
 	int err;
 
-	fte = alloc_fte(ft, spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec, flow_act);
 	if (IS_ERR(fte))
 		return  ERR_PTR(-ENOMEM);
 
@@ -1651,8 +1654,7 @@ search_again_locked:
 		fte_tmp = lookup_fte_locked(g, spec->match_value, take_write);
 		if (!fte_tmp)
 			continue;
-		rule = add_rule_fg(g, spec->match_value,
-				   flow_act, dest, dest_num, fte_tmp);
+		rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte_tmp);
 		up_write_ref_node(&fte_tmp->node, false);
 		tree_put_node(&fte_tmp->node, false);
 		kmem_cache_free(steering->ftes_cache, fte);
@@ -1699,8 +1701,7 @@ skip_search:
 
 		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 		up_write_ref_node(&g->node, false);
-		rule = add_rule_fg(g, spec->match_value,
-				   flow_act, dest, dest_num, fte);
+		rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte);
 		up_write_ref_node(&fte->node, false);
 		tree_put_node(&fte->node, false);
 		return rule;
@@ -1786,7 +1787,7 @@ search_again_locked:
 	if (err)
 		goto err_release_fg;
 
-	fte = alloc_fte(ft, spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec, flow_act);
 	if (IS_ERR(fte)) {
 		err = PTR_ERR(fte);
 		goto err_release_fg;
@@ -1800,8 +1801,7 @@ search_again_locked:
 
 	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 	up_write_ref_node(&g->node, false);
-	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-			   dest_num, fte);
+	rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte);
 	up_write_ref_node(&fte->node, false);
 	tree_put_node(&fte->node, false);
 	tree_put_node(&g->node, false);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a08c3d09a50f..c48c382f926f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -170,6 +170,7 @@ struct fs_fte {
 	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
 	u32				dests_size;
 	u32				index;
+	struct mlx5_flow_context	flow_context;
 	struct mlx5_flow_act		action;
 	enum fs_fte_status		status;
 	struct mlx5_fc			*counter;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 2ddaa97f2179..9bf49ce218fa 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -88,10 +88,20 @@ struct mlx5_flow_group;
 struct mlx5_flow_namespace;
 struct mlx5_flow_handle;
 
+enum {
+	FLOW_CONTEXT_HAS_TAG = BIT(0),
+};
+
+struct mlx5_flow_context {
+	u32 flags;
+	u32 flow_tag;
+};
+
 struct mlx5_flow_spec {
 	u8   match_criteria_enable;
 	u32  match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
 	u32  match_value[MLX5_ST_SZ_DW(fte_match_param)];
+	struct mlx5_flow_context flow_context;
 };
 
 enum {
@@ -173,13 +183,11 @@ struct mlx5_fs_vlan {
 #define MLX5_FS_VLAN_DEPTH	2
 
 enum {
-	FLOW_ACT_HAS_TAG   = BIT(0),
-	FLOW_ACT_NO_APPEND = BIT(1),
+	FLOW_ACT_NO_APPEND = BIT(0),
 };
 
 struct mlx5_flow_act {
 	u32 action;
-	u32 flow_tag;
 	u32 reformat_id;
 	u32 modify_id;
 	uintptr_t esp_id;
@@ -190,7 +198,6 @@ struct mlx5_flow_act {
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
 	struct mlx5_flow_act name = { .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
-				      .flow_tag = MLX5_FS_DEFAULT_FLOW_TAG, \
 				      .reformat_id = 0, \
 				      .modify_id = 0, \
 				      .flags =  0, }
-- 
cgit v1.2.3


From 7445cfb1169cebf8f79763acf65f85d850850461 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:48:00 +0000
Subject: net/mlx5: E-Switch, Tag packet with vport number in VF vports and
 uplink ingress ACLs

When a dual-port VHCA sends a RoCE packet on its non-native port, and the
packet arrives to its affiliated vport FDB, a mismatch might occur on the
rules that match the packet source vport as it is not represented by single
VHCA only in this case. So we change to match on metadata instead of source
vport.
To do that, a rule is created in all vports and uplink ingress ACLs, to
save the source vport number and vhca id in the packet's metadata in order
to match on it later.
The metadata register used is the first of the 32-bit type C registers. It
can be used for matching and header modify operations. The higher 16 bits
of this register are for vhca id, and the lower 16 ones is for vport
number.
This change is not for dual-port RoCE only. If HW and FW allow, the vport
metadata matching is enabled by default.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   9 ++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 180 ++++++++++++++++-----
 include/linux/mlx5/eswitch.h                       |  17 ++
 4 files changed, 172 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a42a23e505df..1235fd84ae3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1168,6 +1168,8 @@ void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
 
 	vport->ingress.drop_rule = NULL;
 	vport->ingress.allow_rule = NULL;
+
+	esw_vport_del_ingress_acl_modify_metadata(esw, vport);
 }
 
 void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 99dc25630629..51e71b824abf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -68,6 +68,8 @@ struct vport_ingress {
 	struct mlx5_flow_group *allow_spoofchk_only_grp;
 	struct mlx5_flow_group *allow_untagged_only_grp;
 	struct mlx5_flow_group *drop_grp;
+	int modify_metadata_id;
+	struct mlx5_flow_handle  *modify_metadata_rule;
 	struct mlx5_flow_handle  *allow_rule;
 	struct mlx5_flow_handle  *drop_rule;
 	struct mlx5_fc           *drop_counter;
@@ -196,6 +198,10 @@ struct mlx5_esw_functions {
 	u16			num_vfs;
 };
 
+enum {
+	MLX5_ESWITCH_VPORT_MATCH_METADATA = BIT(0),
+};
+
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
 	struct mlx5_nb          nb;
@@ -203,6 +209,7 @@ struct mlx5_eswitch {
 	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
 	struct workqueue_struct *work_queue;
 	struct mlx5_vport       *vports;
+	u32 flags;
 	int                     total_vports;
 	int                     enabled_vports;
 	/* Synchronize between vport change events
@@ -240,6 +247,8 @@ void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
 				  struct mlx5_vport *vport);
 void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
 				   struct mlx5_vport *vport);
+void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+					       struct mlx5_vport *vport);
 
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c1c42c1370b8..4bcbc872cd08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1555,32 +1555,16 @@ static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw)
 static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw,
 					     struct mlx5_vport *vport)
 {
-	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
 	/* For prio tag mode, there is only 1 FTEs:
-	 * 1) Untagged packets - push prio tag VLAN, allow
+	 * 1) Untagged packets - push prio tag VLAN and modify metadata if
+	 * required, allow
 	 * Unmatched traffic is allowed by default
 	 */
 
-	if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
-		return -EOPNOTSUPP;
-
-	esw_vport_cleanup_ingress_rules(esw, vport);
-
-	err = esw_vport_enable_ingress_acl(esw, vport);
-	if (err) {
-		mlx5_core_warn(esw->dev,
-			       "failed to enable prio tag ingress acl (%d) on vport[%d]\n",
-			       err, vport->vport);
-		return err;
-	}
-
-	esw_debug(esw->dev,
-		  "vport[%d] configure ingress rules\n", vport->vport);
-
 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
 	if (!spec) {
 		err = -ENOMEM;
@@ -1596,6 +1580,12 @@ static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw,
 	flow_act.vlan[0].ethtype = ETH_P_8021Q;
 	flow_act.vlan[0].vid = 0;
 	flow_act.vlan[0].prio = 0;
+
+	if (vport->ingress.modify_metadata_rule) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+		flow_act.modify_id = vport->ingress.modify_metadata_id;
+	}
+
 	vport->ingress.allow_rule =
 		mlx5_add_flow_rules(vport->ingress.acl, spec,
 				    &flow_act, NULL, 0);
@@ -1616,6 +1606,58 @@ out_no_mem:
 	return err;
 }
 
+static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+						     struct mlx5_vport *vport)
+{
+	u8 action[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {};
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_spec spec = {};
+	int err = 0;
+
+	MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET);
+	MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_0);
+	MLX5_SET(set_action_in, action, data,
+		 mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport));
+
+	err = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+				       1, action, &vport->ingress.modify_metadata_id);
+	if (err) {
+		esw_warn(esw->dev,
+			 "failed to alloc modify header for vport %d ingress acl (%d)\n",
+			 vport->vport, err);
+		return err;
+	}
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	flow_act.modify_id = vport->ingress.modify_metadata_id;
+	vport->ingress.modify_metadata_rule = mlx5_add_flow_rules(vport->ingress.acl,
+								  &spec, &flow_act, NULL, 0);
+	if (IS_ERR(vport->ingress.modify_metadata_rule)) {
+		err = PTR_ERR(vport->ingress.modify_metadata_rule);
+		esw_warn(esw->dev,
+			 "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.modify_metadata_rule = NULL;
+		goto out;
+	}
+
+out:
+	if (err)
+		mlx5_modify_header_dealloc(esw->dev, vport->ingress.modify_metadata_id);
+	return err;
+}
+
+void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+					       struct mlx5_vport *vport)
+{
+	if (vport->ingress.modify_metadata_rule) {
+		mlx5_del_flow_rules(vport->ingress.modify_metadata_rule);
+		mlx5_modify_header_dealloc(esw->dev, vport->ingress.modify_metadata_id);
+
+		vport->ingress.modify_metadata_rule = NULL;
+	}
+}
+
 static int esw_vport_egress_prio_tag_config(struct mlx5_eswitch *esw,
 					    struct mlx5_vport *vport)
 {
@@ -1623,6 +1665,9 @@ static int esw_vport_egress_prio_tag_config(struct mlx5_eswitch *esw,
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
+	if (!MLX5_CAP_GEN(esw->dev, prio_tag_required))
+		return 0;
+
 	/* For prio tag mode, there is only 1 FTEs:
 	 * 1) prio tag packets - pop the prio tag VLAN, allow
 	 * Unmatched traffic is allowed by default
@@ -1676,27 +1721,75 @@ out_no_mem:
 	return err;
 }
 
-static int esw_prio_tag_acls_config(struct mlx5_eswitch *esw, int nvports)
+static int esw_vport_ingress_common_config(struct mlx5_eswitch *esw,
+					   struct mlx5_vport *vport)
 {
-	struct mlx5_vport *vport = NULL;
-	int i, j;
 	int err;
 
-	mlx5_esw_for_each_vf_vport(esw, i, vport, nvports) {
+	if (!mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+	    !MLX5_CAP_GEN(esw->dev, prio_tag_required))
+		return 0;
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+
+	err = esw_vport_enable_ingress_acl(esw, vport);
+	if (err) {
+		esw_warn(esw->dev,
+			 "failed to enable ingress acl (%d) on vport[%d]\n",
+			 err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure ingress rules\n", vport->vport);
+
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
+		err = esw_vport_add_ingress_acl_modify_metadata(esw, vport);
+		if (err)
+			goto out;
+	}
+
+	if (MLX5_CAP_GEN(esw->dev, prio_tag_required) &&
+	    mlx5_eswitch_is_vf_vport(esw, vport->vport)) {
 		err = esw_vport_ingress_prio_tag_config(esw, vport);
 		if (err)
-			goto err_ingress;
-		err = esw_vport_egress_prio_tag_config(esw, vport);
+			goto out;
+	}
+
+out:
+	if (err)
+		esw_vport_disable_ingress_acl(esw, vport);
+	return err;
+}
+
+static int esw_create_offloads_acl_tables(struct mlx5_eswitch *esw)
+{
+	struct mlx5_vport *vport;
+	int i, j;
+	int err;
+
+	mlx5_esw_for_all_vports(esw, i, vport) {
+		err = esw_vport_ingress_common_config(esw, vport);
 		if (err)
-			goto err_egress;
+			goto err_ingress;
+
+		if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) {
+			err = esw_vport_egress_prio_tag_config(esw, vport);
+			if (err)
+				goto err_egress;
+		}
 	}
 
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw))
+		esw_info(esw->dev, "Use metadata reg_c as source vport to match\n");
+
 	return 0;
 
 err_egress:
 	esw_vport_disable_ingress_acl(esw, vport);
 err_ingress:
-	mlx5_esw_for_each_vf_vport_reverse(esw, j, vport, i - 1) {
+	for (j = MLX5_VPORT_PF; j < i; j++) {
+		vport = &esw->vports[j];
 		esw_vport_disable_egress_acl(esw, vport);
 		esw_vport_disable_ingress_acl(esw, vport);
 	}
@@ -1704,15 +1797,17 @@ err_ingress:
 	return err;
 }
 
-static void esw_prio_tag_acls_cleanup(struct mlx5_eswitch *esw)
+static void esw_destroy_offloads_acl_tables(struct mlx5_eswitch *esw)
 {
 	struct mlx5_vport *vport;
 	int i;
 
-	mlx5_esw_for_each_vf_vport(esw, i, vport, esw->nvports) {
+	mlx5_esw_for_all_vports(esw, i, vport) {
 		esw_vport_disable_egress_acl(esw, vport);
 		esw_vport_disable_ingress_acl(esw, vport);
 	}
+
+	esw->flags &= ~MLX5_ESWITCH_VPORT_MATCH_METADATA;
 }
 
 static int esw_offloads_steering_init(struct mlx5_eswitch *esw, int nvports)
@@ -1722,15 +1817,13 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw, int nvports)
 	memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb));
 	mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
 
-	if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) {
-		err = esw_prio_tag_acls_config(esw, nvports);
-		if (err)
-			return err;
-	}
+	err = esw_create_offloads_acl_tables(esw);
+	if (err)
+		return err;
 
 	err = esw_create_offloads_fdb_tables(esw, nvports);
 	if (err)
-		return err;
+		goto create_fdb_err;
 
 	err = esw_create_offloads_table(esw, nvports);
 	if (err)
@@ -1748,6 +1841,9 @@ create_fg_err:
 create_ft_err:
 	esw_destroy_offloads_fdb_tables(esw);
 
+create_fdb_err:
+	esw_destroy_offloads_acl_tables(esw);
+
 	return err;
 }
 
@@ -1756,8 +1852,7 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 	esw_destroy_vport_rx_group(esw);
 	esw_destroy_offloads_table(esw);
 	esw_destroy_offloads_fdb_tables(esw);
-	if (MLX5_CAP_GEN(esw->dev, prio_tag_required))
-		esw_prio_tag_acls_cleanup(esw);
+	esw_destroy_offloads_acl_tables(esw);
 }
 
 static void esw_functions_changed_event_handler(struct work_struct *work)
@@ -2296,3 +2391,16 @@ bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num)
 	return vport_num >= MLX5_VPORT_FIRST_VF &&
 	       vport_num <= esw->dev->priv.sriov.max_vfs;
 }
+
+bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
+{
+	return !!(esw->flags & MLX5_ESWITCH_VPORT_MATCH_METADATA);
+}
+EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled);
+
+u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					      u16 vport_num)
+{
+	return ((MLX5_CAP_GEN(esw->dev, vhca_id) & 0xffff) << 16) | vport_num;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 174eec0871d9..aece3ae1902d 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -67,11 +67,28 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 #ifdef CONFIG_MLX5_ESWITCH
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
+
+bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
+u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					      u16 vport_num);
 #else  /* CONFIG_MLX5_ESWITCH */
 static inline enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
 {
 	return DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
+
+static inline bool
+mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
+{
+	return false;
+};
+
+static inline u32
+mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					  int vport_num)
+{
+	return 0;
+};
 #endif /* CONFIG_MLX5_ESWITCH */
 #endif
-- 
cgit v1.2.3


From 8d212ff057f8b81ed6ed418874b54ded3bf97ad4 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:48:02 +0000
Subject: net/mlx5e: Specifying known origin of packets matching the flow

In vport metadata matching, source port number is replaced by metadata.
While FW has no idea about what it is in the metadata, a syndrome will
happen. Specify a known origin to avoid the syndrome.
However, there is no functional change because ANY_VPORT (0) is filled
in flow_source, the same default value as before, as a pre-step towards
metadata matching for fast path.
There are two other values can be filled in flow_source. When setting
0x1, packet matching this rule is from uplink, while 0x2 is for packet
from other local vports.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             | 3 +++
 include/linux/mlx5/fs.h                                      | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index 9ec46edf22a6..ddf1b87f1bc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -187,6 +187,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 		__field(u32, index)
 		__field(u32, action)
 		__field(u32, flow_tag)
+		__field(u32, flow_source)
 		__field(u8,  mask_enable)
 		__field(int, new_fte)
 		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
@@ -205,6 +206,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 			   __entry->action = fte->action.action;
 			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
 			   __entry->flow_tag = fte->flow_context.flow_tag;
+			   __entry->flow_source = fte->flow_context.flow_source;
 			   memcpy(__entry->mask_outer,
 				  MLX5_ADDR_OF(fte_match_param,
 					       &__entry->fg->mask.match_criteria,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index fb1335a433ae..7ac1249eadc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -398,6 +398,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 
 	MLX5_SET(flow_context, in_flow_context, flow_tag,
 		 fte->flow_context.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, flow_source,
+		 fte->flow_context.flow_source);
+
 	MLX5_SET(flow_context, in_flow_context, extended_destination,
 		 extended_dest);
 	if (extended_dest) {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 9bf49ce218fa..dc7e7aa53a13 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -95,6 +95,7 @@ enum {
 struct mlx5_flow_context {
 	u32 flags;
 	u32 flow_tag;
+	u32 flow_source;
 };
 
 struct mlx5_flow_spec {
-- 
cgit v1.2.3


From 3b8c4a08a471d56ecaaca939c972fdf5b8255629 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 19 Jun 2019 16:10:16 +0100
Subject: keys: Kill off request_key_async{,_with_auxdata}

Kill off request_key_async{,_with_auxdata}() as they're not currently used.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/core.rst        | 32 ------------------
 Documentation/security/keys/request-key.rst | 23 ++-----------
 include/linux/key.h                         | 11 -------
 security/keys/request_key.c                 | 50 -----------------------------
 4 files changed, 2 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 003f1452a5b7..a0e245f9576f 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1115,38 +1115,6 @@ payload contents" for more information.
     is a blob of length callout_len, if given (the length may be 0).
 
 
- *  A key can be requested asynchronously by calling one of::
-
-	struct key *request_key_async(const struct key_type *type,
-				      const char *description,
-				      const void *callout_info,
-				      size_t callout_len);
-
-    or::
-
-	struct key *request_key_async_with_auxdata(const struct key_type *type,
-						   const char *description,
-						   const char *callout_info,
-					     	   size_t callout_len,
-					     	   void *aux);
-
-    which are asynchronous equivalents of request_key() and
-    request_key_with_auxdata() respectively.
-
-    These two functions return with the key potentially still under
-    construction.  To wait for construction completion, the following should be
-    called::
-
-	int wait_for_key_construction(struct key *key, bool intr);
-
-    The function will wait for the key to finish being constructed and then
-    invokes key_validate() to return an appropriate value to indicate the state
-    of the key (0 indicates the key is usable).
-
-    If intr is true, then the wait can be interrupted by a signal, in which
-    case error ERESTARTSYS will be returned.
-
-
  *  To search for a key under RCU conditions, call::
 
 	struct key *request_key_rcu(const struct key_type *type,
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 45049abdf290..5a210baa583a 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -21,21 +21,6 @@ or::
 					     size_t callout_len,
 					     void *aux);
 
-or::
-
-	struct key *request_key_async(const struct key_type *type,
-				      const char *description,
-				      const char *callout_info,
-				      size_t callout_len);
-
-or::
-
-	struct key *request_key_async_with_auxdata(const struct key_type *type,
-						   const char *description,
-						   const char *callout_info,
-					     	   size_t callout_len,
-						   void *aux);
-
 or::
 
 	struct key *request_key_rcu(const struct key_type *type,
@@ -53,15 +38,11 @@ does not need to link the key to a keyring to prevent it from being immediately
 destroyed.  The kernel interface returns a pointer directly to the key, and
 it's up to the caller to destroy the key.
 
-The request_key*_with_auxdata() calls are like the in-kernel request_key*()
-calls, except that they permit auxiliary data to be passed to the upcaller (the
+The request_key_with_auxdata() calls is like the in-kernel request_key() call,
+except that they permit auxiliary data to be passed to the upcaller (the
 default is NULL).  This is only useful for those key types that define their
 own upcall mechanism rather than using /sbin/request-key.
 
-The two async in-kernel calls may return keys that are still in the process of
-being constructed.  The two non-async ones will wait for construction to
-complete first.
-
 The request_key_rcu() call is like the in-kernel request_key() call, except
 that it doesn't check for keys that are under construction and doesn't attempt
 to construct missing keys.
diff --git a/include/linux/key.h b/include/linux/key.h
index 3604a554df99..4cd5669184f3 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -283,17 +283,6 @@ extern struct key *request_key_with_auxdata(struct key_type *type,
 					    size_t callout_len,
 					    void *aux);
 
-extern struct key *request_key_async(struct key_type *type,
-				     const char *description,
-				     const void *callout_info,
-				     size_t callout_len);
-
-extern struct key *request_key_async_with_auxdata(struct key_type *type,
-						  const char *description,
-						  const void *callout_info,
-						  size_t callout_len,
-						  void *aux);
-
 extern int wait_for_key_construction(struct key *key, bool intr);
 
 extern int key_validate(const struct key *key);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f289982cb5db..36c55ef47b9e 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -739,56 +739,6 @@ struct key *request_key_with_auxdata(struct key_type *type,
 }
 EXPORT_SYMBOL(request_key_with_auxdata);
 
-/*
- * request_key_async - Request a key (allow async construction)
- * @type: Type of key.
- * @description: The searchable description of the key.
- * @callout_info: The data to pass to the instantiation upcall (or NULL).
- * @callout_len: The length of callout_info.
- *
- * As for request_key_and_link() except that it does not add the returned key
- * to a keyring if found, new keys are always allocated in the user's quota and
- * no auxiliary data can be passed.
- *
- * The caller should call wait_for_key_construction() to wait for the
- * completion of the returned key if it is still undergoing construction.
- */
-struct key *request_key_async(struct key_type *type,
-			      const char *description,
-			      const void *callout_info,
-			      size_t callout_len)
-{
-	return request_key_and_link(type, description, callout_info,
-				    callout_len, NULL, NULL,
-				    KEY_ALLOC_IN_QUOTA);
-}
-EXPORT_SYMBOL(request_key_async);
-
-/*
- * request a key with auxiliary data for the upcaller (allow async construction)
- * @type: Type of key.
- * @description: The searchable description of the key.
- * @callout_info: The data to pass to the instantiation upcall (or NULL).
- * @callout_len: The length of callout_info.
- * @aux: Auxiliary data for the upcall.
- *
- * As for request_key_and_link() except that it does not add the returned key
- * to a keyring if found and new keys are always allocated in the user's quota.
- *
- * The caller should call wait_for_key_construction() to wait for the
- * completion of the returned key if it is still undergoing construction.
- */
-struct key *request_key_async_with_auxdata(struct key_type *type,
-					   const char *description,
-					   const void *callout_info,
-					   size_t callout_len,
-					   void *aux)
-{
-	return request_key_and_link(type, description, callout_info,
-				    callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA);
-}
-EXPORT_SYMBOL(request_key_async_with_auxdata);
-
 /**
  * request_key_rcu - Request key from RCU-read-locked context
  * @type: The type of key we want.
-- 
cgit v1.2.3


From f771fde82051976a6fc0fd570f8b86de4a92124b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:31 +0100
Subject: keys: Simplify key description management

Simplify key description management by cramming the word containing the
length with the first few chars of the description also.  This simplifies
the code that generates the index-key used by assoc_array.  It should speed
up key searching a bit too.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/key.h        | 14 +++++++++-
 security/keys/internal.h   |  6 ++++
 security/keys/key.c        |  2 ++
 security/keys/keyring.c    | 70 ++++++++++++++--------------------------------
 security/keys/persistent.c |  1 +
 5 files changed, 43 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 4cd5669184f3..86ccc2d010f6 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -86,9 +86,20 @@ struct keyring_list;
 struct keyring_name;
 
 struct keyring_index_key {
+	union {
+		struct {
+#ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */
+			u8	desc_len;
+			char	desc[sizeof(long) - 1];	/* First few chars of description */
+#else
+			char	desc[sizeof(long) - 1];	/* First few chars of description */
+			u8	desc_len;
+#endif
+		};
+		unsigned long x;
+	};
 	struct key_type		*type;
 	const char		*description;
-	size_t			desc_len;
 };
 
 union key_payload {
@@ -202,6 +213,7 @@ struct key {
 	union {
 		struct keyring_index_key index_key;
 		struct {
+			unsigned long	len_desc;
 			struct key_type	*type;		/* type of key */
 			char		*description;
 		};
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 3d5c08db74d2..ee71c72fc5f0 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -90,6 +90,12 @@ extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
 
 
+static inline void key_set_index_key(struct keyring_index_key *index_key)
+{
+	size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));
+	memcpy(index_key->desc, index_key->description, n);
+}
+
 extern struct key_type *key_type_lookup(const char *type);
 extern void key_type_put(struct key_type *ktype);
 
diff --git a/security/keys/key.c b/security/keys/key.c
index e792d65c0af8..0a3828f15f57 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -285,6 +285,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);
 	if (!key->index_key.description)
 		goto no_memory_3;
+	key_set_index_key(&key->index_key);
 
 	refcount_set(&key->usage, 1);
 	init_rwsem(&key->sem);
@@ -868,6 +869,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			goto error_free_prep;
 	}
 	index_key.desc_len = strlen(index_key.description);
+	key_set_index_key(&index_key);
 
 	ret = __key_link_lock(keyring, &index_key);
 	if (ret < 0) {
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index afa6d4024c67..ebf52077598f 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -179,9 +179,9 @@ static unsigned long hash_key_type_and_desc(const struct keyring_index_key *inde
 	int n, desc_len = index_key->desc_len;
 
 	type = (unsigned long)index_key->type;
-
 	acc = mult_64x32_and_fold(type, desc_len + 13);
 	acc = mult_64x32_and_fold(acc, 9207);
+
 	for (;;) {
 		n = desc_len;
 		if (n <= 0)
@@ -215,23 +215,13 @@ static unsigned long hash_key_type_and_desc(const struct keyring_index_key *inde
 /*
  * Build the next index key chunk.
  *
- * On 32-bit systems the index key is laid out as:
- *
- *	0	4	5	9...
- *	hash	desclen	typeptr	desc[]
- *
- * On 64-bit systems:
- *
- *	0	8	9	17...
- *	hash	desclen	typeptr	desc[]
- *
  * We return it one word-sized chunk at a time.
  */
 static unsigned long keyring_get_key_chunk(const void *data, int level)
 {
 	const struct keyring_index_key *index_key = data;
 	unsigned long chunk = 0;
-	long offset = 0;
+	const u8 *d;
 	int desc_len = index_key->desc_len, n = sizeof(chunk);
 
 	level /= ASSOC_ARRAY_KEY_CHUNK_SIZE;
@@ -239,33 +229,23 @@ static unsigned long keyring_get_key_chunk(const void *data, int level)
 	case 0:
 		return hash_key_type_and_desc(index_key);
 	case 1:
-		return ((unsigned long)index_key->type << 8) | desc_len;
+		return index_key->x;
 	case 2:
-		if (desc_len == 0)
-			return (u8)((unsigned long)index_key->type >>
-				    (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
-		n--;
-		offset = 1;
-		/* fall through */
+		return (unsigned long)index_key->type;
 	default:
-		offset += sizeof(chunk) - 1;
-		offset += (level - 3) * sizeof(chunk);
-		if (offset >= desc_len)
+		level -= 3;
+		if (desc_len <= sizeof(index_key->desc))
 			return 0;
-		desc_len -= offset;
+
+		d = index_key->description + sizeof(index_key->desc);
+		d += level * sizeof(long);
+		desc_len -= sizeof(index_key->desc);
 		if (desc_len > n)
 			desc_len = n;
-		offset += desc_len;
 		do {
 			chunk <<= 8;
-			chunk |= ((u8*)index_key->description)[--offset];
+			chunk |= *d++;
 		} while (--desc_len > 0);
-
-		if (level == 2) {
-			chunk <<= 8;
-			chunk |= (u8)((unsigned long)index_key->type >>
-				      (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
-		}
 		return chunk;
 	}
 }
@@ -304,39 +284,28 @@ static int keyring_diff_objects(const void *object, const void *data)
 	seg_b = hash_key_type_and_desc(b);
 	if ((seg_a ^ seg_b) != 0)
 		goto differ;
+	level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;
 
 	/* The number of bits contributed by the hash is controlled by a
 	 * constant in the assoc_array headers.  Everything else thereafter we
 	 * can deal with as being machine word-size dependent.
 	 */
-	level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;
-	seg_a = a->desc_len;
-	seg_b = b->desc_len;
+	seg_a = a->x;
+	seg_b = b->x;
 	if ((seg_a ^ seg_b) != 0)
 		goto differ;
+	level += sizeof(unsigned long);
 
 	/* The next bit may not work on big endian */
-	level++;
 	seg_a = (unsigned long)a->type;
 	seg_b = (unsigned long)b->type;
 	if ((seg_a ^ seg_b) != 0)
 		goto differ;
-
 	level += sizeof(unsigned long);
-	if (a->desc_len == 0)
-		goto same;
 
-	i = 0;
-	if (((unsigned long)a->description | (unsigned long)b->description) &
-	    (sizeof(unsigned long) - 1)) {
-		do {
-			seg_a = *(unsigned long *)(a->description + i);
-			seg_b = *(unsigned long *)(b->description + i);
-			if ((seg_a ^ seg_b) != 0)
-				goto differ_plus_i;
-			i += sizeof(unsigned long);
-		} while (i < (a->desc_len & (sizeof(unsigned long) - 1)));
-	}
+	i = sizeof(a->desc);
+	if (a->desc_len <= i)
+		goto same;
 
 	for (; i < a->desc_len; i++) {
 		seg_a = *(unsigned char *)(a->description + i);
@@ -662,6 +631,9 @@ static bool search_nested_keyrings(struct key *keyring,
 	BUG_ON((ctx->flags & STATE_CHECKS) == 0 ||
 	       (ctx->flags & STATE_CHECKS) == STATE_CHECKS);
 
+	if (ctx->index_key.description)
+		key_set_index_key(&ctx->index_key);
+
 	/* Check to see if this top-level keyring is what we are looking for
 	 * and whether it is valid or not.
 	 */
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index d0cb5b32eff7..fc29ec59efa7 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -87,6 +87,7 @@ static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
 	index_key.type = &key_type_keyring;
 	index_key.description = buf;
 	index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid));
+	key_set_index_key(&index_key);
 
 	if (ns->persistent_keyring_register) {
 		reg_ref = make_key_ref(ns->persistent_keyring_register, true);
-- 
cgit v1.2.3


From 355ef8e15885020da88f5ba2d85ce42b1d01f537 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Cache the hash value to avoid lots of recalculation

Cache the hash of the key's type and description in the index key so that
we're not recalculating it every time we look at a key during a search.
The hash function does a bunch of multiplications, so evading those is
probably worthwhile - especially as this is done for every key examined
during a search.

This also allows the methods used by assoc_array to get chunks of index-key
to be simplified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/key.h      |  3 +++
 security/keys/internal.h |  8 +-------
 security/keys/key.c      |  2 +-
 security/keys/keyring.c  | 28 ++++++++++++++++++++--------
 4 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 86ccc2d010f6..fb2debcacea0 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -86,6 +86,8 @@ struct keyring_list;
 struct keyring_name;
 
 struct keyring_index_key {
+	/* [!] If this structure is altered, the union in struct key must change too! */
+	unsigned long		hash;			/* Hash value */
 	union {
 		struct {
 #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */
@@ -213,6 +215,7 @@ struct key {
 	union {
 		struct keyring_index_key index_key;
 		struct {
+			unsigned long	hash;
 			unsigned long	len_desc;
 			struct key_type	*type;		/* type of key */
 			char		*description;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index ee71c72fc5f0..4305414795ae 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -89,13 +89,7 @@ extern spinlock_t key_serial_lock;
 extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
 
-
-static inline void key_set_index_key(struct keyring_index_key *index_key)
-{
-	size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));
-	memcpy(index_key->desc, index_key->description, n);
-}
-
+extern void key_set_index_key(struct keyring_index_key *index_key);
 extern struct key_type *key_type_lookup(const char *type);
 extern void key_type_put(struct key_type *ktype);
 
diff --git a/security/keys/key.c b/security/keys/key.c
index 0a3828f15f57..9d52f2472a09 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -285,12 +285,12 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);
 	if (!key->index_key.description)
 		goto no_memory_3;
+	key->index_key.type = type;
 	key_set_index_key(&key->index_key);
 
 	refcount_set(&key->usage, 1);
 	init_rwsem(&key->sem);
 	lockdep_set_class(&key->sem, &type->lock_class);
-	key->index_key.type = type;
 	key->user = user;
 	key->quotalen = quotalen;
 	key->datalen = type->def_datalen;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index ebf52077598f..a5ee3b4d2eb8 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -168,7 +168,7 @@ static u64 mult_64x32_and_fold(u64 x, u32 y)
 /*
  * Hash a key type and description.
  */
-static unsigned long hash_key_type_and_desc(const struct keyring_index_key *index_key)
+static void hash_key_type_and_desc(struct keyring_index_key *index_key)
 {
 	const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP;
 	const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK;
@@ -206,10 +206,22 @@ static unsigned long hash_key_type_and_desc(const struct keyring_index_key *inde
 	 * zero for keyrings and non-zero otherwise.
 	 */
 	if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0)
-		return hash | (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1;
-	if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0)
-		return (hash + (hash << level_shift)) & ~fan_mask;
-	return hash;
+		hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1;
+	else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0)
+		hash = (hash + (hash << level_shift)) & ~fan_mask;
+	index_key->hash = hash;
+}
+
+/*
+ * Finalise an index key to include a part of the description actually in the
+ * index key and to add in the hash too.
+ */
+void key_set_index_key(struct keyring_index_key *index_key)
+{
+	size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));
+	memcpy(index_key->desc, index_key->description, n);
+
+	hash_key_type_and_desc(index_key);
 }
 
 /*
@@ -227,7 +239,7 @@ static unsigned long keyring_get_key_chunk(const void *data, int level)
 	level /= ASSOC_ARRAY_KEY_CHUNK_SIZE;
 	switch (level) {
 	case 0:
-		return hash_key_type_and_desc(index_key);
+		return index_key->hash;
 	case 1:
 		return index_key->x;
 	case 2:
@@ -280,8 +292,8 @@ static int keyring_diff_objects(const void *object, const void *data)
 	int level, i;
 
 	level = 0;
-	seg_a = hash_key_type_and_desc(a);
-	seg_b = hash_key_type_and_desc(b);
+	seg_a = a->hash;
+	seg_b = b->hash;
 	if ((seg_a ^ seg_b) != 0)
 		goto differ;
 	level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;
-- 
cgit v1.2.3


From dcf49dbc8077e278ddd1bc7298abc781496e8a08 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Add a 'recurse' flag for keyring searches

Add a 'recurse' flag for keyring searches so that the flag can be omitted
and recursion disabled, thereby allowing just the nominated keyring to be
searched and none of the children.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/core.rst     | 10 ++++++----
 certs/blacklist.c                        |  2 +-
 crypto/asymmetric_keys/asymmetric_type.c |  2 +-
 include/linux/key.h                      |  3 ++-
 lib/digsig.c                             |  2 +-
 net/rxrpc/security.c                     |  2 +-
 security/integrity/digsig_asymmetric.c   |  4 ++--
 security/keys/internal.h                 |  1 +
 security/keys/keyctl.c                   |  2 +-
 security/keys/keyring.c                  | 12 ++++++++++--
 security/keys/proc.c                     |  3 ++-
 security/keys/process_keys.c             |  3 ++-
 security/keys/request_key.c              |  3 ++-
 security/keys/request_key_auth.c         |  3 ++-
 14 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index a0e245f9576f..ae930ae9d590 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1162,11 +1162,13 @@ payload contents" for more information.
 
 	key_ref_t keyring_search(key_ref_t keyring_ref,
 				 const struct key_type *type,
-				 const char *description)
+				 const char *description,
+				 bool recurse)
 
-    This searches the keyring tree specified for a matching key. Error ENOKEY
-    is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful,
-    the returned key will need to be released.
+    This searches the specified keyring only (recurse == false) or keyring tree
+    (recurse == true) specified for a matching key. Error ENOKEY is returned
+    upon failure (use IS_ERR/PTR_ERR to determine). If successful, the returned
+    key will need to be released.
 
     The possession attribute from the keyring reference is used to control
     access through the permissions mask and is propagated to the returned key
diff --git a/certs/blacklist.c b/certs/blacklist.c
index 3a507b9e2568..181cb7fa9540 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -128,7 +128,7 @@ int is_hash_blacklisted(const u8 *hash, size_t hash_len, const char *type)
 	*p = 0;
 
 	kref = keyring_search(make_key_ref(blacklist_keyring, true),
-			      &key_type_blacklist, buffer);
+			      &key_type_blacklist, buffer, false);
 	if (!IS_ERR(kref)) {
 		key_ref_put(kref);
 		ret = -EKEYREJECTED;
diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c
index 69a0788a7de5..084027ef3121 100644
--- a/crypto/asymmetric_keys/asymmetric_type.c
+++ b/crypto/asymmetric_keys/asymmetric_type.c
@@ -87,7 +87,7 @@ struct key *find_asymmetric_key(struct key *keyring,
 	pr_debug("Look up: \"%s\"\n", req);
 
 	ref = keyring_search(make_key_ref(keyring, 1),
-			     &key_type_asymmetric, req);
+			     &key_type_asymmetric, req, true);
 	if (IS_ERR(ref))
 		pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref));
 	kfree(req);
diff --git a/include/linux/key.h b/include/linux/key.h
index fb2debcacea0..ff102731b3db 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -341,7 +341,8 @@ extern int keyring_clear(struct key *keyring);
 
 extern key_ref_t keyring_search(key_ref_t keyring,
 				struct key_type *type,
-				const char *description);
+				const char *description,
+				bool recurse);
 
 extern int keyring_add_key(struct key *keyring,
 			   struct key *key);
diff --git a/lib/digsig.c b/lib/digsig.c
index 3b0a579bdcdf..3782af401c68 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -221,7 +221,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen,
 		/* search in specific keyring */
 		key_ref_t kref;
 		kref = keyring_search(make_key_ref(keyring, 1UL),
-						&key_type_user, name);
+				      &key_type_user, name, true);
 		if (IS_ERR(kref))
 			key = ERR_CAST(kref);
 		else
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index c4479afe8ae7..2cfc7125bc41 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -148,7 +148,7 @@ found_service:
 
 	/* look through the service's keyring */
 	kref = keyring_search(make_key_ref(rx->securities, 1UL),
-			      &key_type_rxrpc_s, kdesc);
+			      &key_type_rxrpc_s, kdesc, true);
 	if (IS_ERR(kref)) {
 		read_unlock(&local->services_lock);
 		_leave(" = %ld [search]", PTR_ERR(kref));
diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c
index 99080871eb9f..358f614811e8 100644
--- a/security/integrity/digsig_asymmetric.c
+++ b/security/integrity/digsig_asymmetric.c
@@ -39,7 +39,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid)
 		key_ref_t kref;
 
 		kref = keyring_search(make_key_ref(key, 1),
-				     &key_type_asymmetric, name);
+				      &key_type_asymmetric, name, true);
 		if (!IS_ERR(kref)) {
 			pr_err("Key '%s' is in ima_blacklist_keyring\n", name);
 			return ERR_PTR(-EKEYREJECTED);
@@ -51,7 +51,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid)
 		key_ref_t kref;
 
 		kref = keyring_search(make_key_ref(keyring, 1),
-				      &key_type_asymmetric, name);
+				      &key_type_asymmetric, name, true);
 		if (IS_ERR(kref))
 			key = ERR_CAST(kref);
 		else
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 4305414795ae..aa361299a3ec 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -127,6 +127,7 @@ struct keyring_search_context {
 #define KEYRING_SEARCH_NO_CHECK_PERM	0x0008	/* Don't check permissions */
 #define KEYRING_SEARCH_DETECT_TOO_DEEP	0x0010	/* Give an error on excessive depth */
 #define KEYRING_SEARCH_SKIP_EXPIRED	0x0020	/* Ignore expired keys (intention to replace) */
+#define KEYRING_SEARCH_RECURSE		0x0040	/* Search child keyrings also */
 
 	int (*iterator)(const void *object, void *iterator_data);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 9f418e66f067..169409b611b0 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -762,7 +762,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	}
 
 	/* do the search */
-	key_ref = keyring_search(keyring_ref, ktype, description);
+	key_ref = keyring_search(keyring_ref, ktype, description, true);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index a5ee3b4d2eb8..20891cd198f0 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -685,6 +685,9 @@ descend_to_keyring:
 	 * Non-keyrings avoid the leftmost branch of the root entirely (root
 	 * slots 1-15).
 	 */
+	if (!(ctx->flags & KEYRING_SEARCH_RECURSE))
+		goto not_this_keyring;
+
 	ptr = READ_ONCE(keyring->keys.root);
 	if (!ptr)
 		goto not_this_keyring;
@@ -885,13 +888,15 @@ key_ref_t keyring_search_rcu(key_ref_t keyring_ref,
  * @keyring: The root of the keyring tree to be searched.
  * @type: The type of keyring we want to find.
  * @description: The name of the keyring we want to find.
+ * @recurse: True to search the children of @keyring also
  *
  * As keyring_search_rcu() above, but using the current task's credentials and
  * type's default matching function and preferred search method.
  */
 key_ref_t keyring_search(key_ref_t keyring,
 			 struct key_type *type,
-			 const char *description)
+			 const char *description,
+			 bool recurse)
 {
 	struct keyring_search_context ctx = {
 		.index_key.type		= type,
@@ -906,6 +911,8 @@ key_ref_t keyring_search(key_ref_t keyring,
 	key_ref_t key;
 	int ret;
 
+	if (recurse)
+		ctx.flags |= KEYRING_SEARCH_RECURSE;
 	if (type->match_preparse) {
 		ret = type->match_preparse(&ctx.match_data);
 		if (ret < 0)
@@ -1176,7 +1183,8 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 		.flags			= (KEYRING_SEARCH_NO_STATE_CHECK |
 					   KEYRING_SEARCH_NO_UPDATE_TIME |
 					   KEYRING_SEARCH_NO_CHECK_PERM |
-					   KEYRING_SEARCH_DETECT_TOO_DEEP),
+					   KEYRING_SEARCH_DETECT_TOO_DEEP |
+					   KEYRING_SEARCH_RECURSE),
 	};
 
 	rcu_read_lock();
diff --git a/security/keys/proc.c b/security/keys/proc.c
index f081dceae3b9..b4f5ba56b9cb 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -170,7 +170,8 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		.match_data.cmp		= lookup_user_key_possessed,
 		.match_data.raw_data	= key,
 		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
-		.flags			= KEYRING_SEARCH_NO_STATE_CHECK,
+		.flags			= (KEYRING_SEARCH_NO_STATE_CHECK |
+					   KEYRING_SEARCH_RECURSE),
 	};
 
 	key_ref = make_key_ref(key, 0);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index f8ffb06d0297..b07f768d23dc 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -531,7 +531,8 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 	struct keyring_search_context ctx = {
 		.match_data.cmp		= lookup_user_key_possessed,
 		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
-		.flags			= KEYRING_SEARCH_NO_STATE_CHECK,
+		.flags			= (KEYRING_SEARCH_NO_STATE_CHECK |
+					   KEYRING_SEARCH_RECURSE),
 	};
 	struct request_key_auth *rka;
 	struct key *key;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 36c55ef47b9e..1ffd3803ce29 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -569,7 +569,8 @@ struct key *request_key_and_link(struct key_type *type,
 		.match_data.raw_data	= description,
 		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
 		.flags			= (KEYRING_SEARCH_DO_STATE_CHECK |
-					   KEYRING_SEARCH_SKIP_EXPIRED),
+					   KEYRING_SEARCH_SKIP_EXPIRED |
+					   KEYRING_SEARCH_RECURSE),
 	};
 	struct key *key;
 	key_ref_t key_ref;
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 99ed7a8a273d..f613987e8a63 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -252,7 +252,8 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id)
 		.match_data.cmp		= key_default_cmp,
 		.match_data.raw_data	= description,
 		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
-		.flags			= KEYRING_SEARCH_DO_STATE_CHECK,
+		.flags			= (KEYRING_SEARCH_DO_STATE_CHECK |
+					   KEYRING_SEARCH_RECURSE),
 	};
 	struct key *authkey;
 	key_ref_t authkey_ref;
-- 
cgit v1.2.3


From b206f281d0ee14969878469816a69db22d5838e8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Namespace keyring names

Keyring names are held in a single global list that any process can pick
from by means of keyctl_join_session_keyring (provided the keyring grants
Search permission).  This isn't very container friendly, however.

Make the following changes:

 (1) Make default session, process and thread keyring names begin with a
     '.' instead of '_'.

 (2) Keyrings whose names begin with a '.' aren't added to the list.  Such
     keyrings are system specials.

 (3) Replace the global list with per-user_namespace lists.  A keyring adds
     its name to the list for the user_namespace that it is currently in.

 (4) When a user_namespace is deleted, it just removes itself from the
     keyring name list.

The global keyring_name_lock is retained for accessing the name lists.
This allows (4) to work.

This can be tested by:

	# keyctl newring foo @s
	995906392
	# unshare -U
	$ keyctl show
	...
	 995906392 --alswrv  65534 65534   \_ keyring: foo
	...
	$ keyctl session foo
	Joined session keyring: 935622349

As can be seen, a new session keyring was created.

The capability bit KEYCTL_CAPS1_NS_KEYRING_NAME is set if the kernel is
employing this feature.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/key.h            |  2 +
 include/linux/user_namespace.h |  5 +++
 include/uapi/linux/keyctl.h    |  1 +
 kernel/user.c                  |  3 ++
 kernel/user_namespace.c        |  7 +--
 security/keys/keyctl.c         |  3 +-
 security/keys/keyring.c        | 99 ++++++++++++++++++------------------------
 7 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index ff102731b3db..ae1177302d70 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -361,6 +361,7 @@ extern void key_set_timeout(struct key *, unsigned);
 
 extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
 				 key_perm_t perm);
+extern void key_free_user_ns(struct user_namespace *);
 
 /*
  * The permissions required on a key that we're looking up.
@@ -434,6 +435,7 @@ extern void key_init(void);
 #define key_fsuid_changed(c)		do { } while(0)
 #define key_fsgid_changed(c)		do { } while(0)
 #define key_init()			do { } while(0)
+#define key_free_user_ns(ns)		do { } while(0)
 
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..90457015fa3f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -64,6 +64,11 @@ struct user_namespace {
 	struct ns_common	ns;
 	unsigned long		flags;
 
+#ifdef CONFIG_KEYS
+	/* List of joinable keyrings in this namespace */
+	struct list_head	keyring_name_list;
+#endif
+
 	/* Register of per-UID persistent keyrings for this namespace */
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	struct key		*persistent_keyring_register;
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index 551b5814f53e..35b405034674 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -128,5 +128,6 @@ struct keyctl_pkey_params {
 #define KEYCTL_CAPS0_INVALIDATE		0x20 /* KEYCTL_INVALIDATE supported */
 #define KEYCTL_CAPS0_RESTRICT_KEYRING	0x40 /* KEYCTL_RESTRICT_KEYRING supported */
 #define KEYCTL_CAPS0_MOVE		0x80 /* KEYCTL_MOVE supported */
+#define KEYCTL_CAPS1_NS_KEYRING_NAME	0x01 /* Keyring names are per-user_namespace */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/kernel/user.c b/kernel/user.c
index 88b834f0eebc..50979fd1b7aa 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -62,6 +62,9 @@ struct user_namespace init_user_ns = {
 	.ns.ops = &userns_operations,
 #endif
 	.flags = USERNS_INIT_FLAGS,
+#ifdef CONFIG_KEYS
+	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
+#endif
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	.persistent_keyring_register_sem =
 	__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..bda6e890ad88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -133,6 +133,9 @@ int create_user_ns(struct cred *new)
 	ns->flags = parent_ns->flags;
 	mutex_unlock(&userns_state_mutex);
 
+#ifdef CONFIG_KEYS
+	INIT_LIST_HEAD(&ns->keyring_name_list);
+#endif
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	init_rwsem(&ns->persistent_keyring_register_sem);
 #endif
@@ -196,9 +199,7 @@ static void free_user_ns(struct work_struct *work)
 			kfree(ns->projid_map.reverse);
 		}
 		retire_userns_sysctls(ns);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
-		key_put(ns->persistent_keyring_register);
-#endif
+		key_free_user_ns(ns);
 		ns_free_inum(&ns->ns);
 		kmem_cache_free(user_ns_cachep, ns);
 		dec_user_namespaces(ucounts);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 169409b611b0..8a813220f269 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -30,7 +30,7 @@
 
 #define KEY_MAX_DESC_SIZE 4096
 
-static const unsigned char keyrings_capabilities[1] = {
+static const unsigned char keyrings_capabilities[2] = {
 	[0] = (KEYCTL_CAPS0_CAPABILITIES |
 	       (IS_ENABLED(CONFIG_PERSISTENT_KEYRINGS)	? KEYCTL_CAPS0_PERSISTENT_KEYRINGS : 0) |
 	       (IS_ENABLED(CONFIG_KEY_DH_OPERATIONS)	? KEYCTL_CAPS0_DIFFIE_HELLMAN : 0) |
@@ -40,6 +40,7 @@ static const unsigned char keyrings_capabilities[1] = {
 	       KEYCTL_CAPS0_RESTRICT_KEYRING |
 	       KEYCTL_CAPS0_MOVE
 	       ),
+	[1] = (KEYCTL_CAPS1_NS_KEYRING_NAME),
 };
 
 static int key_get_type_from_user(char *type,
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 20891cd198f0..fe851292509e 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
+#include <linux/user_namespace.h>
 #include <keys/keyring-type.h>
 #include <keys/user-type.h>
 #include <linux/assoc_array_priv.h>
@@ -28,11 +29,6 @@
  */
 #define KEYRING_SEARCH_MAX_DEPTH 6
 
-/*
- * We keep all named keyrings in a hash to speed looking them up.
- */
-#define KEYRING_NAME_HASH_SIZE	(1 << 5)
-
 /*
  * We mark pointers we pass to the associative array with bit 1 set if
  * they're keyrings and clear otherwise.
@@ -55,17 +51,20 @@ static inline void *keyring_key_to_ptr(struct key *key)
 	return key;
 }
 
-static struct list_head	keyring_name_hash[KEYRING_NAME_HASH_SIZE];
 static DEFINE_RWLOCK(keyring_name_lock);
 
-static inline unsigned keyring_hash(const char *desc)
+/*
+ * Clean up the bits of user_namespace that belong to us.
+ */
+void key_free_user_ns(struct user_namespace *ns)
 {
-	unsigned bucket = 0;
-
-	for (; *desc; desc++)
-		bucket += (unsigned char)*desc;
+	write_lock(&keyring_name_lock);
+	list_del_init(&ns->keyring_name_list);
+	write_unlock(&keyring_name_lock);
 
-	return bucket & (KEYRING_NAME_HASH_SIZE - 1);
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+	key_put(ns->persistent_keyring_register);
+#endif
 }
 
 /*
@@ -104,23 +103,17 @@ static DEFINE_MUTEX(keyring_serialise_link_lock);
 
 /*
  * Publish the name of a keyring so that it can be found by name (if it has
- * one).
+ * one and it doesn't begin with a dot).
  */
 static void keyring_publish_name(struct key *keyring)
 {
-	int bucket;
-
-	if (keyring->description) {
-		bucket = keyring_hash(keyring->description);
+	struct user_namespace *ns = current_user_ns();
 
+	if (keyring->description &&
+	    keyring->description[0] &&
+	    keyring->description[0] != '.') {
 		write_lock(&keyring_name_lock);
-
-		if (!keyring_name_hash[bucket].next)
-			INIT_LIST_HEAD(&keyring_name_hash[bucket]);
-
-		list_add_tail(&keyring->name_link,
-			      &keyring_name_hash[bucket]);
-
+		list_add_tail(&keyring->name_link, &ns->keyring_name_list);
 		write_unlock(&keyring_name_lock);
 	}
 }
@@ -1097,50 +1090,44 @@ found:
  */
 struct key *find_keyring_by_name(const char *name, bool uid_keyring)
 {
+	struct user_namespace *ns = current_user_ns();
 	struct key *keyring;
-	int bucket;
 
 	if (!name)
 		return ERR_PTR(-EINVAL);
 
-	bucket = keyring_hash(name);
-
 	read_lock(&keyring_name_lock);
 
-	if (keyring_name_hash[bucket].next) {
-		/* search this hash bucket for a keyring with a matching name
-		 * that's readable and that hasn't been revoked */
-		list_for_each_entry(keyring,
-				    &keyring_name_hash[bucket],
-				    name_link
-				    ) {
-			if (!kuid_has_mapping(current_user_ns(), keyring->user->uid))
-				continue;
-
-			if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
-				continue;
+	/* Search this hash bucket for a keyring with a matching name that
+	 * grants Search permission and that hasn't been revoked
+	 */
+	list_for_each_entry(keyring, &ns->keyring_name_list, name_link) {
+		if (!kuid_has_mapping(ns, keyring->user->uid))
+			continue;
 
-			if (strcmp(keyring->description, name) != 0)
-				continue;
+		if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
+			continue;
 
-			if (uid_keyring) {
-				if (!test_bit(KEY_FLAG_UID_KEYRING,
-					      &keyring->flags))
-					continue;
-			} else {
-				if (key_permission(make_key_ref(keyring, 0),
-						   KEY_NEED_SEARCH) < 0)
-					continue;
-			}
+		if (strcmp(keyring->description, name) != 0)
+			continue;
 
-			/* we've got a match but we might end up racing with
-			 * key_cleanup() if the keyring is currently 'dead'
-			 * (ie. it has a zero usage count) */
-			if (!refcount_inc_not_zero(&keyring->usage))
+		if (uid_keyring) {
+			if (!test_bit(KEY_FLAG_UID_KEYRING,
+				      &keyring->flags))
+				continue;
+		} else {
+			if (key_permission(make_key_ref(keyring, 0),
+					   KEY_NEED_SEARCH) < 0)
 				continue;
-			keyring->last_used_at = ktime_get_real_seconds();
-			goto out;
 		}
+
+		/* we've got a match but we might end up racing with
+		 * key_cleanup() if the keyring is currently 'dead'
+		 * (ie. it has a zero usage count) */
+		if (!refcount_inc_not_zero(&keyring->usage))
+			continue;
+		keyring->last_used_at = ktime_get_real_seconds();
+		goto out;
 	}
 
 	keyring = ERR_PTR(-ENOKEY);
-- 
cgit v1.2.3


From 0f44e4d976f96c6439da0d6717238efa4b91196e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Move the user and user-session keyrings to the user_namespace

Move the user and user-session keyrings to the user_namespace struct rather
than pinning them from the user_struct struct.  This prevents these
keyrings from propagating across user-namespaces boundaries with regard to
the KEY_SPEC_* flags, thereby making them more useful in a containerised
environment.

The issue is that a single user_struct may be represent UIDs in several
different namespaces.

The way the patch does this is by attaching a 'register keyring' in each
user_namespace and then sticking the user and user-session keyrings into
that.  It can then be searched to retrieve them.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jann Horn <jannh@google.com>
---
 include/linux/sched/user.h     |  14 ---
 include/linux/user_namespace.h |   9 +-
 kernel/user.c                  |   7 +-
 kernel/user_namespace.c        |   4 +-
 security/keys/internal.h       |   3 +-
 security/keys/keyring.c        |   1 +
 security/keys/persistent.c     |   8 +-
 security/keys/process_keys.c   | 259 ++++++++++++++++++++++++++---------------
 security/keys/request_key.c    |  20 ++--
 9 files changed, 196 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 468d2565a9fe..917d88edb7b9 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -7,8 +7,6 @@
 #include <linux/refcount.h>
 #include <linux/ratelimit.h>
 
-struct key;
-
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -30,18 +28,6 @@ struct user_struct {
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 
-#ifdef CONFIG_KEYS
-	/*
-	 * These pointers can only change from NULL to a non-NULL value once.
-	 * Writes are protected by key_user_keyring_mutex.
-	 * Unlocked readers should use READ_ONCE() unless they know that
-	 * install_user_keyrings() has been called successfully (which sets
-	 * these members to non-NULL values, preventing further modifications).
-	 */
-	struct key *uid_keyring;	/* UID specific keyring */
-	struct key *session_keyring;	/* UID's default session keyring */
-#endif
-
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	kuid_t uid;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 90457015fa3f..fb9f4f799554 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,14 +65,19 @@ struct user_namespace {
 	unsigned long		flags;
 
 #ifdef CONFIG_KEYS
-	/* List of joinable keyrings in this namespace */
+	/* List of joinable keyrings in this namespace.  Modification access of
+	 * these pointers is controlled by keyring_sem.  Once
+	 * user_keyring_register is set, it won't be changed, so it can be
+	 * accessed directly with READ_ONCE().
+	 */
 	struct list_head	keyring_name_list;
+	struct key		*user_keyring_register;
+	struct rw_semaphore	keyring_sem;
 #endif
 
 	/* Register of per-UID persistent keyrings for this namespace */
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	struct key		*persistent_keyring_register;
-	struct rw_semaphore	persistent_keyring_register_sem;
 #endif
 	struct work_struct	work;
 #ifdef CONFIG_SYSCTL
diff --git a/kernel/user.c b/kernel/user.c
index 50979fd1b7aa..f8519b62cf9a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -64,10 +64,7 @@ struct user_namespace init_user_ns = {
 	.flags = USERNS_INIT_FLAGS,
 #ifdef CONFIG_KEYS
 	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
-#endif
-#ifdef CONFIG_PERSISTENT_KEYRINGS
-	.persistent_keyring_register_sem =
-	__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+	.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
 #endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
@@ -143,8 +140,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
-	key_put(up->uid_keyring);
-	key_put(up->session_keyring);
 	kmem_cache_free(uid_cachep, up);
 }
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index bda6e890ad88..c87c2ecc7085 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -135,9 +135,7 @@ int create_user_ns(struct cred *new)
 
 #ifdef CONFIG_KEYS
 	INIT_LIST_HEAD(&ns->keyring_name_list);
-#endif
-#ifdef CONFIG_PERSISTENT_KEYRINGS
-	init_rwsem(&ns->persistent_keyring_register_sem);
+	init_rwsem(&ns->keyring_sem);
 #endif
 	ret = -ENOMEM;
 	if (!setup_userns_sysctls(ns))
diff --git a/security/keys/internal.h b/security/keys/internal.h
index aa361299a3ec..d3a9439e2386 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -148,7 +148,8 @@ extern key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx)
 
 extern struct key *find_keyring_by_name(const char *name, bool uid_keyring);
 
-extern int install_user_keyrings(void);
+extern int look_up_user_keyrings(struct key **, struct key **);
+extern struct key *get_user_session_keyring_rcu(const struct cred *);
 extern int install_thread_keyring_to_cred(struct cred *);
 extern int install_process_keyring_to_cred(struct cred *);
 extern int install_session_keyring_to_cred(struct cred *, struct key *);
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index fe851292509e..3663e5168583 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -62,6 +62,7 @@ void key_free_user_ns(struct user_namespace *ns)
 	list_del_init(&ns->keyring_name_list);
 	write_unlock(&keyring_name_lock);
 
+	key_put(ns->user_keyring_register);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	key_put(ns->persistent_keyring_register);
 #endif
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index fc29ec59efa7..90303fe4a394 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -91,9 +91,9 @@ static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
 
 	if (ns->persistent_keyring_register) {
 		reg_ref = make_key_ref(ns->persistent_keyring_register, true);
-		down_read(&ns->persistent_keyring_register_sem);
+		down_read(&ns->keyring_sem);
 		persistent_ref = find_key_to_update(reg_ref, &index_key);
-		up_read(&ns->persistent_keyring_register_sem);
+		up_read(&ns->keyring_sem);
 
 		if (persistent_ref)
 			goto found;
@@ -102,9 +102,9 @@ static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
 	/* It wasn't in the register, so we'll need to create it.  We might
 	 * also need to create the register.
 	 */
-	down_write(&ns->persistent_keyring_register_sem);
+	down_write(&ns->keyring_sem);
 	persistent_ref = key_create_persistent(ns, uid, &index_key);
-	up_write(&ns->persistent_keyring_register_sem);
+	up_write(&ns->keyring_sem);
 	if (!IS_ERR(persistent_ref))
 		goto found;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index b07f768d23dc..f74d64215942 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -19,15 +19,13 @@
 #include <linux/security.h>
 #include <linux/user_namespace.h>
 #include <linux/uaccess.h>
+#include <linux/init_task.h>
 #include <keys/request_key_auth-type.h>
 #include "internal.h"
 
 /* Session keyring create vs join semaphore */
 static DEFINE_MUTEX(key_session_mutex);
 
-/* User keyring creation semaphore */
-static DEFINE_MUTEX(key_user_keyring_mutex);
-
 /* The root user's tracking struct */
 struct key_user root_key_user = {
 	.usage		= REFCOUNT_INIT(3),
@@ -39,98 +37,185 @@ struct key_user root_key_user = {
 };
 
 /*
- * Install the user and user session keyrings for the current process's UID.
+ * Get or create a user register keyring.
+ */
+static struct key *get_user_register(struct user_namespace *user_ns)
+{
+	struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register);
+
+	if (reg_keyring)
+		return reg_keyring;
+
+	down_write(&user_ns->keyring_sem);
+
+	/* Make sure there's a register keyring.  It gets owned by the
+	 * user_namespace's owner.
+	 */
+	reg_keyring = user_ns->user_keyring_register;
+	if (!reg_keyring) {
+		reg_keyring = keyring_alloc(".user_reg",
+					    user_ns->owner, INVALID_GID,
+					    &init_cred,
+					    KEY_POS_WRITE | KEY_POS_SEARCH |
+					    KEY_USR_VIEW | KEY_USR_READ,
+					    0,
+					    NULL, NULL);
+		if (!IS_ERR(reg_keyring))
+			smp_store_release(&user_ns->user_keyring_register,
+					  reg_keyring);
+	}
+
+	up_write(&user_ns->keyring_sem);
+
+	/* We don't return a ref since the keyring is pinned by the user_ns */
+	return reg_keyring;
+}
+
+/*
+ * Look up the user and user session keyrings for the current process's UID,
+ * creating them if they don't exist.
  */
-int install_user_keyrings(void)
+int look_up_user_keyrings(struct key **_user_keyring,
+			  struct key **_user_session_keyring)
 {
-	struct user_struct *user;
-	const struct cred *cred;
-	struct key *uid_keyring, *session_keyring;
+	const struct cred *cred = current_cred();
+	struct user_namespace *user_ns = current_user_ns();
+	struct key *reg_keyring, *uid_keyring, *session_keyring;
 	key_perm_t user_keyring_perm;
+	key_ref_t uid_keyring_r, session_keyring_r;
+	uid_t uid = from_kuid(user_ns, cred->user->uid);
 	char buf[20];
 	int ret;
-	uid_t uid;
 
 	user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;
-	cred = current_cred();
-	user = cred->user;
-	uid = from_kuid(cred->user_ns, user->uid);
 
-	kenter("%p{%u}", user, uid);
+	kenter("%u", uid);
 
-	if (READ_ONCE(user->uid_keyring) && READ_ONCE(user->session_keyring)) {
-		kleave(" = 0 [exist]");
-		return 0;
-	}
+	reg_keyring = get_user_register(user_ns);
+	if (IS_ERR(reg_keyring))
+		return PTR_ERR(reg_keyring);
 
-	mutex_lock(&key_user_keyring_mutex);
+	down_write(&user_ns->keyring_sem);
 	ret = 0;
 
-	if (!user->uid_keyring) {
-		/* get the UID-specific keyring
-		 * - there may be one in existence already as it may have been
-		 *   pinned by a session, but the user_struct pointing to it
-		 *   may have been destroyed by setuid */
-		sprintf(buf, "_uid.%u", uid);
-
-		uid_keyring = find_keyring_by_name(buf, true);
+	/* Get the user keyring.  Note that there may be one in existence
+	 * already as it may have been pinned by a session, but the user_struct
+	 * pointing to it may have been destroyed by setuid.
+	 */
+	snprintf(buf, sizeof(buf), "_uid.%u", uid);
+	uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
+				       &key_type_keyring, buf, false);
+	kdebug("_uid %p", uid_keyring_r);
+	if (uid_keyring_r == ERR_PTR(-EAGAIN)) {
+		uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
+					    cred, user_keyring_perm,
+					    KEY_ALLOC_UID_KEYRING |
+					    KEY_ALLOC_IN_QUOTA,
+					    NULL, reg_keyring);
 		if (IS_ERR(uid_keyring)) {
-			uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID,
-						    cred, user_keyring_perm,
-						    KEY_ALLOC_UID_KEYRING |
-							KEY_ALLOC_IN_QUOTA,
-						    NULL, NULL);
-			if (IS_ERR(uid_keyring)) {
-				ret = PTR_ERR(uid_keyring);
-				goto error;
-			}
+			ret = PTR_ERR(uid_keyring);
+			goto error;
 		}
+	} else if (IS_ERR(uid_keyring_r)) {
+		ret = PTR_ERR(uid_keyring_r);
+		goto error;
+	} else {
+		uid_keyring = key_ref_to_ptr(uid_keyring_r);
+	}
 
-		/* get a default session keyring (which might also exist
-		 * already) */
-		sprintf(buf, "_uid_ses.%u", uid);
-
-		session_keyring = find_keyring_by_name(buf, true);
+	/* Get a default session keyring (which might also exist already) */
+	snprintf(buf, sizeof(buf), "_uid_ses.%u", uid);
+	session_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
+					   &key_type_keyring, buf, false);
+	kdebug("_uid_ses %p", session_keyring_r);
+	if (session_keyring_r == ERR_PTR(-EAGAIN)) {
+		session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
+						cred, user_keyring_perm,
+						KEY_ALLOC_UID_KEYRING |
+						KEY_ALLOC_IN_QUOTA,
+						NULL, NULL);
 		if (IS_ERR(session_keyring)) {
-			session_keyring =
-				keyring_alloc(buf, user->uid, INVALID_GID,
-					      cred, user_keyring_perm,
-					      KEY_ALLOC_UID_KEYRING |
-						  KEY_ALLOC_IN_QUOTA,
-					      NULL, NULL);
-			if (IS_ERR(session_keyring)) {
-				ret = PTR_ERR(session_keyring);
-				goto error_release;
-			}
-
-			/* we install a link from the user session keyring to
-			 * the user keyring */
-			ret = key_link(session_keyring, uid_keyring);
-			if (ret < 0)
-				goto error_release_both;
+			ret = PTR_ERR(session_keyring);
+			goto error_release;
 		}
 
-		/* install the keyrings */
-		/* paired with READ_ONCE() */
-		smp_store_release(&user->uid_keyring, uid_keyring);
-		/* paired with READ_ONCE() */
-		smp_store_release(&user->session_keyring, session_keyring);
+		/* We install a link from the user session keyring to
+		 * the user keyring.
+		 */
+		ret = key_link(session_keyring, uid_keyring);
+		if (ret < 0)
+			goto error_release_session;
+
+		/* And only then link the user-session keyring to the
+		 * register.
+		 */
+		ret = key_link(reg_keyring, session_keyring);
+		if (ret < 0)
+			goto error_release_session;
+	} else if (IS_ERR(session_keyring_r)) {
+		ret = PTR_ERR(session_keyring_r);
+		goto error_release;
+	} else {
+		session_keyring = key_ref_to_ptr(session_keyring_r);
 	}
 
-	mutex_unlock(&key_user_keyring_mutex);
+	up_write(&user_ns->keyring_sem);
+
+	if (_user_session_keyring)
+		*_user_session_keyring = session_keyring;
+	else
+		key_put(session_keyring);
+	if (_user_keyring)
+		*_user_keyring = uid_keyring;
+	else
+		key_put(uid_keyring);
 	kleave(" = 0");
 	return 0;
 
-error_release_both:
+error_release_session:
 	key_put(session_keyring);
 error_release:
 	key_put(uid_keyring);
 error:
-	mutex_unlock(&key_user_keyring_mutex);
+	up_write(&user_ns->keyring_sem);
 	kleave(" = %d", ret);
 	return ret;
 }
 
+/*
+ * Get the user session keyring if it exists, but don't create it if it
+ * doesn't.
+ */
+struct key *get_user_session_keyring_rcu(const struct cred *cred)
+{
+	struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register);
+	key_ref_t session_keyring_r;
+	char buf[20];
+
+	struct keyring_search_context ctx = {
+		.index_key.type		= &key_type_keyring,
+		.index_key.description	= buf,
+		.cred			= cred,
+		.match_data.cmp		= key_default_cmp,
+		.match_data.raw_data	= buf,
+		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
+		.flags			= KEYRING_SEARCH_DO_STATE_CHECK,
+	};
+
+	if (!reg_keyring)
+		return NULL;
+
+	ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u",
+					  from_kuid(cred->user_ns,
+						    cred->user->uid));
+
+	session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true),
+					       &ctx);
+	if (IS_ERR(session_keyring_r))
+		return NULL;
+	return key_ref_to_ptr(session_keyring_r);
+}
+
 /*
  * Install a thread keyring to the given credentials struct if it didn't have
  * one already.  This is allowed to overrun the quota.
@@ -340,6 +425,7 @@ void key_fsgid_changed(struct cred *new_cred)
  */
 key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx)
 {
+	struct key *user_session;
 	key_ref_t key_ref, ret, err;
 	const struct cred *cred = ctx->cred;
 
@@ -415,10 +501,11 @@ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx)
 		}
 	}
 	/* or search the user-session keyring */
-	else if (READ_ONCE(cred->user->session_keyring)) {
-		key_ref = keyring_search_rcu(
-			make_key_ref(READ_ONCE(cred->user->session_keyring), 1),
-			ctx);
+	else if ((user_session = get_user_session_keyring_rcu(cred))) {
+		key_ref = keyring_search_rcu(make_key_ref(user_session, 1),
+					     ctx);
+		key_put(user_session);
+
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -535,7 +622,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 					   KEYRING_SEARCH_RECURSE),
 	};
 	struct request_key_auth *rka;
-	struct key *key;
+	struct key *key, *user_session;
 	key_ref_t key_ref, skey_ref;
 	int ret;
 
@@ -584,20 +671,20 @@ try_again:
 		if (!ctx.cred->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
-			ret = install_user_keyrings();
+			ret = look_up_user_keyrings(NULL, &user_session);
 			if (ret < 0)
 				goto error;
 			if (lflags & KEY_LOOKUP_CREATE)
 				ret = join_session_keyring(NULL);
 			else
-				ret = install_session_keyring(
-					ctx.cred->user->session_keyring);
+				ret = install_session_keyring(user_session);
 
+			key_put(user_session);
 			if (ret < 0)
 				goto error;
 			goto reget_creds;
-		} else if (ctx.cred->session_keyring ==
-			   READ_ONCE(ctx.cred->user->session_keyring) &&
+		} else if (test_bit(KEY_FLAG_UID_KEYRING,
+				    &ctx.cred->session_keyring->flags) &&
 			   lflags & KEY_LOOKUP_CREATE) {
 			ret = join_session_keyring(NULL);
 			if (ret < 0)
@@ -611,26 +698,16 @@ try_again:
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		if (!READ_ONCE(ctx.cred->user->uid_keyring)) {
-			ret = install_user_keyrings();
-			if (ret < 0)
-				goto error;
-		}
-
-		key = ctx.cred->user->uid_keyring;
-		__key_get(key);
+		ret = look_up_user_keyrings(&key, NULL);
+		if (ret < 0)
+			goto error;
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		if (!READ_ONCE(ctx.cred->user->session_keyring)) {
-			ret = install_user_keyrings();
-			if (ret < 0)
-				goto error;
-		}
-
-		key = ctx.cred->user->session_keyring;
-		__key_get(key);
+		ret = look_up_user_keyrings(NULL, &key);
+		if (ret < 0)
+			goto error;
 		key_ref = make_key_ref(key, 1);
 		break;
 
@@ -879,7 +956,7 @@ void key_change_session_keyring(struct callback_head *twork)
  */
 static int __init init_root_keyring(void)
 {
-	return install_user_keyrings();
+	return look_up_user_keyrings(NULL, NULL);
 }
 
 late_initcall(init_root_keyring);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 1ffd3803ce29..9201ca96c4df 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -121,7 +121,7 @@ static int call_sbin_request_key(struct key *authkey, void *aux)
 	struct request_key_auth *rka = get_request_key_auth(authkey);
 	const struct cred *cred = current_cred();
 	key_serial_t prkey, sskey;
-	struct key *key = rka->target_key, *keyring, *session;
+	struct key *key = rka->target_key, *keyring, *session, *user_session;
 	char *argv[9], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
 	char desc[20];
@@ -129,9 +129,9 @@ static int call_sbin_request_key(struct key *authkey, void *aux)
 
 	kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op);
 
-	ret = install_user_keyrings();
+	ret = look_up_user_keyrings(NULL, &user_session);
 	if (ret < 0)
-		goto error_alloc;
+		goto error_us;
 
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
@@ -169,7 +169,7 @@ static int call_sbin_request_key(struct key *authkey, void *aux)
 
 	session = cred->session_keyring;
 	if (!session)
-		session = cred->user->session_keyring;
+		session = user_session;
 	sskey = session->serial;
 
 	sprintf(keyring_str[2], "%d", sskey);
@@ -211,6 +211,8 @@ error_link:
 	key_put(keyring);
 
 error_alloc:
+	key_put(user_session);
+error_us:
 	complete_request_key(authkey, ret);
 	kleave(" = %d", ret);
 	return ret;
@@ -317,13 +319,15 @@ static int construct_get_dest_keyring(struct key **_dest_keyring)
 
 			/* fall through */
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-			dest_keyring =
-				key_get(READ_ONCE(cred->user->session_keyring));
+			ret = look_up_user_keyrings(NULL, &dest_keyring);
+			if (ret < 0)
+				return ret;
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring =
-				key_get(READ_ONCE(cred->user->uid_keyring));
+			ret = look_up_user_keyrings(&dest_keyring, NULL);
+			if (ret < 0)
+				return ret;
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
-- 
cgit v1.2.3


From 3b6e4de05e9ee2e2f94e4a3fe14d945e2418d9a8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Include target namespace in match criteria

Currently a key has a standard matching criteria of { type, description }
and this is used to only allow keys with unique criteria in a keyring.
This means, however, that you cannot have keys with the same type and
description but a different target namespace in the same keyring.

This is a potential problem for a containerised environment where, say, a
container is made up of some parts of its mount space involving netfs
superblocks from two different network namespaces.

This is also a problem for shared system management keyrings such as the
DNS records keyring or the NFS idmapper keyring that might contain keys
from different network namespaces.

Fix this by including a namespace component in a key's matching criteria.
Keyring types are marked to indicate which, if any, namespace is relevant
to keys of that type, and that namespace is set when the key is created
from the current task's namespace set.

The capability bit KEYCTL_CAPS1_NS_KEY_TAG is set if the kernel is
employing this feature.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/key.h         | 10 ++++++++++
 include/uapi/linux/keyctl.h |  1 +
 security/keys/gc.c          |  2 +-
 security/keys/key.c         |  1 +
 security/keys/keyctl.c      |  3 ++-
 security/keys/keyring.c     | 36 ++++++++++++++++++++++++++++++++++--
 security/keys/persistent.c  |  1 +
 7 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index ae1177302d70..abc68555bac3 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -82,9 +82,16 @@ struct cred;
 
 struct key_type;
 struct key_owner;
+struct key_tag;
 struct keyring_list;
 struct keyring_name;
 
+struct key_tag {
+	struct rcu_head		rcu;
+	refcount_t		usage;
+	bool			removed;	/* T when subject removed */
+};
+
 struct keyring_index_key {
 	/* [!] If this structure is altered, the union in struct key must change too! */
 	unsigned long		hash;			/* Hash value */
@@ -101,6 +108,7 @@ struct keyring_index_key {
 		unsigned long x;
 	};
 	struct key_type		*type;
+	struct key_tag		*domain_tag;	/* Domain of operation */
 	const char		*description;
 };
 
@@ -218,6 +226,7 @@ struct key {
 			unsigned long	hash;
 			unsigned long	len_desc;
 			struct key_type	*type;		/* type of key */
+			struct key_tag	*domain_tag;	/* Domain of operation */
 			char		*description;
 		};
 	};
@@ -268,6 +277,7 @@ extern struct key *key_alloc(struct key_type *type,
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
 extern void key_put(struct key *key);
+extern bool key_put_tag(struct key_tag *tag);
 
 static inline struct key *__key_get(struct key *key)
 {
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index 35b405034674..ed3d5893830d 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -129,5 +129,6 @@ struct keyctl_pkey_params {
 #define KEYCTL_CAPS0_RESTRICT_KEYRING	0x40 /* KEYCTL_RESTRICT_KEYRING supported */
 #define KEYCTL_CAPS0_MOVE		0x80 /* KEYCTL_MOVE supported */
 #define KEYCTL_CAPS1_NS_KEYRING_NAME	0x01 /* Keyring names are per-user_namespace */
+#define KEYCTL_CAPS1_NS_KEY_TAG		0x02 /* Key indexing can include a namespace tag */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 634e96b380e8..83d279fb7793 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -154,7 +154,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys)
 			atomic_dec(&key->user->nikeys);
 
 		key_user_put(key->user);
-
+		key_put_tag(key->domain_tag);
 		kfree(key->description);
 
 		memzero_explicit(key, sizeof(*key));
diff --git a/security/keys/key.c b/security/keys/key.c
index 9d52f2472a09..85fdc2ea6c14 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -317,6 +317,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		goto security_error;
 
 	/* publish the key by giving it a serial number */
+	refcount_inc(&key->domain_tag->usage);
 	atomic_inc(&user->nkeys);
 	key_alloc_serial(key);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 8a813220f269..4bb5781d3ddf 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -40,7 +40,8 @@ static const unsigned char keyrings_capabilities[2] = {
 	       KEYCTL_CAPS0_RESTRICT_KEYRING |
 	       KEYCTL_CAPS0_MOVE
 	       ),
-	[1] = (KEYCTL_CAPS1_NS_KEYRING_NAME),
+	[1] = (KEYCTL_CAPS1_NS_KEYRING_NAME |
+	       KEYCTL_CAPS1_NS_KEY_TAG),
 };
 
 static int key_get_type_from_user(char *type,
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 3663e5168583..0da8fa282d56 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -175,6 +175,9 @@ static void hash_key_type_and_desc(struct keyring_index_key *index_key)
 	type = (unsigned long)index_key->type;
 	acc = mult_64x32_and_fold(type, desc_len + 13);
 	acc = mult_64x32_and_fold(acc, 9207);
+	piece = (unsigned long)index_key->domain_tag;
+	acc = mult_64x32_and_fold(acc, piece);
+	acc = mult_64x32_and_fold(acc, 9207);
 
 	for (;;) {
 		n = desc_len;
@@ -208,16 +211,36 @@ static void hash_key_type_and_desc(struct keyring_index_key *index_key)
 
 /*
  * Finalise an index key to include a part of the description actually in the
- * index key and to add in the hash too.
+ * index key, to set the domain tag and to calculate the hash.
  */
 void key_set_index_key(struct keyring_index_key *index_key)
 {
+	static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), };
 	size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));
+
 	memcpy(index_key->desc, index_key->description, n);
 
+	index_key->domain_tag = &default_domain_tag;
 	hash_key_type_and_desc(index_key);
 }
 
+/**
+ * key_put_tag - Release a ref on a tag.
+ * @tag: The tag to release.
+ *
+ * This releases a reference the given tag and returns true if that ref was the
+ * last one.
+ */
+bool key_put_tag(struct key_tag *tag)
+{
+	if (refcount_dec_and_test(&tag->usage)) {
+		kfree_rcu(tag, rcu);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Build the next index key chunk.
  *
@@ -238,8 +261,10 @@ static unsigned long keyring_get_key_chunk(const void *data, int level)
 		return index_key->x;
 	case 2:
 		return (unsigned long)index_key->type;
+	case 3:
+		return (unsigned long)index_key->domain_tag;
 	default:
-		level -= 3;
+		level -= 4;
 		if (desc_len <= sizeof(index_key->desc))
 			return 0;
 
@@ -268,6 +293,7 @@ static bool keyring_compare_object(const void *object, const void *data)
 	const struct key *key = keyring_ptr_to_key(object);
 
 	return key->index_key.type == index_key->type &&
+		key->index_key.domain_tag == index_key->domain_tag &&
 		key->index_key.desc_len == index_key->desc_len &&
 		memcmp(key->index_key.description, index_key->description,
 		       index_key->desc_len) == 0;
@@ -309,6 +335,12 @@ static int keyring_diff_objects(const void *object, const void *data)
 		goto differ;
 	level += sizeof(unsigned long);
 
+	seg_a = (unsigned long)a->domain_tag;
+	seg_b = (unsigned long)b->domain_tag;
+	if ((seg_a ^ seg_b) != 0)
+		goto differ;
+	level += sizeof(unsigned long);
+
 	i = sizeof(a->desc);
 	if (a->desc_len <= i)
 		goto same;
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index 90303fe4a394..9944d855a28d 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -84,6 +84,7 @@ static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
 	long ret;
 
 	/* Look in the register if it exists */
+	memset(&index_key, 0, sizeof(index_key));
 	index_key.type = &key_type_keyring;
 	index_key.description = buf;
 	index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid));
-- 
cgit v1.2.3


From 218e6424e711ceee31eeba93212fed8ee92d6a11 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:32 +0100
Subject: keys: Garbage collect keys for which the domain has been removed

If a key operation domain (such as a network namespace) has been removed
then attempt to garbage collect all the keys that use it.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/key.h      |  2 ++
 security/keys/internal.h |  3 ++-
 security/keys/keyring.c  | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index abc68555bac3..60c076c6e47f 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,6 +278,7 @@ extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
 extern void key_put(struct key *key);
 extern bool key_put_tag(struct key_tag *tag);
+extern void key_remove_domain(struct key_tag *domain_tag);
 
 static inline struct key *__key_get(struct key *key)
 {
@@ -446,6 +447,7 @@ extern void key_init(void);
 #define key_fsgid_changed(c)		do { } while(0)
 #define key_init()			do { } while(0)
 #define key_free_user_ns(ns)		do { } while(0)
+#define key_remove_domain(d)		do { } while(0)
 
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
diff --git a/security/keys/internal.h b/security/keys/internal.h
index d3a9439e2386..5a561f5f199e 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -209,7 +209,8 @@ static inline bool key_is_dead(const struct key *key, time64_t limit)
 	return
 		key->flags & ((1 << KEY_FLAG_DEAD) |
 			      (1 << KEY_FLAG_INVALIDATED)) ||
-		(key->expiry > 0 && key->expiry <= limit);
+		(key->expiry > 0 && key->expiry <= limit) ||
+		key->domain_tag->removed;
 }
 
 /*
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 0da8fa282d56..d3c86fda1510 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -241,6 +241,21 @@ bool key_put_tag(struct key_tag *tag)
 	return false;
 }
 
+/**
+ * key_remove_domain - Kill off a key domain and gc its keys
+ * @domain_tag: The domain tag to release.
+ *
+ * This marks a domain tag as being dead and releases a ref on it.  If that
+ * wasn't the last reference, the garbage collector is poked to try and delete
+ * all keys that were in the domain.
+ */
+void key_remove_domain(struct key_tag *domain_tag)
+{
+	domain_tag->removed = true;
+	if (!key_put_tag(domain_tag))
+		key_schedule_gc_links();
+}
+
 /*
  * Build the next index key chunk.
  *
-- 
cgit v1.2.3


From 9b242610514fe387ef957bce05e1fdd3efd60359 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:33 +0100
Subject: keys: Network namespace domain tag

Create key domain tags for network namespaces and make it possible to
automatically tag keys that are used by networked services (e.g. AF_RXRPC,
AFS, DNS) with the default network namespace if not set by the caller.

This allows keys with the same description but in different namespaces to
coexist within a keyring.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: netdev@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-afs@lists.infradead.org
---
 include/linux/key-type.h    |  3 +++
 include/net/net_namespace.h |  3 +++
 net/core/net_namespace.c    | 20 ++++++++++++++++++++
 net/dns_resolver/dns_key.c  |  1 +
 net/rxrpc/key.c             |  2 ++
 security/keys/keyring.c     |  7 ++++++-
 6 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index e49d1de0614e..2148a6bf58f1 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -74,6 +74,9 @@ struct key_type {
 	 */
 	size_t def_datalen;
 
+	unsigned int flags;
+#define KEY_TYPE_NET_DOMAIN	0x00000001 /* Keys of this type have a net namespace domain */
+
 	/* vet a description */
 	int (*vet_description)(const char *description);
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 12689ddfc24c..a56bf7fc7c2b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -71,6 +71,9 @@ struct net {
 						 */
 	struct llist_node	cleanup_list;	/* namespaces on death row */
 
+#ifdef CONFIG_KEYS
+	struct key_tag		*key_domain;	/* Key domain of operation tag */
+#endif
 	struct user_namespace   *user_ns;	/* Owning user namespace */
 	struct ucounts		*ucounts;
 	spinlock_t		nsid_lock;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 711b161505ac..88e603b85ed2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -38,9 +38,16 @@ EXPORT_SYMBOL_GPL(net_namespace_list);
 DECLARE_RWSEM(net_rwsem);
 EXPORT_SYMBOL_GPL(net_rwsem);
 
+#ifdef CONFIG_KEYS
+static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
+#endif
+
 struct net init_net = {
 	.count		= REFCOUNT_INIT(1),
 	.dev_base_head	= LIST_HEAD_INIT(init_net.dev_base_head),
+#ifdef CONFIG_KEYS
+	.key_domain	= &init_net_key_domain,
+#endif
 };
 EXPORT_SYMBOL(init_net);
 
@@ -386,10 +393,22 @@ static struct net *net_alloc(void)
 	if (!net)
 		goto out_free;
 
+#ifdef CONFIG_KEYS
+	net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
+	if (!net->key_domain)
+		goto out_free_2;
+	refcount_set(&net->key_domain->usage, 1);
+#endif
+
 	rcu_assign_pointer(net->gen, ng);
 out:
 	return net;
 
+#ifdef CONFIG_KEYS
+out_free_2:
+	kmem_cache_free(net_cachep, net);
+	net = NULL;
+#endif
 out_free:
 	kfree(ng);
 	goto out;
@@ -566,6 +585,7 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
 		list_del_init(&net->exit_list);
 		dec_net_namespaces(net->ucounts);
+		key_remove_domain(net->key_domain);
 		put_user_ns(net->user_ns);
 		net_drop_ns(net);
 	}
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index a65d553e730d..3e1a90669006 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -314,6 +314,7 @@ static long dns_resolver_read(const struct key *key,
 
 struct key_type key_type_dns_resolver = {
 	.name		= "dns_resolver",
+	.flags		= KEY_TYPE_NET_DOMAIN,
 	.preparse	= dns_resolver_preparse,
 	.free_preparse	= dns_resolver_free_preparse,
 	.instantiate	= generic_key_instantiate,
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index e7f6b8823eb6..2722189ec273 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -43,6 +43,7 @@ static long rxrpc_read(const struct key *, char __user *, size_t);
  */
 struct key_type key_type_rxrpc = {
 	.name		= "rxrpc",
+	.flags		= KEY_TYPE_NET_DOMAIN,
 	.preparse	= rxrpc_preparse,
 	.free_preparse	= rxrpc_free_preparse,
 	.instantiate	= generic_key_instantiate,
@@ -58,6 +59,7 @@ EXPORT_SYMBOL(key_type_rxrpc);
  */
 struct key_type key_type_rxrpc_s = {
 	.name		= "rxrpc_s",
+	.flags		= KEY_TYPE_NET_DOMAIN,
 	.vet_description = rxrpc_vet_description_s,
 	.preparse	= rxrpc_preparse_s,
 	.free_preparse	= rxrpc_free_preparse_s,
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index d3c86fda1510..bca070f6ab46 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -17,10 +17,12 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/user_namespace.h>
+#include <linux/nsproxy.h>
 #include <keys/keyring-type.h>
 #include <keys/user-type.h>
 #include <linux/assoc_array_priv.h>
 #include <linux/uaccess.h>
+#include <net/net_namespace.h>
 #include "internal.h"
 
 /*
@@ -220,7 +222,10 @@ void key_set_index_key(struct keyring_index_key *index_key)
 
 	memcpy(index_key->desc, index_key->description, n);
 
-	index_key->domain_tag = &default_domain_tag;
+	if (index_key->type->flags & KEY_TYPE_NET_DOMAIN)
+		index_key->domain_tag = current->nsproxy->net_ns->key_domain;
+	else
+		index_key->domain_tag = &default_domain_tag;
 	hash_key_type_and_desc(index_key);
 }
 
-- 
cgit v1.2.3


From 96125bf9985a75db00496dd2bc9249b777d2b19b Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Sat, 22 Jun 2019 10:07:34 -0700
Subject: Allow 0.0.0.0/8 as a valid address range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The longstanding prohibition against using 0.0.0.0/8 dates back
to two issues with the early internet.

There was an interoperability problem with BSD 4.2 in 1984, fixed in
BSD 4.3 in 1986. BSD 4.2 has long since been retired.

Secondly, addresses of the form 0.x.y.z were initially defined only as
a source address in an ICMP datagram, indicating "node number x.y.z on
this IPv4 network", by nodes that know their address on their local
network, but do not yet know their network prefix, in RFC0792 (page
19).  This usage of 0.x.y.z was later repealed in RFC1122 (section
3.2.2.7), because the original ICMP-based mechanism for learning the
network prefix was unworkable on many networks such as Ethernet (which
have longer addresses that would not fit into the 24 "node number"
bits).  Modern networks use reverse ARP (RFC0903) or BOOTP (RFC0951)
or DHCP (RFC2131) to find their full 32-bit address and CIDR netmask
(and other parameters such as default gateways). 0.x.y.z has had
16,777,215 addresses in 0.0.0.0/8 space left unused and reserved for
future use, since 1989.

This patch allows for these 16m new IPv4 addresses to appear within
a box or on the wire. Layer 2 switches don't care.

0.0.0.0/32 is still prohibited, of course.

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: John Gilmore <gnu@toad.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index 4d2fedfb753a..1873ef642605 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -63,7 +63,7 @@ static inline bool ipv4_is_all_snoopers(__be32 addr)
 
 static inline bool ipv4_is_zeronet(__be32 addr)
 {
-	return (addr & htonl(0xff000000)) == htonl(0x00000000);
+	return (addr == 0);
 }
 
 /* Special-Use IPv4 Addresses (RFC3330) */
-- 
cgit v1.2.3


From 2a6a7aacd4e557a4c7007f8858bcc9654b098fea Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:24:32 +0300
Subject: mfd: regulator: clk: Split rohm-bd718x7.h

Split the bd718x7.h to ROHM common and bd718x7 specific parts
so that we do not need to add same things in every new ROHM
PMIC header. Please note that this change requires changes also
in bd718x7 sub-device drivers for regulators and clk.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/clk/clk-bd718x7.c             |  6 +++---
 drivers/mfd/rohm-bd718x7.c            | 23 ++++++++++++-----------
 drivers/regulator/bd718x7-regulator.c | 25 +++++++++++++------------
 include/linux/mfd/rohm-bd718x7.h      | 22 ++++++++--------------
 include/linux/mfd/rohm-generic.h      | 20 ++++++++++++++++++++
 5 files changed, 56 insertions(+), 40 deletions(-)
 create mode 100644 include/linux/mfd/rohm-generic.h

(limited to 'include/linux')

diff --git a/drivers/clk/clk-bd718x7.c b/drivers/clk/clk-bd718x7.c
index 60422c72d142..461228ebf703 100644
--- a/drivers/clk/clk-bd718x7.c
+++ b/drivers/clk/clk-bd718x7.c
@@ -17,7 +17,7 @@ struct bd718xx_clk {
 	u8 reg;
 	u8 mask;
 	struct platform_device *pdev;
-	struct bd718xx *mfd;
+	struct rohm_regmap_dev *mfd;
 };
 
 static int bd71837_clk_set(struct clk_hw *hw, int status)
@@ -68,7 +68,7 @@ static int bd71837_clk_probe(struct platform_device *pdev)
 	int rval = -ENOMEM;
 	const char *parent_clk;
 	struct device *parent = pdev->dev.parent;
-	struct bd718xx *mfd = dev_get_drvdata(parent);
+	struct rohm_regmap_dev *mfd = dev_get_drvdata(parent);
 	struct clk_init_data init = {
 		.name = "bd718xx-32k-out",
 		.ops = &bd71837_clk_ops,
@@ -119,5 +119,5 @@ static struct platform_driver bd71837_clk = {
 module_platform_driver(bd71837_clk);
 
 MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
-MODULE_DESCRIPTION("BD71837 chip clk driver");
+MODULE_DESCRIPTION("BD71837/BD71847 chip clk driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c
index a29d529a96f4..7beb444a57cb 100644
--- a/drivers/mfd/rohm-bd718x7.c
+++ b/drivers/mfd/rohm-bd718x7.c
@@ -98,18 +98,19 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 		return -ENOMEM;
 
 	bd718xx->chip_irq = i2c->irq;
-	bd718xx->chip_type = (unsigned int)(uintptr_t)
+	bd718xx->chip.chip_type = (unsigned int)(uintptr_t)
 				of_device_get_match_data(&i2c->dev);
-	bd718xx->dev = &i2c->dev;
+	bd718xx->chip.dev = &i2c->dev;
 	dev_set_drvdata(&i2c->dev, bd718xx);
 
-	bd718xx->regmap = devm_regmap_init_i2c(i2c, &bd718xx_regmap_config);
-	if (IS_ERR(bd718xx->regmap)) {
+	bd718xx->chip.regmap = devm_regmap_init_i2c(i2c,
+						    &bd718xx_regmap_config);
+	if (IS_ERR(bd718xx->chip.regmap)) {
 		dev_err(&i2c->dev, "regmap initialization failed\n");
-		return PTR_ERR(bd718xx->regmap);
+		return PTR_ERR(bd718xx->chip.regmap);
 	}
 
-	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->regmap,
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->chip.regmap,
 				       bd718xx->chip_irq, IRQF_ONESHOT, 0,
 				       &bd718xx_irq_chip, &bd718xx->irq_data);
 	if (ret) {
@@ -118,7 +119,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure short press to 10 milliseconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG0,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_SHORT_PRESS_10MS);
@@ -129,7 +130,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure long press to 10 seconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG1,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_LONG_PRESS_10S);
@@ -149,7 +150,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 
 	button.irq = ret;
 
-	ret = devm_mfd_add_devices(bd718xx->dev, PLATFORM_DEVID_AUTO,
+	ret = devm_mfd_add_devices(bd718xx->chip.dev, PLATFORM_DEVID_AUTO,
 				   bd718xx_mfd_cells,
 				   ARRAY_SIZE(bd718xx_mfd_cells), NULL, 0,
 				   regmap_irq_get_domain(bd718xx->irq_data));
@@ -162,11 +163,11 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 static const struct of_device_id bd718xx_of_match[] = {
 	{
 		.compatible = "rohm,bd71837",
-		.data = (void *)BD718XX_TYPE_BD71837,
+		.data = (void *)ROHM_CHIP_TYPE_BD71837,
 	},
 	{
 		.compatible = "rohm,bd71847",
-		.data = (void *)BD718XX_TYPE_BD71847,
+		.data = (void *)ROHM_CHIP_TYPE_BD71847,
 	},
 	{ }
 };
diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c
index fde4264da6ff..ef2fc175a9ae 100644
--- a/drivers/regulator/bd718x7-regulator.c
+++ b/drivers/regulator/bd718x7-regulator.c
@@ -1152,12 +1152,12 @@ static int bd718xx_probe(struct platform_device *pdev)
 {
 	struct bd718xx *mfd;
 	struct regulator_config config = { 0 };
-	struct bd718xx_pmic_inits pmic_regulators[] = {
-		[BD718XX_TYPE_BD71837] = {
+	struct bd718xx_pmic_inits pmic_regulators[ROHM_CHIP_TYPE_AMOUNT] = {
+		[ROHM_CHIP_TYPE_BD71837] = {
 			.r_datas = bd71837_regulators,
 			.r_amount = ARRAY_SIZE(bd71837_regulators),
 		},
-		[BD718XX_TYPE_BD71847] = {
+		[ROHM_CHIP_TYPE_BD71847] = {
 			.r_datas = bd71847_regulators,
 			.r_amount = ARRAY_SIZE(bd71847_regulators),
 		},
@@ -1173,15 +1173,15 @@ static int bd718xx_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	if (mfd->chip_type >= BD718XX_TYPE_AMOUNT ||
-	    !pmic_regulators[mfd->chip_type].r_datas) {
+	if (mfd->chip.chip_type >= ROHM_CHIP_TYPE_AMOUNT ||
+	    !pmic_regulators[mfd->chip.chip_type].r_datas) {
 		dev_err(&pdev->dev, "Unsupported chip type\n");
 		err = -EINVAL;
 		goto err;
 	}
 
 	/* Register LOCK release */
-	err = regmap_update_bits(mfd->regmap, BD718XX_REG_REGLOCK,
+	err = regmap_update_bits(mfd->chip.regmap, BD718XX_REG_REGLOCK,
 				 (REGLOCK_PWRSEQ | REGLOCK_VREG), 0);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to unlock PMIC (%d)\n", err);
@@ -1200,7 +1200,8 @@ static int bd718xx_probe(struct platform_device *pdev)
 	 * bit allowing HW defaults for power rails to be used
 	 */
 	if (!use_snvs) {
-		err = regmap_update_bits(mfd->regmap, BD718XX_REG_TRANS_COND1,
+		err = regmap_update_bits(mfd->chip.regmap,
+					 BD718XX_REG_TRANS_COND1,
 					 BD718XX_ON_REQ_POWEROFF_MASK |
 					 BD718XX_SWRESET_POWEROFF_MASK |
 					 BD718XX_WDOG_POWEROFF_MASK |
@@ -1215,17 +1216,17 @@ static int bd718xx_probe(struct platform_device *pdev)
 		}
 	}
 
-	for (i = 0; i < pmic_regulators[mfd->chip_type].r_amount; i++) {
+	for (i = 0; i < pmic_regulators[mfd->chip.chip_type].r_amount; i++) {
 
 		const struct regulator_desc *desc;
 		struct regulator_dev *rdev;
 		const struct bd718xx_regulator_data *r;
 
-		r = &pmic_regulators[mfd->chip_type].r_datas[i];
+		r = &pmic_regulators[mfd->chip.chip_type].r_datas[i];
 		desc = &r->desc;
 
 		config.dev = pdev->dev.parent;
-		config.regmap = mfd->regmap;
+		config.regmap = mfd->chip.regmap;
 
 		rdev = devm_regulator_register(&pdev->dev, desc, &config);
 		if (IS_ERR(rdev)) {
@@ -1254,7 +1255,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 		 */
 		if (!use_snvs || !rdev->constraints->always_on ||
 		    !rdev->constraints->boot_on) {
-			err = regmap_update_bits(mfd->regmap, r->init.reg,
+			err = regmap_update_bits(mfd->chip.regmap, r->init.reg,
 						 r->init.mask, r->init.val);
 			if (err) {
 				dev_err(&pdev->dev,
@@ -1264,7 +1265,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 			}
 		}
 		for (j = 0; j < r->additional_init_amnt; j++) {
-			err = regmap_update_bits(mfd->regmap,
+			err = regmap_update_bits(mfd->chip.regmap,
 						 r->additional_inits[j].reg,
 						 r->additional_inits[j].mask,
 						 r->additional_inits[j].val);
diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h
index fd194bfc836f..7f2dbde402a1 100644
--- a/include/linux/mfd/rohm-bd718x7.h
+++ b/include/linux/mfd/rohm-bd718x7.h
@@ -4,14 +4,9 @@
 #ifndef __LINUX_MFD_BD718XX_H__
 #define __LINUX_MFD_BD718XX_H__
 
+#include <linux/mfd/rohm-generic.h>
 #include <linux/regmap.h>
 
-enum {
-	BD718XX_TYPE_BD71837 = 0,
-	BD718XX_TYPE_BD71847,
-	BD718XX_TYPE_AMOUNT
-};
-
 enum {
 	BD718XX_BUCK1 = 0,
 	BD718XX_BUCK2,
@@ -321,18 +316,17 @@ enum {
 	BD718XX_PWRBTN_LONG_PRESS_15S
 };
 
-struct bd718xx_clk;
-
 struct bd718xx {
-	unsigned int chip_type;
-	struct device *dev;
-	struct regmap *regmap;
-	unsigned long int id;
+	/*
+	 * Please keep this as the first member here as some
+	 * drivers (clk) supporting more than one chip may only know this
+	 * generic struct 'struct rohm_regmap_dev' and assume it is
+	 * the first chunk of parent device's private data.
+	 */
+	struct rohm_regmap_dev chip;
 
 	int chip_irq;
 	struct regmap_irq_chip_data *irq_data;
-
-	struct bd718xx_clk *clk;
 };
 
 #endif /* __LINUX_MFD_BD718XX_H__ */
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
new file mode 100644
index 000000000000..bff15ac26f2c
--- /dev/null
+++ b/include/linux/mfd/rohm-generic.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_ROHM_H__
+#define __LINUX_MFD_ROHM_H__
+
+enum {
+	ROHM_CHIP_TYPE_BD71837 = 0,
+	ROHM_CHIP_TYPE_BD71847,
+	ROHM_CHIP_TYPE_BD70528,
+	ROHM_CHIP_TYPE_AMOUNT
+};
+
+struct rohm_regmap_dev {
+	unsigned int chip_type;
+	struct device *dev;
+	struct regmap *regmap;
+};
+
+#endif
-- 
cgit v1.2.3


From 21b7c58fc1943f3aa8c18a994ab9bed4ae5aa72d Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:25:08 +0300
Subject: mfd: bd70528: Support ROHM bd70528 PMIC core

ROHM BD70528MWV is an ultra-low quiescent current general
purpose single-chip power management IC for battery-powered
portable devices.

Add MFD core which enables chip access for following subdevices:
	- regulators/LED drivers
	- battery-charger
	- gpios
	- 32.768kHz clk
	- RTC
	- watchdog

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              |  17 ++
 drivers/mfd/Makefile             |   2 +
 drivers/mfd/rohm-bd70528.c       | 316 ++++++++++++++++++++++++++++++
 include/linux/mfd/rohm-bd70528.h | 408 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 743 insertions(+)
 create mode 100644 drivers/mfd/rohm-bd70528.c
 create mode 100644 include/linux/mfd/rohm-bd70528.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..11fc53d78c5f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1890,6 +1890,23 @@ config MFD_ROHM_BD718XX
 	  NXP i.MX8. It contains 8 BUCK outputs and 7 LDOs, voltage monitoring
 	  and emergency shut down as well as 32,768KHz clock output.
 
+config MFD_ROHM_BD70528
+	tristate "ROHM BD70528 Power Management IC"
+	depends on I2C=y
+	depends on OF
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select MFD_CORE
+	help
+	  Select this option to get support for the ROHM BD70528 Power
+	  Management IC. BD71837 is general purpose single-chip power
+	  management IC for battery-powered portable devices. It contains
+	  3 ultra-low current consumption buck converters, 3 LDOs and 2 LED
+	  drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz
+	  crystal oscillator, high-accuracy VREF for use with an external ADC,
+	  10 bits SAR ADC for battery temperature monitor and 1S battery
+	  charger.
+
 config MFD_STM32_LPTIMER
 	tristate "Support for STM32 Low-Power Timer"
 	depends on (ARCH_STM32 && OF) || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 52b1a90ff515..643d65bcb6ea 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -247,5 +247,7 @@ obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
 obj-$(CONFIG_MFD_MXS_LRADC)     += mxs-lradc.o
 obj-$(CONFIG_MFD_SC27XX_PMIC)	+= sprd-sc27xx-spi.o
 obj-$(CONFIG_RAVE_SP_CORE)	+= rave-sp.o
+obj-$(CONFIG_MFD_ROHM_BD70528)	+= rohm-bd70528.o
 obj-$(CONFIG_MFD_ROHM_BD718XX)	+= rohm-bd718x7.o
 obj-$(CONFIG_MFD_STMFX) 	+= stmfx.o
+
diff --git a/drivers/mfd/rohm-bd70528.c b/drivers/mfd/rohm-bd70528.c
new file mode 100644
index 000000000000..55599d5c5c86
--- /dev/null
+++ b/drivers/mfd/rohm-bd70528.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+//
+// Copyright (C) 2019 ROHM Semiconductors
+//
+// ROHM BD70528 PMIC driver
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rohm-bd70528.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+
+#define BD70528_NUM_OF_GPIOS 4
+
+static const struct resource rtc_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_RTC_ALARM, "bd70528-rtc-alm"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_ELPS_TIM, "bd70528-elapsed-timer"),
+};
+
+static const struct resource charger_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_RES, "bd70528-bat-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_DET, "bd70528-bat-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DBAT_DET, "bd70528-bat-dead"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_RES, "bd70528-bat-warmed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_DET, "bd70528-bat-cold"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_RES, "bd70528-bat-cooled"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_DET, "bd70528-bat-hot"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_CHG_TSD, "bd70528-chg-tshd"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_RMV, "bd70528-bat-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_DET, "bd70528-bat-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_RES, "bd70528-dcin2-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_DET, "bd70528-dcin2-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_RMV, "bd70528-dcin2-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_DET, "bd70528-dcin2-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_RMV, "bd70528-dcin1-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_DET, "bd70528-dcin1-detected"),
+};
+
+static struct mfd_cell bd70528_mfd_cells[] = {
+	{ .name = "bd70528-pmic", },
+	{ .name = "bd70528-gpio", },
+	/*
+	 * We use BD71837 driver to drive the clock block. Only differences to
+	 * BD70528 clock gate are the register address and mask.
+	 */
+	{ .name = "bd718xx-clk", },
+	{ .name = "bd70528-wdt", },
+	{
+		.name = "bd70528-power",
+		.resources = charger_irqs,
+		.num_resources = ARRAY_SIZE(charger_irqs),
+	}, {
+		.name = "bd70528-rtc",
+		.resources = rtc_irqs,
+		.num_resources = ARRAY_SIZE(rtc_irqs),
+	},
+};
+
+static const struct regmap_range volatile_ranges[] = {
+	{
+		.range_min = BD70528_REG_INT_MAIN,
+		.range_max = BD70528_REG_INT_OP_FAIL,
+	}, {
+		.range_min = BD70528_REG_RTC_COUNT_H,
+		.range_max = BD70528_REG_RTC_ALM_REPEAT,
+	}, {
+		/*
+		 * WDT control reg is special. Magic values must be written to
+		 * it in order to change the control. Should not be cached.
+		 */
+		.range_min = BD70528_REG_WDT_CTRL,
+		.range_max = BD70528_REG_WDT_CTRL,
+	}, {
+		/*
+		 * BD70528 also contains a few other registers which require
+		 * magic sequences to be written in order to update the value.
+		 * At least SHIPMODE, HWRESET, WARMRESET,and STANDBY
+		 */
+		.range_min = BD70528_REG_SHIPMODE,
+		.range_max = BD70528_REG_STANDBY,
+	},
+};
+
+static const struct regmap_access_table volatile_regs = {
+	.yes_ranges = &volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(volatile_ranges),
+};
+
+static struct regmap_config bd70528_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.volatile_table = &volatile_regs,
+	.max_register = BD70528_MAX_REGISTER,
+	.cache_type = REGCACHE_RBTREE,
+};
+
+/*
+ * Mapping of main IRQ register bits to sub-IRQ register offsets so that we can
+ * access corect sub-IRQ registers based on bits that are set in main IRQ
+ * register.
+ */
+
+/* bit [0] - Shutdown register */
+unsigned int bit0_offsets[] = {0};	/* Shutdown register */
+unsigned int bit1_offsets[] = {1};	/* Power failure register */
+unsigned int bit2_offsets[] = {2};	/* VR FAULT register */
+unsigned int bit3_offsets[] = {3};	/* PMU register interrupts */
+unsigned int bit4_offsets[] = {4, 5};	/* Charger 1 and Charger 2 registers */
+unsigned int bit5_offsets[] = {6};	/* RTC register */
+unsigned int bit6_offsets[] = {7};	/* GPIO register */
+unsigned int bit7_offsets[] = {8};	/* Invalid operation register */
+
+static struct regmap_irq_sub_irq_map bd70528_sub_irq_offsets[] = {
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit0_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit1_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit2_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit3_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit4_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit5_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit6_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit7_offsets),
+};
+
+static struct regmap_irq bd70528_irqs[] = {
+	REGMAP_IRQ_REG(BD70528_INT_LONGPUSH, 0, BD70528_INT_LONGPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_WDT, 0, BD70528_INT_WDT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_HWRESET, 0, BD70528_INT_HWRESET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTB_FAULT, 0, BD70528_INT_RSTB_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_VBAT_UVLO, 0, BD70528_INT_VBAT_UVLO_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_TSD, 0, BD70528_INT_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTIN, 0, BD70528_INT_RSTIN_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FAULT, 1,
+		       BD70528_INT_BUCK1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FAULT, 1,
+		       BD70528_INT_BUCK2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_FAULT, 1,
+		       BD70528_INT_BUCK3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO1_FAULT, 1, BD70528_INT_LDO1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO2_FAULT, 1, BD70528_INT_LDO2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO3_FAULT, 1, BD70528_INT_LDO3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_FAULT, 1, BD70528_INT_LED1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_FAULT, 1, BD70528_INT_LED2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_OCP, 2, BD70528_INT_BUCK1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_OCP, 2, BD70528_INT_BUCK2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_OCP, 2, BD70528_INT_BUCK3_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_OCP, 2, BD70528_INT_LED1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_OCP, 2, BD70528_INT_LED2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FULLON, 2,
+		       BD70528_INT_BUCK1_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FULLON, 2,
+		       BD70528_INT_BUCK2_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_SHORTPUSH, 3, BD70528_INT_SHORTPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_AUTO_WAKEUP, 3,
+		       BD70528_INT_AUTO_WAKEUP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_STATE_CHANGE, 3,
+		       BD70528_INT_STATE_CHANGE_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_RES, 4, BD70528_INT_BAT_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_DET, 4, BD70528_INT_BAT_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DBAT_DET, 4, BD70528_INT_DBAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_RES, 4,
+		       BD70528_INT_BATTSD_COLD_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_DET, 4,
+		       BD70528_INT_BATTSD_COLD_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_RES, 4,
+		       BD70528_INT_BATTSD_HOT_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_DET, 4,
+		       BD70528_INT_BATTSD_HOT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_CHG_TSD, 4, BD70528_INT_CHG_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_RMV, 5, BD70528_INT_BAT_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_DET, 5, BD70528_INT_BAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_RES, 5,
+		       BD70528_INT_DCIN2_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_DET, 5,
+		       BD70528_INT_DCIN2_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_RMV, 5, BD70528_INT_DCIN2_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_DET, 5, BD70528_INT_DCIN2_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_RMV, 5, BD70528_INT_DCIN1_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_DET, 5, BD70528_INT_DCIN1_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RTC_ALARM, 6, BD70528_INT_RTC_ALARM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_ELPS_TIM, 6, BD70528_INT_ELPS_TIM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO0, 7, BD70528_INT_GPIO0_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO1, 7, BD70528_INT_GPIO1_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO2, 7, BD70528_INT_GPIO2_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO3, 7, BD70528_INT_GPIO3_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK1_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK2_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK3_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED1_VOLT_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED2_VOLT_OPFAIL_MASK),
+};
+
+static struct regmap_irq_chip bd70528_irq_chip = {
+	.name = "bd70528_irq",
+	.main_status = BD70528_REG_INT_MAIN,
+	.irqs = &bd70528_irqs[0],
+	.num_irqs = ARRAY_SIZE(bd70528_irqs),
+	.status_base = BD70528_REG_INT_SHDN,
+	.mask_base = BD70528_REG_INT_SHDN_MASK,
+	.ack_base = BD70528_REG_INT_SHDN,
+	.type_base = BD70528_REG_GPIO1_IN,
+	.init_ack_masked = true,
+	.num_regs = 9,
+	.num_main_regs = 1,
+	.num_type_reg = 4,
+	.sub_reg_offsets = &bd70528_sub_irq_offsets[0],
+	.num_main_status_bits = 8,
+	.irq_reg_stride = 1,
+};
+
+static int bd70528_i2c_probe(struct i2c_client *i2c,
+			     const struct i2c_device_id *id)
+{
+	struct bd70528_data *bd70528;
+	struct regmap_irq_chip_data *irq_data;
+	int ret, i;
+
+	if (!i2c->irq) {
+		dev_err(&i2c->dev, "No IRQ configured\n");
+		return -EINVAL;
+	}
+
+	bd70528 = devm_kzalloc(&i2c->dev, sizeof(*bd70528), GFP_KERNEL);
+	if (!bd70528)
+		return -ENOMEM;
+
+	mutex_init(&bd70528->rtc_timer_lock);
+
+	dev_set_drvdata(&i2c->dev, &bd70528->chip);
+
+	bd70528->chip.chip_type = ROHM_CHIP_TYPE_BD70528;
+	bd70528->chip.regmap = devm_regmap_init_i2c(i2c, &bd70528_regmap);
+	if (IS_ERR(bd70528->chip.regmap)) {
+		dev_err(&i2c->dev, "Failed to initialize Regmap\n");
+		return PTR_ERR(bd70528->chip.regmap);
+	}
+
+	/*
+	 * Disallow type setting for all IRQs by default as most of them do not
+	 * support setting type.
+	 */
+	for (i = 0; i < ARRAY_SIZE(bd70528_irqs); i++)
+		bd70528_irqs[i].type.types_supported = 0;
+
+	/* Set IRQ typesetting information for GPIO pins 0 - 3 */
+	for (i = 0; i < BD70528_NUM_OF_GPIOS; i++) {
+		struct regmap_irq_type *type;
+
+		type = &bd70528_irqs[BD70528_INT_GPIO0 + i].type;
+		type->type_reg_offset = 2 * i;
+		type->type_rising_val = 0x20;
+		type->type_falling_val = 0x10;
+		type->type_level_high_val = 0x40;
+		type->type_level_low_val = 0x50;
+		type->types_supported = (IRQ_TYPE_EDGE_BOTH |
+				IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW);
+	}
+
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd70528->chip.regmap,
+				       i2c->irq, IRQF_ONESHOT, 0,
+				       &bd70528_irq_chip, &irq_data);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to add IRQ chip\n");
+		return ret;
+	}
+	dev_dbg(&i2c->dev, "Registered %d IRQs for chip\n",
+		bd70528_irq_chip.num_irqs);
+
+	/*
+	 * BD70528 IRQ controller is not touching the main mask register.
+	 * So enable the GPIO block interrupts at main level. We can just leave
+	 * them enabled as the IRQ controller should disable IRQs from
+	 * sub-registers when IRQ is disabled or freed.
+	 */
+	ret = regmap_update_bits(bd70528->chip.regmap,
+				 BD70528_REG_INT_MAIN_MASK,
+				 BD70528_INT_GPIO_MASK, 0);
+
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO,
+				   bd70528_mfd_cells,
+				   ARRAY_SIZE(bd70528_mfd_cells), NULL, 0,
+				   regmap_irq_get_domain(irq_data));
+	if (ret)
+		dev_err(&i2c->dev, "Failed to create subdevices\n");
+
+	return ret;
+}
+
+static const struct of_device_id bd70528_of_match[] = {
+	{ .compatible = "rohm,bd70528", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, bd70528_of_match);
+
+static struct i2c_driver bd70528_drv = {
+	.driver = {
+		.name = "rohm-bd70528",
+		.of_match_table = bd70528_of_match,
+	},
+	.probe = &bd70528_i2c_probe,
+};
+
+module_i2c_driver(bd70528_drv);
+
+MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
+MODULE_DESCRIPTION("ROHM BD70528 Power Management IC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h
new file mode 100644
index 000000000000..1013e60c5b25
--- /dev/null
+++ b/include/linux/mfd/rohm-bd70528.h
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_BD70528_H__
+#define __LINUX_MFD_BD70528_H__
+
+#include <linux/bits.h>
+#include <linux/device.h>
+#include <linux/mfd/rohm-generic.h>
+#include <linux/regmap.h>
+
+enum {
+	BD70528_BUCK1,
+	BD70528_BUCK2,
+	BD70528_BUCK3,
+	BD70528_LDO1,
+	BD70528_LDO2,
+	BD70528_LDO3,
+	BD70528_LED1,
+	BD70528_LED2,
+};
+
+struct bd70528_data {
+	struct rohm_regmap_dev chip;
+	struct mutex rtc_timer_lock;
+};
+
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_LDO_VOLTS 0x20
+
+#define BD70528_REG_BUCK1_EN	0x0F
+#define BD70528_REG_BUCK1_VOLT	0x15
+#define BD70528_REG_BUCK2_EN	0x10
+#define BD70528_REG_BUCK2_VOLT	0x16
+#define BD70528_REG_BUCK3_EN	0x11
+#define BD70528_REG_BUCK3_VOLT	0x17
+#define BD70528_REG_LDO1_EN	0x1b
+#define BD70528_REG_LDO1_VOLT	0x1e
+#define BD70528_REG_LDO2_EN	0x1c
+#define BD70528_REG_LDO2_VOLT	0x1f
+#define BD70528_REG_LDO3_EN	0x1d
+#define BD70528_REG_LDO3_VOLT	0x20
+#define BD70528_REG_LED_CTRL	0x2b
+#define BD70528_REG_LED_VOLT	0x29
+#define BD70528_REG_LED_EN	0x2a
+
+/* main irq registers */
+#define BD70528_REG_INT_MAIN	0x7E
+#define BD70528_REG_INT_MAIN_MASK 0x74
+
+/* 'sub irq' registers */
+#define BD70528_REG_INT_SHDN	0x7F
+#define BD70528_REG_INT_PWR_FLT	0x80
+#define BD70528_REG_INT_VR_FLT	0x81
+#define BD70528_REG_INT_MISC	0x82
+#define BD70528_REG_INT_BAT1	0x83
+#define BD70528_REG_INT_BAT2	0x84
+#define BD70528_REG_INT_RTC	0x85
+#define BD70528_REG_INT_GPIO	0x86
+#define BD70528_REG_INT_OP_FAIL	0x87
+
+#define BD70528_REG_INT_SHDN_MASK	0x75
+#define BD70528_REG_INT_PWR_FLT_MASK	0x76
+#define BD70528_REG_INT_VR_FLT_MASK	0x77
+#define BD70528_REG_INT_MISC_MASK	0x78
+#define BD70528_REG_INT_BAT1_MASK	0x79
+#define BD70528_REG_INT_BAT2_MASK	0x7a
+#define BD70528_REG_INT_RTC_MASK	0x7b
+#define BD70528_REG_INT_GPIO_MASK	0x7c
+#define BD70528_REG_INT_OP_FAIL_MASK	0x7d
+
+/* Reset related 'magic' registers */
+#define BD70528_REG_SHIPMODE	0x03
+#define BD70528_REG_HWRESET	0x04
+#define BD70528_REG_WARMRESET	0x05
+#define BD70528_REG_STANDBY	0x06
+
+/* GPIO registers */
+#define BD70528_REG_GPIO_STATE	0x8F
+
+#define BD70528_REG_GPIO1_IN	0x4d
+#define BD70528_REG_GPIO2_IN	0x4f
+#define BD70528_REG_GPIO3_IN	0x51
+#define BD70528_REG_GPIO4_IN	0x53
+#define BD70528_REG_GPIO1_OUT	0x4e
+#define BD70528_REG_GPIO2_OUT	0x50
+#define BD70528_REG_GPIO3_OUT	0x52
+#define BD70528_REG_GPIO4_OUT	0x54
+
+/* clk control */
+
+#define BD70528_REG_CLK_OUT	0x2c
+
+/* RTC */
+
+#define BD70528_REG_RTC_COUNT_H		0x2d
+#define BD70528_REG_RTC_COUNT_L		0x2e
+#define BD70528_REG_RTC_SEC		0x2f
+#define BD70528_REG_RTC_MINUTE		0x30
+#define BD70528_REG_RTC_HOUR		0x31
+#define BD70528_REG_RTC_WEEK		0x32
+#define BD70528_REG_RTC_DAY		0x33
+#define BD70528_REG_RTC_MONTH		0x34
+#define BD70528_REG_RTC_YEAR		0x35
+
+#define BD70528_REG_RTC_ALM_SEC		0x36
+#define BD70528_REG_RTC_ALM_START	BD70528_REG_RTC_ALM_SEC
+#define BD70528_REG_RTC_ALM_MINUTE	0x37
+#define BD70528_REG_RTC_ALM_HOUR	0x38
+#define BD70528_REG_RTC_ALM_WEEK	0x39
+#define BD70528_REG_RTC_ALM_DAY		0x3a
+#define BD70528_REG_RTC_ALM_MONTH	0x3b
+#define BD70528_REG_RTC_ALM_YEAR	0x3c
+#define BD70528_REG_RTC_ALM_MASK	0x3d
+#define BD70528_REG_RTC_ALM_REPEAT	0x3e
+#define BD70528_REG_RTC_START		BD70528_REG_RTC_SEC
+
+#define BD70528_REG_RTC_WAKE_SEC	0x43
+#define BD70528_REG_RTC_WAKE_START	BD70528_REG_RTC_WAKE_SEC
+#define BD70528_REG_RTC_WAKE_MIN	0x44
+#define BD70528_REG_RTC_WAKE_HOUR	0x45
+#define BD70528_REG_RTC_WAKE_CTRL	0x46
+
+#define BD70528_REG_ELAPSED_TIMER_EN	0x42
+#define BD70528_REG_WAKE_EN		0x46
+
+/* WDT registers */
+#define BD70528_REG_WDT_CTRL		0x4A
+#define BD70528_REG_WDT_HOUR		0x49
+#define BD70528_REG_WDT_MINUTE		0x48
+#define BD70528_REG_WDT_SEC		0x47
+
+/* Charger / Battery */
+#define BD70528_REG_CHG_CURR_STAT	0x59
+#define BD70528_REG_CHG_BAT_STAT	0x57
+#define BD70528_REG_CHG_BAT_TEMP	0x58
+#define BD70528_REG_CHG_IN_STAT		0x56
+#define BD70528_REG_CHG_DCIN_ILIM	0x5d
+#define BD70528_REG_CHG_CHG_CURR_WARM	0x61
+#define BD70528_REG_CHG_CHG_CURR_COLD	0x62
+
+/* Masks for main IRQ register bits */
+enum {
+	BD70528_INT_SHDN,
+#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN)
+	BD70528_INT_PWR_FLT,
+#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT)
+	BD70528_INT_VR_FLT,
+#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT)
+	BD70528_INT_MISC,
+#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC)
+	BD70528_INT_BAT1,
+#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1)
+	BD70528_INT_RTC,
+#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC)
+	BD70528_INT_GPIO,
+#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO)
+	BD70528_INT_OP_FAIL,
+#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL)
+};
+
+/* IRQs */
+enum {
+	/* Shutdown register IRQs */
+	BD70528_INT_LONGPUSH,
+	BD70528_INT_WDT,
+	BD70528_INT_HWRESET,
+	BD70528_INT_RSTB_FAULT,
+	BD70528_INT_VBAT_UVLO,
+	BD70528_INT_TSD,
+	BD70528_INT_RSTIN,
+	/* Power failure register IRQs */
+	BD70528_INT_BUCK1_FAULT,
+	BD70528_INT_BUCK2_FAULT,
+	BD70528_INT_BUCK3_FAULT,
+	BD70528_INT_LDO1_FAULT,
+	BD70528_INT_LDO2_FAULT,
+	BD70528_INT_LDO3_FAULT,
+	BD70528_INT_LED1_FAULT,
+	BD70528_INT_LED2_FAULT,
+	/* VR FAULT register IRQs */
+	BD70528_INT_BUCK1_OCP,
+	BD70528_INT_BUCK2_OCP,
+	BD70528_INT_BUCK3_OCP,
+	BD70528_INT_LED1_OCP,
+	BD70528_INT_LED2_OCP,
+	BD70528_INT_BUCK1_FULLON,
+	BD70528_INT_BUCK2_FULLON,
+	/* PMU register interrupts */
+	BD70528_INT_SHORTPUSH,
+	BD70528_INT_AUTO_WAKEUP,
+	BD70528_INT_STATE_CHANGE,
+	/* Charger 1 register IRQs */
+	BD70528_INT_BAT_OV_RES,
+	BD70528_INT_BAT_OV_DET,
+	BD70528_INT_DBAT_DET,
+	BD70528_INT_BATTSD_COLD_RES,
+	BD70528_INT_BATTSD_COLD_DET,
+	BD70528_INT_BATTSD_HOT_RES,
+	BD70528_INT_BATTSD_HOT_DET,
+	BD70528_INT_CHG_TSD,
+	/* Charger 2 register IRQs */
+	BD70528_INT_BAT_RMV,
+	BD70528_INT_BAT_DET,
+	BD70528_INT_DCIN2_OV_RES,
+	BD70528_INT_DCIN2_OV_DET,
+	BD70528_INT_DCIN2_RMV,
+	BD70528_INT_DCIN2_DET,
+	BD70528_INT_DCIN1_RMV,
+	BD70528_INT_DCIN1_DET,
+	/* RTC register IRQs */
+	BD70528_INT_RTC_ALARM,
+	BD70528_INT_ELPS_TIM,
+	/* GPIO register IRQs */
+	BD70528_INT_GPIO0,
+	BD70528_INT_GPIO1,
+	BD70528_INT_GPIO2,
+	BD70528_INT_GPIO3,
+	/* Invalid operation register IRQs */
+	BD70528_INT_BUCK1_DVS_OPFAIL,
+	BD70528_INT_BUCK2_DVS_OPFAIL,
+	BD70528_INT_BUCK3_DVS_OPFAIL,
+	BD70528_INT_LED1_VOLT_OPFAIL,
+	BD70528_INT_LED2_VOLT_OPFAIL,
+};
+
+/* Masks */
+#define BD70528_INT_LONGPUSH_MASK 0x1
+#define BD70528_INT_WDT_MASK 0x2
+#define BD70528_INT_HWRESET_MASK 0x4
+#define BD70528_INT_RSTB_FAULT_MASK 0x8
+#define BD70528_INT_VBAT_UVLO_MASK 0x10
+#define BD70528_INT_TSD_MASK 0x20
+#define BD70528_INT_RSTIN_MASK 0x40
+
+#define BD70528_INT_BUCK1_FAULT_MASK 0x1
+#define BD70528_INT_BUCK2_FAULT_MASK 0x2
+#define BD70528_INT_BUCK3_FAULT_MASK 0x4
+#define BD70528_INT_LDO1_FAULT_MASK 0x8
+#define BD70528_INT_LDO2_FAULT_MASK 0x10
+#define BD70528_INT_LDO3_FAULT_MASK 0x20
+#define BD70528_INT_LED1_FAULT_MASK 0x40
+#define BD70528_INT_LED2_FAULT_MASK 0x80
+
+#define BD70528_INT_BUCK1_OCP_MASK 0x1
+#define BD70528_INT_BUCK2_OCP_MASK 0x2
+#define BD70528_INT_BUCK3_OCP_MASK 0x4
+#define BD70528_INT_LED1_OCP_MASK 0x8
+#define BD70528_INT_LED2_OCP_MASK 0x10
+#define BD70528_INT_BUCK1_FULLON_MASK 0x20
+#define BD70528_INT_BUCK2_FULLON_MASK 0x40
+
+#define BD70528_INT_SHORTPUSH_MASK 0x1
+#define BD70528_INT_AUTO_WAKEUP_MASK 0x2
+#define BD70528_INT_STATE_CHANGE_MASK 0x10
+
+#define BD70528_INT_BAT_OV_RES_MASK 0x1
+#define BD70528_INT_BAT_OV_DET_MASK 0x2
+#define BD70528_INT_DBAT_DET_MASK 0x4
+#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8
+#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10
+#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20
+#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40
+#define BD70528_INT_CHG_TSD_MASK 0x80
+
+#define BD70528_INT_BAT_RMV_MASK 0x1
+#define BD70528_INT_BAT_DET_MASK 0x2
+#define BD70528_INT_DCIN2_OV_RES_MASK 0x4
+#define BD70528_INT_DCIN2_OV_DET_MASK 0x8
+#define BD70528_INT_DCIN2_RMV_MASK 0x10
+#define BD70528_INT_DCIN2_DET_MASK 0x20
+#define BD70528_INT_DCIN1_RMV_MASK 0x40
+#define BD70528_INT_DCIN1_DET_MASK 0x80
+
+#define BD70528_INT_RTC_ALARM_MASK 0x1
+#define BD70528_INT_ELPS_TIM_MASK 0x2
+
+#define BD70528_INT_GPIO0_MASK 0x1
+#define BD70528_INT_GPIO1_MASK 0x2
+#define BD70528_INT_GPIO2_MASK 0x4
+#define BD70528_INT_GPIO3_MASK 0x8
+
+#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1
+#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2
+#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4
+#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10
+#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20
+
+#define BD70528_DEBOUNCE_MASK 0x3
+
+#define BD70528_DEBOUNCE_DISABLE 0
+#define BD70528_DEBOUNCE_15MS 1
+#define BD70528_DEBOUNCE_30MS 2
+#define BD70528_DEBOUNCE_50MS 3
+
+#define BD70528_GPIO_DRIVE_MASK 0x2
+#define BD70528_GPIO_PUSH_PULL 0x0
+#define BD70528_GPIO_OPEN_DRAIN 0x2
+
+#define BD70528_GPIO_OUT_EN_MASK 0x80
+#define BD70528_GPIO_OUT_ENABLE 0x80
+#define BD70528_GPIO_OUT_DISABLE 0x0
+
+#define BD70528_GPIO_OUT_HI 0x1
+#define BD70528_GPIO_OUT_LO 0x0
+#define BD70528_GPIO_OUT_MASK 0x1
+
+#define BD70528_GPIO_IN_STATE_BASE 1
+
+#define BD70528_CLK_OUT_EN_MASK 0x1
+
+/* RTC masks to mask out reserved bits */
+
+#define BD70528_MASK_RTC_SEC		0x7f
+#define BD70528_MASK_RTC_MINUTE		0x7f
+#define BD70528_MASK_RTC_HOUR_24H	0x80
+#define BD70528_MASK_RTC_HOUR_PM	0x20
+#define BD70528_MASK_RTC_HOUR		0x1f
+#define BD70528_MASK_RTC_DAY		0x3f
+#define BD70528_MASK_RTC_WEEK		0x07
+#define BD70528_MASK_RTC_MONTH		0x1f
+#define BD70528_MASK_RTC_YEAR		0xff
+#define BD70528_MASK_RTC_COUNT_L	0x7f
+
+#define BD70528_MASK_ELAPSED_TIMER_EN	0x1
+/* Mask second, min and hour fields
+ * HW would support ALM irq for over 24h
+ * (by setting day, month and year too)
+ * but as we wish to keep this same as for
+ * wake-up we limit ALM to 24H and only
+ * unmask sec, min and hour
+ */
+#define BD70528_MASK_ALM_EN		0x7
+#define BD70528_MASK_WAKE_EN		0x1
+
+/* WDT masks */
+#define BD70528_MASK_WDT_EN		0x1
+#define BD70528_MASK_WDT_HOUR		0x1
+#define BD70528_MASK_WDT_MINUTE		0x7f
+#define BD70528_MASK_WDT_SEC		0x7f
+
+#define BD70528_WDT_STATE_BIT		0x1
+#define BD70528_ELAPSED_STATE_BIT	0x2
+#define BD70528_WAKE_STATE_BIT		0x4
+
+/* Charger masks */
+#define BD70528_MASK_CHG_STAT		0x7f
+#define BD70528_MASK_CHG_BAT_TIMER	0x20
+#define BD70528_MASK_CHG_BAT_OVERVOLT	0x10
+#define BD70528_MASK_CHG_BAT_DETECT	0x1
+#define BD70528_MASK_CHG_DCIN1_UVLO	0x1
+#define BD70528_MASK_CHG_DCIN_ILIM	0x3f
+#define BD70528_MASK_CHG_CHG_CURR	0x1f
+#define BD70528_MASK_CHG_TRICKLE_CURR	0x10
+
+/*
+ * Note, external battery register is the lonely rider at
+ * address 0xc5. See how to stuff that in the regmap
+ */
+#define BD70528_MAX_REGISTER 0x94
+
+/* Buck control masks */
+#define BD70528_MASK_RUN_EN	0x4
+#define BD70528_MASK_STBY_EN	0x2
+#define BD70528_MASK_IDLE_EN	0x1
+#define BD70528_MASK_LED1_EN	0x1
+#define BD70528_MASK_LED2_EN	0x10
+
+#define BD70528_MASK_BUCK_VOLT	0xf
+#define BD70528_MASK_LDO_VOLT	0x1f
+#define BD70528_MASK_LED1_VOLT	0x1
+#define BD70528_MASK_LED2_VOLT	0x10
+
+/* Misc irq masks */
+#define BD70528_INT_MASK_SHORT_PUSH	1
+#define BD70528_INT_MASK_AUTO_WAKE	2
+#define BD70528_INT_MASK_POWER_STATE	4
+
+#define BD70528_MASK_BUCK_RAMP 0x10
+#define BD70528_SIFT_BUCK_RAMP 4
+
+#if IS_ENABLED(CONFIG_BD70528_WATCHDOG)
+
+int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state);
+void bd70528_wdt_lock(struct rohm_regmap_dev *data);
+void bd70528_wdt_unlock(struct rohm_regmap_dev *data);
+
+#else /* CONFIG_BD70528_WATCHDOG */
+
+static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable,
+				  int *old_state)
+{
+	return 0;
+}
+
+static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data)
+{
+}
+
+static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data)
+{
+}
+
+#endif /* CONFIG_BD70528_WATCHDOG */
+
+#endif /* __LINUX_MFD_BD70528_H__ */
-- 
cgit v1.2.3


From 6bbe6f5732faeabb4bb583726ec2d7f9739532bd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 18:05:28 -0300
Subject: docs: thermal: convert to ReST

Rename the thermal documentation files to ReST, add an
index for them and adjust in order to produce a nice html
output via the Sphinx build system.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 Documentation/thermal/cpu-cooling-api.rst          | 107 +++
 Documentation/thermal/cpu-cooling-api.txt          |  92 ---
 Documentation/thermal/exynos_thermal               |  77 --
 Documentation/thermal/exynos_thermal.rst           |  90 +++
 Documentation/thermal/exynos_thermal_emulation     |  53 --
 Documentation/thermal/exynos_thermal_emulation.rst |  61 ++
 Documentation/thermal/index.rst                    |  18 +
 Documentation/thermal/intel_powerclamp.rst         | 320 +++++++++
 Documentation/thermal/intel_powerclamp.txt         | 317 --------
 Documentation/thermal/nouveau_thermal              |  82 ---
 Documentation/thermal/nouveau_thermal.rst          |  96 +++
 Documentation/thermal/power_allocator.rst          | 271 +++++++
 Documentation/thermal/power_allocator.txt          | 247 -------
 Documentation/thermal/sysfs-api.rst                | 798 +++++++++++++++++++++
 Documentation/thermal/sysfs-api.txt                | 636 ----------------
 Documentation/thermal/x86_pkg_temperature_thermal  |  47 --
 .../thermal/x86_pkg_temperature_thermal.rst        |  55 ++
 MAINTAINERS                                        |   2 +-
 include/linux/thermal.h                            |   4 +-
 19 files changed, 1819 insertions(+), 1554 deletions(-)
 create mode 100644 Documentation/thermal/cpu-cooling-api.rst
 delete mode 100644 Documentation/thermal/cpu-cooling-api.txt
 delete mode 100644 Documentation/thermal/exynos_thermal
 create mode 100644 Documentation/thermal/exynos_thermal.rst
 delete mode 100644 Documentation/thermal/exynos_thermal_emulation
 create mode 100644 Documentation/thermal/exynos_thermal_emulation.rst
 create mode 100644 Documentation/thermal/index.rst
 create mode 100644 Documentation/thermal/intel_powerclamp.rst
 delete mode 100644 Documentation/thermal/intel_powerclamp.txt
 delete mode 100644 Documentation/thermal/nouveau_thermal
 create mode 100644 Documentation/thermal/nouveau_thermal.rst
 create mode 100644 Documentation/thermal/power_allocator.rst
 delete mode 100644 Documentation/thermal/power_allocator.txt
 create mode 100644 Documentation/thermal/sysfs-api.rst
 delete mode 100644 Documentation/thermal/sysfs-api.txt
 delete mode 100644 Documentation/thermal/x86_pkg_temperature_thermal
 create mode 100644 Documentation/thermal/x86_pkg_temperature_thermal.rst

(limited to 'include/linux')

diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst
new file mode 100644
index 000000000000..645d914c45a6
--- /dev/null
+++ b/Documentation/thermal/cpu-cooling-api.rst
@@ -0,0 +1,107 @@
+=======================
+CPU cooling APIs How To
+=======================
+
+Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
+
+Updated: 6 Jan 2015
+
+Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
+
+0. Introduction
+===============
+
+The generic cpu cooling(freq clipping) provides registration/unregistration APIs
+to the caller. The binding of the cooling devices to the trip point is left for
+the user. The registration APIs returns the cooling device pointer.
+
+1. cpu cooling APIs
+===================
+
+1.1 cpufreq registration/unregistration APIs
+--------------------------------------------
+
+    ::
+
+	struct thermal_cooling_device
+	*cpufreq_cooling_register(struct cpumask *clip_cpus)
+
+    This interface function registers the cpufreq cooling device with the name
+    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+    cooling devices.
+
+   clip_cpus:
+	cpumask of cpus where the frequency constraints will happen.
+
+    ::
+
+	struct thermal_cooling_device
+	*of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+
+    This interface function registers the cpufreq cooling device with
+    the name "thermal-cpufreq-%x" linking it with a device tree node, in
+    order to bind it via the thermal DT code. This api can support multiple
+    instances of cpufreq cooling devices.
+
+    policy:
+	CPUFreq policy.
+
+
+    ::
+
+	void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+
+    cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+===============
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic power (static power isn't
+supported currently).  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as::
+
+	Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes::
+
+	Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
deleted file mode 100644
index 7df567eaea1a..000000000000
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-CPU cooling APIs How To
-===================================
-
-Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
-
-Updated: 6 Jan 2015
-
-Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
-
-0. Introduction
-
-The generic cpu cooling(freq clipping) provides registration/unregistration APIs
-to the caller. The binding of the cooling devices to the trip point is left for
-the user. The registration APIs returns the cooling device pointer.
-
-1. cpu cooling APIs
-
-1.1 cpufreq registration/unregistration APIs
-1.1.1 struct thermal_cooling_device *cpufreq_cooling_register(
-	struct cpumask *clip_cpus)
-
-    This interface function registers the cpufreq cooling device with the name
-    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
-    cooling devices.
-
-   clip_cpus: cpumask of cpus where the frequency constraints will happen.
-
-1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
-					struct cpufreq_policy *policy)
-
-    This interface function registers the cpufreq cooling device with
-    the name "thermal-cpufreq-%x" linking it with a device tree node, in
-    order to bind it via the thermal DT code. This api can support multiple
-    instances of cpufreq cooling devices.
-
-    policy: CPUFreq policy.
-
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
-
-    cdev: Cooling device pointer which has to be unregistered.
-
-2. Power models
-
-The power API registration functions provide a simple power model for
-CPUs.  The current power is calculated as dynamic power (static power isn't
-supported currently).  This power model requires that the operating-points of
-the CPUs are registered using the kernel's opp library and the
-`cpufreq_frequency_table` is assigned to the `struct device` of the
-cpu.  If you are using CONFIG_CPUFREQ_DT then the
-`cpufreq_frequency_table` should already be assigned to the cpu
-device.
-
-The dynamic power consumption of a processor depends on many factors.
-For a given processor implementation the primary factors are:
-
-- The time the processor spends running, consuming dynamic power, as
-  compared to the time in idle states where dynamic consumption is
-  negligible.  Herein we refer to this as 'utilisation'.
-- The voltage and frequency levels as a result of DVFS.  The DVFS
-  level is a dominant factor governing power consumption.
-- In running time the 'execution' behaviour (instruction types, memory
-  access patterns and so forth) causes, in most cases, a second order
-  variation.  In pathological cases this variation can be significant,
-  but typically it is of a much lesser impact than the factors above.
-
-A high level dynamic power consumption model may then be represented as:
-
-Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
-
-f(run) here represents the described execution behaviour and its
-result has a units of Watts/Hz/Volt^2 (this often expressed in
-mW/MHz/uVolt^2)
-
-The detailed behaviour for f(run) could be modelled on-line.  However,
-in practice, such an on-line model has dependencies on a number of
-implementation specific processor support and characterisation
-factors.  Therefore, in initial implementation that contribution is
-represented as a constant coefficient.  This is a simplification
-consistent with the relative contribution to overall power variation.
-
-In this simplified representation our model becomes:
-
-Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
-
-Where `capacitance` is a constant that represents an indicative
-running time dynamic power coefficient in fundamental units of
-mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
-from 100 to 500.  For reference, the approximate values for the SoC in
-ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
-140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal
deleted file mode 100644
index 9010c4416967..000000000000
--- a/Documentation/thermal/exynos_thermal
+++ /dev/null
@@ -1,77 +0,0 @@
-Kernel driver exynos_tmu
-=================
-
-Supported chips:
-* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
-  Datasheet: Not publicly available
-
-Authors: Donggeun Kim <dg77.kim@samsung.com>
-Authors: Amit Daniel <amit.daniel@samsung.com>
-
-TMU controller Description:
----------------------------
-
-This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
-
-The chip only exposes the measured 8-bit temperature code value
-through a register.
-Temperature can be taken from the temperature code.
-There are three equations converting from temperature to temperature code.
-
-The three equations are:
-  1. Two point trimming
-	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
-
-  2. One point trimming
-	Tc = T + TI1 - 25
-
-  3. No trimming
-	Tc = T + 50
-
-  Tc: Temperature code, T: Temperature,
-  TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 25 degree Celsius which is unchanged
-  TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 85 degree Celsius which is unchanged
-
-TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
-when temperature exceeds pre-defined levels.
-The maximum number of configurable threshold is five.
-The threshold levels are defined as follows:
-  Level_0: current temperature > trigger_level_0 + threshold
-  Level_1: current temperature > trigger_level_1 + threshold
-  Level_2: current temperature > trigger_level_2 + threshold
-  Level_3: current temperature > trigger_level_3 + threshold
-
-  The threshold and each trigger_level are set
-  through the corresponding registers.
-
-When an interrupt occurs, this driver notify kernel thermal framework
-with the function exynos_report_trigger.
-Although an interrupt condition for level_0 can be set,
-it can be used to synchronize the cooling action.
-
-TMU driver description:
------------------------
-
-The exynos thermal driver is structured as,
-
-					Kernel Core thermal framework
-				(thermal_core.c, step_wise.c, cpu_cooling.c)
-								^
-								|
-								|
-TMU configuration data -------> TMU Driver  <------> Exynos Core thermal wrapper
-(exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
-(exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
-
-a) TMU configuration data: This consist of TMU register offsets/bitfields
-		described through structure exynos_tmu_registers. Also several
-		other platform data (struct exynos_tmu_platform_data) members
-		are used to configure the TMU.
-b) TMU driver: This component initialises the TMU controller and sets different
-		thresholds. It invokes core thermal implementation with the call
-		exynos_report_trigger.
-c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the
-		Kernel core thermal framework. They are exynos_unregister_thermal,
-		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst
new file mode 100644
index 000000000000..5bd556566c70
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal.rst
@@ -0,0 +1,90 @@
+========================
+Kernel driver exynos_tmu
+========================
+
+Supported chips:
+
+* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
+
+  Datasheet: Not publicly available
+
+Authors: Donggeun Kim <dg77.kim@samsung.com>
+Authors: Amit Daniel <amit.daniel@samsung.com>
+
+TMU controller Description:
+---------------------------
+
+This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
+
+The chip only exposes the measured 8-bit temperature code value
+through a register.
+Temperature can be taken from the temperature code.
+There are three equations converting from temperature to temperature code.
+
+The three equations are:
+  1. Two point trimming::
+
+	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
+
+  2. One point trimming::
+
+	Tc = T + TI1 - 25
+
+  3. No trimming::
+
+	Tc = T + 50
+
+  Tc:
+       Temperature code, T: Temperature,
+  TI1:
+       Trimming info for 25 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 25 degree Celsius which is unchanged
+  TI2:
+       Trimming info for 85 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 85 degree Celsius which is unchanged
+
+TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
+when temperature exceeds pre-defined levels.
+The maximum number of configurable threshold is five.
+The threshold levels are defined as follows::
+
+  Level_0: current temperature > trigger_level_0 + threshold
+  Level_1: current temperature > trigger_level_1 + threshold
+  Level_2: current temperature > trigger_level_2 + threshold
+  Level_3: current temperature > trigger_level_3 + threshold
+
+The threshold and each trigger_level are set
+through the corresponding registers.
+
+When an interrupt occurs, this driver notify kernel thermal framework
+with the function exynos_report_trigger.
+Although an interrupt condition for level_0 can be set,
+it can be used to synchronize the cooling action.
+
+TMU driver description:
+-----------------------
+
+The exynos thermal driver is structured as::
+
+					Kernel Core thermal framework
+				(thermal_core.c, step_wise.c, cpu_cooling.c)
+								^
+								|
+								|
+  TMU configuration data -----> TMU Driver  <----> Exynos Core thermal wrapper
+  (exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
+  (exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
+
+a) TMU configuration data:
+		This consist of TMU register offsets/bitfields
+		described through structure exynos_tmu_registers. Also several
+		other platform data (struct exynos_tmu_platform_data) members
+		are used to configure the TMU.
+b) TMU driver:
+		This component initialises the TMU controller and sets different
+		thresholds. It invokes core thermal implementation with the call
+		exynos_report_trigger.
+c) Exynos Core thermal wrapper:
+		This provides 3 wrapper function to use the
+		Kernel core thermal framework. They are exynos_unregister_thermal,
+		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation
deleted file mode 100644
index b15efec6ca28..000000000000
--- a/Documentation/thermal/exynos_thermal_emulation
+++ /dev/null
@@ -1,53 +0,0 @@
-EXYNOS EMULATION MODE
-========================
-
-Copyright (C) 2012 Samsung Electronics
-
-Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
-
-Description
------------
-
-Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit.
-Thermal emulation mode supports software debug for TMU's operation. User can set temperature
-manually with software code and TMU will read current temperature from user value not from
-sensor's value.
-
-Enabling CONFIG_THERMAL_EMULATION option will make this support available.
-When it's enabled, sysfs node will be created as
-/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
-
-The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
-temperature you want to update to sysfs node, it automatically enable emulation mode and
-current temperature will be changed into it.
-(Exynos also supports user changeable delay time which would be used to delay of
- changing temperature. However, this node only uses same delay of real sensing time, 938us.)
-
-Exynos emulation mode requires synchronous of value changing and enabling. It means when you
-want to update the any value of delay or next temperature, then you have to enable emulation
-mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value repeatedly. That's why
-this node gives users the right to change termerpature only. Just one interface makes it more
-simply to use.
-
-Disabling emulation mode only requires writing value 0 to sysfs node.
-
-
-TEMP	120 |
-	    |
-	100 |
-	    |
-	 80 |
-	    |		     	 	 +-----------
-	 60 |      		     	 |	    |
-	    |	           +-------------|          |
-	 40 |              |         	 |          |
-	    |		   |	     	 |          |
-	 20 |		   |	     	 |          +----------
-	    |	 	   |	     	 |          |          |
-	  0 |______________|_____________|__________|__________|_________
-		   A	    	 A	    A	   	       A     TIME
-		   |<----->|	 |<----->|  |<----->|	       |
-		   | 938us |  	 |	 |  |       |          |
-emulation    :  0  50	   |  	 70      |  20      |          0
-current temp :   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst
new file mode 100644
index 000000000000..c21d10838bc5
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal_emulation.rst
@@ -0,0 +1,61 @@
+=====================
+Exynos Emulation Mode
+=====================
+
+Copyright (C) 2012 Samsung Electronics
+
+Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
+
+Description
+-----------
+
+Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal
+management unit. Thermal emulation mode supports software debug for
+TMU's operation. User can set temperature manually with software code
+and TMU will read current temperature from user value not from sensor's
+value.
+
+Enabling CONFIG_THERMAL_EMULATION option will make this support
+available. When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
+
+The sysfs node, 'emul_node', will contain value 0 for the initial state.
+When you input any temperature you want to update to sysfs node, it
+automatically enable emulation mode and current temperature will be
+changed into it.
+
+(Exynos also supports user changeable delay time which would be used to
+delay of changing temperature. However, this node only uses same delay
+of real sensing time, 938us.)
+
+Exynos emulation mode requires synchronous of value changing and
+enabling. It means when you want to update the any value of delay or
+next temperature, then you have to enable emulation mode at the same
+time. (Or you have to keep the mode enabling.) If you don't, it fails to
+change the value to updated one and just use last succeessful value
+repeatedly. That's why this node gives users the right to change
+termerpature only. Just one interface makes it more simply to use.
+
+Disabling emulation mode only requires writing value 0 to sysfs node.
+
+::
+
+
+  TEMP	120 |
+	    |
+	100 |
+	    |
+	 80 |
+	    |				 +-----------
+	 60 |      			 |	    |
+	    |		   +-------------|          |
+	 40 |              |         	 |          |
+	    |		   |		 |          |
+	 20 |		   |		 |          +----------
+	    |		   |		 |          |          |
+	  0 |______________|_____________|__________|__________|_________
+		   A		 A	    A		       A     TIME
+		   |<----->|	 |<----->|  |<----->|	       |
+		   | 938us |  	 |	 |  |       |          |
+  emulation   : 0  50	   |  	 70      |  20      |          0
+  current temp:   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst
new file mode 100644
index 000000000000..8c1c00146cad
--- /dev/null
+++ b/Documentation/thermal/index.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+=======
+Thermal
+=======
+
+.. toctree::
+   :maxdepth: 1
+
+   cpu-cooling-api
+   sysfs-api
+   power_allocator
+
+   exynos_thermal
+   exynos_thermal_emulation
+   intel_powerclamp
+   nouveau_thermal
+   x86_pkg_temperature_thermal
diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst
new file mode 100644
index 000000000000..3f6dfb0b3ea6
--- /dev/null
+++ b/Documentation/thermal/intel_powerclamp.rst
@@ -0,0 +1,320 @@
+=======================
+Intel Powerclamp Driver
+=======================
+
+By:
+  - Arjan van de Ven <arjan@linux.intel.com>
+  - Jacob Pan <jacob.jun.pan@linux.intel.com>
+
+.. Contents:
+
+	(*) Introduction
+	    - Goals and Objectives
+
+	(*) Theory of Operation
+	    - Idle Injection
+	    - Calibration
+
+	(*) Performance Analysis
+	    - Effectiveness and Limitations
+	    - Power vs Performance
+	    - Scalability
+	    - Calibration
+	    - Comparison with Alternative Techniques
+
+	(*) Usage and Interfaces
+	    - Generic Thermal Layer (sysfs)
+	    - Kernel APIs (TBD)
+
+INTRODUCTION
+============
+
+Consider the situation where a system’s power consumption must be
+reduced at runtime, due to power budget, thermal constraint, or noise
+level, and where active cooling is not preferred. Software managed
+passive power reduction must be performed to prevent the hardware
+actions that are designed for catastrophic scenarios.
+
+Currently, P-states, T-states (clock modulation), and CPU offlining
+are used for CPU throttling.
+
+On Intel CPUs, C-states provide effective power reduction, but so far
+they’re only used opportunistically, based on workload. With the
+development of intel_powerclamp driver, the method of synchronizing
+idle injection across all online CPU threads was introduced. The goal
+is to achieve forced and controllable C-state residency.
+
+Test/Analysis has been made in the areas of power, performance,
+scalability, and user experience. In many cases, clear advantage is
+shown over taking the CPU offline or modulating the CPU clock.
+
+
+THEORY OF OPERATION
+===================
+
+Idle Injection
+--------------
+
+On modern Intel processors (Nehalem or later), package level C-state
+residency is available in MSRs, thus also available to the kernel.
+
+These MSRs are::
+
+      #define MSR_PKG_C2_RESIDENCY      0x60D
+      #define MSR_PKG_C3_RESIDENCY      0x3F8
+      #define MSR_PKG_C6_RESIDENCY      0x3F9
+      #define MSR_PKG_C7_RESIDENCY      0x3FA
+
+If the kernel can also inject idle time to the system, then a
+closed-loop control system can be established that manages package
+level C-state. The intel_powerclamp driver is conceived as such a
+control system, where the target set point is a user-selected idle
+ratio (based on power reduction), and the error is the difference
+between the actual package level C-state residency ratio and the target idle
+ratio.
+
+Injection is controlled by high priority kernel threads, spawned for
+each online CPU.
+
+These kernel threads, with SCHED_FIFO class, are created to perform
+clamping actions of controlled duty ratio and duration. Each per-CPU
+thread synchronizes its idle time and duration, based on the rounding
+of jiffies, so accumulated errors can be prevented to avoid a jittery
+effect. Threads are also bound to the CPU such that they cannot be
+migrated, unless the CPU is taken offline. In this case, threads
+belong to the offlined CPUs will be terminated immediately.
+
+Running as SCHED_FIFO and relatively high priority, also allows such
+scheme to work for both preemptable and non-preemptable kernels.
+Alignment of idle time around jiffies ensures scalability for HZ
+values. This effect can be better visualized using a Perf timechart.
+The following diagram shows the behavior of kernel thread
+kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
+for a given "duration", then relinquishes the CPU to other tasks,
+until the next time interval.
+
+The NOHZ schedule tick is disabled during idle time, but interrupts
+are not masked. Tests show that the extra wakeups from scheduler tick
+have a dramatic impact on the effectiveness of the powerclamp driver
+on large scale systems (Westmere system with 80 processors).
+
+::
+
+  CPU0
+		    ____________          ____________
+  kidle_inject/0   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				 duration
+  CPU1
+		    ____________          ____________
+  kidle_inject/1   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				^
+				|
+				|
+				roundup(jiffies, interval)
+
+Only one CPU is allowed to collect statistics and update global
+control parameters. This CPU is referred to as the controlling CPU in
+this document. The controlling CPU is elected at runtime, with a
+policy that favors BSP, taking into account the possibility of a CPU
+hot-plug.
+
+In terms of dynamics of the idle control system, package level idle
+time is considered largely as a non-causal system where its behavior
+cannot be based on the past or current input. Therefore, the
+intel_powerclamp driver attempts to enforce the desired idle time
+instantly as given input (target idle ratio). After injection,
+powerclamp monitors the actual idle for a given time window and adjust
+the next injection accordingly to avoid over/under correction.
+
+When used in a causal control system, such as a temperature control,
+it is up to the user of this driver to implement algorithms where
+past samples and outputs are included in the feedback. For example, a
+PID-based thermal controller can use the powerclamp driver to
+maintain a desired target temperature, based on integral and
+derivative gains of the past samples.
+
+
+
+Calibration
+-----------
+During scalability testing, it is observed that synchronized actions
+among CPUs become challenging as the number of cores grows. This is
+also true for the ability of a system to enter package level C-states.
+
+To make sure the intel_powerclamp driver scales well, online
+calibration is implemented. The goals for doing such a calibration
+are:
+
+a) determine the effective range of idle injection ratio
+b) determine the amount of compensation needed at each target ratio
+
+Compensation to each target ratio consists of two parts:
+
+	a) steady state error compensation
+	This is to offset the error occurring when the system can
+	enter idle without extra wakeups (such as external interrupts).
+
+	b) dynamic error compensation
+	When an excessive amount of wakeups occurs during idle, an
+	additional idle ratio can be added to quiet interrupts, by
+	slowing down CPU activities.
+
+A debugfs file is provided for the user to examine compensation
+progress and results, such as on a Westmere system::
+
+  [jacob@nex01 ~]$ cat
+  /sys/kernel/debug/intel_powerclamp/powerclamp_calib
+  controlling cpu: 0
+  pct confidence steady dynamic (compensation)
+  0       0       0       0
+  1       1       0       0
+  2       1       1       0
+  3       3       1       0
+  4       3       1       0
+  5       3       1       0
+  6       3       1       0
+  7       3       1       0
+  8       3       1       0
+  ...
+  30      3       2       0
+  31      3       2       0
+  32      3       1       0
+  33      3       2       0
+  34      3       1       0
+  35      3       2       0
+  36      3       1       0
+  37      3       2       0
+  38      3       1       0
+  39      3       2       0
+  40      3       3       0
+  41      3       1       0
+  42      3       2       0
+  43      3       1       0
+  44      3       1       0
+  45      3       2       0
+  46      3       3       0
+  47      3       0       0
+  48      3       2       0
+  49      3       3       0
+
+Calibration occurs during runtime. No offline method is available.
+Steady state compensation is used only when confidence levels of all
+adjacent ratios have reached satisfactory level. A confidence level
+is accumulated based on clean data collected at runtime. Data
+collected during a period without extra interrupts is considered
+clean.
+
+To compensate for excessive amounts of wakeup during idle, additional
+idle time is injected when such a condition is detected. Currently,
+we have a simple algorithm to double the injection ratio. A possible
+enhancement might be to throttle the offending IRQ, such as delaying
+EOI for level triggered interrupts. But it is a challenge to be
+non-intrusive to the scheduler or the IRQ core code.
+
+
+CPU Online/Offline
+------------------
+Per-CPU kernel threads are started/stopped upon receiving
+notifications of CPU hotplug activities. The intel_powerclamp driver
+keeps track of clamping kernel threads, even after they are migrated
+to other CPUs, after a CPU offline event.
+
+
+Performance Analysis
+====================
+This section describes the general performance data collected on
+multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
+
+Effectiveness and Limitations
+-----------------------------
+The maximum range that idle injection is allowed is capped at 50
+percent. As mentioned earlier, since interrupts are allowed during
+forced idle time, excessive interrupts could result in less
+effectiveness. The extreme case would be doing a ping -f to generated
+flooded network interrupts without much CPU acknowledgement. In this
+case, little can be done from the idle injection threads. In most
+normal cases, such as scp a large file, applications can be throttled
+by the powerclamp driver, since slowing down the CPU also slows down
+network protocol processing, which in turn reduces interrupts.
+
+When control parameters change at runtime by the controlling CPU, it
+may take an additional period for the rest of the CPUs to catch up
+with the changes. During this time, idle injection is out of sync,
+thus not able to enter package C- states at the expected ratio. But
+this effect is minor, in that in most cases change to the target
+ratio is updated much less frequently than the idle injection
+frequency.
+
+Scalability
+-----------
+Tests also show a minor, but measurable, difference between the 4P/8P
+Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
+More compensation is needed on Westmere for the same amount of
+target idle ratio. The compensation also increases as the idle ratio
+gets larger. The above reason constitutes the need for the
+calibration code.
+
+On the IVB 8P system, compared to an offline CPU, powerclamp can
+achieve up to 40% better performance per watt. (measured by a spin
+counter summed over per CPU counting threads spawned for all running
+CPUs).
+
+Usage and Interfaces
+====================
+The powerclamp driver is registered to the generic thermal layer as a
+cooling device. Currently, it’s not bound to any thermal zones::
+
+  jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
+  cur_state:0
+  max_state:50
+  type:intel_powerclamp
+
+cur_state allows user to set the desired idle percentage. Writing 0 to
+cur_state will stop idle injection. Writing a value between 1 and
+max_state will start the idle injection. Reading cur_state returns the
+actual and current idle percentage. This may not be the same value
+set by the user in that current idle percentage depends on workload
+and includes natural idle. When idle injection is disabled, reading
+cur_state returns value -1 instead of 0 which is to avoid confusing
+100% busy state with the disabled state.
+
+Example usage:
+- To inject 25% idle time::
+
+	$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
+
+If the system is not busy and has more than 25% idle time already,
+then the powerclamp driver will not start idle injection. Using Top
+will not show idle injection kernel threads.
+
+If the system is busy (spin test below) and has less than 25% natural
+idle time, powerclamp kernel threads will do idle injection. Forced
+idle time is accounted as normal idle in that common code path is
+taken as the idle task.
+
+In this example, 24.1% idle is shown. This helps the system admin or
+user determine the cause of slowdown, when a powerclamp driver is in action::
+
+
+  Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
+  Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
+  Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
+  Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
+
+    PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
+   3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
+   3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
+   3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
+   3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
+   3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
+   2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
+   1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
+   2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
+
+Tests have shown that by using the powerclamp driver as a cooling
+device, a PID based userspace thermal controller can manage to
+control CPU temperature effectively, when no other thermal influence
+is added. For example, a UltraBook user can compile the kernel under
+certain temperature (below most active trip points).
diff --git a/Documentation/thermal/intel_powerclamp.txt b/Documentation/thermal/intel_powerclamp.txt
deleted file mode 100644
index b5df21168fbc..000000000000
--- a/Documentation/thermal/intel_powerclamp.txt
+++ /dev/null
@@ -1,317 +0,0 @@
-			 =======================
-			 INTEL POWERCLAMP DRIVER
-			 =======================
-By: Arjan van de Ven <arjan@linux.intel.com>
-    Jacob Pan <jacob.jun.pan@linux.intel.com>
-
-Contents:
-	(*) Introduction
-	    - Goals and Objectives
-
-	(*) Theory of Operation
-	    - Idle Injection
-	    - Calibration
-
-	(*) Performance Analysis
-	    - Effectiveness and Limitations
-	    - Power vs Performance
-	    - Scalability
-	    - Calibration
-	    - Comparison with Alternative Techniques
-
-	(*) Usage and Interfaces
-	    - Generic Thermal Layer (sysfs)
-	    - Kernel APIs (TBD)
-
-============
-INTRODUCTION
-============
-
-Consider the situation where a system’s power consumption must be
-reduced at runtime, due to power budget, thermal constraint, or noise
-level, and where active cooling is not preferred. Software managed
-passive power reduction must be performed to prevent the hardware
-actions that are designed for catastrophic scenarios.
-
-Currently, P-states, T-states (clock modulation), and CPU offlining
-are used for CPU throttling.
-
-On Intel CPUs, C-states provide effective power reduction, but so far
-they’re only used opportunistically, based on workload. With the
-development of intel_powerclamp driver, the method of synchronizing
-idle injection across all online CPU threads was introduced. The goal
-is to achieve forced and controllable C-state residency.
-
-Test/Analysis has been made in the areas of power, performance,
-scalability, and user experience. In many cases, clear advantage is
-shown over taking the CPU offline or modulating the CPU clock.
-
-
-===================
-THEORY OF OPERATION
-===================
-
-Idle Injection
---------------
-
-On modern Intel processors (Nehalem or later), package level C-state
-residency is available in MSRs, thus also available to the kernel.
-
-These MSRs are:
-      #define MSR_PKG_C2_RESIDENCY	0x60D
-      #define MSR_PKG_C3_RESIDENCY	0x3F8
-      #define MSR_PKG_C6_RESIDENCY	0x3F9
-      #define MSR_PKG_C7_RESIDENCY	0x3FA
-
-If the kernel can also inject idle time to the system, then a
-closed-loop control system can be established that manages package
-level C-state. The intel_powerclamp driver is conceived as such a
-control system, where the target set point is a user-selected idle
-ratio (based on power reduction), and the error is the difference
-between the actual package level C-state residency ratio and the target idle
-ratio.
-
-Injection is controlled by high priority kernel threads, spawned for
-each online CPU.
-
-These kernel threads, with SCHED_FIFO class, are created to perform
-clamping actions of controlled duty ratio and duration. Each per-CPU
-thread synchronizes its idle time and duration, based on the rounding
-of jiffies, so accumulated errors can be prevented to avoid a jittery
-effect. Threads are also bound to the CPU such that they cannot be
-migrated, unless the CPU is taken offline. In this case, threads
-belong to the offlined CPUs will be terminated immediately.
-
-Running as SCHED_FIFO and relatively high priority, also allows such
-scheme to work for both preemptable and non-preemptable kernels.
-Alignment of idle time around jiffies ensures scalability for HZ
-values. This effect can be better visualized using a Perf timechart.
-The following diagram shows the behavior of kernel thread
-kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
-for a given "duration", then relinquishes the CPU to other tasks,
-until the next time interval.
-
-The NOHZ schedule tick is disabled during idle time, but interrupts
-are not masked. Tests show that the extra wakeups from scheduler tick
-have a dramatic impact on the effectiveness of the powerclamp driver
-on large scale systems (Westmere system with 80 processors).
-
-CPU0
-		  ____________          ____________
-kidle_inject/0   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			       duration
-CPU1
-		  ____________          ____________
-kidle_inject/1   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			      ^
-			      |
-			      |
-			      roundup(jiffies, interval)
-
-Only one CPU is allowed to collect statistics and update global
-control parameters. This CPU is referred to as the controlling CPU in
-this document. The controlling CPU is elected at runtime, with a
-policy that favors BSP, taking into account the possibility of a CPU
-hot-plug.
-
-In terms of dynamics of the idle control system, package level idle
-time is considered largely as a non-causal system where its behavior
-cannot be based on the past or current input. Therefore, the
-intel_powerclamp driver attempts to enforce the desired idle time
-instantly as given input (target idle ratio). After injection,
-powerclamp monitors the actual idle for a given time window and adjust
-the next injection accordingly to avoid over/under correction.
-
-When used in a causal control system, such as a temperature control,
-it is up to the user of this driver to implement algorithms where
-past samples and outputs are included in the feedback. For example, a
-PID-based thermal controller can use the powerclamp driver to
-maintain a desired target temperature, based on integral and
-derivative gains of the past samples.
-
-
-
-Calibration
------------
-During scalability testing, it is observed that synchronized actions
-among CPUs become challenging as the number of cores grows. This is
-also true for the ability of a system to enter package level C-states.
-
-To make sure the intel_powerclamp driver scales well, online
-calibration is implemented. The goals for doing such a calibration
-are:
-
-a) determine the effective range of idle injection ratio
-b) determine the amount of compensation needed at each target ratio
-
-Compensation to each target ratio consists of two parts:
-
-        a) steady state error compensation
-	This is to offset the error occurring when the system can
-	enter idle without extra wakeups (such as external interrupts).
-
-	b) dynamic error compensation
-	When an excessive amount of wakeups occurs during idle, an
-	additional idle ratio can be added to quiet interrupts, by
-	slowing down CPU activities.
-
-A debugfs file is provided for the user to examine compensation
-progress and results, such as on a Westmere system.
-[jacob@nex01 ~]$ cat
-/sys/kernel/debug/intel_powerclamp/powerclamp_calib
-controlling cpu: 0
-pct confidence steady dynamic (compensation)
-0	0	0	0
-1	1	0	0
-2	1	1	0
-3	3	1	0
-4	3	1	0
-5	3	1	0
-6	3	1	0
-7	3	1	0
-8	3	1	0
-...
-30	3	2	0
-31	3	2	0
-32	3	1	0
-33	3	2	0
-34	3	1	0
-35	3	2	0
-36	3	1	0
-37	3	2	0
-38	3	1	0
-39	3	2	0
-40	3	3	0
-41	3	1	0
-42	3	2	0
-43	3	1	0
-44	3	1	0
-45	3	2	0
-46	3	3	0
-47	3	0	0
-48	3	2	0
-49	3	3	0
-
-Calibration occurs during runtime. No offline method is available.
-Steady state compensation is used only when confidence levels of all
-adjacent ratios have reached satisfactory level. A confidence level
-is accumulated based on clean data collected at runtime. Data
-collected during a period without extra interrupts is considered
-clean.
-
-To compensate for excessive amounts of wakeup during idle, additional
-idle time is injected when such a condition is detected. Currently,
-we have a simple algorithm to double the injection ratio. A possible
-enhancement might be to throttle the offending IRQ, such as delaying
-EOI for level triggered interrupts. But it is a challenge to be
-non-intrusive to the scheduler or the IRQ core code.
-
-
-CPU Online/Offline
-------------------
-Per-CPU kernel threads are started/stopped upon receiving
-notifications of CPU hotplug activities. The intel_powerclamp driver
-keeps track of clamping kernel threads, even after they are migrated
-to other CPUs, after a CPU offline event.
-
-
-=====================
-Performance Analysis
-=====================
-This section describes the general performance data collected on
-multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
-
-Effectiveness and Limitations
------------------------------
-The maximum range that idle injection is allowed is capped at 50
-percent. As mentioned earlier, since interrupts are allowed during
-forced idle time, excessive interrupts could result in less
-effectiveness. The extreme case would be doing a ping -f to generated
-flooded network interrupts without much CPU acknowledgement. In this
-case, little can be done from the idle injection threads. In most
-normal cases, such as scp a large file, applications can be throttled
-by the powerclamp driver, since slowing down the CPU also slows down
-network protocol processing, which in turn reduces interrupts.
-
-When control parameters change at runtime by the controlling CPU, it
-may take an additional period for the rest of the CPUs to catch up
-with the changes. During this time, idle injection is out of sync,
-thus not able to enter package C- states at the expected ratio. But
-this effect is minor, in that in most cases change to the target
-ratio is updated much less frequently than the idle injection
-frequency.
-
-Scalability
------------
-Tests also show a minor, but measurable, difference between the 4P/8P
-Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
-More compensation is needed on Westmere for the same amount of
-target idle ratio. The compensation also increases as the idle ratio
-gets larger. The above reason constitutes the need for the
-calibration code.
-
-On the IVB 8P system, compared to an offline CPU, powerclamp can
-achieve up to 40% better performance per watt. (measured by a spin
-counter summed over per CPU counting threads spawned for all running
-CPUs).
-
-====================
-Usage and Interfaces
-====================
-The powerclamp driver is registered to the generic thermal layer as a
-cooling device. Currently, it’s not bound to any thermal zones.
-
-jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
-cur_state:0
-max_state:50
-type:intel_powerclamp
-
-cur_state allows user to set the desired idle percentage. Writing 0 to
-cur_state will stop idle injection. Writing a value between 1 and
-max_state will start the idle injection. Reading cur_state returns the
-actual and current idle percentage. This may not be the same value
-set by the user in that current idle percentage depends on workload
-and includes natural idle. When idle injection is disabled, reading
-cur_state returns value -1 instead of 0 which is to avoid confusing
-100% busy state with the disabled state.
-
-Example usage:
-- To inject 25% idle time
-$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
-"
-
-If the system is not busy and has more than 25% idle time already,
-then the powerclamp driver will not start idle injection. Using Top
-will not show idle injection kernel threads.
-
-If the system is busy (spin test below) and has less than 25% natural
-idle time, powerclamp kernel threads will do idle injection. Forced
-idle time is accounted as normal idle in that common code path is
-taken as the idle task.
-
-In this example, 24.1% idle is shown. This helps the system admin or
-user determine the cause of slowdown, when a powerclamp driver is in action.
-
-
-Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
-Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
-Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
-Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
-
-  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
- 3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
- 3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
- 3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
- 3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
- 3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
- 2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
- 1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
- 2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
-
-Tests have shown that by using the powerclamp driver as a cooling
-device, a PID based userspace thermal controller can manage to
-control CPU temperature effectively, when no other thermal influence
-is added. For example, a UltraBook user can compile the kernel under
-certain temperature (below most active trip points).
diff --git a/Documentation/thermal/nouveau_thermal b/Documentation/thermal/nouveau_thermal
deleted file mode 100644
index 6e17a11efcb0..000000000000
--- a/Documentation/thermal/nouveau_thermal
+++ /dev/null
@@ -1,82 +0,0 @@
-Kernel driver nouveau
-===================
-
-Supported chips:
-* NV43+
-
-Authors: Martin Peres (mupuf) <martin.peres@free.fr>
-
-Description
----------
-
-This driver allows to read the GPU core temperature, drive the GPU fan and
-set temperature alarms.
-
-Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
-cannot access any of the i2c external monitoring chips it may find. If you
-have one of those, temperature and/or fan management through Nouveau's HWMON
-interface is likely not to work. This document may then not cover your situation
-entirely.
-
-Temperature management
---------------------
-
-Temperature is exposed under as a read-only HWMON attribute temp1_input.
-
-In order to protect the GPU from overheating, Nouveau supports 4 configurable
-temperature thresholds:
-
- * Fan_boost: Fan speed is set to 100% when reaching this temperature;
- * Downclock: The GPU will be downclocked to reduce its power dissipation;
- * Critical: The GPU is put on hold to further lower power dissipation;
- * Shutdown: Shut the computer down to protect your GPU.
-
-WARNING: Some of these thresholds may not be used by Nouveau depending
-on your chipset.
-
-The default value for these thresholds comes from the GPU's vbios. These
-thresholds can be configured thanks to the following HWMON attributes:
-
- * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
- * Downclock: temp1_max and temp1_max_hyst;
- * Critical: temp1_crit and temp1_crit_hyst;
- * Shutdown: temp1_emergency and temp1_emergency_hyst.
-
-NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
-to multiply!
-
-Fan management
-------------
-
-Not all cards have a drivable fan. If you do, then the following HWMON
-attributes should be available:
-
- * pwm1_enable: Current fan management mode (NONE, MANUAL or AUTO);
- * pwm1: Current PWM value (power percentage);
- * pwm1_min: The minimum PWM speed allowed;
- * pwm1_max: The maximum PWM speed allowed (bypassed when hitting Fan_boost);
-
-You may also have the following attribute:
-
- * fan1_input: Speed in RPM of your fan.
-
-Your fan can be driven in different modes:
-
- * 0: The fan is left untouched;
- * 1: The fan can be driven in manual (use pwm1 to change the speed);
- * 2; The fan is driven automatically depending on the temperature.
-
-NOTE: Be sure to use the manual mode if you want to drive the fan speed manually
-
-NOTE2: When operating in manual mode outside the vbios-defined
-[PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
-depending on your hardware.
-
-Bug reports
----------
-
-Thermal management on Nouveau is new and may not work on all cards. If you have
-inquiries, please ping mupuf on IRC (#nouveau, freenode).
-
-Bug reports should be filled on Freedesktop's bug tracker. Please follow
-http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/nouveau_thermal.rst b/Documentation/thermal/nouveau_thermal.rst
new file mode 100644
index 000000000000..37255fd6735d
--- /dev/null
+++ b/Documentation/thermal/nouveau_thermal.rst
@@ -0,0 +1,96 @@
+=====================
+Kernel driver nouveau
+=====================
+
+Supported chips:
+
+* NV43+
+
+Authors: Martin Peres (mupuf) <martin.peres@free.fr>
+
+Description
+-----------
+
+This driver allows to read the GPU core temperature, drive the GPU fan and
+set temperature alarms.
+
+Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
+cannot access any of the i2c external monitoring chips it may find. If you
+have one of those, temperature and/or fan management through Nouveau's HWMON
+interface is likely not to work. This document may then not cover your situation
+entirely.
+
+Temperature management
+----------------------
+
+Temperature is exposed under as a read-only HWMON attribute temp1_input.
+
+In order to protect the GPU from overheating, Nouveau supports 4 configurable
+temperature thresholds:
+
+ * Fan_boost:
+	Fan speed is set to 100% when reaching this temperature;
+ * Downclock:
+	The GPU will be downclocked to reduce its power dissipation;
+ * Critical:
+	The GPU is put on hold to further lower power dissipation;
+ * Shutdown:
+	Shut the computer down to protect your GPU.
+
+WARNING:
+	Some of these thresholds may not be used by Nouveau depending
+	on your chipset.
+
+The default value for these thresholds comes from the GPU's vbios. These
+thresholds can be configured thanks to the following HWMON attributes:
+
+ * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
+ * Downclock: temp1_max and temp1_max_hyst;
+ * Critical: temp1_crit and temp1_crit_hyst;
+ * Shutdown: temp1_emergency and temp1_emergency_hyst.
+
+NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
+to multiply!
+
+Fan management
+--------------
+
+Not all cards have a drivable fan. If you do, then the following HWMON
+attributes should be available:
+
+ * pwm1_enable:
+	Current fan management mode (NONE, MANUAL or AUTO);
+ * pwm1:
+	Current PWM value (power percentage);
+ * pwm1_min:
+	The minimum PWM speed allowed;
+ * pwm1_max:
+	The maximum PWM speed allowed (bypassed when hitting Fan_boost);
+
+You may also have the following attribute:
+
+ * fan1_input:
+	Speed in RPM of your fan.
+
+Your fan can be driven in different modes:
+
+ * 0: The fan is left untouched;
+ * 1: The fan can be driven in manual (use pwm1 to change the speed);
+ * 2; The fan is driven automatically depending on the temperature.
+
+NOTE:
+  Be sure to use the manual mode if you want to drive the fan speed manually
+
+NOTE2:
+  When operating in manual mode outside the vbios-defined
+  [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
+  depending on your hardware.
+
+Bug reports
+-----------
+
+Thermal management on Nouveau is new and may not work on all cards. If you have
+inquiries, please ping mupuf on IRC (#nouveau, freenode).
+
+Bug reports should be filled on Freedesktop's bug tracker. Please follow
+http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/power_allocator.rst b/Documentation/thermal/power_allocator.rst
new file mode 100644
index 000000000000..67b6a3297238
--- /dev/null
+++ b/Documentation/thermal/power_allocator.rst
@@ -0,0 +1,271 @@
+=================================
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor works optimally with the following two passive trip points:
+
+1.  "switch on" trip point: temperature above which the governor
+    control loop starts operating.  This is the first passive trip
+    point of the thermal zone.
+
+2.  "desired temperature" trip point: it should be higher than the
+    "switch on" trip point.  This the target temperature the governor
+    is controlling for.  This is the last passive trip point of the
+    thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+   -  e = desired_temperature - current_temperature
+   -  err_integral is the sum of previous errors
+   -  diff_err = e - previous_error
+
+It is similar to the one depicted below::
+
+				      k_d
+				       |
+  current_temp                         |
+       |                               v
+       |              +----------+   +---+
+       |       +----->| diff_err |-->| X |------+
+       |       |      +----------+   +---+      |
+       |       |                                |      tdp        actor
+       |       |                      k_i       |       |  get_requested_power()
+       |       |                       |        |       |        |     |
+       |       |                       |        |       |        |     | ...
+       v       |                       v        v       v        v     v
+     +---+     |      +-------+      +---+    +---+   +---+   +----------+
+     | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
+     +---+     |      +-------+      +---+    +---+   +---+   |allocation|
+       ^       |                                ^             +----------+
+       |       |                                |                |     |
+       |       |        +---+                   |                |     |
+       |       +------->| X |-------------------+                v     v
+       |                +---+                               granted performance
+  desired_temperature     ^
+			  |
+			  |
+		      k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone.  This estimates the
+sustained power that can be dissipated at the desired control
+temperature.  This is the maximum sustained power for allocation at
+the desired maximum temperature.  The actual sustained power can vary
+for a number of reasons.  The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon.  `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone.  For example::
+
+	thermal-zones {
+		soc_thermal {
+			polling-delay = <1000>;
+			polling-delay-passive = <100>;
+			sustainable-power = <2500>;
+			...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`.  If no
+`thermal_zone_params` were being passed, then something like below
+will suffice::
+
+	static const struct thermal_zone_params tz_params = {
+		.sustainable_power = 3500,
+	};
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`.  `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point).  Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system.  For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature.  On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+while temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is::
+
+    2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`.  The default value
+for `k_po` is::
+
+    sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have::
+
+    P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one.  When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`.  That is, the system should operate in
+thermal equilibrium under constant load.  `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get::
+
+    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+	sustainable_power
+
+where:
+
+    - T_set is the desired temperature
+    - T is the current temperature
+    - T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes::
+
+    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+	sustainable_power = 2 * sustainable_power + sustainable_power =
+	3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant.  This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests.  When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term.  This term is then multiplied by `k_i` and the result
+added to the output of the controller.  Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant.  It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`.  It consists on three ops:
+
+1. ::
+
+    int get_requested_power(struct thermal_cooling_device *cdev,
+			    struct thermal_zone_device *tz, u32 *power);
+
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@power:
+	pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power .  It should return 0 on
+success, -E* on failure.  This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. ::
+
+	int state2power(struct thermal_cooling_device *cdev, struct
+			thermal_zone_device *tz, unsigned long state,
+			u32 *power);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@state:
+	A cooling device state
+@power:
+	pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power.  It should return 0 on success, -E*
+on failure.  This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. ::
+
+	int power2state(struct thermal_cooling_device *cdev, u32 power,
+			unsigned long *state);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@power:
+	power in milliwatts
+@state:
+	pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state.  It should return 0 on success,
+-E* on failure.  This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set.  It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices.  They express the relative power efficiency of different
+cooling devices.  Higher weight can be used to express higher power
+efficiency.  Weighting is relative such that if each cooling device
+has a weight of one they are considered equal.  This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency.  For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick.  If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good.  Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.
diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
deleted file mode 100644
index 9fb0ff06dca9..000000000000
--- a/Documentation/thermal/power_allocator.txt
+++ /dev/null
@@ -1,247 +0,0 @@
-Power allocator governor tunables
-=================================
-
-Trip points
------------
-
-The governor works optimally with the following two passive trip points:
-
-1.  "switch on" trip point: temperature above which the governor
-    control loop starts operating.  This is the first passive trip
-    point of the thermal zone.
-
-2.  "desired temperature" trip point: it should be higher than the
-    "switch on" trip point.  This the target temperature the governor
-    is controlling for.  This is the last passive trip point of the
-    thermal zone.
-
-PID Controller
---------------
-
-The power allocator governor implements a
-Proportional-Integral-Derivative controller (PID controller) with
-temperature as the control input and power as the controlled output:
-
-    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
-
-where
-    e = desired_temperature - current_temperature
-    err_integral is the sum of previous errors
-    diff_err = e - previous_error
-
-It is similar to the one depicted below:
-
-                                      k_d
-                                       |
-current_temp                           |
-     |                                 v
-     |                +----------+   +---+
-     |         +----->| diff_err |-->| X |------+
-     |         |      +----------+   +---+      |
-     |         |                                |      tdp        actor
-     |         |                      k_i       |       |  get_requested_power()
-     |         |                       |        |       |        |     |
-     |         |                       |        |       |        |     | ...
-     v         |                       v        v       v        v     v
-   +---+       |      +-------+      +---+    +---+   +---+   +----------+
-   | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
-   +---+       |      +-------+      +---+    +---+   +---+   |allocation|
-     ^         |                                ^             +----------+
-     |         |                                |                |     |
-     |         |        +---+                   |                |     |
-     |         +------->| X |-------------------+                v     v
-     |                  +---+                               granted performance
-desired_temperature       ^
-                          |
-                          |
-                      k_po/k_pu
-
-Sustainable power
------------------
-
-An estimate of the sustainable dissipatable power (in mW) should be
-provided while registering the thermal zone.  This estimates the
-sustained power that can be dissipated at the desired control
-temperature.  This is the maximum sustained power for allocation at
-the desired maximum temperature.  The actual sustained power can vary
-for a number of reasons.  The closed loop controller will take care of
-variations such as environmental conditions, and some factors related
-to the speed-grade of the silicon.  `sustainable_power` is therefore
-simply an estimate, and may be tuned to affect the aggressiveness of
-the thermal ramp. For reference, the sustainable power of a 4" phone
-is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
-depending on screen size).
-
-If you are using device tree, do add it as a property of the
-thermal-zone.  For example:
-
-	thermal-zones {
-		soc_thermal {
-			polling-delay = <1000>;
-			polling-delay-passive = <100>;
-			sustainable-power = <2500>;
-			...
-
-Instead, if the thermal zone is registered from the platform code, pass a
-`thermal_zone_params` that has a `sustainable_power`.  If no
-`thermal_zone_params` were being passed, then something like below
-will suffice:
-
-	static const struct thermal_zone_params tz_params = {
-		.sustainable_power = 3500,
-	};
-
-and then pass `tz_params` as the 5th parameter to
-`thermal_zone_device_register()`
-
-k_po and k_pu
--------------
-
-The implementation of the PID controller in the power allocator
-thermal governor allows the configuration of two proportional term
-constants: `k_po` and `k_pu`.  `k_po` is the proportional term
-constant during temperature overshoot periods (current temperature is
-above "desired temperature" trip point).  Conversely, `k_pu` is the
-proportional term constant during temperature undershoot periods
-(current temperature below "desired temperature" trip point).
-
-These controls are intended as the primary mechanism for configuring
-the permitted thermal "ramp" of the system.  For instance, a lower
-`k_pu` value will provide a slower ramp, at the cost of capping
-available capacity at a low temperature.  On the other hand, a high
-value of `k_pu` will result in the governor granting very high power
-while temperature is low, and may lead to temperature overshooting.
-
-The default value for `k_pu` is:
-
-    2 * sustainable_power / (desired_temperature - switch_on_temp)
-
-This means that at `switch_on_temp` the output of the controller's
-proportional term will be 2 * `sustainable_power`.  The default value
-for `k_po` is:
-
-    sustainable_power / (desired_temperature - switch_on_temp)
-
-Focusing on the proportional and feed forward values of the PID
-controller equation we have:
-
-    P_max = k_p * e + sustainable_power
-
-The proportional term is proportional to the difference between the
-desired temperature and the current one.  When the current temperature
-is the desired one, then the proportional component is zero and
-`P_max` = `sustainable_power`.  That is, the system should operate in
-thermal equilibrium under constant load.  `sustainable_power` is only
-an estimate, which is the reason for closed-loop control such as this.
-
-Expanding `k_pu` we get:
-    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
-        sustainable_power
-
-where
-    T_set is the desired temperature
-    T is the current temperature
-    T_on is the switch on temperature
-
-When the current temperature is the switch_on temperature, the above
-formula becomes:
-
-    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
-        sustainable_power = 2 * sustainable_power + sustainable_power =
-        3 * sustainable_power
-
-Therefore, the proportional term alone linearly decreases power from
-3 * `sustainable_power` to `sustainable_power` as the temperature
-rises from the switch on temperature to the desired temperature.
-
-k_i and integral_cutoff
------------------------
-
-`k_i` configures the PID loop's integral term constant.  This term
-allows the PID controller to compensate for long term drift and for
-the quantized nature of the output control: cooling devices can't set
-the exact power that the governor requests.  When the temperature
-error is below `integral_cutoff`, errors are accumulated in the
-integral term.  This term is then multiplied by `k_i` and the result
-added to the output of the controller.  Typically `k_i` is set low (1
-or 2) and `integral_cutoff` is 0.
-
-k_d
----
-
-`k_d` configures the PID loop's derivative term constant.  It's
-recommended to leave it as the default: 0.
-
-Cooling device power API
-========================
-
-Cooling devices controlled by this governor must supply the additional
-"power" API in their `cooling_device_ops`.  It consists on three ops:
-
-1. int get_requested_power(struct thermal_cooling_device *cdev,
-	struct thermal_zone_device *tz, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@power: pointer in which to store the calculated power
-
-`get_requested_power()` calculates the power requested by the device
-in milliwatts and stores it in @power .  It should return 0 on
-success, -E* on failure.  This is currently used by the power
-allocator governor to calculate how much power to give to each cooling
-device.
-
-2. int state2power(struct thermal_cooling_device *cdev, struct
-        thermal_zone_device *tz, unsigned long state, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@state: A cooling device state
-@power: pointer in which to store the equivalent power
-
-Convert cooling device state @state into power consumption in
-milliwatts and store it in @power.  It should return 0 on success, -E*
-on failure.  This is currently used by thermal core to calculate the
-maximum power that an actor can consume.
-
-3. int power2state(struct thermal_cooling_device *cdev, u32 power,
-	unsigned long *state);
-@cdev: The `struct thermal_cooling_device` pointer
-@power: power in milliwatts
-@state: pointer in which to store the resulting state
-
-Calculate a cooling device state that would make the device consume at
-most @power mW and store it in @state.  It should return 0 on success,
--E* on failure.  This is currently used by the thermal core to convert
-a given power set by the power allocator governor to a state that the
-cooling device can set.  It is a function because this conversion may
-depend on external factors that may change so this function should the
-best conversion given "current circumstances".
-
-Cooling device weights
-----------------------
-
-Weights are a mechanism to bias the allocation among cooling
-devices.  They express the relative power efficiency of different
-cooling devices.  Higher weight can be used to express higher power
-efficiency.  Weighting is relative such that if each cooling device
-has a weight of one they are considered equal.  This is particularly
-useful in heterogeneous systems where two cooling devices may perform
-the same kind of compute, but with different efficiency.  For example,
-a system with two different types of processors.
-
-If the thermal zone is registered using
-`thermal_zone_device_register()` (i.e., platform code), then weights
-are passed as part of the thermal zone's `thermal_bind_parameters`.
-If the platform is registered using device tree, then they are passed
-as the `contribution` property of each map in the `cooling-maps` node.
-
-Limitations of the power allocator governor
-===========================================
-
-The power allocator governor's PID controller works best if there is a
-periodic tick.  If you have a driver that calls
-`thermal_zone_device_update()` (or anything that ends up calling the
-governor's `throttle()` function) repetitively, the governor response
-won't be very good.  Note that this is not particular to this
-governor, step-wise will also misbehave if you call its throttle()
-faster than the normal thermal framework tick (due to interrupts for
-example) as it will overreact.
diff --git a/Documentation/thermal/sysfs-api.rst b/Documentation/thermal/sysfs-api.rst
new file mode 100644
index 000000000000..e4930761d3e5
--- /dev/null
+++ b/Documentation/thermal/sysfs-api.rst
@@ -0,0 +1,798 @@
+===================================
+Generic Thermal Sysfs driver How To
+===================================
+
+Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
+
+Updated: 2 January 2008
+
+Copyright (c)  2008 Intel Corporation
+
+
+0. Introduction
+===============
+
+The generic thermal sysfs provides a set of interfaces for thermal zone
+devices (sensors) and thermal cooling devices (fan, processor...) to register
+with the thermal management solution and to be a part of it.
+
+This how-to focuses on enabling new thermal zone and cooling devices to
+participate in thermal management.
+This solution is platform independent and any type of thermal zone devices
+and cooling devices should be able to make use of the infrastructure.
+
+The main task of the thermal sysfs driver is to expose thermal zone attributes
+as well as cooling device attributes to the user space.
+An intelligent thermal management application can make decisions based on
+inputs from thermal zone attributes (the current temperature and trip point
+temperature) and throttle appropriate devices.
+
+- `[0-*]`	denotes any positive number starting from 0
+- `[1-*]`	denotes any positive number starting from 1
+
+1. thermal sysfs driver interface functions
+===========================================
+
+1.1 thermal zone device interface
+---------------------------------
+
+    ::
+
+	struct thermal_zone_device
+	*thermal_zone_device_register(char *type,
+				      int trips, int mask, void *devdata,
+				      struct thermal_zone_device_ops *ops,
+				      const struct thermal_zone_params *tzp,
+				      int passive_delay, int polling_delay))
+
+    This interface function adds a new thermal zone device (sensor) to
+    /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the
+    thermal cooling devices registered at the same time.
+
+    type:
+	the thermal zone type.
+    trips:
+	the total number of trip points this thermal zone supports.
+    mask:
+	Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
+    devdata:
+	device private data
+    ops:
+	thermal zone device call-backs.
+
+	.bind:
+		bind the thermal zone device with a thermal cooling device.
+	.unbind:
+		unbind the thermal zone device with a thermal cooling device.
+	.get_temp:
+		get the current temperature of the thermal zone.
+	.set_trips:
+		    set the trip points window. Whenever the current temperature
+		    is updated, the trip points immediately below and above the
+		    current temperature are found.
+	.get_mode:
+		   get the current mode (enabled/disabled) of the thermal zone.
+
+			- "enabled" means the kernel thermal management is
+			  enabled.
+			- "disabled" will prevent kernel thermal driver action
+			  upon trip points so that user applications can take
+			  charge of thermal management.
+	.set_mode:
+		set the mode (enabled/disabled) of the thermal zone.
+	.get_trip_type:
+		get the type of certain trip point.
+	.get_trip_temp:
+			get the temperature above which the certain trip point
+			will be fired.
+	.set_emul_temp:
+			set the emulation temperature which helps in debugging
+			different threshold temperature points.
+    tzp:
+	thermal zone platform parameters.
+    passive_delay:
+	number of milliseconds to wait between polls when
+	performing passive cooling.
+    polling_delay:
+	number of milliseconds to wait between polls when checking
+	whether trip points have been crossed (0 for interrupt driven systems).
+
+    ::
+
+	void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+
+    This interface function removes the thermal zone device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds all the thermal cooling devices it uses.
+
+	::
+
+	   struct thermal_zone_device
+	   *thermal_zone_of_sensor_register(struct device *dev, int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface adds a new sensor to a DT thermal zone.
+	This function will search the list of thermal zones described in
+	device tree and look for the zone that refer to the sensor device
+	pointed by dev->of_node as temperature providers. For the zone
+	pointing to the sensor node, the sensor will be added to the DT
+	thermal zone device.
+
+	The parameters for this interface are:
+
+	dev:
+			Device node of sensor containing valid node pointer in
+			dev->of_node.
+	sensor_id:
+			a sensor identifier, in case the sensor IP has more
+			than one sensors
+	data:
+			a private pointer (owned by the caller) that will be
+			passed back, when a temperature reading is needed.
+	ops:
+			`struct thermal_zone_of_device_ops *`.
+
+			==============  =======================================
+			get_temp	a pointer to a function that reads the
+					sensor temperature. This is mandatory
+					callback provided by sensor driver.
+			set_trips	a pointer to a function that sets a
+					temperature window. When this window is
+					left the driver must inform the thermal
+					core via thermal_zone_device_update.
+			get_trend 	a pointer to a function that reads the
+					sensor temperature trend.
+			set_emul_temp	a pointer to a function that sets
+					sensor emulated temperature.
+			==============  =======================================
+
+	The thermal zone temperature is provided by the get_temp() function
+	pointer of thermal_zone_of_device_ops. When called, it will
+	have the private pointer @data back.
+
+	It returns error pointer if fails otherwise valid thermal zone device
+	handle. Caller should check the return handle with IS_ERR() for finding
+	whether success or not.
+
+	::
+
+	    void thermal_zone_of_sensor_unregister(struct device *dev,
+						   struct thermal_zone_device *tzd)
+
+	This interface unregisters a sensor from a DT thermal zone which was
+	successfully added by interface thermal_zone_of_sensor_register().
+	This function removes the sensor callbacks and private data from the
+	thermal zone device registered with thermal_zone_of_sensor_register()
+	interface. It will also silent the zone by remove the .get_temp() and
+	get_trend() thermal zone device callbacks.
+
+	::
+
+	  struct thermal_zone_device
+	  *devm_thermal_zone_of_sensor_register(struct device *dev,
+				int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_register().
+
+	All details of thermal_zone_of_sensor_register() described in
+	section 1.1.3 is applicable here.
+
+	The benefit of using this interface to register sensor is that it
+	is not require to explicitly call thermal_zone_of_sensor_unregister()
+	in error path or during driver unbinding as this is done by driver
+	resource manager.
+
+	::
+
+		void devm_thermal_zone_of_sensor_unregister(struct device *dev,
+						struct thermal_zone_device *tzd)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_unregister().
+	All details of thermal_zone_of_sensor_unregister() described in
+	section 1.1.4 is applicable here.
+	Normally this function will not need to be called and the resource
+	management code will ensure that the resource is freed.
+
+	::
+
+		int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+	This interface is used to read the slope attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+	::
+
+		int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+	This interface is used to read the offset attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+1.2 thermal cooling device interface
+------------------------------------
+
+
+    ::
+
+	struct thermal_cooling_device
+	*thermal_cooling_device_register(char *name,
+			void *devdata, struct thermal_cooling_device_ops *)
+
+    This interface function adds a new thermal cooling device (fan/processor/...)
+    to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself
+    to all the thermal zone devices registered at the same time.
+
+    name:
+	the cooling device name.
+    devdata:
+	device private data.
+    ops:
+	thermal cooling devices call-backs.
+
+	.get_max_state:
+		get the Maximum throttle state of the cooling device.
+	.get_cur_state:
+		get the Currently requested throttle state of the
+		cooling device.
+	.set_cur_state:
+		set the Current throttle state of the cooling device.
+
+    ::
+
+	void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function removes the thermal cooling device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds itself from all the thermal zone devices using it.
+
+1.3 interface for binding a thermal zone device with a thermal cooling device
+-----------------------------------------------------------------------------
+
+    ::
+
+	int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+		int trip, struct thermal_cooling_device *cdev,
+		unsigned long upper, unsigned long lower, unsigned int weight);
+
+    This interface function binds a thermal cooling device to a particular trip
+    point of a thermal zone device.
+
+    This function is usually called in the thermal zone device .bind callback.
+
+    tz:
+	  the thermal zone device
+    cdev:
+	  thermal cooling device
+    trip:
+	  indicates which trip point in this thermal zone the cooling device
+	  is associated with.
+    upper:
+	  the Maximum cooling state for this trip point.
+	  THERMAL_NO_LIMIT means no upper limit,
+	  and the cooling device can be in max_state.
+    lower:
+	  the Minimum cooling state can be used for this trip point.
+	  THERMAL_NO_LIMIT means no lower limit,
+	  and the cooling device can be in cooling state 0.
+    weight:
+	  the influence of this cooling device in this thermal
+	  zone.  See 1.4.1 below for more information.
+
+    ::
+
+	int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+				int trip, struct thermal_cooling_device *cdev);
+
+    This interface function unbinds a thermal cooling device from a particular
+    trip point of a thermal zone device. This function is usually called in
+    the thermal zone device .unbind callback.
+
+    tz:
+	the thermal zone device
+    cdev:
+	thermal cooling device
+    trip:
+	indicates which trip point in this thermal zone the cooling device
+	is associated with.
+
+1.4 Thermal Zone Parameters
+---------------------------
+
+    ::
+
+	struct thermal_bind_params
+
+    This structure defines the following parameters that are used to bind
+    a zone with a cooling device for a particular trip point.
+
+    .cdev:
+	     The cooling device pointer
+    .weight:
+	     The 'influence' of a particular cooling device on this
+	     zone. This is relative to the rest of the cooling
+	     devices. For example, if all cooling devices have a
+	     weight of 1, then they all contribute the same. You can
+	     use percentages if you want, but it's not mandatory. A
+	     weight of 0 means that this cooling device doesn't
+	     contribute to the cooling of this zone unless all cooling
+	     devices have a weight of 0. If all weights are 0, then
+	     they all contribute the same.
+    .trip_mask:
+	       This is a bit mask that gives the binding relation between
+	       this thermal zone and cdev, for a particular trip point.
+	       If nth bit is set, then the cdev and thermal zone are bound
+	       for trip point n.
+    .binding_limits:
+		     This is an array of cooling state limits. Must have
+		     exactly 2 * thermal_zone.number_of_trip_points. It is an
+		     array consisting of tuples <lower-state upper-state> of
+		     state limits. Each trip will be associated with one state
+		     limit tuple when binding. A NULL pointer means
+		     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+		     These limits are used when binding a cdev to a trip point.
+    .match:
+	    This call back returns success(0) if the 'tz and cdev' need to
+	    be bound, as per platform data.
+
+    ::
+
+	struct thermal_zone_params
+
+    This structure defines the platform level parameters for a thermal zone.
+    This data, for each thermal zone should come from the platform layer.
+    This is an optional feature where some platforms can choose not to
+    provide this data.
+
+    .governor_name:
+	       Name of the thermal governor used for this zone
+    .no_hwmon:
+	       a boolean to indicate if the thermal to hwmon sysfs interface
+	       is required. when no_hwmon == false, a hwmon sysfs interface
+	       will be created. when no_hwmon == true, nothing will be done.
+	       In case the thermal_zone_params is NULL, the hwmon interface
+	       will be created (for backward compatibility).
+    .num_tbps:
+	       Number of thermal_bind_params entries for this zone
+    .tbp:
+	       thermal_bind_params entries
+
+2. sysfs attributes structure
+=============================
+
+==	================
+RO	read only value
+WO	write only value
+RW	read/write value
+==	================
+
+Thermal sysfs attributes will be represented under /sys/class/thermal.
+Hwmon sysfs I/F extension is also available under /sys/class/hwmon
+if hwmon is compiled in or built as a module.
+
+Thermal zone device sys I/F, created once it's registered::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---type:			Type of the thermal zone
+    |---temp:			Current temperature
+    |---mode:			Working mode of the thermal zone
+    |---policy:			Thermal governor used for this zone
+    |---available_policies:	Available thermal governors for this zone
+    |---trip_point_[0-*]_temp:	Trip point temperature
+    |---trip_point_[0-*]_type:	Trip point type
+    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
+    |---emul_temp:		Emulated temperature set node
+    |---sustainable_power:      Sustainable dissipatable power
+    |---k_po:                   Proportional term during temperature overshoot
+    |---k_pu:                   Proportional term during temperature undershoot
+    |---k_i:                    PID's integral term in the power allocator gov
+    |---k_d:                    PID's derivative term in the power allocator
+    |---integral_cutoff:        Offset above which errors are accumulated
+    |---slope:                  Slope constant applied as linear extrapolation
+    |---offset:                 Offset constant applied as linear extrapolation
+
+Thermal cooling device sys I/F, created once it's registered::
+
+  /sys/class/thermal/cooling_device[0-*]:
+    |---type:			Type of the cooling device(processor/fan/...)
+    |---max_state:		Maximum cooling state of the cooling device
+    |---cur_state:		Current cooling state of the cooling device
+    |---stats:			Directory containing cooling device's statistics
+    |---stats/reset:		Writing any value resets the statistics
+    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
+    |---stats/total_trans:	Total number of times cooling state is changed
+    |---stats/trans_table:	Cooing state transition table
+
+
+Then next two dynamic attributes are created/removed in pairs. They represent
+the relationship between a thermal zone and its associated cooling device.
+They are created/removed for each successful execution of
+thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
+
+::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
+    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
+    |---cdev[0-*]_weight:       Influence of the cooling device in
+				this thermal zone
+
+Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
+the generic thermal driver also creates a hwmon sysfs I/F for each _type_
+of thermal zone device. E.g. the generic thermal driver registers one hwmon
+class device and build the associated hwmon sysfs I/F for all the registered
+ACPI thermal zones.
+
+::
+
+  /sys/class/hwmon/hwmon[0-*]:
+    |---name:			The type of the thermal zone devices
+    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
+    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
+
+Please read Documentation/hwmon/sysfs-interface.rst for additional information.
+
+Thermal zone attributes
+-----------------------
+
+type
+	Strings which represent the thermal zone type.
+	This is given by thermal zone driver as part of registration.
+	E.g: "acpitz" indicates it's an ACPI thermal device.
+	In order to keep it consistent with hwmon sys attribute; this should
+	be a short, lowercase string, not containing spaces nor dashes.
+	RO, Required
+
+temp
+	Current temperature as reported by thermal zone (sensor).
+	Unit: millidegree Celsius
+	RO, Required
+
+mode
+	One of the predefined values in [enabled, disabled].
+	This file gives information about the algorithm that is currently
+	managing the thermal zone. It can be either default kernel based
+	algorithm or user space application.
+
+	enabled
+			  enable Kernel Thermal management.
+	disabled
+			  Preventing kernel thermal zone driver actions upon
+			  trip points so that user application can take full
+			  charge of the thermal management.
+
+	RW, Optional
+
+policy
+	One of the various thermal governors used for a particular zone.
+
+	RW, Required
+
+available_policies
+	Available thermal governors which can be used for a particular zone.
+
+	RO, Required
+
+`trip_point_[0-*]_temp`
+	The temperature above which trip point will be fired.
+
+	Unit: millidegree Celsius
+
+	RO, Optional
+
+`trip_point_[0-*]_type`
+	Strings which indicate the type of the trip point.
+
+	E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI
+	thermal zone.
+
+	RO, Optional
+
+`trip_point_[0-*]_hyst`
+	The hysteresis value for a trip point, represented as an integer
+	Unit: Celsius
+	RW, Optional
+
+`cdev[0-*]`
+	Sysfs link to the thermal cooling device node where the sys I/F
+	for cooling device throttling control represents.
+
+	RO, Optional
+
+`cdev[0-*]_trip_point`
+	The trip point in this thermal zone which `cdev[0-*]` is associated
+	with; -1 means the cooling device is not associated with any trip
+	point.
+
+	RO, Optional
+
+`cdev[0-*]_weight`
+	The influence of `cdev[0-*]` in this thermal zone. This value
+	is relative to the rest of cooling devices in the thermal
+	zone. For example, if a cooling device has a weight double
+	than that of other, it's twice as effective in cooling the
+	thermal zone.
+
+	RW, Optional
+
+passive
+	Attribute is only present for zones in which the passive cooling
+	policy is not supported by native thermal driver. Default is zero
+	and can be set to a temperature (in millidegrees) to enable a
+	passive trip point for the zone. Activation is done by polling with
+	an interval of 1 second.
+
+	Unit: millidegrees Celsius
+
+	Valid values: 0 (disabled) or greater than 1000
+
+	RW, Optional
+
+emul_temp
+	Interface to set the emulated temperature method in thermal zone
+	(sensor). After setting this temperature, the thermal zone may pass
+	this temperature to platform emulation function if registered or
+	cache it locally. This is useful in debugging different temperature
+	threshold and its associated cooling action. This is write only node
+	and writing 0 on this node should disable emulation.
+	Unit: millidegree Celsius
+
+	WO, Optional
+
+	  WARNING:
+	    Be careful while enabling this option on production systems,
+	    because userland can easily disable the thermal policy by simply
+	    flooding this sysfs node with low temperature values.
+
+sustainable_power
+	An estimate of the sustained power that can be dissipated by
+	the thermal zone. Used by the power allocator governor. For
+	more information see Documentation/thermal/power_allocator.rst
+
+	Unit: milliwatts
+
+	RW, Optional
+
+k_po
+	The proportional term of the power allocator governor's PID
+	controller during temperature overshoot. Temperature overshoot
+	is when the current temperature is above the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_pu
+	The proportional term of the power allocator governor's PID
+	controller during temperature undershoot. Temperature undershoot
+	is when the current temperature is below the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_i
+	The integral term of the power allocator governor's PID
+	controller. This term allows the PID controller to compensate
+	for long term drift. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_d
+	The derivative term of the power allocator governor's PID
+	controller. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+integral_cutoff
+	Temperature offset from the desired temperature trip point
+	above which the integral term of the power allocator
+	governor's PID controller starts accumulating errors. For
+	example, if integral_cutoff is 0, then the integral term only
+	accumulates error when temperature is above the desired
+	temperature trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	Unit: millidegree Celsius
+
+	RW, Optional
+
+slope
+	The slope constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+offset
+	The offset constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+Cooling device attributes
+-------------------------
+
+type
+	String which represents the type of device, e.g:
+
+	- for generic ACPI: should be "Fan", "Processor" or "LCD"
+	- for memory controller device on intel_menlow platform:
+	  should be "Memory controller".
+
+	RO, Required
+
+max_state
+	The maximum permissible cooling state of this cooling device.
+
+	RO, Required
+
+cur_state
+	The current cooling state of this cooling device.
+	The value can any integer numbers between 0 and max_state:
+
+	- cur_state == 0 means no cooling
+	- cur_state == max_state means the maximum cooling.
+
+	RW, Required
+
+stats/reset
+	Writing any value resets the cooling device's statistics.
+	WO, Required
+
+stats/time_in_state_ms:
+	The amount of time spent by the cooling device in various cooling
+	states. The output will have "<state> <time>" pair in each line, which
+	will mean this cooling device spent <time> msec of time at <state>.
+	Output will have one line for each of the supported states.  usertime
+	units here is 10mS (similar to other time exported in /proc).
+	RO, Required
+
+
+stats/total_trans:
+	A single positive value showing the total number of times the state of a
+	cooling device is changed.
+
+	RO, Required
+
+stats/trans_table:
+	This gives fine grained information about all the cooling state
+	transitions. The cat output here is a two dimensional matrix, where an
+	entry <i,j> (row i, column j) represents the number of transitions from
+	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
+	reading this will return an -EFBIG error.
+	RO, Required
+
+3. A simple implementation
+==========================
+
+ACPI thermal zone may support multiple trip points like critical, hot,
+passive, active. If an ACPI thermal zone supports critical, passive,
+active[0] and active[1] at the same time, it may register itself as a
+thermal_zone_device (thermal_zone1) with 4 trip points in all.
+It has one processor and one fan, which are both registered as
+thermal_cooling_device. Both are considered to have the same
+effectiveness in cooling the thermal zone.
+
+If the processor is listed in _PSL method, and the fan is listed in _AL0
+method, the sys I/F structure will be built like this::
+
+ /sys/class/thermal:
+  |thermal_zone1:
+    |---type:			acpitz
+    |---temp:			37000
+    |---mode:			enabled
+    |---policy:			step_wise
+    |---available_policies:	step_wise fair_share
+    |---trip_point_0_temp:	100000
+    |---trip_point_0_type:	critical
+    |---trip_point_1_temp:	80000
+    |---trip_point_1_type:	passive
+    |---trip_point_2_temp:	70000
+    |---trip_point_2_type:	active0
+    |---trip_point_3_temp:	60000
+    |---trip_point_3_type:	active1
+    |---cdev0:			--->/sys/class/thermal/cooling_device0
+    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
+    |---cdev0_weight:           1024
+    |---cdev1:			--->/sys/class/thermal/cooling_device3
+    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
+    |---cdev1_weight:           1024
+
+  |cooling_device0:
+    |---type:			Processor
+    |---max_state:		8
+    |---cur_state:		0
+
+  |cooling_device3:
+    |---type:			Fan
+    |---max_state:		2
+    |---cur_state:		0
+
+ /sys/class/hwmon:
+  |hwmon0:
+    |---name:			acpitz
+    |---temp1_input:		37000
+    |---temp1_crit:		100000
+
+4. Event Notification
+=====================
+
+The framework includes a simple notification mechanism, in the form of a
+netlink event. Netlink socket initialization is done during the _init_
+of the framework. Drivers which intend to use the notification mechanism
+just need to call thermal_generate_netlink_event() with two arguments viz
+(originator, event). The originator is a pointer to struct thermal_zone_device
+from where the event has been originated. An integer which represents the
+thermal zone device will be used in the message to identify the zone. The
+event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
+THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
+crosses any of the configured thresholds.
+
+5. Export Symbol APIs
+=====================
+
+5.1. get_tz_trend
+-----------------
+
+This function returns the trend of a thermal zone, i.e the rate of change
+of temperature of the thermal zone. Ideally, the thermal sensor drivers
+are supposed to implement the callback. If they don't, the thermal
+framework calculated the trend by comparing the previous and the current
+temperature values.
+
+5.2. get_thermal_instance
+-------------------------
+
+This function returns the thermal_instance corresponding to a given
+{thermal_zone, cooling_device, trip_point} combination. Returns NULL
+if such an instance does not exist.
+
+5.3. thermal_notify_framework
+-----------------------------
+
+This function handles the trip events from sensor drivers. It starts
+throttling the cooling devices according to the policy configured.
+For CRITICAL and HOT trip points, this notifies the respective drivers,
+and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
+The throttling policy is based on the configured platform data; if no
+platform data is provided, this uses the step_wise throttling policy.
+
+5.4. thermal_cdev_update
+------------------------
+
+This function serves as an arbitrator to set the state of a cooling
+device. It sets the cooling device to the deepest cooling state if
+possible.
+
+6. thermal_emergency_poweroff
+=============================
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
deleted file mode 100644
index c3fa500df92c..000000000000
--- a/Documentation/thermal/sysfs-api.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-Generic Thermal Sysfs driver How To
-===================================
-
-Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
-
-Updated: 2 January 2008
-
-Copyright (c)  2008 Intel Corporation
-
-
-0. Introduction
-
-The generic thermal sysfs provides a set of interfaces for thermal zone
-devices (sensors) and thermal cooling devices (fan, processor...) to register
-with the thermal management solution and to be a part of it.
-
-This how-to focuses on enabling new thermal zone and cooling devices to
-participate in thermal management.
-This solution is platform independent and any type of thermal zone devices
-and cooling devices should be able to make use of the infrastructure.
-
-The main task of the thermal sysfs driver is to expose thermal zone attributes
-as well as cooling device attributes to the user space.
-An intelligent thermal management application can make decisions based on
-inputs from thermal zone attributes (the current temperature and trip point
-temperature) and throttle appropriate devices.
-
-[0-*]	denotes any positive number starting from 0
-[1-*]	denotes any positive number starting from 1
-
-1. thermal sysfs driver interface functions
-
-1.1 thermal zone device interface
-1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *type,
-		int trips, int mask, void *devdata,
-		struct thermal_zone_device_ops *ops,
-		const struct thermal_zone_params *tzp,
-		int passive_delay, int polling_delay))
-
-    This interface function adds a new thermal zone device (sensor) to
-    /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
-    thermal cooling devices registered at the same time.
-
-    type: the thermal zone type.
-    trips: the total number of trip points this thermal zone supports.
-    mask: Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
-    devdata: device private data
-    ops: thermal zone device call-backs.
-	.bind: bind the thermal zone device with a thermal cooling device.
-	.unbind: unbind the thermal zone device with a thermal cooling device.
-	.get_temp: get the current temperature of the thermal zone.
-	.set_trips: set the trip points window. Whenever the current temperature
-		    is updated, the trip points immediately below and above the
-		    current temperature are found.
-	.get_mode: get the current mode (enabled/disabled) of the thermal zone.
-	    - "enabled" means the kernel thermal management is enabled.
-	    - "disabled" will prevent kernel thermal driver action upon trip points
-	      so that user applications can take charge of thermal management.
-	.set_mode: set the mode (enabled/disabled) of the thermal zone.
-	.get_trip_type: get the type of certain trip point.
-	.get_trip_temp: get the temperature above which the certain trip point
-			will be fired.
-	.set_emul_temp: set the emulation temperature which helps in debugging
-			different threshold temperature points.
-    tzp: thermal zone platform parameters.
-    passive_delay: number of milliseconds to wait between polls when
-	performing passive cooling.
-    polling_delay: number of milliseconds to wait between polls when checking
-	whether trip points have been crossed (0 for interrupt driven systems).
-
-
-1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
-
-    This interface function removes the thermal zone device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds all the thermal cooling devices it uses.
-
-1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id, void *data,
-		const struct thermal_zone_of_device_ops *ops)
-
-	This interface adds a new sensor to a DT thermal zone.
-	This function will search the list of thermal zones described in
-	device tree and look for the zone that refer to the sensor device
-	pointed by dev->of_node as temperature providers. For the zone
-	pointing to the sensor node, the sensor will be added to the DT
-	thermal zone device.
-
-	The parameters for this interface are:
-	dev:		Device node of sensor containing valid node pointer in
-			dev->of_node.
-	sensor_id:	a sensor identifier, in case the sensor IP has more
-			than one sensors
-	data:		a private pointer (owned by the caller) that will be
-			passed back, when a temperature reading is needed.
-	ops:		struct thermal_zone_of_device_ops *.
-
-			get_temp:	a pointer to a function that reads the
-					sensor temperature. This is mandatory
-					callback provided by sensor driver.
-			set_trips:      a pointer to a function that sets a
-					temperature window. When this window is
-					left the driver must inform the thermal
-					core via thermal_zone_device_update.
-			get_trend: 	a pointer to a function that reads the
-					sensor temperature trend.
-			set_emul_temp:	a pointer to a function that sets
-					sensor emulated temperature.
-	The thermal zone temperature is provided by the get_temp() function
-	pointer of thermal_zone_of_device_ops. When called, it will
-	have the private pointer @data back.
-
-	It returns error pointer if fails otherwise valid thermal zone device
-	handle. Caller should check the return handle with IS_ERR() for finding
-	whether success or not.
-
-1.1.4 void thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface unregisters a sensor from a DT thermal zone which was
-	successfully added by interface thermal_zone_of_sensor_register().
-	This function removes the sensor callbacks and private data from the
-	thermal zone device registered with thermal_zone_of_sensor_register()
-	interface. It will also silent the zone by remove the .get_temp() and
-	get_trend() thermal zone device callbacks.
-
-1.1.5 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id,
-		void *data, const struct thermal_zone_of_device_ops *ops)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_register().
-	All details of thermal_zone_of_sensor_register() described in
-	section 1.1.3 is applicable here.
-	The benefit of using this interface to register sensor is that it
-	is not require to explicitly call thermal_zone_of_sensor_unregister()
-	in error path or during driver unbinding as this is done by driver
-	resource manager.
-
-1.1.6 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_unregister().
-	All details of thermal_zone_of_sensor_unregister() described in
-	section 1.1.4 is applicable here.
-	Normally this function will not need to be called and the resource
-	management code will ensure that the resource is freed.
-
-1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
-
-	This interface is used to read the slope attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
-
-	This interface is used to read the offset attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.2 thermal cooling device interface
-1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
-		void *devdata, struct thermal_cooling_device_ops *)
-
-    This interface function adds a new thermal cooling device (fan/processor/...)
-    to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
-    to all the thermal zone devices registered at the same time.
-    name: the cooling device name.
-    devdata: device private data.
-    ops: thermal cooling devices call-backs.
-	.get_max_state: get the Maximum throttle state of the cooling device.
-	.get_cur_state: get the Currently requested throttle state of the cooling device.
-	.set_cur_state: set the Current throttle state of the cooling device.
-
-1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function removes the thermal cooling device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds itself from all the thermal zone devices using it.
-
-1.3 interface for binding a thermal zone device with a thermal cooling device
-1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
-	int trip, struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower, unsigned int weight);
-
-    This interface function binds a thermal cooling device to a particular trip
-    point of a thermal zone device.
-    This function is usually called in the thermal zone device .bind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-    upper:the Maximum cooling state for this trip point.
-          THERMAL_NO_LIMIT means no upper limit,
-	  and the cooling device can be in max_state.
-    lower:the Minimum cooling state can be used for this trip point.
-          THERMAL_NO_LIMIT means no lower limit,
-	  and the cooling device can be in cooling state 0.
-    weight: the influence of this cooling device in this thermal
-            zone.  See 1.4.1 below for more information.
-
-1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
-		int trip, struct thermal_cooling_device *cdev);
-
-    This interface function unbinds a thermal cooling device from a particular
-    trip point of a thermal zone device. This function is usually called in
-    the thermal zone device .unbind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-
-1.4 Thermal Zone Parameters
-1.4.1 struct thermal_bind_params
-    This structure defines the following parameters that are used to bind
-    a zone with a cooling device for a particular trip point.
-    .cdev: The cooling device pointer
-    .weight: The 'influence' of a particular cooling device on this
-             zone. This is relative to the rest of the cooling
-             devices. For example, if all cooling devices have a
-             weight of 1, then they all contribute the same. You can
-             use percentages if you want, but it's not mandatory. A
-             weight of 0 means that this cooling device doesn't
-             contribute to the cooling of this zone unless all cooling
-             devices have a weight of 0. If all weights are 0, then
-             they all contribute the same.
-    .trip_mask:This is a bit mask that gives the binding relation between
-               this thermal zone and cdev, for a particular trip point.
-               If nth bit is set, then the cdev and thermal zone are bound
-               for trip point n.
-    .binding_limits: This is an array of cooling state limits. Must have
-                     exactly 2 * thermal_zone.number_of_trip_points. It is an
-                     array consisting of tuples <lower-state upper-state> of
-                     state limits. Each trip will be associated with one state
-                     limit tuple when binding. A NULL pointer means
-                     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
-                     These limits are used when binding a cdev to a trip point.
-    .match: This call back returns success(0) if the 'tz and cdev' need to
-	    be bound, as per platform data.
-1.4.2 struct thermal_zone_params
-    This structure defines the platform level parameters for a thermal zone.
-    This data, for each thermal zone should come from the platform layer.
-    This is an optional feature where some platforms can choose not to
-    provide this data.
-    .governor_name: Name of the thermal governor used for this zone
-    .no_hwmon: a boolean to indicate if the thermal to hwmon sysfs interface
-               is required. when no_hwmon == false, a hwmon sysfs interface
-               will be created. when no_hwmon == true, nothing will be done.
-               In case the thermal_zone_params is NULL, the hwmon interface
-               will be created (for backward compatibility).
-    .num_tbps: Number of thermal_bind_params entries for this zone
-    .tbp: thermal_bind_params entries
-
-2. sysfs attributes structure
-
-RO	read only value
-WO	write only value
-RW	read/write value
-
-Thermal sysfs attributes will be represented under /sys/class/thermal.
-Hwmon sysfs I/F extension is also available under /sys/class/hwmon
-if hwmon is compiled in or built as a module.
-
-Thermal zone device sys I/F, created once it's registered:
-/sys/class/thermal/thermal_zone[0-*]:
-    |---type:			Type of the thermal zone
-    |---temp:			Current temperature
-    |---mode:			Working mode of the thermal zone
-    |---policy:			Thermal governor used for this zone
-    |---available_policies:	Available thermal governors for this zone
-    |---trip_point_[0-*]_temp:	Trip point temperature
-    |---trip_point_[0-*]_type:	Trip point type
-    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
-    |---emul_temp:		Emulated temperature set node
-    |---sustainable_power:      Sustainable dissipatable power
-    |---k_po:                   Proportional term during temperature overshoot
-    |---k_pu:                   Proportional term during temperature undershoot
-    |---k_i:                    PID's integral term in the power allocator gov
-    |---k_d:                    PID's derivative term in the power allocator
-    |---integral_cutoff:        Offset above which errors are accumulated
-    |---slope:                  Slope constant applied as linear extrapolation
-    |---offset:                 Offset constant applied as linear extrapolation
-
-Thermal cooling device sys I/F, created once it's registered:
-/sys/class/thermal/cooling_device[0-*]:
-    |---type:			Type of the cooling device(processor/fan/...)
-    |---max_state:		Maximum cooling state of the cooling device
-    |---cur_state:		Current cooling state of the cooling device
-    |---stats:			Directory containing cooling device's statistics
-    |---stats/reset:		Writing any value resets the statistics
-    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
-    |---stats/total_trans:	Total number of times cooling state is changed
-    |---stats/trans_table:	Cooing state transition table
-
-
-Then next two dynamic attributes are created/removed in pairs. They represent
-the relationship between a thermal zone and its associated cooling device.
-They are created/removed for each successful execution of
-thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
-
-/sys/class/thermal/thermal_zone[0-*]:
-    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
-    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
-    |---cdev[0-*]_weight:       Influence of the cooling device in
-                                this thermal zone
-
-Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
-the generic thermal driver also creates a hwmon sysfs I/F for each _type_
-of thermal zone device. E.g. the generic thermal driver registers one hwmon
-class device and build the associated hwmon sysfs I/F for all the registered
-ACPI thermal zones.
-
-/sys/class/hwmon/hwmon[0-*]:
-    |---name:			The type of the thermal zone devices
-    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
-    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
-
-Please read Documentation/hwmon/sysfs-interface.rst for additional information.
-
-***************************
-* Thermal zone attributes *
-***************************
-
-type
-	Strings which represent the thermal zone type.
-	This is given by thermal zone driver as part of registration.
-	E.g: "acpitz" indicates it's an ACPI thermal device.
-	In order to keep it consistent with hwmon sys attribute; this should
-	be a short, lowercase string, not containing spaces nor dashes.
-	RO, Required
-
-temp
-	Current temperature as reported by thermal zone (sensor).
-	Unit: millidegree Celsius
-	RO, Required
-
-mode
-	One of the predefined values in [enabled, disabled].
-	This file gives information about the algorithm that is currently
-	managing the thermal zone. It can be either default kernel based
-	algorithm or user space application.
-	enabled		= enable Kernel Thermal management.
-	disabled	= Preventing kernel thermal zone driver actions upon
-			  trip points so that user application can take full
-			  charge of the thermal management.
-	RW, Optional
-
-policy
-	One of the various thermal governors used for a particular zone.
-	RW, Required
-
-available_policies
-	Available thermal governors which can be used for a particular zone.
-	RO, Required
-
-trip_point_[0-*]_temp
-	The temperature above which trip point will be fired.
-	Unit: millidegree Celsius
-	RO, Optional
-
-trip_point_[0-*]_type
-	Strings which indicate the type of the trip point.
-	E.g. it can be one of critical, hot, passive, active[0-*] for ACPI
-	thermal zone.
-	RO, Optional
-
-trip_point_[0-*]_hyst
-	The hysteresis value for a trip point, represented as an integer
-	Unit: Celsius
-	RW, Optional
-
-cdev[0-*]
-	Sysfs link to the thermal cooling device node where the sys I/F
-	for cooling device throttling control represents.
-	RO, Optional
-
-cdev[0-*]_trip_point
-	The trip point in this thermal zone which cdev[0-*] is associated
-	with; -1 means the cooling device is not associated with any trip
-	point.
-	RO, Optional
-
-cdev[0-*]_weight
-        The influence of cdev[0-*] in this thermal zone. This value
-        is relative to the rest of cooling devices in the thermal
-        zone. For example, if a cooling device has a weight double
-        than that of other, it's twice as effective in cooling the
-        thermal zone.
-        RW, Optional
-
-passive
-	Attribute is only present for zones in which the passive cooling
-	policy is not supported by native thermal driver. Default is zero
-	and can be set to a temperature (in millidegrees) to enable a
-	passive trip point for the zone. Activation is done by polling with
-	an interval of 1 second.
-	Unit: millidegrees Celsius
-	Valid values: 0 (disabled) or greater than 1000
-	RW, Optional
-
-emul_temp
-	Interface to set the emulated temperature method in thermal zone
-	(sensor). After setting this temperature, the thermal zone may pass
-	this temperature to platform emulation function if registered or
-	cache it locally. This is useful in debugging different temperature
-	threshold and its associated cooling action. This is write only node
-	and writing 0 on this node should disable emulation.
-	Unit: millidegree Celsius
-	WO, Optional
-
-	  WARNING: Be careful while enabling this option on production systems,
-	  because userland can easily disable the thermal policy by simply
-	  flooding this sysfs node with low temperature values.
-
-sustainable_power
-	An estimate of the sustained power that can be dissipated by
-	the thermal zone. Used by the power allocator governor. For
-	more information see Documentation/thermal/power_allocator.txt
-	Unit: milliwatts
-	RW, Optional
-
-k_po
-	The proportional term of the power allocator governor's PID
-	controller during temperature overshoot. Temperature overshoot
-	is when the current temperature is above the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_pu
-	The proportional term of the power allocator governor's PID
-	controller during temperature undershoot. Temperature undershoot
-	is when the current temperature is below the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_i
-	The integral term of the power allocator governor's PID
-	controller. This term allows the PID controller to compensate
-	for long term drift. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_d
-	The derivative term of the power allocator governor's PID
-	controller. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-integral_cutoff
-	Temperature offset from the desired temperature trip point
-	above which the integral term of the power allocator
-	governor's PID controller starts accumulating errors. For
-	example, if integral_cutoff is 0, then the integral term only
-	accumulates error when temperature is above the desired
-	temperature trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	Unit: millidegree Celsius
-	RW, Optional
-
-slope
-	The slope constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-offset
-	The offset constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-*****************************
-* Cooling device attributes *
-*****************************
-
-type
-	String which represents the type of device, e.g:
-	- for generic ACPI: should be "Fan", "Processor" or "LCD"
-	- for memory controller device on intel_menlow platform:
-	  should be "Memory controller".
-	RO, Required
-
-max_state
-	The maximum permissible cooling state of this cooling device.
-	RO, Required
-
-cur_state
-	The current cooling state of this cooling device.
-	The value can any integer numbers between 0 and max_state:
-	- cur_state == 0 means no cooling
-	- cur_state == max_state means the maximum cooling.
-	RW, Required
-
-stats/reset
-	Writing any value resets the cooling device's statistics.
-	WO, Required
-
-stats/time_in_state_ms:
-	The amount of time spent by the cooling device in various cooling
-	states. The output will have "<state> <time>" pair in each line, which
-	will mean this cooling device spent <time> msec of time at <state>.
-	Output will have one line for each of the supported states.  usertime
-	units here is 10mS (similar to other time exported in /proc).
-	RO, Required
-
-stats/total_trans:
-	A single positive value showing the total number of times the state of a
-	cooling device is changed.
-	RO, Required
-
-stats/trans_table:
-	This gives fine grained information about all the cooling state
-	transitions. The cat output here is a two dimensional matrix, where an
-	entry <i,j> (row i, column j) represents the number of transitions from
-	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
-	reading this will return an -EFBIG error.
-	RO, Required
-
-3. A simple implementation
-
-ACPI thermal zone may support multiple trip points like critical, hot,
-passive, active. If an ACPI thermal zone supports critical, passive,
-active[0] and active[1] at the same time, it may register itself as a
-thermal_zone_device (thermal_zone1) with 4 trip points in all.
-It has one processor and one fan, which are both registered as
-thermal_cooling_device. Both are considered to have the same
-effectiveness in cooling the thermal zone.
-
-If the processor is listed in _PSL method, and the fan is listed in _AL0
-method, the sys I/F structure will be built like this:
-
-/sys/class/thermal:
-
-|thermal_zone1:
-    |---type:			acpitz
-    |---temp:			37000
-    |---mode:			enabled
-    |---policy:			step_wise
-    |---available_policies:	step_wise fair_share
-    |---trip_point_0_temp:	100000
-    |---trip_point_0_type:	critical
-    |---trip_point_1_temp:	80000
-    |---trip_point_1_type:	passive
-    |---trip_point_2_temp:	70000
-    |---trip_point_2_type:	active0
-    |---trip_point_3_temp:	60000
-    |---trip_point_3_type:	active1
-    |---cdev0:			--->/sys/class/thermal/cooling_device0
-    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
-    |---cdev0_weight:           1024
-    |---cdev1:			--->/sys/class/thermal/cooling_device3
-    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
-    |---cdev1_weight:           1024
-
-|cooling_device0:
-    |---type:			Processor
-    |---max_state:		8
-    |---cur_state:		0
-
-|cooling_device3:
-    |---type:			Fan
-    |---max_state:		2
-    |---cur_state:		0
-
-/sys/class/hwmon:
-
-|hwmon0:
-    |---name:			acpitz
-    |---temp1_input:		37000
-    |---temp1_crit:		100000
-
-4. Event Notification
-
-The framework includes a simple notification mechanism, in the form of a
-netlink event. Netlink socket initialization is done during the _init_
-of the framework. Drivers which intend to use the notification mechanism
-just need to call thermal_generate_netlink_event() with two arguments viz
-(originator, event). The originator is a pointer to struct thermal_zone_device
-from where the event has been originated. An integer which represents the
-thermal zone device will be used in the message to identify the zone. The
-event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
-THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
-crosses any of the configured thresholds.
-
-5. Export Symbol APIs:
-
-5.1: get_tz_trend:
-This function returns the trend of a thermal zone, i.e the rate of change
-of temperature of the thermal zone. Ideally, the thermal sensor drivers
-are supposed to implement the callback. If they don't, the thermal
-framework calculated the trend by comparing the previous and the current
-temperature values.
-
-5.2:get_thermal_instance:
-This function returns the thermal_instance corresponding to a given
-{thermal_zone, cooling_device, trip_point} combination. Returns NULL
-if such an instance does not exist.
-
-5.3:thermal_notify_framework:
-This function handles the trip events from sensor drivers. It starts
-throttling the cooling devices according to the policy configured.
-For CRITICAL and HOT trip points, this notifies the respective drivers,
-and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
-The throttling policy is based on the configured platform data; if no
-platform data is provided, this uses the step_wise throttling policy.
-
-5.4:thermal_cdev_update:
-This function serves as an arbitrator to set the state of a cooling
-device. It sets the cooling device to the deepest cooling state if
-possible.
-
-6. thermal_emergency_poweroff:
-
-On an event of critical trip temperature crossing. Thermal framework
-allows the system to shutdown gracefully by calling orderly_poweroff().
-In the event of a failure of orderly_poweroff() to shut down the system
-we are in danger of keeping the system alive at undesirably high
-temperatures. To mitigate this high risk scenario we program a work
-queue to fire after a pre-determined number of seconds to start
-an emergency shutdown of the device using the kernel_power_off()
-function. In case kernel_power_off() fails then finally
-emergency_restart() is called in the worst case.
-
-The delay should be carefully profiled so as to give adequate time for
-orderly_poweroff(). In case of failure of an orderly_poweroff() the
-emergency poweroff kicks in after the delay has elapsed and shuts down
-the system.
-
-If set to 0 emergency poweroff will not be supported. So a carefully
-profiled non-zero positive value is a must for emergerncy poweroff to be
-triggered.
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal b/Documentation/thermal/x86_pkg_temperature_thermal
deleted file mode 100644
index 17a3a4c0a0ca..000000000000
--- a/Documentation/thermal/x86_pkg_temperature_thermal
+++ /dev/null
@@ -1,47 +0,0 @@
-Kernel driver: x86_pkg_temp_thermal
-===================
-
-Supported chips:
-* x86: with package level thermal management
-(Verify using: CPUID.06H:EAX[bit 6] =1)
-
-Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
-
-Reference
----
-Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
-Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
-
-Description
----------
-
-This driver register CPU digital temperature package level sensor as a thermal
-zone with maximum two user mode configurable trip points. Number of trip points
-depends on the capability of the package. Once the trip point is violated,
-user mode can receive notification via thermal notification mechanism and can
-take any action to control temperature.
-
-
-Threshold management
---------------------
-Each package will register as a thermal zone under /sys/class/thermal.
-Example:
-/sys/class/thermal/thermal_zone1
-
-This contains two trip points:
-- trip_point_0_temp
-- trip_point_1_temp
-
-User can set any temperature between 0 to TJ-Max temperature. Temperature units
-are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.txt" for
-thermal sys-fs details.
-
-Any value other than 0 in these trip points, can trigger thermal notifications.
-Setting 0, stops sending thermal notifications.
-
-Thermal notifications: To get kobject-uevent notifications, set the thermal zone
-policy to "user_space". For example: echo -n "user_space" > policy
-
-
-
-
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal.rst b/Documentation/thermal/x86_pkg_temperature_thermal.rst
new file mode 100644
index 000000000000..f134dbd3f5a9
--- /dev/null
+++ b/Documentation/thermal/x86_pkg_temperature_thermal.rst
@@ -0,0 +1,55 @@
+===================================
+Kernel driver: x86_pkg_temp_thermal
+===================================
+
+Supported chips:
+
+* x86: with package level thermal management
+
+(Verify using: CPUID.06H:EAX[bit 6] =1)
+
+Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Reference
+---------
+
+Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
+Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
+
+Description
+-----------
+
+This driver register CPU digital temperature package level sensor as a thermal
+zone with maximum two user mode configurable trip points. Number of trip points
+depends on the capability of the package. Once the trip point is violated,
+user mode can receive notification via thermal notification mechanism and can
+take any action to control temperature.
+
+
+Threshold management
+--------------------
+Each package will register as a thermal zone under /sys/class/thermal.
+
+Example::
+
+	/sys/class/thermal/thermal_zone1
+
+This contains two trip points:
+
+- trip_point_0_temp
+- trip_point_1_temp
+
+User can set any temperature between 0 to TJ-Max temperature. Temperature units
+are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.rst" for
+thermal sys-fs details.
+
+Any value other than 0 in these trip points, can trigger thermal notifications.
+Setting 0, stops sending thermal notifications.
+
+Thermal notifications:
+To get kobject-uevent notifications, set the thermal zone
+policy to "user_space".
+
+For example::
+
+	echo -n "user_space" > policy
diff --git a/MAINTAINERS b/MAINTAINERS
index d0ed735994a5..693f2aebbc83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15618,7 +15618,7 @@ M:	Viresh Kumar <viresh.kumar@linaro.org>
 M:	Javi Merino <javi.merino@kernel.org>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/thermal/cpu-cooling-api.txt
+F:	Documentation/thermal/cpu-cooling-api.rst
 F:	drivers/thermal/cpu_cooling.c
 F:	include/linux/cpu_cooling.h
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 15a4ca5d7099..681047f8cc05 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -251,7 +251,7 @@ struct thermal_bind_params {
 	 * platform characterization. This value is relative to the
 	 * rest of the weights so a cooling device whose weight is
 	 * double that of another cooling device is twice as
-	 * effective. See Documentation/thermal/sysfs-api.txt for more
+	 * effective. See Documentation/thermal/sysfs-api.rst for more
 	 * information.
 	 */
 	int weight;
@@ -259,7 +259,7 @@ struct thermal_bind_params {
 	/*
 	 * This is a bit mask that gives the binding relation between this
 	 * thermal zone and cdev, for a particular trip point.
-	 * See Documentation/thermal/sysfs-api.txt for more information.
+	 * See Documentation/thermal/sysfs-api.rst for more information.
 	 */
 	int trip_mask;
 
-- 
cgit v1.2.3


From a5b47a40bed8b19e956872fb55097d676a68f59e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 27 Jun 2019 11:59:41 +0900
Subject: block: Remove unused code

bio_flush_dcache_pages() is unused. Remove it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 12 ------------
 include/linux/bio.h | 11 -----------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index ad9c3aa9bf7d..bb55b94bb361 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1760,18 +1760,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
 }
 EXPORT_SYMBOL(generic_end_io_acct);
 
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
-	struct bio_vec bvec;
-	struct bvec_iter iter;
-
-	bio_for_each_segment(bvec, bi, iter)
-		flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
 static inline bool bio_remaining_done(struct bio *bio)
 {
 	/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ee11c4324751..5a8ae56e09ff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,17 +443,6 @@ void generic_end_io_acct(struct request_queue *q, int op,
 				struct hd_struct *part,
 				unsigned long start_time);
 
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-extern void bio_flush_dcache_pages(struct bio *bi);
-#else
-static inline void bio_flush_dcache_pages(struct bio *bi)
-{
-}
-#endif
-
 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-- 
cgit v1.2.3


From 586c1b4125b3c7bf5b482fcafab5d568b8a3c285 Mon Sep 17 00:00:00 2001
From: Tony Xie <tony.xie@rock-chips.com>
Date: Fri, 21 Jun 2019 06:32:54 -0400
Subject: mfd: rk808: Add RK817 and RK809 support

The RK809 and RK817 are a Power Management IC (PMIC) for multimedia
and handheld devices. They contains the following components:
  - Regulators
  - RTC
  - Clocking

Both RK809 and RK817 chips are using a similar register map,
so we can reuse the RTC and Clocking functionality.
Most of regulators have a some implementation also.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |   6 +-
 drivers/mfd/rk808.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/rk808.h | 172 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..0b7db542e478 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1030,14 +1030,14 @@ config MFD_RC5T583
 	  different functionality of the device.
 
 config MFD_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power Management Chip"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power Management Chip"
 	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	help
-	  If you say yes here you get support for the RK805, RK808 and RK818
-	  Power Management chips.
+	  If you say yes here you get support for the RK805, RK808, RK809,
+	  RK817 and RK818 Power Management chips.
 	  This driver provides common support for accessing the device
 	  through I2C interface. The device supports multiple sub-devices
 	  including interrupts, RTC, LDO & DCDC regulators, and onkey.
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 94377782d208..6ee1c461a3bb 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/regmap.h>
+#include <linux/syscore_ops.h>
 
 struct rk808_reg_data {
 	int addr;
@@ -62,6 +63,27 @@ static bool rk808_is_volatile_reg(struct device *dev, unsigned int reg)
 	return false;
 }
 
+static bool rk817_is_volatile_reg(struct device *dev, unsigned int reg)
+{
+	/*
+	 * Notes:
+	 * - Technically the ROUND_30s bit makes RTC_CTRL_REG volatile, but
+	 *   we don't use that feature.  It's better to cache.
+	 */
+
+	switch (reg) {
+	case RK817_SECONDS_REG ... RK817_WEEKS_REG:
+	case RK817_RTC_STATUS_REG:
+	case RK817_INT_STS_REG0:
+	case RK817_INT_STS_REG1:
+	case RK817_INT_STS_REG2:
+	case RK817_SYS_STS:
+		return true;
+	}
+
+	return true;
+}
+
 static const struct regmap_config rk818_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -86,6 +108,14 @@ static const struct regmap_config rk808_regmap_config = {
 	.volatile_reg = rk808_is_volatile_reg,
 };
 
+static const struct regmap_config rk817_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = RK817_GPIO_INT_CFG,
+	.cache_type = REGCACHE_NONE,
+	.volatile_reg = rk817_is_volatile_reg,
+};
+
 static struct resource rtc_resources[] = {
 	{
 		.start  = RK808_IRQ_RTC_ALARM,
@@ -94,6 +124,10 @@ static struct resource rtc_resources[] = {
 	}
 };
 
+static struct resource rk817_rtc_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_RTC_ALARM),
+};
+
 static struct resource rk805_key_resources[] = {
 	{
 		.start  = RK805_IRQ_PWRON_FALL,
@@ -107,6 +141,11 @@ static struct resource rk805_key_resources[] = {
 	}
 };
 
+static struct resource rk817_pwrkey_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_RISE),
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_FALL),
+};
+
 static const struct mfd_cell rk805s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -132,6 +171,21 @@ static const struct mfd_cell rk808s[] = {
 	},
 };
 
+static const struct mfd_cell rk817s[] = {
+	{ .name = "rk808-clkout",},
+	{ .name = "rk808-regulator",},
+	{
+		.name = "rk8xx-pwrkey",
+		.num_resources = ARRAY_SIZE(rk817_pwrkey_resources),
+		.resources = &rk817_pwrkey_resources[0],
+	},
+	{
+		.name = "rk808-rtc",
+		.num_resources = ARRAY_SIZE(rk817_rtc_resources),
+		.resources = &rk817_rtc_resources[0],
+	},
+};
+
 static const struct mfd_cell rk818s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -167,6 +221,13 @@ static const struct rk808_reg_data rk808_pre_init_reg[] = {
 						    VB_LO_SEL_3500MV },
 };
 
+static const struct rk808_reg_data rk817_pre_init_reg[] = {
+	{RK817_RTC_CTRL_REG, RTC_STOP, RTC_STOP},
+	{RK817_GPIO_INT_CFG, RK817_INT_POL_MSK, RK817_INT_POL_H},
+	{RK817_SYS_CFG(1), RK817_HOTDIE_TEMP_MSK | RK817_TSD_TEMP_MSK,
+					   RK817_HOTDIE_105 | RK817_TSD_140},
+};
+
 static const struct rk808_reg_data rk818_pre_init_reg[] = {
 	/* improve efficiency */
 	{ RK818_BUCK2_CONFIG_REG, BUCK2_RATE_MASK,  BUCK_ILMIN_250MA },
@@ -332,6 +393,33 @@ static const struct regmap_irq rk818_irqs[] = {
 	},
 };
 
+static const struct regmap_irq rk817_irqs[RK817_IRQ_END] = {
+	REGMAP_IRQ_REG_LINE(0, 8),
+	REGMAP_IRQ_REG_LINE(1, 8),
+	REGMAP_IRQ_REG_LINE(2, 8),
+	REGMAP_IRQ_REG_LINE(3, 8),
+	REGMAP_IRQ_REG_LINE(4, 8),
+	REGMAP_IRQ_REG_LINE(5, 8),
+	REGMAP_IRQ_REG_LINE(6, 8),
+	REGMAP_IRQ_REG_LINE(7, 8),
+	REGMAP_IRQ_REG_LINE(8, 8),
+	REGMAP_IRQ_REG_LINE(9, 8),
+	REGMAP_IRQ_REG_LINE(10, 8),
+	REGMAP_IRQ_REG_LINE(11, 8),
+	REGMAP_IRQ_REG_LINE(12, 8),
+	REGMAP_IRQ_REG_LINE(13, 8),
+	REGMAP_IRQ_REG_LINE(14, 8),
+	REGMAP_IRQ_REG_LINE(15, 8),
+	REGMAP_IRQ_REG_LINE(16, 8),
+	REGMAP_IRQ_REG_LINE(17, 8),
+	REGMAP_IRQ_REG_LINE(18, 8),
+	REGMAP_IRQ_REG_LINE(19, 8),
+	REGMAP_IRQ_REG_LINE(20, 8),
+	REGMAP_IRQ_REG_LINE(21, 8),
+	REGMAP_IRQ_REG_LINE(22, 8),
+	REGMAP_IRQ_REG_LINE(23, 8)
+};
+
 static struct regmap_irq_chip rk805_irq_chip = {
 	.name = "rk805",
 	.irqs = rk805_irqs,
@@ -355,6 +443,18 @@ static const struct regmap_irq_chip rk808_irq_chip = {
 	.init_ack_masked = true,
 };
 
+static struct regmap_irq_chip rk817_irq_chip = {
+	.name = "rk817",
+	.irqs = rk817_irqs,
+	.num_irqs = ARRAY_SIZE(rk817_irqs),
+	.num_regs = 3,
+	.irq_reg_stride = 2,
+	.status_base = RK817_INT_STS_REG0,
+	.mask_base = RK817_INT_STS_MSK_REG0,
+	.ack_base = RK817_INT_STS_REG0,
+	.init_ack_masked = true,
+};
+
 static const struct regmap_irq_chip rk818_irq_chip = {
 	.name = "rk818",
 	.irqs = rk818_irqs,
@@ -423,9 +523,33 @@ static void rk818_device_shutdown(void)
 		dev_err(&rk808_i2c_client->dev, "power off error!\n");
 }
 
+static void rk8xx_syscore_shutdown(void)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret;
+
+	if (system_state == SYSTEM_POWER_OFF &&
+	    (rk808->variant == RK809_ID || rk808->variant == RK817_ID)) {
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_DN_FUN);
+		if (ret) {
+			dev_warn(&rk808_i2c_client->dev,
+				 "Cannot switch to power down function\n");
+		}
+	}
+}
+
+static struct syscore_ops rk808_syscore_ops = {
+	.shutdown = rk8xx_syscore_shutdown,
+};
+
 static const struct of_device_id rk808_of_match[] = {
 	{ .compatible = "rockchip,rk805" },
 	{ .compatible = "rockchip,rk808" },
+	{ .compatible = "rockchip,rk809" },
+	{ .compatible = "rockchip,rk817" },
 	{ .compatible = "rockchip,rk818" },
 	{ },
 };
@@ -438,10 +562,11 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void);
+	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
+	unsigned char pmic_id_msb, pmic_id_lsb;
 	int ret;
 	int i;
 
@@ -449,15 +574,24 @@ static int rk808_probe(struct i2c_client *client,
 	if (!rk808)
 		return -ENOMEM;
 
+	if (of_device_is_compatible(np, "rockchip,rk817") ||
+	    of_device_is_compatible(np, "rockchip,rk809")) {
+		pmic_id_msb = RK817_ID_MSB;
+		pmic_id_lsb = RK817_ID_LSB;
+	} else {
+		pmic_id_msb = RK808_ID_MSB;
+		pmic_id_lsb = RK808_ID_LSB;
+	}
+
 	/* Read chip variant */
-	msb = i2c_smbus_read_byte_data(client, RK808_ID_MSB);
+	msb = i2c_smbus_read_byte_data(client, pmic_id_msb);
 	if (msb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_MSB);
 		return msb;
 	}
 
-	lsb = i2c_smbus_read_byte_data(client, RK808_ID_LSB);
+	lsb = i2c_smbus_read_byte_data(client, pmic_id_lsb);
 	if (lsb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_LSB);
@@ -495,6 +629,16 @@ static int rk808_probe(struct i2c_client *client,
 		nr_cells = ARRAY_SIZE(rk818s);
 		pm_pwroff_fn = rk818_device_shutdown;
 		break;
+	case RK809_ID:
+	case RK817_ID:
+		rk808->regmap_cfg = &rk817_regmap_config;
+		rk808->regmap_irq_chip = &rk817_irq_chip;
+		pre_init_reg = rk817_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk817_pre_init_reg);
+		cells = rk817s;
+		nr_cells = ARRAY_SIZE(rk817s);
+		register_syscore_ops(&rk808_syscore_ops);
+		break;
 	default:
 		dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
 			rk808->variant);
@@ -568,10 +712,52 @@ static int rk808_remove(struct i2c_client *client)
 	return 0;
 }
 
+static int rk8xx_suspend(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_SLP_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int rk8xx_resume(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_NULL_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+SIMPLE_DEV_PM_OPS(rk8xx_pm_ops, rk8xx_suspend, rk8xx_resume);
+
 static struct i2c_driver rk808_i2c_driver = {
 	.driver = {
 		.name = "rk808",
 		.of_match_table = rk808_of_match,
+		.pm = &rk8xx_pm_ops,
 	},
 	.probe    = rk808_probe,
 	.remove   = rk808_remove,
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index d3156594674c..0fd9eedf3c20 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -382,6 +382,7 @@ enum rk805_reg {
 #define SWITCH1_EN	BIT(5)
 #define DEV_OFF_RST	BIT(3)
 #define DEV_OFF		BIT(0)
+#define RTC_STOP	BIT(0)
 
 #define VB_LO_ACT		BIT(4)
 #define VB_LO_SEL_3500MV	(7 << 0)
@@ -396,6 +397,175 @@ enum rk805_reg {
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
 #define FPWM_MODE			BIT(7)
+enum rk817_reg_id {
+	RK817_ID_DCDC1 = 0,
+	RK817_ID_DCDC2,
+	RK817_ID_DCDC3,
+	RK817_ID_DCDC4,
+	RK817_ID_LDO1,
+	RK817_ID_LDO2,
+	RK817_ID_LDO3,
+	RK817_ID_LDO4,
+	RK817_ID_LDO5,
+	RK817_ID_LDO6,
+	RK817_ID_LDO7,
+	RK817_ID_LDO8,
+	RK817_ID_LDO9,
+	RK817_ID_BOOST,
+	RK817_ID_BOOST_OTG_SW,
+	RK817_NUM_REGULATORS
+};
+
+enum rk809_reg_id {
+	RK809_ID_DCDC5 = RK817_ID_BOOST,
+	RK809_ID_SW1,
+	RK809_ID_SW2,
+	RK809_NUM_REGULATORS
+};
+
+#define RK817_SECONDS_REG		0x00
+#define RK817_MINUTES_REG		0x01
+#define RK817_HOURS_REG			0x02
+#define RK817_DAYS_REG			0x03
+#define RK817_MONTHS_REG		0x04
+#define RK817_YEARS_REG			0x05
+#define RK817_WEEKS_REG			0x06
+#define RK817_ALARM_SECONDS_REG		0x07
+#define RK817_ALARM_MINUTES_REG		0x08
+#define RK817_ALARM_HOURS_REG		0x09
+#define RK817_ALARM_DAYS_REG		0x0a
+#define RK817_ALARM_MONTHS_REG		0x0b
+#define RK817_ALARM_YEARS_REG		0x0c
+#define RK817_RTC_CTRL_REG		0xd
+#define RK817_RTC_STATUS_REG		0xe
+#define RK817_RTC_INT_REG		0xf
+#define RK817_RTC_COMP_LSB_REG		0x10
+#define RK817_RTC_COMP_MSB_REG		0x11
+
+#define RK817_POWER_EN_REG(i)		(0xb1 + (i))
+#define RK817_POWER_SLP_EN_REG(i)	(0xb5 + (i))
+
+#define RK817_POWER_CONFIG		(0xb9)
+
+#define RK817_BUCK_CONFIG_REG(i)	(0xba + (i) * 3)
+
+#define RK817_BUCK1_ON_VSEL_REG		0xBB
+#define RK817_BUCK1_SLP_VSEL_REG	0xBC
+
+#define RK817_BUCK2_CONFIG_REG		0xBD
+#define RK817_BUCK2_ON_VSEL_REG		0xBE
+#define RK817_BUCK2_SLP_VSEL_REG	0xBF
+
+#define RK817_BUCK3_CONFIG_REG		0xC0
+#define RK817_BUCK3_ON_VSEL_REG		0xC1
+#define RK817_BUCK3_SLP_VSEL_REG	0xC2
+
+#define RK817_BUCK4_CONFIG_REG		0xC3
+#define RK817_BUCK4_ON_VSEL_REG		0xC4
+#define RK817_BUCK4_SLP_VSEL_REG	0xC5
+
+#define RK817_LDO_ON_VSEL_REG(idx)	(0xcc + (idx) * 2)
+#define RK817_BOOST_OTG_CFG		(0xde)
+
+#define RK817_ID_MSB			0xed
+#define RK817_ID_LSB			0xee
+
+#define RK817_SYS_STS			0xf0
+#define RK817_SYS_CFG(i)		(0xf1 + (i))
+
+#define RK817_ON_SOURCE_REG		0xf5
+#define RK817_OFF_SOURCE_REG		0xf6
+
+/* INTERRUPT REGISTER */
+#define RK817_INT_STS_REG0		0xf8
+#define RK817_INT_STS_MSK_REG0		0xf9
+#define RK817_INT_STS_REG1		0xfa
+#define RK817_INT_STS_MSK_REG1		0xfb
+#define RK817_INT_STS_REG2		0xfc
+#define RK817_INT_STS_MSK_REG2		0xfd
+#define RK817_GPIO_INT_CFG		0xfe
+
+/* IRQ Definitions */
+#define RK817_IRQ_PWRON_FALL		0
+#define RK817_IRQ_PWRON_RISE		1
+#define RK817_IRQ_PWRON			2
+#define RK817_IRQ_PWMON_LP		3
+#define RK817_IRQ_HOTDIE		4
+#define RK817_IRQ_RTC_ALARM		5
+#define RK817_IRQ_RTC_PERIOD		6
+#define RK817_IRQ_VB_LO			7
+#define RK817_IRQ_PLUG_IN		8
+#define RK817_IRQ_PLUG_OUT		9
+#define RK817_IRQ_CHRG_TERM		10
+#define RK817_IRQ_CHRG_TIME		11
+#define RK817_IRQ_CHRG_TS		12
+#define RK817_IRQ_USB_OV		13
+#define RK817_IRQ_CHRG_IN_CLMP		14
+#define RK817_IRQ_BAT_DIS_ILIM		15
+#define RK817_IRQ_GATE_GPIO		16
+#define RK817_IRQ_TS_GPIO		17
+#define RK817_IRQ_CODEC_PD		18
+#define RK817_IRQ_CODEC_PO		19
+#define RK817_IRQ_CLASSD_MUTE_DONE	20
+#define RK817_IRQ_CLASSD_OCP		21
+#define RK817_IRQ_BAT_OVP               22
+#define RK817_IRQ_CHRG_BAT_HI		23
+#define RK817_IRQ_END			(RK817_IRQ_CHRG_BAT_HI + 1)
+
+/*
+ * rtc_ctrl 0xd
+ * same as 808, except bit4
+ */
+#define RK817_RTC_CTRL_RSV4		BIT(4)
+
+/* power config 0xb9 */
+#define RK817_BUCK3_FB_RES_MSK		BIT(6)
+#define RK817_BUCK3_FB_RES_INTER	BIT(6)
+#define RK817_BUCK3_FB_RES_EXT		0
+
+/* buck config 0xba */
+#define RK817_RAMP_RATE_OFFSET		6
+#define RK817_RAMP_RATE_MASK		(0x3 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_3MV_PER_US	(0x0 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_6_3MV_PER_US	(0x1 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_12_5MV_PER_US	(0x2 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_25MV_PER_US	(0x3 << RK817_RAMP_RATE_OFFSET)
+
+/* sys_cfg1 0xf2 */
+#define RK817_HOTDIE_TEMP_MSK		(0x3 << 4)
+#define RK817_HOTDIE_85			(0x0 << 4)
+#define RK817_HOTDIE_95			(0x1 << 4)
+#define RK817_HOTDIE_105		(0x2 << 4)
+#define RK817_HOTDIE_115		(0x3 << 4)
+
+#define RK817_TSD_TEMP_MSK		BIT(6)
+#define RK817_TSD_140			0
+#define RK817_TSD_160			BIT(6)
+
+#define RK817_CLK32KOUT2_EN		BIT(7)
+
+/* sys_cfg3 0xf4 */
+#define RK817_SLPPIN_FUNC_MSK		(0x3 << 3)
+#define SLPPIN_NULL_FUN			(0x0 << 3)
+#define SLPPIN_SLP_FUN			(0x1 << 3)
+#define SLPPIN_DN_FUN			(0x2 << 3)
+#define SLPPIN_RST_FUN			(0x3 << 3)
+
+#define RK817_RST_FUNC_MSK		(0x3 << 6)
+#define RK817_RST_FUNC_SFT		(6)
+#define RK817_RST_FUNC_CNT		(3)
+#define RK817_RST_FUNC_DEV		(0) /* reset the dev */
+#define RK817_RST_FUNC_REG		(0x1 << 6) /* reset the reg only */
+
+#define RK817_SLPPOL_MSK		BIT(5)
+#define RK817_SLPPOL_H			BIT(5)
+#define RK817_SLPPOL_L			(0)
+
+/* gpio&int 0xfe */
+#define RK817_INT_POL_MSK		BIT(1)
+#define RK817_INT_POL_H			BIT(1)
+#define RK817_INT_POL_L			0
+#define RK809_BUCK5_CONFIG(i)		(RK817_BOOST_OTG_CFG + (i) * 1)
 
 enum {
 	BUCK_ILMIN_50MA,
@@ -443,6 +613,8 @@ enum {
 enum {
 	RK805_ID = 0x8050,
 	RK808_ID = 0x0000,
+	RK809_ID = 0x8090,
+	RK817_ID = 0x8170,
 	RK818_ID = 0x8181,
 };
 
-- 
cgit v1.2.3


From e444f6d68c07bc01a3a3d5905409dbe1ca391d26 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Wed, 26 Jun 2019 14:29:18 +0200
Subject: regulator: rk808: Add RK809 and RK817 support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    Add support for the rk809 and rk817 regulator driver.
    Their specifications are as follows：
    1. The RK809 and RK809 consist of 5 DCDCs, 9 LDOs
       and have the same registers for these components except dcdc5.
    2. The dcdc5 is a boost dcdc for RK817 and is a buck for RK809.
    3. The RK817 has one switch but The Rk809 has two.

    The output voltages are configurable and are meant to supply power
    to the main processor and other components.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Mark Brown <broonie@kernel.org>
[rebased on top of 5.2-rc1]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig           |   4 +-
 drivers/regulator/rk808-regulator.c | 646 ++++++++++++++++++++++++++++++++++--
 include/linux/mfd/rk808.h           |   3 +
 3 files changed, 625 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6c37f0df9323..214a958ff3e5 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -762,11 +762,11 @@ config REGULATOR_RC5T583
 	  outputs which can be controlled by i2c communication.
 
 config REGULATOR_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power regulators"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power regulators"
 	depends on MFD_RK808
 	help
 	  Select this option to enable the power regulator of ROCKCHIP
-	  PMIC RK805,RK808 and RK818.
+	  PMIC RK805,RK809&RK817,RK808 and RK818.
 	  This driver supports the control of different power rails of device
 	  through regulator interface. The device supports multiple DCDC/LDO
 	  outputs which can be controlled by i2c communication.
diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
index 23713e16c286..e9b0bb996fc4 100644
--- a/drivers/regulator/rk808-regulator.c
+++ b/drivers/regulator/rk808-regulator.c
@@ -36,6 +36,12 @@
 #define RK808_BUCK4_VSEL_MASK	0xf
 #define RK808_LDO_VSEL_MASK	0x1f
 
+#define RK809_BUCK5_VSEL_MASK		0x7
+
+#define RK817_LDO_VSEL_MASK		0x7f
+#define RK817_BOOST_VSEL_MASK		0x7
+#define RK817_BUCK_VSEL_MASK		0x7f
+
 #define RK818_BUCK_VSEL_MASK		0x3f
 #define RK818_BUCK4_VSEL_MASK		0x1f
 #define RK818_LDO_VSEL_MASK		0x1f
@@ -65,30 +71,36 @@
 /* max steps for increase voltage of Buck1/2, equal 100mv*/
 #define MAX_STEPS_ONE_TIME 8
 
-#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,      \
-	_vmask, _ereg, _emask, _etime)                                  \
-	[_id] = {                                                       \
-		.name           = (_match),                             \
-		.supply_name    = (_supply),                            \
-		.of_match       = of_match_ptr(_match),                 \
-		.regulators_node = of_match_ptr("regulators"),          \
-		.type           = REGULATOR_VOLTAGE,                    \
-		.id             = (_id),                                \
-		.n_voltages     = (((_max) - (_min)) / (_step) + 1),    \
-		.owner          = THIS_MODULE,                          \
-		.min_uV         = (_min) * 1000,                        \
-		.uV_step        = (_step) * 1000,                       \
-		.vsel_reg       = (_vreg),                              \
-		.vsel_mask      = (_vmask),                             \
-		.enable_reg     = (_ereg),                              \
-		.enable_mask    = (_emask),                             \
-		.enable_time    = (_etime),                             \
-		.ops            = &rk805_reg_ops,                       \
+#define ENABLE_MASK(id)			(BIT(id) | BIT(4 + (id)))
+#define DISABLE_VAL(id)			(BIT(4 + (id)))
+
+#define RK817_BOOST_DESC(_id, _match, _supply, _min, _max, _step, _vreg,\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, m_drop)		\
+	{							\
+		.name		= (_match),				\
+		.supply_name	= (_supply),				\
+		.of_match	= of_match_ptr(_match),			\
+		.regulators_node = of_match_ptr("regulators"),		\
+		.type		= REGULATOR_VOLTAGE,			\
+		.id		= (_id),				\
+		.n_voltages	= (((_max) - (_min)) / (_step) + 1),	\
+		.owner		= THIS_MODULE,				\
+		.min_uV		= (_min) * 1000,			\
+		.uV_step	= (_step) * 1000,			\
+		.vsel_reg	= (_vreg),				\
+		.vsel_mask	= (_vmask),				\
+		.enable_reg	= (_ereg),				\
+		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
+		.enable_time	= (_etime),				\
+		.min_dropout_uV = (m_drop) * 1000,			\
+		.ops		= &rk817_boost_ops,			\
 	}
 
-#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
-	_vmask, _ereg, _emask, _etime)					\
-	[_id] = {							\
+#define RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, _ops)		\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -103,12 +115,30 @@
 		.vsel_mask	= (_vmask),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.enable_time	= (_etime),				\
-		.ops		= &rk808_reg_ops,			\
+		.ops		= _ops,			\
 	}
 
-#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
-	[_id] = {							\
+#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk805_reg_ops)
+
+#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk808_reg_ops)
+
+#define RK817_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _disval, _etime)				\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _emask, _disval, _etime, &rk817_reg_ops)
+
+#define RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_enval, _disval, _ops)						\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -117,10 +147,20 @@
 		.id		= (_id),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.owner		= THIS_MODULE,				\
-		.ops		= &rk808_switch_ops			\
+		.ops		= _ops					\
 	}
 
+#define RK817_DESC_SWITCH(_id, _match, _supply, _ereg, _emask,		\
+	_disval)							\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_emask, _disval, &rk817_switch_ops)
+
+#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	0, 0, &rk808_switch_ops)
 
 struct rk808_regulator_data {
 	struct gpio_desc *dvs_gpio[2];
@@ -138,6 +178,51 @@ static const struct regulator_linear_range rk808_ldo3_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2500000, 15, 15, 0),
 };
 
+#define RK809_BUCK5_SEL_CNT		(8)
+
+static const struct regulator_linear_range rk809_buck5_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(1500000, 0, 0, 0),
+	REGULATOR_LINEAR_RANGE(1800000, 1, 3, 200000),
+	REGULATOR_LINEAR_RANGE(2800000, 4, 5, 200000),
+	REGULATOR_LINEAR_RANGE(3300000, 6, 7, 300000),
+};
+
+#define RK817_BUCK1_MIN0 500000
+#define RK817_BUCK1_MAX0 1500000
+
+#define RK817_BUCK1_MIN1 1600000
+#define RK817_BUCK1_MAX1 2400000
+
+#define RK817_BUCK3_MAX1 3400000
+
+#define RK817_BUCK1_STP0 12500
+#define RK817_BUCK1_STP1 100000
+
+#define RK817_BUCK1_SEL0 ((RK817_BUCK1_MAX0 - RK817_BUCK1_MIN0) /\
+						  RK817_BUCK1_STP0)
+#define RK817_BUCK1_SEL1 ((RK817_BUCK1_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK3_SEL1 ((RK817_BUCK3_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK1_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK1_SEL1 + 1)
+#define RK817_BUCK3_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK3_SEL1 + 1)
+
+static const struct regulator_linear_range rk817_buck1_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK1_SEL_CNT, RK817_BUCK1_STP1),
+};
+
+static const struct regulator_linear_range rk817_buck3_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK3_SEL_CNT, RK817_BUCK1_STP1),
+};
+
 static int rk808_buck1_2_get_voltage_sel_regmap(struct regulator_dev *rdev)
 {
 	struct rk808_regulator_data *pdata = rdev_get_drvdata(rdev);
@@ -289,6 +374,36 @@ static int rk808_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
 				  RK808_RAMP_RATE_MASK, ramp_value);
 }
 
+/*
+ * RK817 RK809
+ */
+static int rk817_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
+{
+	unsigned int ramp_value = RK817_RAMP_RATE_25MV_PER_US;
+	unsigned int reg = RK817_BUCK_CONFIG_REG(rdev_get_id(rdev));
+
+	switch (ramp_delay) {
+	case 0 ... 3000:
+		ramp_value = RK817_RAMP_RATE_3MV_PER_US;
+		break;
+	case 3001 ... 6300:
+		ramp_value = RK817_RAMP_RATE_6_3MV_PER_US;
+		break;
+	case 6301 ... 12500:
+		ramp_value = RK817_RAMP_RATE_12_5MV_PER_US;
+		break;
+	case 12501 ... 25000:
+		break;
+	default:
+		dev_warn(&rdev->dev,
+			 "%s ramp_delay: %d not supported, setting 10000\n",
+			 rdev->desc->name, ramp_delay);
+	}
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  RK817_RAMP_RATE_MASK, ramp_value);
+}
+
 static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -304,6 +419,21 @@ static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 				  sel);
 }
 
+static int rk817_set_suspend_voltage(struct regulator_dev *rdev, int uv)
+{
+	unsigned int reg;
+	int sel = regulator_map_voltage_linear(rdev, uv, uv);
+	/* only ldo1~ldo9 */
+	if (sel < 0)
+		return -EINVAL;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  rdev->desc->vsel_mask,
+				  sel);
+}
+
 static int rk808_set_suspend_voltage_range(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -363,6 +493,131 @@ static int rk808_set_suspend_disable(struct regulator_dev *rdev)
 				  rdev->desc->enable_mask);
 }
 
+static int rk817_set_suspend_enable_ctrl(struct regulator_dev *rdev,
+					 unsigned int en)
+{
+	unsigned int reg;
+	int id = rdev_get_id(rdev);
+	unsigned int id_slp, msk, val;
+
+	if (id >= RK817_ID_DCDC1 && id <= RK817_ID_DCDC4)
+		id_slp = id;
+	else if (id >= RK817_ID_LDO1 && id <= RK817_ID_LDO8)
+		id_slp = 8 + (id - RK817_ID_LDO1);
+	else if (id >= RK817_ID_LDO9 && id <= RK809_ID_SW2)
+		id_slp = 4 + (id - RK817_ID_LDO9);
+	else
+		return -EINVAL;
+
+	reg = RK817_POWER_SLP_EN_REG(id_slp / 8);
+
+	msk = BIT(id_slp % 8);
+	if (en)
+		val = msk;
+	else
+		val = 0;
+
+	return regmap_update_bits(rdev->regmap, reg, msk, val);
+}
+
+static int rk817_set_suspend_enable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 1);
+}
+
+static int rk817_set_suspend_disable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 0);
+}
+
+static int rk8xx_set_suspend_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	unsigned int reg;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rk8xx_set_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned int rk8xx_get_mode(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int err;
+
+	err = regmap_read(rdev->regmap, rdev->desc->vsel_reg, &val);
+	if (err)
+		return err;
+
+	if (val & FPWM_MODE)
+		return REGULATOR_MODE_FAST;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int rk8xx_is_enabled_wmsk_regmap(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(rdev->regmap, rdev->desc->enable_reg, &val);
+	if (ret != 0)
+		return ret;
+
+	/* add write mask bit */
+	val |= (rdev->desc->enable_mask & 0xf0);
+	val &= rdev->desc->enable_mask;
+
+	if (rdev->desc->enable_is_inverted) {
+		if (rdev->desc->enable_val)
+			return val != rdev->desc->enable_val;
+		return (val == 0);
+	}
+	if (rdev->desc->enable_val)
+		return val == rdev->desc->enable_val;
+	return val != 0;
+}
+
+static unsigned int rk8xx_regulator_of_map_mode(unsigned int mode)
+{
+	switch (mode) {
+	case 1:
+		return REGULATOR_MODE_FAST;
+	case 2:
+		return REGULATOR_MODE_NORMAL;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct regulator_ops rk805_reg_ops = {
 	.list_voltage           = regulator_list_voltage_linear,
 	.map_voltage            = regulator_map_voltage_linear,
@@ -439,6 +694,71 @@ static const struct regulator_linear_range rk805_buck_1_2_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2300000, 63, 63, 0),
 };
 
+static struct regulator_ops rk809_buck5_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_reg_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk817_set_suspend_voltage,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_boost_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_buck_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_mode		= rk8xx_set_mode,
+	.get_mode		= rk8xx_get_mode,
+	.set_suspend_mode	= rk8xx_set_suspend_mode,
+	.set_ramp_delay		= rk817_set_ramp_delay,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_switch_ops = {
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
 static const struct regulator_desc rk805_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -595,6 +915,271 @@ static const struct regulator_desc rk808_reg[] = {
 		RK808_DCDC_EN_REG, BIT(6)),
 };
 
+static const struct regulator_desc rk809_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC_REG5",
+		.supply_name = "vcc9",
+		.of_match = of_match_ptr("DCDC_REG5"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK809_ID_DCDC5,
+		.ops = &rk809_buck5_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK809_BUCK5_SEL_CNT,
+		.linear_ranges = rk809_buck5_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk809_buck5_voltage_ranges),
+		.vsel_reg = RK809_BUCK5_CONFIG(0),
+		.vsel_mask = RK809_BUCK5_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(3),
+		.enable_mask = ENABLE_MASK(1),
+		.enable_val = ENABLE_MASK(1),
+		.disable_val = DISABLE_VAL(1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC_SWITCH(RK809_ID_SW1, "SWITCH_REG1", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+	RK817_DESC_SWITCH(RK809_ID_SW2, "SWITCH_REG2", "vcc8",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(3),
+			  DISABLE_VAL(3)),
+};
+
+static const struct regulator_desc rk817_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_BOOST_DESC(RK817_ID_BOOST, "BOOST", "vcc8", 4700, 5400, 100,
+			 RK817_BOOST_OTG_CFG, RK817_BOOST_VSEL_MASK,
+			 RK817_POWER_EN_REG(3), ENABLE_MASK(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400, 3500 - 5400),
+	RK817_DESC_SWITCH(RK817_ID_BOOST_OTG_SW, "OTG_SWITCH", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+};
+
 static const struct regulator_desc rk818_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -765,6 +1350,14 @@ static int rk808_regulator_probe(struct platform_device *pdev)
 		regulators = rk808_reg;
 		nregulators = RK808_NUM_REGULATORS;
 		break;
+	case RK809_ID:
+		regulators = rk809_reg;
+		nregulators = RK809_NUM_REGULATORS;
+		break;
+	case RK817_ID:
+		regulators = rk817_reg;
+		nregulators = RK817_NUM_REGULATORS;
+		break;
 	case RK818_ID:
 		regulators = rk818_reg;
 		nregulators = RK818_NUM_REGULATORS;
@@ -803,6 +1396,7 @@ static struct platform_driver rk808_regulator_driver = {
 module_platform_driver(rk808_regulator_driver);
 
 MODULE_DESCRIPTION("regulator driver for the RK805/RK808/RK818 series PMICs");
+MODULE_AUTHOR("Tony xie <tony.xie@rock-chips.com>");
 MODULE_AUTHOR("Chris Zhong <zyw@rock-chips.com>");
 MODULE_AUTHOR("Zhang Qing <zhangqing@rock-chips.com>");
 MODULE_AUTHOR("Wadim Egorov <w.egorov@phytec.de>");
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 0fd9eedf3c20..2a9cd01691b2 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -396,7 +396,10 @@ enum rk805_reg {
 #define SHUTDOWN_FUN			(0x2 << 2)
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
+#define PWM_MODE_MSK			BIT(7)
 #define FPWM_MODE			BIT(7)
+#define AUTO_PWM_MODE			0
+
 enum rk817_reg_id {
 	RK817_ID_DCDC1 = 0,
 	RK817_ID_DCDC2,
-- 
cgit v1.2.3


From f310f2eff794f96b4ea87be7f5938e57c34a64f1 Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Mon, 17 Jun 2019 18:45:05 +0200
Subject: gpio: Add comments on #if/#else/#endif

Improve readability a bit by commenting #if/#else/#endif statements
with the checked preprocessor symbols.

Signed-off-by: Enrico Weigelt <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h   | 16 ++++++++--------
 include/linux/gpio/gpio-reg.h |  2 +-
 include/linux/gpio/machine.h  |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 02698c0f34ea..5d325fd29d6b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -161,7 +161,7 @@ struct gpio_irq_chip {
 	 */
 	void		(*irq_disable)(struct irq_data *data);
 };
-#endif
+#endif /* CONFIG_GPIOLIB_IRQCHIP */
 
 /**
  * struct gpio_chip - abstract a GPIO controller
@@ -301,7 +301,7 @@ struct gpio_chip {
 	spinlock_t bgpio_lock;
 	unsigned long bgpio_data;
 	unsigned long bgpio_dir;
-#endif
+#endif /* CONFIG_GPIO_GENERIC */
 
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 	/*
@@ -316,7 +316,7 @@ struct gpio_chip {
 	 * used to handle IRQs for most practical cases.
 	 */
 	struct gpio_irq_chip irq;
-#endif
+#endif /* CONFIG_GPIOLIB_IRQCHIP */
 
 	/**
 	 * @need_valid_mask:
@@ -363,7 +363,7 @@ struct gpio_chip {
 	 */
 	int (*of_xlate)(struct gpio_chip *gc,
 			const struct of_phandle_args *gpiospec, u32 *flags);
-#endif
+#endif /* CONFIG_OF_GPIO */
 };
 
 extern const char *gpiochip_is_requested(struct gpio_chip *chip,
@@ -406,7 +406,7 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 	})
 #else
 #define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL)
-#endif
+#endif /* CONFIG_LOCKDEP */
 
 static inline int gpiochip_add(struct gpio_chip *chip)
 {
@@ -461,7 +461,7 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 #define BGPIOF_READ_OUTPUT_REG_SET	BIT(4) /* reg_set stores output value */
 #define BGPIOF_NO_OUTPUT		BIT(5) /* only input */
 
-#endif
+#endif /* CONFIG_GPIO_GENERIC */
 
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 
@@ -531,7 +531,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 					handler, type, true,
 					&lock_key, &request_key);
 }
-#else
+#else /* ! CONFIG_LOCKDEP */
 static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       struct irq_chip *irqchip,
 				       unsigned int first_irq,
@@ -582,7 +582,7 @@ int gpiochip_add_pingroup_range(struct gpio_chip *chip,
 			unsigned int gpio_offset, const char *pin_group);
 void gpiochip_remove_pin_ranges(struct gpio_chip *chip);
 
-#else
+#else /* ! CONFIG_PINCTRL */
 
 static inline int
 gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name,
diff --git a/include/linux/gpio/gpio-reg.h b/include/linux/gpio/gpio-reg.h
index 5c6efd394cb0..39b888c40b39 100644
--- a/include/linux/gpio/gpio-reg.h
+++ b/include/linux/gpio/gpio-reg.h
@@ -11,4 +11,4 @@ struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
 
 int gpio_reg_resume(struct gpio_chip *gc);
 
-#endif
+#endif /* GPIO_REG_H */
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index 35f299d1f6a7..1ebe5be05d5f 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -97,7 +97,7 @@ void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n);
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_hogs(struct gpiod_hog *hogs);
-#else
+#else /* ! CONFIG_GPIOLIB */
 static inline
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {}
 static inline
@@ -105,6 +105,6 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {}
 static inline
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {}
 static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {}
-#endif
+#endif /* CONFIG_GPIOLIB */
 
 #endif /* __LINUX_GPIO_MACHINE_H */
-- 
cgit v1.2.3


From 8f3fd89593377d15df88ad26dcf48318d8c9896f Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Mon, 24 Jun 2019 07:40:33 +0200
Subject: siox: Add helper macro to simplify driver registration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add more helper macros for trivial driver init cases, similar to the
already existing module_platform_driver() or module_i2c_driver().

This helps to reduce driver init boilerplate.

Signed-off-by: Enrico Weigelt <info@metux.net>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/siox.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/siox.h b/include/linux/siox.h
index d79624e83134..21d770078540 100644
--- a/include/linux/siox.h
+++ b/include/linux/siox.h
@@ -75,3 +75,13 @@ static inline void siox_driver_unregister(struct siox_driver *sdriver)
 {
 	return driver_unregister(&sdriver->driver);
 }
+
+/*
+ * module_siox_driver() - Helper macro for drivers that don't do
+ * anything special in module init/exit.  This eliminates a lot of
+ * boilerplate.  Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit()
+ */
+#define module_siox_driver(__siox_driver) \
+	module_driver(__siox_driver, siox_driver_register, \
+			siox_driver_unregister)
-- 
cgit v1.2.3


From 56855a99f3d0d1e9f1f4e24f5851f9bf14c83296 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 26 Jun 2019 16:37:16 -0500
Subject: ACPI/PPTT: Add function to return ACPI 6.3 Identical tokens

ACPI 6.3 adds a flag to indicate that child nodes are all
identical cores. This is useful to authoritatively determine
if a set of (possibly offline) cores are identical or not.

Since the flag doesn't give us a unique id we can generate
one and use it to create bitmaps of sibling nodes, or simply
in a loop to determine if a subset of cores are identical.

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/acpi/pptt.c  | 26 ++++++++++++++++++++++++++
 include/linux/acpi.h |  5 +++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index 05344413f199..1e7ac0bd0d3a 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -683,3 +683,29 @@ int find_acpi_cpu_topology_package(unsigned int cpu)
 	return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE,
 					  ACPI_PPTT_PHYSICAL_PACKAGE);
 }
+
+/**
+ * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag
+ * @cpu: Kernel logical CPU number
+ *
+ * Determine a unique heterogeneous tag for the given CPU. CPUs with the same
+ * implementation should have matching tags.
+ *
+ * The returned tag can be used to group peers with identical implementation.
+ *
+ * The search terminates when a level is found with the identical implementation
+ * flag set or we reach a root node.
+ *
+ * Due to limitations in the PPTT data structure, there may be rare situations
+ * where two cores in a heterogeneous machine may be identical, but won't have
+ * the same tag.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the CPU cannot be found.
+ * Otherwise returns a value which represents a group of identical cores
+ * similar to this CPU.
+ */
+int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
+{
+	return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE,
+					  ACPI_PPTT_ACPI_IDENTICAL);
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d315d86844e4..5bcd23e5ccd6 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1303,6 +1303,7 @@ static inline int lpit_read_residency_count_address(u64 *address)
 #ifdef CONFIG_ACPI_PPTT
 int find_acpi_cpu_topology(unsigned int cpu, int level);
 int find_acpi_cpu_topology_package(unsigned int cpu);
+int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
 int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
 #else
 static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
@@ -1313,6 +1314,10 @@ static inline int find_acpi_cpu_topology_package(unsigned int cpu)
 {
 	return -EINVAL;
 }
+static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
+{
+	return -EINVAL;
+}
 static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From d24a0c7099b32b6981d7f126c45348e381718350 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 26 Jun 2019 16:37:17 -0500
Subject: arm_pmu: acpi: spe: Add initial MADT/SPE probing

ACPI 6.3 adds additional fields to the MADT GICC
structure to describe SPE PPI's. We pick these out
of the cached reference to the madt_gicc structure
similarly to the core PMU code. We then create a platform
device referring to the IRQ and let the user/module loader
decide whether to load the SPE driver.

Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/acpi.h |  3 ++
 drivers/perf/arm_pmu_acpi.c   | 72 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/perf/arm_pmu.h  |  2 ++
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 7628efbe6c12..d10399b9f998 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -41,6 +41,9 @@
 	(!(entry) || (entry)->header.length < ACPI_MADT_GICC_MIN_LENGTH || \
 	(unsigned long)(entry) + (entry)->header.length > (end))
 
+#define ACPI_MADT_GICC_SPE  (ACPI_OFFSET(struct acpi_madt_generic_interrupt, \
+	spe_interrupt) + sizeof(u16))
+
 /* Basic configuration for ACPI */
 #ifdef	CONFIG_ACPI
 pgprot_t __acpi_get_mem_attribute(phys_addr_t addr);
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 0f197516d708..864d7ebe45e9 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -74,6 +74,76 @@ static void arm_pmu_acpi_unregister_irq(int cpu)
 	acpi_unregister_gsi(gsi);
 }
 
+#if IS_ENABLED(CONFIG_ARM_SPE_PMU)
+static struct resource spe_resources[] = {
+	{
+		/* irq */
+		.flags          = IORESOURCE_IRQ,
+	}
+};
+
+static struct platform_device spe_dev = {
+	.name = ARMV8_SPE_PDEV_NAME,
+	.id = -1,
+	.resource = spe_resources,
+	.num_resources = ARRAY_SIZE(spe_resources)
+};
+
+/*
+ * For lack of a better place, hook the normal PMU MADT walk
+ * and create a SPE device if we detect a recent MADT with
+ * a homogeneous PPI mapping.
+ */
+static void arm_spe_acpi_register_device(void)
+{
+	int cpu, hetid, irq, ret;
+	bool first = true;
+	u16 gsi = 0;
+
+	/*
+	 * Sanity check all the GICC tables for the same interrupt number.
+	 * For now, we only support homogeneous ACPI/SPE machines.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct acpi_madt_generic_interrupt *gicc;
+
+		gicc = acpi_cpu_get_madt_gicc(cpu);
+		if (gicc->header.length < ACPI_MADT_GICC_SPE)
+			return;
+
+		if (first) {
+			gsi = gicc->spe_interrupt;
+			if (!gsi)
+				return;
+			hetid = find_acpi_cpu_topology_hetero_id(cpu);
+			first = false;
+		} else if ((gsi != gicc->spe_interrupt) ||
+			   (hetid != find_acpi_cpu_topology_hetero_id(cpu))) {
+			pr_warn("ACPI: SPE must be homogeneous\n");
+			return;
+		}
+	}
+
+	irq = acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE,
+				ACPI_ACTIVE_HIGH);
+	if (irq < 0) {
+		pr_warn("ACPI: SPE Unable to register interrupt: %d\n", gsi);
+		return;
+	}
+
+	spe_resources[0].start = irq;
+	ret = platform_device_register(&spe_dev);
+	if (ret < 0) {
+		pr_warn("ACPI: SPE: Unable to register device\n");
+		acpi_unregister_gsi(gsi);
+	}
+}
+#else
+static inline void arm_spe_acpi_register_device(void)
+{
+}
+#endif /* CONFIG_ARM_SPE_PMU */
+
 static int arm_pmu_acpi_parse_irqs(void)
 {
 	int irq, cpu, irq_cpu, err;
@@ -279,6 +349,8 @@ static int arm_pmu_acpi_init(void)
 	if (acpi_disabled)
 		return 0;
 
+	arm_spe_acpi_register_device();
+
 	ret = arm_pmu_acpi_parse_irqs();
 	if (ret)
 		return ret;
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 4641e850b204..784bc58f165a 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -175,4 +175,6 @@ void armpmu_free_irq(int irq, int cpu);
 
 #endif /* CONFIG_ARM_PMU */
 
+#define ARMV8_SPE_PDEV_NAME "arm,spe-v1"
+
 #endif /* __ARM_PMU_H__ */
-- 
cgit v1.2.3


From 5a136b4ae327e7f6be9c984a010df8d7ea5a4f83 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 7 Jun 2019 12:10:33 -0300
Subject: mm/hmm: Fix error flows in hmm_invalidate_range_start

If the trylock on the hmm->mirrors_sem fails the function will return
without decrementing the notifiers that were previously incremented. Since
the caller will not call invalidate_range_end() on EAGAIN this will result
in notifiers becoming permanently incremented and deadlock.

If the sync_cpu_device_pagetables() required blocking the function will
not return EAGAIN even though the device continues to touch the
pages. This is a violation of the mmu notifier contract.

Switch, and rename, the ranges_lock to a spin lock so we can reliably
obtain it without blocking during error unwind.

The error unwind is necessary since the notifiers count must be held
incremented across the call to sync_cpu_device_pagetables() as we cannot
allow the range to become marked valid by a parallel
invalidate_start/end() pair while doing sync_cpu_device_pagetables().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  2 +-
 mm/hmm.c            | 69 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bf013e965257..0fa8ea34ccef 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -86,7 +86,7 @@
 struct hmm {
 	struct mm_struct	*mm;
 	struct kref		kref;
-	struct mutex		lock;
+	spinlock_t		ranges_lock;
 	struct list_head	ranges;
 	struct list_head	mirrors;
 	struct mmu_notifier	mmu_notifier;
diff --git a/mm/hmm.c b/mm/hmm.c
index b224ea635a77..de35289df20d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -64,7 +64,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	init_rwsem(&hmm->mirrors_sem);
 	hmm->mmu_notifier.ops = NULL;
 	INIT_LIST_HEAD(&hmm->ranges);
-	mutex_init(&hmm->lock);
+	spin_lock_init(&hmm->ranges_lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
 	hmm->mm = mm;
@@ -144,6 +144,25 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	hmm_put(hmm);
 }
 
+static void notifiers_decrement(struct hmm *hmm)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
+	hmm->notifiers--;
+	if (!hmm->notifiers) {
+		struct hmm_range *range;
+
+		list_for_each_entry(range, &hmm->ranges, list) {
+			if (range->valid)
+				continue;
+			range->valid = true;
+		}
+		wake_up_all(&hmm->wq);
+	}
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
+}
+
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
@@ -151,6 +170,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
+	unsigned long flags;
 	int ret = 0;
 
 	if (!kref_get_unless_zero(&hmm->kref))
@@ -161,12 +181,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	update.event = HMM_UPDATE_INVALIDATE;
 	update.blockable = mmu_notifier_range_blockable(nrange);
 
-	if (mmu_notifier_range_blockable(nrange))
-		mutex_lock(&hmm->lock);
-	else if (!mutex_trylock(&hmm->lock)) {
-		ret = -EAGAIN;
-		goto out;
-	}
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	hmm->notifiers++;
 	list_for_each_entry(range, &hmm->ranges, list) {
 		if (update.end < range->start || update.start >= range->end)
@@ -174,7 +189,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 
 		range->valid = false;
 	}
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	if (mmu_notifier_range_blockable(nrange))
 		down_read(&hmm->mirrors_sem);
@@ -182,16 +197,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		ret = -EAGAIN;
 		goto out;
 	}
+
 	list_for_each_entry(mirror, &hmm->mirrors, list) {
-		int ret;
+		int rc;
 
-		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN)
+		rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+		if (rc) {
+			if (WARN_ON(update.blockable || rc != -EAGAIN))
+				continue;
+			ret = -EAGAIN;
 			break;
+		}
 	}
 	up_read(&hmm->mirrors_sem);
 
 out:
+	if (ret)
+		notifiers_decrement(hmm);
 	hmm_put(hmm);
 	return ret;
 }
@@ -204,20 +226,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	mutex_lock(&hmm->lock);
-	hmm->notifiers--;
-	if (!hmm->notifiers) {
-		struct hmm_range *range;
-
-		list_for_each_entry(range, &hmm->ranges, list) {
-			if (range->valid)
-				continue;
-			range->valid = true;
-		}
-		wake_up_all(&hmm->wq);
-	}
-	mutex_unlock(&hmm->lock);
-
+	notifiers_decrement(hmm);
 	hmm_put(hmm);
 }
 
@@ -868,6 +877,7 @@ int hmm_range_register(struct hmm_range *range,
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
 	struct hmm *hmm = mirror->hmm;
+	unsigned long flags;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -886,7 +896,7 @@ int hmm_range_register(struct hmm_range *range,
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 
 	range->hmm = hmm;
 	kref_get(&hmm->kref);
@@ -898,7 +908,7 @@ int hmm_range_register(struct hmm_range *range,
 	 */
 	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	return 0;
 }
@@ -914,10 +924,11 @@ EXPORT_SYMBOL(hmm_range_register);
 void hmm_range_unregister(struct hmm_range *range)
 {
 	struct hmm *hmm = range->hmm;
+	unsigned long flags;
 
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	list_del_init(&range->list);
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	/* Drop reference taken by hmm_range_register() */
 	mmput(hmm->mm);
-- 
cgit v1.2.3


From 4844ef80305d0180051d0787cd91c63573255dc2 Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:42 +0530
Subject: mtd: cfi_cmdset_0002: Add support for polling status register

HyperFlash devices are compliant with CFI AMD/Fujitsu Extended Command
Set (0x0002) for flash operations, therefore
drivers/mtd/chips/cfi_cmdset_0002.c can be used as is. But these devices
do not support DQ polling method of determining chip ready/good status.
These flashes provide Status Register whose bits can be polled to know
status of flash operation.

Cypress HyperFlash datasheet here[1], talks about CFI Amd/Fujitsu
Extended Query version 1.5. Bit 0 of "Software Features supported" field
of CFI Primary Vendor-Specific Extended Query table indicates
presence/absence of status register and Bit 1 indicates whether or not
DQ polling is supported. Using these bits, its possible to determine
whether flash supports DQ polling or need to use Status Register.

Add support for polling Status Register to know device ready/status of
erase/write operations when DQ polling is not supported.
Print error messages on erase/program failure by looking at related
Status Register bits.

[1] https://www.cypress.com/file/213346/download

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tokunori Ikegami <ikegami.t@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/chips/cfi_cmdset_0002.c | 130 +++++++++++++++++++++++++++++++-----
 include/linux/mtd/cfi.h             |   7 ++
 2 files changed, 120 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index a1a7d334aa82..f4da7bd552e9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -49,6 +49,16 @@
 #define SST49LF008A		0x005a
 #define AT49BV6416		0x00d6
 
+/*
+ * Status Register bit description. Used by flash devices that don't
+ * support DQ polling (e.g. HyperFlash)
+ */
+#define CFI_SR_DRB		BIT(7)
+#define CFI_SR_ESB		BIT(5)
+#define CFI_SR_PSB		BIT(4)
+#define CFI_SR_WBASB		BIT(3)
+#define CFI_SR_SLSB		BIT(1)
+
 static int cfi_amdstd_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_amdstd_write_words(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
 static int cfi_amdstd_write_buffers(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
@@ -97,6 +107,50 @@ static struct mtd_chip_driver cfi_amdstd_chipdrv = {
 	.module		= THIS_MODULE
 };
 
+/*
+ * Use status register to poll for Erase/write completion when DQ is not
+ * supported. This is indicated by Bit[1:0] of SoftwareFeatures field in
+ * CFI Primary Vendor-Specific Extended Query table 1.5
+ */
+static int cfi_use_status_reg(struct cfi_private *cfi)
+{
+	struct cfi_pri_amdstd *extp = cfi->cmdset_priv;
+	u8 poll_mask = CFI_POLL_STATUS_REG | CFI_POLL_DQ;
+
+	return extp->MinorVersion >= '5' &&
+		(extp->SoftwareFeatures & poll_mask) == CFI_POLL_STATUS_REG;
+}
+
+static void cfi_check_err_status(struct map_info *map, struct flchip *chip,
+				 unsigned long adr)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	map_word status;
+
+	if (!cfi_use_status_reg(cfi))
+		return;
+
+	cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+			 cfi->device_type, NULL);
+	status = map_read(map, adr);
+
+	if (map_word_bitsset(map, status, CMD(0x3a))) {
+		unsigned long chipstatus = MERGESTATUS(status);
+
+		if (chipstatus & CFI_SR_ESB)
+			pr_err("%s erase operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_PSB)
+			pr_err("%s program operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_WBASB)
+			pr_err("%s buffer program command aborted, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_SLSB)
+			pr_err("%s sector write protected, status %lx\n",
+			       map->name, chipstatus);
+	}
+}
 
 /* #define DEBUG_CFI_FEATURES */
 
@@ -742,10 +796,25 @@ static struct mtd_info *cfi_amdstd_setup(struct mtd_info *mtd)
  * correctly and is therefore not done	(particularly with interleaved chips
  * as each chip must be checked independently of the others).
  */
-static int __xipram chip_ready(struct map_info *map, unsigned long addr)
+static int __xipram chip_ready(struct map_info *map, struct flchip *chip,
+			       unsigned long addr)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word d, t;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		d = map_read(map, addr);
+
+		return map_word_andequal(map, d, ready, ready);
+	}
+
 	d = map_read(map, addr);
 	t = map_read(map, addr);
 
@@ -767,10 +836,30 @@ static int __xipram chip_ready(struct map_info *map, unsigned long addr)
  * as each chip must be checked independently of the others).
  *
  */
-static int __xipram chip_good(struct map_info *map, unsigned long addr, map_word expected)
+static int __xipram chip_good(struct map_info *map, struct flchip *chip,
+			      unsigned long addr, map_word expected)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word oldd, curd;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		map_word err = CMD(CFI_SR_PSB | CFI_SR_ESB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit and Erase/Program status bit to know if
+		 * operation succeeded.
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		curd = map_read(map, addr);
+
+		if (map_word_andequal(map, curd, ready, ready))
+			return !map_word_bitsset(map, curd, err);
+
+		return 0;
+	}
+
 	oldd = map_read(map, addr);
 	curd = map_read(map, addr);
 
@@ -792,7 +881,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 
 	case FL_STATUS:
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -830,7 +919,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		chip->state = FL_ERASE_SUSPENDING;
 		chip->erase_suspended = 1;
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -1362,7 +1451,7 @@ static int do_otp_lock(struct map_info *map, struct flchip *chip, loff_t adr,
 	/* wait for chip to become ready */
 	timeo = jiffies + msecs_to_jiffies(2);
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -1628,22 +1717,24 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 			continue;
 		}
 
-		if (time_after(jiffies, timeo) && !chip_ready(map, adr)){
+		if (time_after(jiffies, timeo) &&
+		    !chip_ready(map, chip, adr)) {
 			xip_enable(map, chip, adr);
 			printk(KERN_WARNING "MTD %s(): software timeout\n", __func__);
 			xip_disable(map, chip, adr);
 			break;
 		}
 
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		/* Latency issues. Drop the lock, wait a while and retry */
 		UDELAY(map, chip, adr, 1);
 	}
 	/* Did we succeed? */
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -1881,10 +1972,11 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 		 * We check "time_after" and "!chip_good" before checking "chip_good" to avoid
 		 * the failure due to scheduling.
 		 */
-		if (time_after(jiffies, timeo) && !chip_good(map, adr, datum))
+		if (time_after(jiffies, timeo) &&
+		    !chip_good(map, chip, adr, datum))
 			break;
 
-		if (chip_good(map, adr, datum)) {
+		if (chip_good(map, chip, adr, datum)) {
 			xip_enable(map, chip, adr);
 			goto op_done;
 		}
@@ -1901,6 +1993,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 	 * See e.g.
 	 * http://www.spansion.com/Support/Application%20Notes/MirrorBit_Write_Buffer_Prog_Page_Buffer_Read_AN.pdf
 	 */
+	cfi_check_err_status(map, chip, adr);
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi,
 			 cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi,
@@ -2018,7 +2111,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 	 * If the driver thinks the chip is idle, and no toggle bits
 	 * are changing, then the chip is actually idle for sure.
 	 */
-	if (chip->state == FL_READY && chip_ready(map, adr))
+	if (chip->state == FL_READY && chip_ready(map, chip, adr))
 		return 0;
 
 	/*
@@ -2035,7 +2128,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 
 		/* wait for the chip to become ready */
 		for (i = 0; i < jiffies_to_usecs(timeo); i++) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				return 0;
 
 			udelay(1);
@@ -2099,14 +2192,15 @@ retry:
 	map_write(map, datum, adr);
 
 	for (i = 0; i < jiffies_to_usecs(uWriteTimeout); i++) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		udelay(1);
 	}
 
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2300,7 +2394,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2316,6 +2410,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2396,7 +2491,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2412,6 +2507,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2587,7 +2683,7 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map,
 	 */
 	timeo = jiffies + msecs_to_jiffies(2000);	/* 2s max (un)locking */
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index cbf77168658c..7fdbc1ff6527 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -233,6 +233,13 @@ struct cfi_pri_amdstd {
 	uint8_t  VppMin;
 	uint8_t  VppMax;
 	uint8_t  TopBottom;
+	/* Below field are added from version 1.5 */
+	uint8_t  ProgramSuspend;
+	uint8_t  UnlockBypass;
+	uint8_t  SecureSiliconSector;
+	uint8_t  SoftwareFeatures;
+#define CFI_POLL_STATUS_REG	BIT(0)
+#define CFI_POLL_DQ		BIT(1)
 } __packed;
 
 /* Vendor-Specific PRI for Atmel chips (command set 0x0002) */
-- 
cgit v1.2.3


From dcc7d3446a0fa19bd7e8074920b8f9ef3b7ec00c Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:44 +0530
Subject: mtd: Add support for HyperBus memory devices

Cypress' HyperBus is Low Signal Count, High Performance Double Data Rate
Bus interface between a host system master and one or more slave
interfaces. HyperBus is used to connect microprocessor, microcontroller,
or ASIC devices with random access NOR flash memory (called HyperFlash)
or self refresh DRAM (called HyperRAM).

Its a 8-bit data bus (DQ[7:0]) with  Read-Write Data Strobe (RWDS)
signal and either Single-ended clock(3.0V parts) or Differential clock
(1.8V parts). It uses ChipSelect lines to select b/w multiple slaves.
At bus level, it follows a separate protocol described in HyperBus
specification[1].

HyperFlash follows CFI AMD/Fujitsu Extended Command Set (0x0002) similar
to that of existing parallel NORs. Since HyperBus is x8 DDR bus,
its equivalent to x16 parallel NOR flash with respect to bits per clock
cycle. But HyperBus operates at >166MHz frequencies.
HyperRAM provides direct random read/write access to flash memory
array.

But, HyperBus memory controllers seem to abstract implementation details
and expose a simple MMIO interface to access connected flash.

Add support for registering HyperFlash devices with MTD framework. MTD
maps framework along with CFI chip support framework are used to support
communicating with flash.

Framework is modelled along the lines of spi-nor framework. HyperBus
memory controller (HBMC) drivers calls hyperbus_register_device() to
register a single HyperFlash device. HyperFlash core parses MMIO access
information from DT, sets up the map_info struct, probes CFI flash and
registers it with MTD framework.

Some HBMC masters need calibration/training sequence[3] to be carried
out, in order for DLL inside the controller to lock, by reading a known
string/pattern. This is done by repeatedly reading CFI Query
Identification String. Calibration needs to be done before trying to detect
flash as part of CFI flash probe.

HyperRAM is not supported at the moment.

HyperBus specification can be found at[1]
HyperFlash datasheet can be found at[2]

[1] https://www.cypress.com/file/213356/download
[2] https://www.cypress.com/file/213346/download
[3] http://www.ti.com/lit/ug/spruid7b/spruid7b.pdf
    Table 12-5741. HyperFlash Access Sequence

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 MAINTAINERS                          |   7 ++
 drivers/mtd/Kconfig                  |   2 +
 drivers/mtd/Makefile                 |   1 +
 drivers/mtd/hyperbus/Kconfig         |  11 +++
 drivers/mtd/hyperbus/Makefile        |   3 +
 drivers/mtd/hyperbus/hyperbus-core.c | 153 +++++++++++++++++++++++++++++++++++
 include/linux/mtd/hyperbus.h         |  84 +++++++++++++++++++
 7 files changed, 261 insertions(+)
 create mode 100644 drivers/mtd/hyperbus/Kconfig
 create mode 100644 drivers/mtd/hyperbus/Makefile
 create mode 100644 drivers/mtd/hyperbus/hyperbus-core.c
 create mode 100644 include/linux/mtd/hyperbus.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..f1253adb8cf6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7305,6 +7305,13 @@ F:	include/uapi/linux/hyperv.h
 F:	tools/hv/
 F:	Documentation/ABI/stable/sysfs-bus-vmbus
 
+HYPERBUS SUPPORT
+M:	Vignesh Raghavendra <vigneshr@ti.com>
+S:	Supported
+F:	drivers/mtd/hyperbus/
+F:	include/linux/mtd/hyperbus.h
+F:	Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
+
 HYPERVISOR VIRTUAL CONSOLE DRIVER
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Odd Fixes
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index fb31a7f649a3..80a6e2dcd085 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -274,4 +274,6 @@ source "drivers/mtd/spi-nor/Kconfig"
 
 source "drivers/mtd/ubi/Kconfig"
 
+source "drivers/mtd/hyperbus/Kconfig"
+
 endif # MTD
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 806287e80e84..62d649a959e2 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -34,3 +34,4 @@ obj-y		+= chips/ lpddr/ maps/ devices/ nand/ tests/
 
 obj-$(CONFIG_MTD_SPI_NOR)	+= spi-nor/
 obj-$(CONFIG_MTD_UBI)		+= ubi/
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus/
diff --git a/drivers/mtd/hyperbus/Kconfig b/drivers/mtd/hyperbus/Kconfig
new file mode 100644
index 000000000000..98147e28caa0
--- /dev/null
+++ b/drivers/mtd/hyperbus/Kconfig
@@ -0,0 +1,11 @@
+menuconfig MTD_HYPERBUS
+	tristate "HyperBus support"
+	select MTD_CFI
+	select MTD_MAP_BANK_WIDTH_2
+	select MTD_CFI_AMDSTD
+	select MTD_COMPLEX_MAPPINGS
+	help
+	  This is the framework for the HyperBus which can be used by
+	  the HyperBus Controller driver to communicate with
+	  HyperFlash. See Cypress HyperBus specification for more
+	  details
diff --git a/drivers/mtd/hyperbus/Makefile b/drivers/mtd/hyperbus/Makefile
new file mode 100644
index 000000000000..ca61dedd730d
--- /dev/null
+++ b/drivers/mtd/hyperbus/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus-core.o
diff --git a/drivers/mtd/hyperbus/hyperbus-core.c b/drivers/mtd/hyperbus/hyperbus-core.c
new file mode 100644
index 000000000000..6af9ea34117d
--- /dev/null
+++ b/drivers/mtd/hyperbus/hyperbus-core.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+// Author: Vignesh Raghavendra <vigneshr@ti.com>
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/hyperbus.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/mtd.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/types.h>
+
+static struct hyperbus_device *map_to_hbdev(struct map_info *map)
+{
+	return container_of(map, struct hyperbus_device, map);
+}
+
+static map_word hyperbus_read16(struct map_info *map, unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+	map_word read_data;
+
+	read_data.x[0] = ctlr->ops->read16(hbdev, addr);
+
+	return read_data;
+}
+
+static void hyperbus_write16(struct map_info *map, map_word d,
+			     unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->write16(hbdev, addr, d.x[0]);
+}
+
+static void hyperbus_copy_from(struct map_info *map, void *to,
+			       unsigned long from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_from(hbdev, to, from, len);
+}
+
+static void hyperbus_copy_to(struct map_info *map, unsigned long to,
+			     const void *from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_to(hbdev, to, from, len);
+}
+
+int hyperbus_register_device(struct hyperbus_device *hbdev)
+{
+	const struct hyperbus_ops *ops;
+	struct hyperbus_ctlr *ctlr;
+	struct device_node *np;
+	struct map_info *map;
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	if (!hbdev || !hbdev->np || !hbdev->ctlr || !hbdev->ctlr->dev) {
+		pr_err("hyperbus: please fill all the necessary fields!\n");
+		return -EINVAL;
+	}
+
+	np = hbdev->np;
+	ctlr = hbdev->ctlr;
+	if (!of_device_is_compatible(np, "cypress,hyperflash"))
+		return -ENODEV;
+
+	hbdev->memtype = HYPERFLASH;
+
+	ret = of_address_to_resource(np, 0, &res);
+	if (ret)
+		return ret;
+
+	dev = ctlr->dev;
+	map = &hbdev->map;
+	map->size = resource_size(&res);
+	map->virt = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(map->virt))
+		return PTR_ERR(map->virt);
+
+	map->name = dev_name(dev);
+	map->bankwidth = 2;
+	map->device_node = np;
+
+	simple_map_init(map);
+	ops = ctlr->ops;
+	if (ops) {
+		if (ops->read16)
+			map->read = hyperbus_read16;
+		if (ops->write16)
+			map->write = hyperbus_write16;
+		if (ops->copy_to)
+			map->copy_to = hyperbus_copy_to;
+		if (ops->copy_from)
+			map->copy_from = hyperbus_copy_from;
+
+		if (ops->calibrate && !ctlr->calibrated) {
+			ret = ops->calibrate(hbdev);
+			if (!ret) {
+				dev_err(dev, "Calibration failed\n");
+				return -ENODEV;
+			}
+			ctlr->calibrated = true;
+		}
+	}
+
+	hbdev->mtd = do_map_probe("cfi_probe", map);
+	if (!hbdev->mtd) {
+		dev_err(dev, "probing of hyperbus device failed\n");
+		return -ENODEV;
+	}
+
+	hbdev->mtd->dev.parent = dev;
+	mtd_set_of_node(hbdev->mtd, np);
+
+	ret = mtd_device_register(hbdev->mtd, NULL, 0);
+	if (ret) {
+		dev_err(dev, "failed to register mtd device\n");
+		map_destroy(hbdev->mtd);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hyperbus_register_device);
+
+int hyperbus_unregister_device(struct hyperbus_device *hbdev)
+{
+	int ret = 0;
+
+	if (hbdev && hbdev->mtd) {
+		ret = mtd_device_unregister(hbdev->mtd);
+		map_destroy(hbdev->mtd);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hyperbus_unregister_device);
+
+MODULE_DESCRIPTION("HyperBus Framework");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vignesh Raghavendra <vigneshr@ti.com>");
diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h
new file mode 100644
index 000000000000..2dfe65964f6e
--- /dev/null
+++ b/include/linux/mtd/hyperbus.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ */
+
+#ifndef __LINUX_MTD_HYPERBUS_H__
+#define __LINUX_MTD_HYPERBUS_H__
+
+#include <linux/mtd/map.h>
+
+enum hyperbus_memtype {
+	HYPERFLASH,
+	HYPERRAM,
+};
+
+/**
+ * struct hyperbus_device - struct representing HyperBus slave device
+ * @map: map_info struct for accessing MMIO HyperBus flash memory
+ * @np: pointer to HyperBus slave device node
+ * @mtd: pointer to MTD struct
+ * @ctlr: pointer to HyperBus controller struct
+ * @memtype: type of memory device: HyperFlash or HyperRAM
+ */
+
+struct hyperbus_device {
+	struct map_info map;
+	struct device_node *np;
+	struct mtd_info *mtd;
+	struct hyperbus_ctlr *ctlr;
+	enum hyperbus_memtype memtype;
+};
+
+/**
+ * struct hyperbus_ops - struct representing custom HyperBus operations
+ * @read16: read 16 bit of data from flash in a single burst. Used to read
+ *          from non default address space, such as ID/CFI space
+ * @write16: write 16 bit of data to flash in a single burst. Used to
+ *           send cmd to flash or write single 16 bit word at a time.
+ * @copy_from: copy data from flash memory
+ * @copy_to: copy data to flash memory
+ * @calibrate: calibrate HyperBus controller
+ */
+
+struct hyperbus_ops {
+	u16 (*read16)(struct hyperbus_device *hbdev, unsigned long addr);
+	void (*write16)(struct hyperbus_device *hbdev,
+			unsigned long addr, u16 val);
+	void (*copy_from)(struct hyperbus_device *hbdev, void *to,
+			  unsigned long from, ssize_t len);
+	void (*copy_to)(struct hyperbus_device *dev, unsigned long to,
+			const void *from, ssize_t len);
+	int (*calibrate)(struct hyperbus_device *dev);
+};
+
+/**
+ * struct hyperbus_ctlr - struct representing HyperBus controller
+ * @dev: pointer to HyperBus controller device
+ * @calibrated: flag to indicate ctlr calibration sequence is complete
+ * @ops: HyperBus controller ops
+ */
+struct hyperbus_ctlr {
+	struct device *dev;
+	bool calibrated;
+
+	const struct hyperbus_ops *ops;
+};
+
+/**
+ * hyperbus_register_device - probe and register a HyperBus slave memory device
+ * @hbdev: hyperbus_device struct with dev, np and ctlr field populated
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_register_device(struct hyperbus_device *hbdev);
+
+/**
+ * hyperbus_unregister_device - deregister HyperBus slave memory device
+ * @hbdev: hyperbus_device to be unregistered
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_unregister_device(struct hyperbus_device *hbdev);
+
+#endif /* __LINUX_MTD_HYPERBUS_H__ */
-- 
cgit v1.2.3


From 855eff216a97afa4a2233b792cb3c812b5ebd876 Mon Sep 17 00:00:00 2001
From: Jonathan Bakker <xc-racer2@live.ca>
Date: Fri, 26 Apr 2019 17:06:34 +0200
Subject: mtd: onenand: Add support for 8Gb datasize onenand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Used in several S5PV210-based Galaxy S devices, among them SGH-T959V,
SGH-T959P, SGH-T839, and SPH-D700.

Signed-off-by: Jonathan Bakker <xc-racer2@live.ca>
Signed-off-by: Paweł Chmiel <pawel.mikolaj.chmiel@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/onenand/onenand_base.c | 2 ++
 include/linux/mtd/onenand_regs.h        | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/onenand/onenand_base.c b/drivers/mtd/nand/onenand/onenand_base.c
index f41d76248550..492c0059673d 100644
--- a/drivers/mtd/nand/onenand/onenand_base.c
+++ b/drivers/mtd/nand/onenand/onenand_base.c
@@ -3260,6 +3260,8 @@ static void onenand_check_features(struct mtd_info *mtd)
 
 	/* Lock scheme */
 	switch (density) {
+	case ONENAND_DEVICE_DENSITY_8Gb:
+		this->options |= ONENAND_HAS_NOP_1;
 	case ONENAND_DEVICE_DENSITY_4Gb:
 		if (ONENAND_IS_DDP(this))
 			this->options |= ONENAND_HAS_2PLANE;
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d60130f88eed..9640d707cbf8 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -80,6 +80,7 @@
 #define ONENAND_DEVICE_DENSITY_1Gb	(0x003)
 #define ONENAND_DEVICE_DENSITY_2Gb	(0x004)
 #define ONENAND_DEVICE_DENSITY_4Gb	(0x005)
+#define ONENAND_DEVICE_DENSITY_8Gb	(0x006)
 
 /*
  * Version ID Register F002h (R)
-- 
cgit v1.2.3


From 14a82ea7e1682645d942d9fb41fcb6126fd1645e Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:30 +0200
Subject: mtd: rawnand: export NAND operation tracer

The NAND core has a NAND operation tracing function, but it can only
be used by drivers using the generic option parser from the NAND core.
Export the tracing function as a static inline function in rawnand.h
so that drivers implementing exec_op directly do not have to write their
own operation tracing.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 30 +-----------------------------
 include/linux/mtd/rawnand.h      | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index d565b4a9dce1..6ecd1c496ce3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -2115,35 +2115,7 @@ static void nand_op_parser_trace(const struct nand_op_parser_ctx *ctx)
 		if (instr == &ctx->subop.instrs[0])
 			prefix = "    ->";
 
-		switch (instr->type) {
-		case NAND_OP_CMD_INSTR:
-			pr_debug("%sCMD      [0x%02x]\n", prefix,
-				 instr->ctx.cmd.opcode);
-			break;
-		case NAND_OP_ADDR_INSTR:
-			pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
-				 instr->ctx.addr.naddrs,
-				 instr->ctx.addr.naddrs < 64 ?
-				 instr->ctx.addr.naddrs : 64,
-				 instr->ctx.addr.addrs);
-			break;
-		case NAND_OP_DATA_IN_INSTR:
-			pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_DATA_OUT_INSTR:
-			pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_WAITRDY_INSTR:
-			pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
-				 instr->ctx.waitrdy.timeout_ms);
-			break;
-		}
+		nand_op_trace(prefix, instr);
 
 		if (instr == &ctx->subop.instrs[ctx->subop.ninstrs - 1])
 			prefix = "      ";
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index dbfffa5bec7b..f5bb6f11c36b 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -877,6 +877,42 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
 			   const struct nand_op_parser *parser,
 			   const struct nand_operation *op, bool check_only);
 
+static inline void nand_op_trace(const char *prefix,
+				 const struct nand_op_instr *instr)
+{
+#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) || defined(DEBUG)
+	switch (instr->type) {
+	case NAND_OP_CMD_INSTR:
+		pr_debug("%sCMD      [0x%02x]\n", prefix,
+			 instr->ctx.cmd.opcode);
+		break;
+	case NAND_OP_ADDR_INSTR:
+		pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
+			 instr->ctx.addr.naddrs,
+			 instr->ctx.addr.naddrs < 64 ?
+			 instr->ctx.addr.naddrs : 64,
+			 instr->ctx.addr.addrs);
+		break;
+	case NAND_OP_DATA_IN_INSTR:
+		pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_DATA_OUT_INSTR:
+		pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_WAITRDY_INSTR:
+		pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
+			 instr->ctx.waitrdy.timeout_ms);
+		break;
+	}
+#endif
+}
+
 /**
  * struct nand_controller_ops - Controller operations
  *
-- 
cgit v1.2.3


From e0ddaab76802d3179013f4864535043e2aea6c69 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:41 +0200
Subject: dmaengine: mxs: Add header file to be shared with gpmi nand driver

The mxs dma driver can do PIO transfers. A pointer to the PIO words
to transfer is passed in the struct scatterlist * argument of
dmaengine_prep_slave_sg(). It's quite ugly and non obvious to cast
u32 * to struct scatterlist * each time when calling
dmaengine_prep_slave_sg(), so add a static inline wrapper function
to be called by the user along with a description what is going on.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  1 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 36 ++++++++++++------------------
 include/linux/dma/mxs-dma.h                | 21 +++++++++++++++++
 3 files changed, 36 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/dma/mxs-dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index ce92a3626ea4..62ee9328aea1 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -24,6 +24,7 @@
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/list.h>
+#include <linux/dma/mxs-dma.h>
 
 #include <asm/irq.h>
 
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 74c4de0b1a3d..45c7b91aae23 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/dma/mxs-dma.h>
 #include "gpmi-nand.h"
 #include "gpmi-regs.h"
 #include "bch-regs.h"
@@ -914,9 +915,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
 		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
 	pio[1] = pio[2] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -988,8 +988,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel, (struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1025,9 +1025,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,10 +1082,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1117,9 +1114,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(0);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 2,
-				DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1144,10 +1139,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[3] = geo->page_size;
 	pio[4] = payload;
 	pio[5] = auxiliary;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1163,9 +1156,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 3,
-				DMA_TRANS_NONE, DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
+				      DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
new file mode 100644
index 000000000000..092b2a7b92ac
--- /dev/null
+++ b/include/linux/dma/mxs-dma.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MXS_DMA_H_
+#define _MXS_DMA_H_
+
+#include <linux/dmaengine.h>
+
+/*
+ * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
+ * in the second argument to dmaengine_prep_slave_sg when the direction is
+ * set to DMA_TRANS_NONE. To make this clear and to prevent users from doing
+ * the error prone casting we have this wrapper function
+ */
+static inline struct dma_async_tx_descriptor *mxs_dmaengine_prep_pio(
+        struct dma_chan *chan, u32 *pio, unsigned int npio,
+        enum dma_transfer_direction dir, unsigned long flags)
+{
+	return dmaengine_prep_slave_sg(chan, (struct scatterlist *)pio, npio,
+				       dir, flags);
+}
+
+#endif /* _MXS_DMA_H_ */
-- 
cgit v1.2.3


From ceeeb99cd821a2f7493e1e0e1eca5afc7a205213 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:42 +0200
Subject: dmaengine: mxs: rename custom flag

The mxs dma driver uses the flags parameter in dmaengine_prep_slave_sg() for
custom flags, but still uses the dmaengine specific names of the flags.
Do a little bit better and at least give the flag a custom name.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  4 ++--
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 17 +++++++++++------
 include/linux/dma/mxs-dma.h                |  2 ++
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 62ee9328aea1..c622bee7eb12 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -541,7 +541,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits = 0;
 		ccw->bits |= CCW_IRQ;
 		ccw->bits |= CCW_DEC_SEM;
-		if (flags & DMA_CTRL_ACK)
+		if (flags & MXS_DMA_CTRL_WAIT4END)
 			ccw->bits |= CCW_WAIT4END;
 		ccw->bits |= CCW_HALT_ON_TERM;
 		ccw->bits |= CCW_TERM_FLUSH;
@@ -573,7 +573,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 				ccw->bits &= ~CCW_CHAIN;
 				ccw->bits |= CCW_IRQ;
 				ccw->bits |= CCW_DEC_SEM;
-				if (flags & DMA_CTRL_ACK)
+				if (flags & MXS_DMA_CTRL_WAIT4END)
 					ccw->bits |= CCW_WAIT4END;
 			}
 		}
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 45c7b91aae23..d088b3e77fef 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -926,7 +926,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 	sg_init_one(sgl, this->cmd_buffer, this->command_length);
 	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+				sgl, 1, DMA_MEM_TO_DEV,
+				MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -996,7 +997,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 	/* [2] send DMA request */
 	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+					1, DMA_MEM_TO_DEV,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1033,7 +1035,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 	/* [2] : send DMA request */
 	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM, DMA_CTRL_ACK);
+					1, DMA_DEV_TO_MEM,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,7 +1086,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[5] = auxiliary;
 
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1140,7 +1144,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1157,7 +1162,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
 	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      DMA_CTRL_ACK);
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 092b2a7b92ac..4a33f2c8a682 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -4,6 +4,8 @@
 
 #include <linux/dmaengine.h>
 
+#define MXS_DMA_CTRL_WAIT4END	BIT(31)
+
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
  * in the second argument to dmaengine_prep_slave_sg when the direction is
-- 
cgit v1.2.3


From ef347c0cfd619a9251e5a2f9ff72e33650a9bccb Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:43 +0200
Subject: mtd: rawnand: gpmi: Implement exec_op

The gpmi driver performance suffers from NAND operations being split
in multiple small DMA transfers. This has been forced by the NAND layer
in the former days, but now with exec_op we can use the controller as
intended.

With this patch gpmi_nfc_exec_op becomes the main entry point to NAND
operations. Here all instructions are collected and chained as separate
DMA transfers. In the end whole chain is fired and waited to be
finished. gpmi_nfc_exec_op only does the hardware operations, bad block
marker swapping and buffer scrambling is done by the callers. It's worth
noting that the nand_*_op functions always take the buffer lengths for
the data that the NAND chip actually transfers. When doing BCH we have
to calculate the net data size from the raw data size in some places.

This patch has been tested with 2048/64 and 2048/128 byte NAND on
i.MX6q. mtd_oobtest, mtd_subpagetest and mtd_speedtest run without
errors. nandbiterrs, nandpagetest and nandsubpagetest userspace tests
from mtdutils run without errors and UBIFS can successfully be mounted.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |    3 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 1105 +++++++++++-----------------
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h |   25 +-
 include/linux/dma/mxs-dma.h                |    1 +
 4 files changed, 444 insertions(+), 690 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index c622bee7eb12..20a9cb7cb6d3 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -78,6 +78,7 @@
 #define BM_CCW_COMMAND		(3 << 0)
 #define CCW_CHAIN		(1 << 2)
 #define CCW_IRQ			(1 << 3)
+#define CCW_WAIT4RDY		(1 << 5)
 #define CCW_DEC_SEM		(1 << 6)
 #define CCW_WAIT4END		(1 << 7)
 #define CCW_HALT_ON_TERM	(1 << 8)
@@ -547,6 +548,8 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits |= CCW_TERM_FLUSH;
 		ccw->bits |= BF_CCW(sg_len, PIO_NUM);
 		ccw->bits |= BF_CCW(MXS_DMA_CMD_NO_XFER, COMMAND);
+		if (flags & MXS_DMA_CTRL_WAIT4RDY)
+			ccw->bits |= CCW_WAIT4RDY;
 	} else {
 		for_each_sg(sgl, sg, sg_len, i) {
 			if (sg_dma_len(sg) > MAX_XFER_BYTES) {
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index d088b3e77fef..5db84178edff 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -528,26 +528,12 @@ static int common_nfc_set_geometry(struct gpmi_nand_data *this)
 static int bch_set_geometry(struct gpmi_nand_data *this)
 {
 	struct resources *r = &this->resources;
-	struct bch_geometry *bch_geo = &this->bch_geometry;
-	unsigned int block_count;
-	unsigned int block_size;
-	unsigned int metadata_size;
-	unsigned int ecc_strength;
-	unsigned int page_size;
-	unsigned int gf_len;
 	int ret;
 
 	ret = common_nfc_set_geometry(this);
 	if (ret)
 		return ret;
 
-	block_count   = bch_geo->ecc_chunk_count - 1;
-	block_size    = bch_geo->ecc_chunk_size;
-	metadata_size = bch_geo->metadata_size;
-	ecc_strength  = bch_geo->ecc_strength >> 1;
-	page_size     = bch_geo->page_size;
-	gf_len        = bch_geo->gf_len;
-
 	ret = pm_runtime_get_sync(this->dev);
 	if (ret < 0)
 		return ret;
@@ -561,27 +547,9 @@ static int bch_set_geometry(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	/* Configure layout 0. */
-	writel(BF_BCH_FLASH0LAYOUT0_NBLOCKS(block_count)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(metadata_size)
-			| BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT0_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT0);
-
-	writel(BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size)
-			| BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT1_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT1);
-
 	/* Set *all* chip selects to use layout 0. */
 	writel(0, r->bch_regs + HW_BCH_LAYOUTSELECT);
 
-	/* Enable interrupts. */
-	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
-				r->bch_regs + HW_BCH_CTRL_SET);
-
 	ret = 0;
 err_out:
 	pm_runtime_mark_last_busy(this->dev);
@@ -795,32 +763,6 @@ static void gpmi_clear_bch(struct gpmi_nand_data *this)
 	writel(BM_BCH_CTRL_COMPLETE_IRQ, r->bch_regs + HW_BCH_CTRL_CLR);
 }
 
-/* Returns the Ready/Busy status of the given chip. */
-static int gpmi_is_ready(struct gpmi_nand_data *this, unsigned chip)
-{
-	struct resources *r = &this->resources;
-	uint32_t mask = 0;
-	uint32_t reg = 0;
-
-	if (GPMI_IS_MX23(this)) {
-		mask = MX23_BM_GPMI_DEBUG_READY0 << chip;
-		reg = readl(r->gpmi_regs + HW_GPMI_DEBUG);
-	} else if (GPMI_IS_MX28(this) || GPMI_IS_MX6(this)) {
-		/*
-		 * In the imx6, all the ready/busy pins are bound
-		 * together. So we only need to check chip 0.
-		 */
-		if (GPMI_IS_MX6(this))
-			chip = 0;
-
-		/* MX28 shares the same R/B register as MX6Q. */
-		mask = MX28_BF_GPMI_STAT_READY_BUSY(1 << chip);
-		reg = readl(r->gpmi_regs + HW_GPMI_STAT);
-	} else
-		dev_err(this->dev, "unknown arch.\n");
-	return reg & mask;
-}
-
 static struct dma_chan *get_dma_chan(struct gpmi_nand_data *this)
 {
 	/* We use the DMA channel 0 to access all the nand chips. */
@@ -836,29 +778,6 @@ static void dma_irq_callback(void *param)
 	complete(dma_c);
 }
 
-static int start_dma_without_bch_irq(struct gpmi_nand_data *this,
-				     struct dma_async_tx_descriptor *desc)
-{
-	struct completion *dma_c = &this->dma_done;
-	unsigned long timeout;
-
-	init_completion(dma_c);
-
-	desc->callback		= dma_irq_callback;
-	desc->callback_param	= this;
-	dmaengine_submit(desc);
-	dma_async_issue_pending(get_dma_chan(this));
-
-	/* Wait for the interrupt from the DMA block. */
-	timeout = wait_for_completion_timeout(dma_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "DMA timeout, last DMA\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
 static irqreturn_t bch_irq(int irq, void *cookie)
 {
 	struct gpmi_nand_data *this = cookie;
@@ -868,83 +787,25 @@ static irqreturn_t bch_irq(int irq, void *cookie)
 	return IRQ_HANDLED;
 }
 
-/*
- * This function is used in BCH reading or BCH writing pages.
- * It will wait for the BCH interrupt as long as ONE second.
- * Actually, we must wait for two interrupts :
- *	[1] firstly the DMA interrupt and
- *	[2] secondly the BCH interrupt.
- */
-static int start_dma_with_bch_irq(struct gpmi_nand_data *this,
-				  struct dma_async_tx_descriptor *desc)
+static int gpmi_raw_len_to_len(struct gpmi_nand_data *this, int raw_len)
 {
-	struct completion *bch_c = &this->bch_done;
-	unsigned long timeout;
-
-	/* Prepare to receive an interrupt from the BCH block. */
-	init_completion(bch_c);
-
-	/* start the DMA */
-	start_dma_without_bch_irq(this, desc);
-
-	/* Wait for the interrupt from the BCH block. */
-	timeout = wait_for_completion_timeout(bch_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "BCH timeout\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
-static int gpmi_send_command(struct gpmi_nand_data *this)
-{
-	struct dma_chan *channel = get_dma_chan(this);
-	struct dma_async_tx_descriptor *desc;
-	struct scatterlist *sgl;
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[3];
-
-	/* [1] send out the PIO words */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
-		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
-		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
-	pio[1] = pio[2] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send out the COMMAND + ADDRESS string stored in @buffer */
-	sgl = &this->cmd_sgl;
-
-	sg_init_one(sgl, this->cmd_buffer, this->command_length);
-	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV,
-				MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
+	/*
+	 * raw_len is the length to read/write including bch data which
+	 * we are passed in exec_op. Calculate the data length from it.
+	 */
+	if (this->bch)
+		return ALIGN_DOWN(raw_len, this->bch_geometry.ecc_chunk_size);
+	else
+		return raw_len;
 }
 
 /* Can we use the upper's buffer directly for DMA? */
 static bool prepare_data_dma(struct gpmi_nand_data *this, const void *buf,
-			     int len, enum dma_data_direction dr)
+			     int raw_len, struct scatterlist *sgl,
+			     enum dma_data_direction dr)
 {
-	struct scatterlist *sgl = &this->data_sgl;
 	int ret;
+	int len = gpmi_raw_len_to_len(this, raw_len);
 
 	/* first try to map the upper buffer directly */
 	if (virt_addr_valid(buf) && !object_is_on_stack(buf)) {
@@ -960,7 +821,7 @@ map_fail:
 	/* We have to use our own DMA buffer. */
 	sg_init_one(sgl, this->data_buffer_dma, len);
 
-	if (dr == DMA_TO_DEVICE)
+	if (dr == DMA_TO_DEVICE && buf != this->data_buffer_dma)
 		memcpy(this->data_buffer_dma, buf, len);
 
 	dma_map_sg(this->dev, sgl, 1, dr);
@@ -968,208 +829,6 @@ map_fail:
 	return false;
 }
 
-static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	uint32_t command_mode;
-	uint32_t address;
-	u32 pio[2];
-
-	/* [1] PIO */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send DMA request */
-	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
-}
-
-static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[2];
-	bool direct;
-
-	/* [1] : send PIO */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] : send DMA request */
-	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] : submit the DMA */
-
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_FROM_DEVICE);
-	if (!direct)
-		memcpy(buf, this->data_buffer_dma, len);
-
-	return ret;
-}
-
-static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* A DMA descriptor that does an ECC page read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
-				BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	return start_dma_with_bch_irq(this, desc);
-}
-
-static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* [1] Wait for the chip to report ready. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] Enable the BCH block and read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__READ;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
-			| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-
-	pio[1] = 0;
-	pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] Disable the BCH block */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-	pio[1] = 0;
-	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [4] submit the DMA */
-	return start_dma_with_bch_irq(this, desc);
-}
-
 /**
  * gpmi_copy_bits - copy bits from one memory region to another
  * @dst: destination buffer
@@ -1568,67 +1227,20 @@ static void release_resources(struct gpmi_nand_data *this)
 	release_dma_channels(this);
 }
 
-static int send_page_prepare(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void **use_virt, dma_addr_t *use_phys)
-{
-	struct device *dev = this->dev;
-
-	if (virt_addr_valid(source)) {
-		dma_addr_t source_phys;
-
-		source_phys = dma_map_single(dev, (void *)source, length,
-						DMA_TO_DEVICE);
-		if (dma_mapping_error(dev, source_phys)) {
-			if (alt_size < length) {
-				dev_err(dev, "Alternate buffer is too small\n");
-				return -ENOMEM;
-			}
-			goto map_failed;
-		}
-		*use_virt = source;
-		*use_phys = source_phys;
-		return 0;
-	}
-map_failed:
-	/*
-	 * Copy the content of the source buffer into the alternate
-	 * buffer and set up the return values accordingly.
-	 */
-	memcpy(alt_virt, source, length);
-
-	*use_virt = alt_virt;
-	*use_phys = alt_phys;
-	return 0;
-}
-
-static void send_page_end(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void *used_virt, dma_addr_t used_phys)
-{
-	struct device *dev = this->dev;
-	if (used_virt == source)
-		dma_unmap_single(dev, used_phys, length, DMA_TO_DEVICE);
-}
-
 static void gpmi_free_dma_buffer(struct gpmi_nand_data *this)
 {
 	struct device *dev = this->dev;
+	struct bch_geometry *geo = &this->bch_geometry;
 
-	if (this->payload_virt && virt_addr_valid(this->payload_virt))
-		dma_free_coherent(dev, this->page_buffer_size,
-					this->payload_virt,
-					this->payload_phys);
-	kfree(this->cmd_buffer);
+	if (this->auxiliary_virt && virt_addr_valid(this->auxiliary_virt))
+		dma_free_coherent(dev, geo->auxiliary_size,
+					this->auxiliary_virt,
+					this->auxiliary_phys);
 	kfree(this->data_buffer_dma);
 	kfree(this->raw_buffer);
 
-	this->cmd_buffer	= NULL;
 	this->data_buffer_dma	= NULL;
 	this->raw_buffer	= NULL;
-	this->page_buffer_size	=  0;
 }
 
 /* Allocate the DMA buffers */
@@ -1638,11 +1250,6 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	struct device *dev = this->dev;
 	struct mtd_info *mtd = nand_to_mtd(&this->nand);
 
-	/* [1] Allocate a command buffer. PAGE_SIZE is enough. */
-	this->cmd_buffer = kzalloc(PAGE_SIZE, GFP_DMA | GFP_KERNEL);
-	if (this->cmd_buffer == NULL)
-		goto error_alloc;
-
 	/*
 	 * [2] Allocate a read/write data buffer.
 	 *     The gpmi_alloc_dma_buffer can be called twice.
@@ -1656,27 +1263,15 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	if (this->data_buffer_dma == NULL)
 		goto error_alloc;
 
-	/*
-	 * [3] Allocate the page buffer.
-	 *
-	 * Both the payload buffer and the auxiliary buffer must appear on
-	 * 32-bit boundaries. We presume the size of the payload buffer is a
-	 * power of two and is much larger than four, which guarantees the
-	 * auxiliary buffer will appear on a 32-bit boundary.
-	 */
-	this->page_buffer_size = geo->payload_size + geo->auxiliary_size;
-	this->payload_virt = dma_alloc_coherent(dev, this->page_buffer_size,
-					&this->payload_phys, GFP_DMA);
-	if (!this->payload_virt)
+	this->auxiliary_virt = dma_alloc_coherent(dev, geo->auxiliary_size,
+					&this->auxiliary_phys, GFP_DMA);
+	if (!this->auxiliary_virt)
 		goto error_alloc;
 
-	this->raw_buffer = kzalloc(mtd->writesize + mtd->oobsize, GFP_KERNEL);
+	this->raw_buffer = kzalloc((mtd->writesize ?: PAGE_SIZE) + mtd->oobsize, GFP_KERNEL);
 	if (!this->raw_buffer)
 		goto error_alloc;
 
-	/* Slice up the page buffer. */
-	this->auxiliary_virt = this->payload_virt + geo->payload_size;
-	this->auxiliary_phys = this->payload_phys + geo->payload_size;
 	return 0;
 
 error_alloc:
@@ -1684,105 +1279,6 @@ error_alloc:
 	return -ENOMEM;
 }
 
-static void gpmi_cmd_ctrl(struct nand_chip *chip, int data, unsigned int ctrl)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * Every operation begins with a command byte and a series of zero or
-	 * more address bytes. These are distinguished by either the Address
-	 * Latch Enable (ALE) or Command Latch Enable (CLE) signals being
-	 * asserted. When MTD is ready to execute the command, it will deassert
-	 * both latch enables.
-	 *
-	 * Rather than run a separate DMA operation for every single byte, we
-	 * queue them up and run a single DMA operation for the entire series
-	 * of command and data bytes. NAND_CMD_NONE means the END of the queue.
-	 */
-	if ((ctrl & (NAND_ALE | NAND_CLE))) {
-		if (data != NAND_CMD_NONE)
-			this->cmd_buffer[this->command_length++] = data;
-		return;
-	}
-
-	if (!this->command_length)
-		return;
-
-	ret = gpmi_send_command(this);
-	if (ret)
-		dev_err(this->dev, "Chip: %u, Error %d\n",
-			this->current_chip, ret);
-
-	this->command_length = 0;
-}
-
-static int gpmi_dev_ready(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	return gpmi_is_ready(this, this->current_chip);
-}
-
-static void gpmi_select_chip(struct nand_chip *chip, int chipnr)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * For power consumption matters, disable/enable the clock each time a
-	 * die is selected/unselected.
-	 */
-	if (this->current_chip < 0 && chipnr >= 0) {
-		ret = pm_runtime_get_sync(this->dev);
-		if (ret < 0)
-			dev_err(this->dev, "Failed to enable the clock\n");
-	} else if (this->current_chip >= 0 && chipnr < 0) {
-		pm_runtime_mark_last_busy(this->dev);
-		pm_runtime_put_autosuspend(this->dev);
-	}
-
-	/*
-	 * This driver currently supports only one NAND chip. Plus, dies share
-	 * the same configuration. So once timings have been applied on the
-	 * controller side, they will not change anymore. When the time will
-	 * come, the check on must_apply_timings will have to be dropped.
-	 */
-	if (chipnr >= 0 && this->hw.must_apply_timings) {
-		this->hw.must_apply_timings = false;
-		gpmi_nfc_apply_timings(this);
-	}
-
-	this->current_chip = chipnr;
-}
-
-static void gpmi_read_buf(struct nand_chip *chip, uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_read_data(this, buf, len);
-}
-
-static void gpmi_write_buf(struct nand_chip *chip, const uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_send_data(this, buf, len);
-}
-
-static uint8_t gpmi_read_byte(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	uint8_t *buf = this->data_buffer_dma;
-
-	gpmi_read_buf(chip, buf, 1);
-	return buf[0];
-}
-
 /*
  * Handles block mark swapping.
  * It can be called in swapping the block mark, or swapping it back,
@@ -1831,50 +1327,20 @@ static void block_mark_swapping(struct gpmi_nand_data *this,
 	p[1] = (p[1] & mask) | (from_oob >> (8 - bit));
 }
 
-static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
+static int gpmi_count_bitflips(struct nand_chip *chip, void *buf, int first,
+			       int last, int meta)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	dma_addr_t    payload_phys;
-	unsigned int  i;
+	int i;
 	unsigned char *status;
-	unsigned int  max_bitflips = 0;
-	int           ret;
-	bool          direct = false;
-
-	payload_phys = this->payload_phys;
-
-	if (virt_addr_valid(buf)) {
-		dma_addr_t dest_phys;
-
-		dest_phys = dma_map_single(this->dev, buf, nfc_geo->payload_size,
-					   DMA_FROM_DEVICE);
-		if (!dma_mapping_error(this->dev, dest_phys)) {
-			payload_phys = dest_phys;
-			direct = true;
-		}
-	}
-
-	/* go! */
-	ret = gpmi_read_page(this, payload_phys, this->auxiliary_phys);
-
-	if (direct)
-		dma_unmap_single(this->dev, payload_phys, nfc_geo->payload_size,
-				 DMA_FROM_DEVICE);
-
-	if (ret) {
-		dev_err(this->dev, "Error in ECC-based read: %d\n", ret);
-		return ret;
-	}
+	unsigned int max_bitflips = 0;
 
 	/* Loop over status bytes, accumulating ECC status. */
-	status = this->auxiliary_virt + nfc_geo->auxiliary_status_offset;
+	status = this->auxiliary_virt + ALIGN(meta, 4);
 
-	if (!direct)
-		memcpy(buf, this->payload_virt, nfc_geo->payload_size);
-
-	for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
+	for (i = first; i < last; i++, status++) {
 		if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
 			continue;
 
@@ -1954,25 +1420,53 @@ static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
 		max_bitflips = max_t(unsigned int, max_bitflips, *status);
 	}
 
-	/* handle the block mark swapping */
-	block_mark_swapping(this, buf, this->auxiliary_virt);
-
 	return max_bitflips;
 }
 
+static void gpmi_bch_layout_std(struct gpmi_nand_data *this)
+{
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int ecc_strength = geo->ecc_strength >> 1;
+	unsigned int gf_len = geo->gf_len;
+	unsigned int block_size = block_size = geo->ecc_chunk_size;
+
+	this->bch_flashlayout0 =
+		BF_BCH_FLASH0LAYOUT0_NBLOCKS(geo->ecc_chunk_count - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(geo->metadata_size) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this);
+
+	this->bch_flashlayout1 =
+		BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(geo->page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this);
+}
+
 static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 			      int oob_required, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int max_bitflips;
 	int ret;
 
-	nand_read_page_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
 
-	ret = gpmi_ecc_read_page_data(chip, buf);
-	if (ret < 0)
+	ret = nand_read_page_op(chip, page, 0, buf, geo->page_size);
+	if (ret)
 		return ret;
 
+	max_bitflips = gpmi_count_bitflips(chip, buf, 0,
+					   geo->ecc_chunk_count,
+					   geo->auxiliary_status_offset);
+
+	/* handle the block mark swapping */
+	block_mark_swapping(this, buf, this->auxiliary_virt);
+
 	if (oob_required) {
 		/*
 		 * It's time to deliver the OOB bytes. See gpmi_ecc_read_oob()
@@ -1988,7 +1482,7 @@ static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 		chip->oob_poi[0] = ((uint8_t *)this->auxiliary_virt)[0];
 	}
 
-	return ret;
+	return max_bitflips;
 }
 
 /* Fake a virtual small page for the subpage read */
@@ -1996,17 +1490,15 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 				 uint32_t len, uint8_t *buf, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	void __iomem *bch_regs = this->resources.bch_regs;
-	struct bch_geometry old_geo = this->bch_geometry;
 	struct bch_geometry *geo = &this->bch_geometry;
 	int size = chip->ecc.size; /* ECC chunk size */
 	int meta, n, page_size;
-	u32 r1_old, r2_old, r1_new, r2_new;
 	unsigned int max_bitflips;
+	unsigned int ecc_strength;
 	int first, last, marker_pos;
 	int ecc_parity_size;
 	int col = 0;
-	int old_swap_block_mark = this->swap_block_mark;
+	int ret;
 
 	/* The size of ECC parity */
 	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
@@ -2039,43 +1531,33 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 		buf = buf + first * size;
 	}
 
-	nand_read_page_op(chip, page, col, NULL, 0);
-
-	/* Save the old environment */
-	r1_old = r1_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT0);
-	r2_old = r2_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT1);
+	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
 
-	/* change the BCH registers and bch_geometry{} */
 	n = last - first + 1;
 	page_size = meta + (size + ecc_parity_size) * n;
+	ecc_strength = geo->ecc_strength >> 1;
+
+	this->bch_flashlayout0 = BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(meta) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(geo->ecc_chunk_size, this);
 
-	r1_new &= ~(BM_BCH_FLASH0LAYOUT0_NBLOCKS |
-			BM_BCH_FLASH0LAYOUT0_META_SIZE);
-	r1_new |= BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(meta);
-	writel(r1_new, bch_regs + HW_BCH_FLASH0LAYOUT0);
+	this->bch_flashlayout1 = BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(geo->ecc_chunk_size, this);
 
-	r2_new &= ~BM_BCH_FLASH0LAYOUT1_PAGE_SIZE;
-	r2_new |= BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size);
-	writel(r2_new, bch_regs + HW_BCH_FLASH0LAYOUT1);
+	this->bch = true;
 
-	geo->ecc_chunk_count = n;
-	geo->payload_size = n * size;
-	geo->page_size = page_size;
-	geo->auxiliary_status_offset = ALIGN(meta, 4);
+	ret = nand_read_page_op(chip, page, col, buf, page_size);
+	if (ret)
+		return ret;
 
 	dev_dbg(this->dev, "page:%d(%d:%d)%d, chunk:(%d:%d), BCH PG size:%d\n",
 		page, offs, len, col, first, n, page_size);
 
-	/* Read the subpage now */
-	this->swap_block_mark = false;
-	max_bitflips = gpmi_ecc_read_page_data(chip, buf);
-
-	/* Restore */
-	writel(r1_old, bch_regs + HW_BCH_FLASH0LAYOUT0);
-	writel(r2_old, bch_regs + HW_BCH_FLASH0LAYOUT1);
-	this->bch_geometry = old_geo;
-	this->swap_block_mark = old_swap_block_mark;
+	max_bitflips = gpmi_count_bitflips(chip, buf, first, last, meta);
 
 	return max_bitflips;
 }
@@ -2086,81 +1568,29 @@ static int gpmi_ecc_write_page(struct nand_chip *chip, const uint8_t *buf,
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
-	const void *payload_virt;
-	dma_addr_t payload_phys;
-	const void *auxiliary_virt;
-	dma_addr_t auxiliary_phys;
-	int        ret;
+	int ret;
 
 	dev_dbg(this->dev, "ecc write page.\n");
 
-	nand_prog_page_begin_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
+
+	memcpy(this->auxiliary_virt, chip->oob_poi, nfc_geo->auxiliary_size);
 
 	if (this->swap_block_mark) {
 		/*
-		 * If control arrives here, we're doing block mark swapping.
-		 * Since we can't modify the caller's buffers, we must copy them
-		 * into our own.
-		 */
-		memcpy(this->payload_virt, buf, mtd->writesize);
-		payload_virt = this->payload_virt;
-		payload_phys = this->payload_phys;
-
-		memcpy(this->auxiliary_virt, chip->oob_poi,
-				nfc_geo->auxiliary_size);
-		auxiliary_virt = this->auxiliary_virt;
-		auxiliary_phys = this->auxiliary_phys;
-
-		/* Handle block mark swapping. */
-		block_mark_swapping(this,
-				(void *)payload_virt, (void *)auxiliary_virt);
-	} else {
-		/*
-		 * If control arrives here, we're not doing block mark swapping,
-		 * so we can to try and use the caller's buffers.
+		 * When doing bad block marker swapping we must always copy the
+		 * input buffer as we can't modify the const buffer.
 		 */
-		ret = send_page_prepare(this,
-				buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				&payload_virt, &payload_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate payload DMA buffer\n");
-			return 0;
-		}
-
-		ret = send_page_prepare(this,
-				chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				&auxiliary_virt, &auxiliary_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate auxiliary DMA buffer\n");
-			goto exit_auxiliary;
-		}
+		memcpy(this->data_buffer_dma, buf, mtd->writesize);
+		buf = this->data_buffer_dma;
+		block_mark_swapping(this, this->data_buffer_dma,
+				    this->auxiliary_virt);
 	}
 
-	/* Ask the NFC. */
-	ret = gpmi_send_page(this, payload_phys, auxiliary_phys);
-	if (ret)
-		dev_err(this->dev, "Error in ECC-based write: %d\n", ret);
-
-	if (!this->swap_block_mark) {
-		send_page_end(this, chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				auxiliary_virt, auxiliary_phys);
-exit_auxiliary:
-		send_page_end(this, buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				payload_virt, payload_phys);
-	}
+	ret = nand_prog_page_op(chip, page, 0, buf, nfc_geo->page_size);
 
-	if (ret)
-		return ret;
-
-	return nand_prog_page_end_op(chip);
+	return ret;
 }
 
 /*
@@ -2229,7 +1659,6 @@ static int gpmi_ecc_read_oob(struct nand_chip *chip, int page)
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	int ret;
 
-	dev_dbg(this->dev, "page number is %d\n", page);
 	/* clear the OOB buffer */
 	memset(chip->oob_poi, ~0, mtd->oobsize);
 
@@ -2297,9 +1726,12 @@ static int gpmi_ecc_read_page_raw(struct nand_chip *chip, uint8_t *buf,
 	size_t oob_byte_off;
 	uint8_t *oob = chip->oob_poi;
 	int step;
+	int ret;
 
-	nand_read_page_op(chip, page, 0, tmp_buf,
-			  mtd->writesize + mtd->oobsize);
+	ret = nand_read_page_op(chip, page, 0, tmp_buf,
+				mtd->writesize + mtd->oobsize);
+	if (ret)
+		return ret;
 
 	/*
 	 * If required, swap the bad block marker and the data stored in the
@@ -2789,9 +2221,330 @@ static int gpmi_nand_attach_chip(struct nand_chip *chip)
 	return 0;
 }
 
+static struct gpmi_transfer *get_next_transfer(struct gpmi_nand_data *this)
+{
+	struct gpmi_transfer *transfer = &this->transfers[this->ntransfers];
+
+	this->ntransfers++;
+
+	if (this->ntransfers == GPMI_MAX_TRANSFERS)
+		return NULL;
+
+	return transfer;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_command(
+	struct gpmi_nand_data *this, u8 cmd, const u8 *addr, int naddr)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	int chip = this->nand.cur_cs;
+	u32 pio[3];
+
+	/* [1] send out the PIO words */
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(chip, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
+		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
+		| BF_GPMI_CTRL0_XFER_COUNT(naddr + 1);
+	pio[1] = 0;
+	pio[2] = 0;
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->cmdbuf[0] = cmd;
+	if (naddr)
+		memcpy(&transfer->cmdbuf[1], addr, naddr);
+
+	sg_init_one(&transfer->sgl, transfer->cmdbuf, naddr + 1);
+	dma_map_sg(this->dev, &transfer->sgl, 1, DMA_TO_DEVICE);
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1, DMA_MEM_TO_DEV,
+				       MXS_DMA_CTRL_WAIT4END);
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_wait_ready(
+	struct gpmi_nand_data *this)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	u32 pio[2];
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(0);
+	pio[1] = 0;
+
+	return mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE,
+				MXS_DMA_CTRL_WAIT4END | MXS_DMA_CTRL_WAIT4RDY);
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_read(
+	struct gpmi_nand_data *this, void *buf, int raw_len, bool *direct)
+{
+	struct dma_async_tx_descriptor *desc;
+	struct dma_chan *channel = get_dma_chan(this);
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_FROM_DEVICE;
+
+	*direct = prepare_data_dma(this, buf, raw_len, &transfer->sgl,
+				   DMA_FROM_DEVICE);
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
+				| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					     DMA_DEV_TO_MEM,
+					     MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_write(
+	struct gpmi_nand_data *this, const void *buf, int raw_len)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	prepare_data_dma(this, buf, raw_len, &transfer->sgl, DMA_TO_DEVICE);
+
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
+					BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE,
+				      (this->bch ? MXS_DMA_CTRL_WAIT4END : 0));
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					       DMA_MEM_TO_DEV,
+					       MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static int gpmi_nfc_exec_op(struct nand_chip *chip,
+			     const struct nand_operation *op,
+			     bool check_only)
+{
+	const struct nand_op_instr *instr;
+	struct gpmi_nand_data *this = nand_get_controller_data(chip);
+	struct dma_async_tx_descriptor *desc = NULL;
+	int i, ret, buf_len = 0, nbufs = 0;
+	u8 cmd = 0;
+	void *buf_read = NULL;
+	const void *buf_write = NULL;
+	bool direct = false;
+	struct completion *completion;
+	unsigned long to;
+
+	this->ntransfers = 0;
+	for (i = 0; i < GPMI_MAX_TRANSFERS; i++)
+		this->transfers[i].direction = DMA_NONE;
+
+	ret = pm_runtime_get_sync(this->dev);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * This driver currently supports only one NAND chip. Plus, dies share
+	 * the same configuration. So once timings have been applied on the
+	 * controller side, they will not change anymore. When the time will
+	 * come, the check on must_apply_timings will have to be dropped.
+	 */
+	if (this->hw.must_apply_timings) {
+		this->hw.must_apply_timings = false;
+		gpmi_nfc_apply_timings(this);
+	}
+
+	dev_dbg(this->dev, "%s: %d instructions\n", __func__, op->ninstrs);
+
+	for (i = 0; i < op->ninstrs; i++) {
+		instr = &op->instrs[i];
+
+		nand_op_trace("  ", instr);
+
+		switch (instr->type) {
+		case NAND_OP_WAITRDY_INSTR:
+			desc = gpmi_chain_wait_ready(this);
+			break;
+		case NAND_OP_CMD_INSTR:
+			cmd = instr->ctx.cmd.opcode;
+
+			/*
+			 * When this command has an address cycle chain it
+			 * together with the address cycle
+			 */
+			if (i + 1 != op->ninstrs &&
+			    op->instrs[i + 1].type == NAND_OP_ADDR_INSTR)
+				continue;
+
+			desc = gpmi_chain_command(this, cmd, NULL, 0);
+
+			break;
+		case NAND_OP_ADDR_INSTR:
+			desc = gpmi_chain_command(this, cmd, instr->ctx.addr.addrs,
+						  instr->ctx.addr.naddrs);
+			break;
+		case NAND_OP_DATA_OUT_INSTR:
+			buf_write = instr->ctx.data.buf.out;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_write(this, buf_write, buf_len);
+
+			break;
+		case NAND_OP_DATA_IN_INSTR:
+			if (!instr->ctx.data.len)
+				break;
+			buf_read = instr->ctx.data.buf.in;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_read(this, buf_read, buf_len,
+						   &direct);
+			break;
+		}
+
+		if (!desc) {
+			ret = -ENXIO;
+			goto unmap;
+		}
+	}
+
+	dev_dbg(this->dev, "%s setup done\n", __func__);
+
+	if (nbufs > 1) {
+		dev_err(this->dev, "Multiple data instructions not supported\n");
+		ret = -EINVAL;
+		goto unmap;
+	}
+
+	if (this->bch) {
+		writel(this->bch_flashlayout0,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT0);
+		writel(this->bch_flashlayout1,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT1);
+	}
+
+	if (this->bch && buf_read) {
+		writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+		       this->resources.bch_regs + HW_BCH_CTRL_SET);
+		completion = &this->bch_done;
+	} else {
+		desc->callback = dma_irq_callback;
+		desc->callback_param = this;
+		completion = &this->dma_done;
+	}
+
+	init_completion(completion);
+
+	dmaengine_submit(desc);
+	dma_async_issue_pending(get_dma_chan(this));
+
+	to = wait_for_completion_timeout(completion, msecs_to_jiffies(1000));
+	if (!to) {
+		dev_err(this->dev, "DMA timeout, last DMA\n");
+		gpmi_dump_info(this);
+		ret = -ETIMEDOUT;
+		goto unmap;
+	}
+
+	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+	       this->resources.bch_regs + HW_BCH_CTRL_CLR);
+	gpmi_clear_bch(this);
+
+	ret = 0;
+
+unmap:
+	for (i = 0; i < this->ntransfers; i++) {
+		struct gpmi_transfer *transfer = &this->transfers[i];
+
+		if (transfer->direction != DMA_NONE)
+			dma_unmap_sg(this->dev, &transfer->sgl, 1,
+				     transfer->direction);
+	}
+
+	if (!ret && buf_read && !direct)
+		memcpy(buf_read, this->data_buffer_dma,
+		       gpmi_raw_len_to_len(this, buf_len));
+
+	this->bch = false;
+
+	pm_runtime_mark_last_busy(this->dev);
+	pm_runtime_put_autosuspend(this->dev);
+
+	return ret;
+}
+
 static const struct nand_controller_ops gpmi_nand_controller_ops = {
 	.attach_chip = gpmi_nand_attach_chip,
 	.setup_data_interface = gpmi_setup_data_interface,
+	.exec_op = gpmi_nfc_exec_op,
 };
 
 static int gpmi_nand_init(struct gpmi_nand_data *this)
@@ -2800,9 +2553,6 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	struct mtd_info  *mtd = nand_to_mtd(chip);
 	int ret;
 
-	/* init current chip */
-	this->current_chip	= -1;
-
 	/* init the MTD data structures */
 	mtd->name		= "gpmi-nand";
 	mtd->dev.parent		= this->dev;
@@ -2810,14 +2560,8 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	/* init the nand_chip{}, we don't support a 16-bit NAND Flash bus. */
 	nand_set_controller_data(chip, this);
 	nand_set_flash_node(chip, this->pdev->dev.of_node);
-	chip->legacy.select_chip	= gpmi_select_chip;
-	chip->legacy.cmd_ctrl	= gpmi_cmd_ctrl;
-	chip->legacy.dev_ready	= gpmi_dev_ready;
-	chip->legacy.read_byte	= gpmi_read_byte;
-	chip->legacy.read_buf	= gpmi_read_buf;
-	chip->legacy.write_buf	= gpmi_write_buf;
-	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->legacy.block_markbad = gpmi_block_markbad;
+	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->options		|= NAND_NO_SUBPAGE_WRITE;
 
 	/* Set up swap_block_mark, must be set before the gpmi_set_geometry() */
@@ -2833,7 +2577,10 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	chip->legacy.dummy_controller.ops = &gpmi_nand_controller_ops;
+	nand_controller_init(&this->base);
+	this->base.ops = &gpmi_nand_controller_ops;
+	chip->controller = &this->base;
+
 	ret = nand_scan(chip, GPMI_IS_MX6(this) ? 2 : 1);
 	if (ret)
 		goto err_out;
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
index 51a070da84ed..fdc5ed7de083 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
@@ -103,6 +103,14 @@ struct gpmi_nfc_hardware_timing {
 	u32 ctrl1n;
 };
 
+#define GPMI_MAX_TRANSFERS	8
+
+struct gpmi_transfer {
+	u8 cmdbuf[8];
+	struct scatterlist sgl;
+	enum dma_data_direction direction;
+};
+
 struct gpmi_nand_data {
 	/* Devdata */
 	const struct gpmi_devdata *devdata;
@@ -126,23 +134,18 @@ struct gpmi_nand_data {
 	struct boot_rom_geometry rom_geometry;
 
 	/* MTD / NAND */
+	struct nand_controller	base;
 	struct nand_chip	nand;
 
-	/* General-use Variables */
-	int			current_chip;
-	unsigned int		command_length;
+	struct gpmi_transfer	transfers[GPMI_MAX_TRANSFERS];
+	int			ntransfers;
 
-	struct scatterlist	cmd_sgl;
-	char			*cmd_buffer;
+	bool			bch;
+	uint32_t		bch_flashlayout0;
+	uint32_t		bch_flashlayout1;
 
-	struct scatterlist	data_sgl;
 	char			*data_buffer_dma;
 
-	unsigned int		page_buffer_size;
-
-	void			*payload_virt;
-	dma_addr_t		payload_phys;
-
 	void			*auxiliary_virt;
 	dma_addr_t		auxiliary_phys;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 4a33f2c8a682..069d9f5a609e 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -5,6 +5,7 @@
 #include <linux/dmaengine.h>
 
 #define MXS_DMA_CTRL_WAIT4END	BIT(31)
+#define MXS_DMA_CTRL_WAIT4RDY	BIT(30)
 
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
-- 
cgit v1.2.3


From bded033062396e67ffbb3111084cf7ea202473d5 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:53 -0700
Subject: mtd: spinand: Define macros for page-read ops with three-byte
 addresses

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes three-byte addresses
for its page-read ops.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/spinand.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 507f7e289bd1..8aa39ac41e8e 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -68,30 +68,60 @@
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 1))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(fast, addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1),		\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 2),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 4),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PROG_EXEC_OP(addr)					\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
-- 
cgit v1.2.3


From 878844908e563a2f02b977bacd221c288e681c47 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:54 -0700
Subject: mtd: spinand: Add support for two-byte device IDs

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes two-byte device IDs.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c | 2 +-
 include/linux/mtd/spinand.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4c15bb58c623..556bfdb34455 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -845,7 +845,7 @@ spinand_select_op_variant(struct spinand_device *spinand,
  */
 int spinand_match_and_init(struct spinand_device *spinand,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid)
+			   unsigned int table_size, u16 devid)
 {
 	struct nand_device *nand = spinand_to_nand(spinand);
 	unsigned int i;
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 8aa39ac41e8e..fbc0423bb4ae 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -290,7 +290,7 @@ struct spinand_ecc_info {
  */
 struct spinand_info {
 	const char *model;
-	u8 devid;
+	u16 devid;
 	u32 flags;
 	struct nand_memory_organization memorg;
 	struct nand_ecc_req eccreq;
@@ -452,7 +452,7 @@ static inline void spinand_set_of_node(struct spinand_device *spinand,
 
 int spinand_match_and_init(struct spinand_device *dev,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid);
+			   unsigned int table_size, u16 devid);
 
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
-- 
cgit v1.2.3


From 9f897bfdd89f5f08a12fa263a7f57fbf8ad9292f Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 16 May 2019 12:41:46 -0400
Subject: mtd: Add flag to indicate panic_write

Added a flag to indicate a panic_write so that low level drivers can
use it to take required action where applicable, to ensure oops data
gets written to assigned mtd device.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/mtdcore.c   | 3 +++
 include/linux/mtd/mtd.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 453242d6cf56..408615f29e57 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1124,6 +1124,9 @@ int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
 		return -EROFS;
 	if (!len)
 		return 0;
+	if (!mtd->oops_panic_write)
+		mtd->oops_panic_write = true;
+
 	return mtd->_panic_write(mtd, to, len, retlen, buf);
 }
 EXPORT_SYMBOL_GPL(mtd_panic_write);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 936a3fdb48b5..4ca8c1c845fb 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -316,6 +316,12 @@ struct mtd_info {
 	int (*_get_device) (struct mtd_info *mtd);
 	void (*_put_device) (struct mtd_info *mtd);
 
+	/*
+	 * flag indicates a panic write, low level drivers can take appropriate
+	 * action if required to ensure writes go through
+	 */
+	bool oops_panic_write;
+
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
-- 
cgit v1.2.3


From 3552691616c940a7c4125c2678ba816653cd725e Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Tue, 18 Jun 2019 10:08:05 -0700
Subject: mtd: spinand: Add initial support for Paragon PN26G0xA

Add initial support for Paragon Technology
PN26G01Axxxxx and PN26G02Axxxxx SPI NAND

Datasheets available at
http://www.xtxtech.com/upfile/2016082517274590.pdf
http://www.xtxtech.com/upfile/2016082517282329.pdf

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile  |   2 +-
 drivers/mtd/nand/spi/core.c    |   1 +
 drivers/mtd/nand/spi/paragon.c | 147 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h    |   1 +
 4 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/paragon.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 753125082640..9662b9c1d5a9 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o gigadevice.o macronix.o micron.o toshiba.o winbond.o
+spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 556bfdb34455..f0f3528aab8f 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -757,6 +757,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
+	&paragon_spinand_manufacturer,
 	&toshiba_spinand_manufacturer,
 	&winbond_spinand_manufacturer,
 };
diff --git a/drivers/mtd/nand/spi/paragon.c b/drivers/mtd/nand/spi/paragon.c
new file mode 100644
index 000000000000..52307681cbd0
--- /dev/null
+++ b/drivers/mtd/nand/spi/paragon.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Jeff Kletsky
+ *
+ * Author: Jeff Kletsky <git-commits@allycomm.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+
+#define SPINAND_MFR_PARAGON	0xa1
+
+
+#define PN26G0XA_STATUS_ECC_BITMASK		(3 << 4)
+
+#define PN26G0XA_STATUS_ECC_NONE_DETECTED	(0 << 4)
+#define PN26G0XA_STATUS_ECC_1_7_CORRECTED	(1 << 4)
+#define PN26G0XA_STATUS_ECC_ERRORED		(2 << 4)
+#define PN26G0XA_STATUS_ECC_8_CORRECTED		(3 << 4)
+
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+
+static int pn26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = 6 + (15 * section); /* 4 BBM + 2 user bytes */
+	region->length = 13;
+
+	return 0;
+}
+
+static int pn26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 4)
+		return -ERANGE;
+
+	if (section == 4) {
+		region->offset = 64;
+		region->length = 64;
+	} else {
+		region->offset = 4 + (15 * section);
+		region->length = 2;
+	}
+
+	return 0;
+}
+
+static int pn26g0xa_ecc_get_status(struct spinand_device *spinand,
+				   u8 status)
+{
+	switch (status & PN26G0XA_STATUS_ECC_BITMASK) {
+	case PN26G0XA_STATUS_ECC_NONE_DETECTED:
+		return 0;
+
+	case PN26G0XA_STATUS_ECC_1_7_CORRECTED:
+		return 7;	/* Return upper limit by convention */
+
+	case PN26G0XA_STATUS_ECC_8_CORRECTED:
+		return 8;
+
+	case PN26G0XA_STATUS_ECC_ERRORED:
+		return -EBADMSG;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct mtd_ooblayout_ops pn26g0xa_ooblayout = {
+	.ecc = pn26g0xa_ooblayout_ecc,
+	.free = pn26g0xa_ooblayout_free,
+};
+
+
+static const struct spinand_info paragon_spinand_table[] = {
+	SPINAND_INFO("PN26G01A", 0xe1,
+		     NAND_MEMORG(1, 2048, 128, 64, 1024, 21, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+	SPINAND_INFO("PN26G02A", 0xe2,
+		     NAND_MEMORG(1, 2048, 128, 64, 2048, 41, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+};
+
+static int paragon_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/* Read ID returns [0][MID][DID] */
+
+	if (id[1] != SPINAND_MFR_PARAGON)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, paragon_spinand_table,
+				     ARRAY_SIZE(paragon_spinand_table),
+				     id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static const struct spinand_manufacturer_ops paragon_spinand_manuf_ops = {
+	.detect = paragon_spinand_detect,
+};
+
+const struct spinand_manufacturer paragon_spinand_manufacturer = {
+	.id = SPINAND_MFR_PARAGON,
+	.name = "Paragon",
+	.ops = &paragon_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index fbc0423bb4ae..4ea558bd3c46 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -227,6 +227,7 @@ struct spinand_manufacturer {
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
+extern const struct spinand_manufacturer paragon_spinand_manufacturer;
 extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
 extern const struct spinand_manufacturer winbond_spinand_manufacturer;
 
-- 
cgit v1.2.3


From 2c9858ecbeb1e68224290043445990e29337d4c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jun 2019 16:52:37 +0200
Subject: workqueue: Make alloc/apply/free_workqueue_attrs() static

None of those functions have any users outside of workqueue.c. Confine
them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 4 ----
 kernel/workqueue.c        | 7 +++----
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d59525fca4d3..b7c585b5ec1c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -435,10 +435,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
-void free_workqueue_attrs(struct workqueue_attrs *attrs);
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs);
 int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 95aea04ff722..b8fa7afe6e7d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
  *
  * Undo alloc_workqueue_attrs().
  */
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
+static void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
 	if (attrs) {
 		free_cpumask_var(attrs->cpumask);
@@ -3346,7 +3346,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
  *
  * Return: The allocated new workqueue_attr on success. %NULL on failure.
  */
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
 {
 	struct workqueue_attrs *attrs;
 
@@ -4033,7 +4033,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
  *
  * Return: 0 on success and -errno on failure.
  */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
+static int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs)
 {
 	int ret;
@@ -4044,7 +4044,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
 
 /**
  * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
-- 
cgit v1.2.3


From a58946c158a040068e7c94dc1d58bbd273258068 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 26 Jun 2019 21:02:33 +0100
Subject: keys: Pass the network namespace into request_key mechanism

Create a request_key_net() function and use it to pass the network
namespace domain tag into DNS revolver keys and rxrpc/AFS keys so that keys
for different domains can coexist in the same keyring.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: netdev@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-afs@lists.infradead.org
---
 Documentation/security/keys/core.rst        | 28 +++++++++++++----
 Documentation/security/keys/request-key.rst | 29 +++++++++++++-----
 fs/afs/addr_list.c                          |  4 +--
 fs/afs/dynroot.c                            |  8 +++--
 fs/cifs/dns_resolve.c                       |  3 +-
 fs/nfs/dns_resolve.c                        |  3 +-
 fs/nfs/nfs4idmap.c                          |  2 +-
 include/linux/dns_resolver.h                |  3 +-
 include/linux/key.h                         | 47 ++++++++++++++++++++++++++---
 net/ceph/messenger.c                        |  3 +-
 net/dns_resolver/dns_query.c                |  7 +++--
 net/rxrpc/key.c                             |  4 +--
 security/keys/internal.h                    |  1 +
 security/keys/keyctl.c                      |  2 +-
 security/keys/keyring.c                     | 11 ++++---
 security/keys/request_key.c                 | 39 ++++++++++++++++--------
 16 files changed, 145 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index ae930ae9d590..0e74f372e58c 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1102,26 +1102,42 @@ payload contents" for more information.
     See also Documentation/security/keys/request-key.rst.
 
 
+ *  To search for a key in a specific domain, call:
+
+	struct key *request_key_tag(const struct key_type *type,
+				    const char *description,
+				    struct key_tag *domain_tag,
+				    const char *callout_info);
+
+    This is identical to request_key(), except that a domain tag may be
+    specifies that causes search algorithm to only match keys matching that
+    tag.  The domain_tag may be NULL, specifying a global domain that is
+    separate from any nominated domain.
+
+
  *  To search for a key, passing auxiliary data to the upcaller, call::
 
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
+					     struct key_tag *domain_tag,
 					     const void *callout_info,
 					     size_t callout_len,
 					     void *aux);
 
-    This is identical to request_key(), except that the auxiliary data is
-    passed to the key_type->request_key() op if it exists, and the callout_info
-    is a blob of length callout_len, if given (the length may be 0).
+    This is identical to request_key_tag(), except that the auxiliary data is
+    passed to the key_type->request_key() op if it exists, and the
+    callout_info is a blob of length callout_len, if given (the length may be
+    0).
 
 
  *  To search for a key under RCU conditions, call::
 
 	struct key *request_key_rcu(const struct key_type *type,
-				    const char *description);
+				    const char *description,
+				    struct key_tag *domain_tag);
 
-    which is similar to request_key() except that it does not check for keys
-    that are under construction and it will not call out to userspace to
+    which is similar to request_key_tag() except that it does not check for
+    keys that are under construction and it will not call out to userspace to
     construct a key if it can't find a match.
 
 
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 5a210baa583a..35f2296b704a 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -13,10 +13,18 @@ The process starts by either the kernel requesting a service by calling
 				const char *description,
 				const char *callout_info);
 
+or::
+
+	struct key *request_key_tag(const struct key_type *type,
+				    const char *description,
+				    const struct key_tag *domain_tag,
+				    const char *callout_info);
+
 or::
 
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
+					     const struct key_tag *domain_tag,
 					     const char *callout_info,
 					     size_t callout_len,
 					     void *aux);
@@ -24,7 +32,8 @@ or::
 or::
 
 	struct key *request_key_rcu(const struct key_type *type,
-				    const char *description);
+				    const char *description,
+				    const struct key_tag *domain_tag);
 
 Or by userspace invoking the request_key system call::
 
@@ -38,14 +47,18 @@ does not need to link the key to a keyring to prevent it from being immediately
 destroyed.  The kernel interface returns a pointer directly to the key, and
 it's up to the caller to destroy the key.
 
-The request_key_with_auxdata() calls is like the in-kernel request_key() call,
-except that they permit auxiliary data to be passed to the upcaller (the
-default is NULL).  This is only useful for those key types that define their
-own upcall mechanism rather than using /sbin/request-key.
+The request_key_tag() call is like the in-kernel request_key(), except that it
+also takes a domain tag that allows keys to be separated by namespace and
+killed off as a group.
+
+The request_key_with_auxdata() calls is like the request_key_tag() call, except
+that they permit auxiliary data to be passed to the upcaller (the default is
+NULL).  This is only useful for those key types that define their own upcall
+mechanism rather than using /sbin/request-key.
 
-The request_key_rcu() call is like the in-kernel request_key() call, except
-that it doesn't check for keys that are under construction and doesn't attempt
-to construct missing keys.
+The request_key_rcu() call is like the request_key_tag() call, except that it
+doesn't check for keys that are under construction and doesn't attempt to
+construct missing keys.
 
 The userspace interface links the key to a keyring associated with the process
 to prevent the key from going away, and returns the serial number of the key to
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 9eaff55df7b4..6b1e8fc6c954 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -250,8 +250,8 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
 
 	_enter("%s", cell->name);
 
-	ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1",
-			&result, _expiry, true);
+	ret = dns_query(cell->net->net, "afsdb", cell->name, cell->name_len,
+			"srv=1", &result, _expiry, true);
 	if (ret < 0) {
 		_leave(" = %d [dns]", ret);
 		return ERR_PTR(ret);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index af1689d1f32e..b075605b0c45 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -28,6 +28,7 @@ const struct file_operations afs_dynroot_file_operations = {
 static int afs_probe_cell_name(struct dentry *dentry)
 {
 	struct afs_cell *cell;
+	struct afs_net *net = afs_d2net(dentry);
 	const char *name = dentry->d_name.name;
 	size_t len = dentry->d_name.len;
 	int ret;
@@ -40,13 +41,14 @@ static int afs_probe_cell_name(struct dentry *dentry)
 		len--;
 	}
 
-	cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len);
+	cell = afs_lookup_cell_rcu(net, name, len);
 	if (!IS_ERR(cell)) {
-		afs_put_cell(afs_d2net(dentry), cell);
+		afs_put_cell(net, cell);
 		return 0;
 	}
 
-	ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL, false);
+	ret = dns_query(net->net, "afsdb", name, len, "srv=1",
+			NULL, NULL, false);
 	if (ret == -ENODATA)
 		ret = -EDESTADDRREQ;
 	return ret;
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e21b2528cfb..534cbba72789 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -77,7 +77,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 		goto name_is_IP_address;
 
 	/* Perform the upcall */
-	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL, false);
+	rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
+		       NULL, ip_addr, NULL, false);
 	if (rc < 0)
 		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
 			 __func__, len, len, hostname);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index e6a700f01452..aec769a500a1 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -22,7 +22,8 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
 	char *ip_addr = NULL;
 	int ip_len;
 
-	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL, false);
+	ip_len = dns_query(net, NULL, name, namelen, NULL, &ip_addr, NULL,
+			   false);
 	if (ip_len > 0)
 		ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
 	else
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 4884fdae28fb..1e7296395d71 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -291,7 +291,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
 	if (IS_ERR(rkey)) {
 		mutex_lock(&idmap->idmap_mutex);
 		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
-						desc, "", 0, idmap);
+						desc, NULL, "", 0, idmap);
 		mutex_unlock(&idmap->idmap_mutex);
 	}
 	if (!IS_ERR(rkey))
diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h
index f2b3ae22e6b7..976cbbdb2832 100644
--- a/include/linux/dns_resolver.h
+++ b/include/linux/dns_resolver.h
@@ -26,7 +26,8 @@
 
 #include <uapi/linux/dns_resolver.h>
 
-extern int dns_query(const char *type, const char *name, size_t namelen,
+struct net;
+extern int dns_query(struct net *net, const char *type, const char *name, size_t namelen,
 		     const char *options, char **_result, time64_t *_expiry,
 		     bool invalidate);
 
diff --git a/include/linux/key.h b/include/linux/key.h
index 60c076c6e47f..18d7f62ab6b0 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -36,6 +36,7 @@ typedef int32_t key_serial_t;
 typedef uint32_t key_perm_t;
 
 struct key;
+struct net;
 
 #ifdef CONFIG_KEYS
 
@@ -296,19 +297,57 @@ static inline void key_ref_put(key_ref_t key_ref)
 	key_put(key_ref_to_ptr(key_ref));
 }
 
-extern struct key *request_key(struct key_type *type,
-			       const char *description,
-			       const char *callout_info);
+extern struct key *request_key_tag(struct key_type *type,
+				   const char *description,
+				   struct key_tag *domain_tag,
+				   const char *callout_info);
 
 extern struct key *request_key_rcu(struct key_type *type,
-				   const char *description);
+				   const char *description,
+				   struct key_tag *domain_tag);
 
 extern struct key *request_key_with_auxdata(struct key_type *type,
 					    const char *description,
+					    struct key_tag *domain_tag,
 					    const void *callout_info,
 					    size_t callout_len,
 					    void *aux);
 
+/**
+ * request_key - Request a key and wait for construction
+ * @type: Type of key.
+ * @description: The searchable description of the key.
+ * @callout_info: The data to pass to the instantiation upcall (or NULL).
+ *
+ * As for request_key_tag(), but with the default global domain tag.
+ */
+static inline struct key *request_key(struct key_type *type,
+				      const char *description,
+				      const char *callout_info)
+{
+	return request_key_tag(type, description, NULL, callout_info);
+}
+
+#ifdef CONFIG_NET
+/*
+ * request_key_net - Request a key for a net namespace and wait for construction
+ * @type: Type of key.
+ * @description: The searchable description of the key.
+ * @net: The network namespace that is the key's domain of operation.
+ * @callout_info: The data to pass to the instantiation upcall (or NULL).
+ *
+ * As for request_key() except that it does not add the returned key to a
+ * keyring if found, new keys are always allocated in the user's quota, the
+ * callout_info must be a NUL-terminated string and no auxiliary data can be
+ * passed.  Only keys that operate the specified network namespace are used.
+ *
+ * Furthermore, it then works as wait_for_key_construction() to wait for the
+ * completion of keys undergoing construction with a non-interruptible wait.
+ */
+#define request_key_net(type, description, net, callout_info) \
+	request_key_tag(type, description, net->key_domain, callout_info);
+#endif /* CONFIG_NET */
+
 extern int wait_for_key_construction(struct key *key, bool intr);
 
 extern int key_validate(const struct key *key);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cd0b094468b6..a33402c99321 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1887,7 +1887,8 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
 		return -EINVAL;
 
 	/* do dns_resolve upcall */
-	ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL, false);
+	ip_len = dns_query(current->nsproxy->net_ns,
+			   NULL, name, end - name, NULL, &ip_addr, NULL, false);
 	if (ip_len > 0)
 		ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
 	else
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 2d260432b3be..cab4e0df924f 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -40,6 +40,7 @@
 #include <linux/cred.h>
 #include <linux/dns_resolver.h>
 #include <linux/err.h>
+#include <net/net_namespace.h>
 
 #include <keys/dns_resolver-type.h>
 #include <keys/user-type.h>
@@ -48,6 +49,7 @@
 
 /**
  * dns_query - Query the DNS
+ * @net: The network namespace to operate in.
  * @type: Query type (or NULL for straight host->IP lookup)
  * @name: Name to look up
  * @namelen: Length of name
@@ -69,7 +71,8 @@
  *
  * Returns the size of the result on success, -ve error code otherwise.
  */
-int dns_query(const char *type, const char *name, size_t namelen,
+int dns_query(struct net *net,
+	      const char *type, const char *name, size_t namelen,
 	      const char *options, char **_result, time64_t *_expiry,
 	      bool invalidate)
 {
@@ -122,7 +125,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
 	 * add_key() to preinstall malicious redirections
 	 */
 	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key(&key_type_dns_resolver, desc, options);
+	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
 	revert_creds(saved_cred);
 	kfree(desc);
 	if (IS_ERR(rkey)) {
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 2722189ec273..1cc6b0c6cc42 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -914,7 +914,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key(&key_type_rxrpc, description, NULL);
+	key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
@@ -945,7 +945,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key(&key_type_keyring, description, NULL);
+	key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 5a561f5f199e..f1f2b076f3a1 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -156,6 +156,7 @@ extern int install_session_keyring_to_cred(struct cred *, struct key *);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
+					struct key_tag *domain_tag,
 					const void *callout_info,
 					size_t callout_len,
 					void *aux,
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4bb5781d3ddf..d2f8eabcbcf4 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -224,7 +224,7 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type,
 	}
 
 	/* do the search */
-	key = request_key_and_link(ktype, description, callout_info,
+	key = request_key_and_link(ktype, description, NULL, callout_info,
 				   callout_len, NULL, key_ref_to_ptr(dest_ref),
 				   KEY_ALLOC_IN_QUOTA);
 	if (IS_ERR(key)) {
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index bca070f6ab46..29c31585ed61 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -222,10 +222,13 @@ void key_set_index_key(struct keyring_index_key *index_key)
 
 	memcpy(index_key->desc, index_key->description, n);
 
-	if (index_key->type->flags & KEY_TYPE_NET_DOMAIN)
-		index_key->domain_tag = current->nsproxy->net_ns->key_domain;
-	else
-		index_key->domain_tag = &default_domain_tag;
+	if (!index_key->domain_tag) {
+		if (index_key->type->flags & KEY_TYPE_NET_DOMAIN)
+			index_key->domain_tag = current->nsproxy->net_ns->key_domain;
+		else
+			index_key->domain_tag = &default_domain_tag;
+	}
+
 	hash_key_type_and_desc(index_key);
 }
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 9201ca96c4df..aa589d3c90e2 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -17,6 +17,7 @@
 #include <linux/err.h>
 #include <linux/keyctl.h>
 #include <linux/slab.h>
+#include <net/net_namespace.h>
 #include "internal.h"
 #include <keys/request_key_auth-type.h>
 
@@ -533,16 +534,18 @@ error:
  * request_key_and_link - Request a key and cache it in a keyring.
  * @type: The type of key we want.
  * @description: The searchable description of the key.
+ * @domain_tag: The domain in which the key operates.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
  * @dest_keyring: Where to cache the key.
  * @flags: Flags to key_alloc().
  *
- * A key matching the specified criteria is searched for in the process's
- * keyrings and returned with its usage count incremented if found.  Otherwise,
- * if callout_info is not NULL, a key will be allocated and some service
- * (probably in userspace) will be asked to instantiate it.
+ * A key matching the specified criteria (type, description, domain_tag) is
+ * searched for in the process's keyrings and returned with its usage count
+ * incremented if found.  Otherwise, if callout_info is not NULL, a key will be
+ * allocated and some service (probably in userspace) will be asked to
+ * instantiate it.
  *
  * If successfully found or created, the key will be linked to the destination
  * keyring if one is provided.
@@ -558,6 +561,7 @@ error:
  */
 struct key *request_key_and_link(struct key_type *type,
 				 const char *description,
+				 struct key_tag *domain_tag,
 				 const void *callout_info,
 				 size_t callout_len,
 				 void *aux,
@@ -566,6 +570,7 @@ struct key *request_key_and_link(struct key_type *type,
 {
 	struct keyring_search_context ctx = {
 		.index_key.type		= type,
+		.index_key.domain_tag	= domain_tag,
 		.index_key.description	= description,
 		.index_key.desc_len	= strlen(description),
 		.cred			= current_cred(),
@@ -672,9 +677,10 @@ int wait_for_key_construction(struct key *key, bool intr)
 EXPORT_SYMBOL(wait_for_key_construction);
 
 /**
- * request_key - Request a key and wait for construction
+ * request_key_tag - Request a key and wait for construction
  * @type: Type of key.
  * @description: The searchable description of the key.
+ * @domain_tag: The domain in which the key operates.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  *
  * As for request_key_and_link() except that it does not add the returned key
@@ -685,9 +691,10 @@ EXPORT_SYMBOL(wait_for_key_construction);
  * Furthermore, it then works as wait_for_key_construction() to wait for the
  * completion of keys undergoing construction with a non-interruptible wait.
  */
-struct key *request_key(struct key_type *type,
-			const char *description,
-			const char *callout_info)
+struct key *request_key_tag(struct key_type *type,
+			    const char *description,
+			    struct key_tag *domain_tag,
+			    const char *callout_info)
 {
 	struct key *key;
 	size_t callout_len = 0;
@@ -695,7 +702,8 @@ struct key *request_key(struct key_type *type,
 
 	if (callout_info)
 		callout_len = strlen(callout_info);
-	key = request_key_and_link(type, description, callout_info, callout_len,
+	key = request_key_and_link(type, description, domain_tag,
+				   callout_info, callout_len,
 				   NULL, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
@@ -706,12 +714,13 @@ struct key *request_key(struct key_type *type,
 	}
 	return key;
 }
-EXPORT_SYMBOL(request_key);
+EXPORT_SYMBOL(request_key_tag);
 
 /**
  * request_key_with_auxdata - Request a key with auxiliary data for the upcaller
  * @type: The type of key we want.
  * @description: The searchable description of the key.
+ * @domain_tag: The domain in which the key operates.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
@@ -724,6 +733,7 @@ EXPORT_SYMBOL(request_key);
  */
 struct key *request_key_with_auxdata(struct key_type *type,
 				     const char *description,
+				     struct key_tag *domain_tag,
 				     const void *callout_info,
 				     size_t callout_len,
 				     void *aux)
@@ -731,7 +741,8 @@ struct key *request_key_with_auxdata(struct key_type *type,
 	struct key *key;
 	int ret;
 
-	key = request_key_and_link(type, description, callout_info, callout_len,
+	key = request_key_and_link(type, description, domain_tag,
+				   callout_info, callout_len,
 				   aux, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
@@ -748,6 +759,7 @@ EXPORT_SYMBOL(request_key_with_auxdata);
  * request_key_rcu - Request key from RCU-read-locked context
  * @type: The type of key we want.
  * @description: The name of the key we want.
+ * @domain_tag: The domain in which the key operates.
  *
  * Request a key from a context that we may not sleep in (such as RCU-mode
  * pathwalk).  Keys under construction are ignored.
@@ -755,10 +767,13 @@ EXPORT_SYMBOL(request_key_with_auxdata);
  * Return a pointer to the found key if successful, -ENOKEY if we couldn't find
  * a key or some other error if the key found was unsuitable or inaccessible.
  */
-struct key *request_key_rcu(struct key_type *type, const char *description)
+struct key *request_key_rcu(struct key_type *type,
+			    const char *description,
+			    struct key_tag *domain_tag)
 {
 	struct keyring_search_context ctx = {
 		.index_key.type		= type,
+		.index_key.domain_tag	= domain_tag,
 		.index_key.description	= description,
 		.index_key.desc_len	= strlen(description),
 		.cred			= current_cred(),
-- 
cgit v1.2.3


From 2e12256b9a76584fa3a6da19210509d4775aee36 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 27 Jun 2019 23:03:07 +0100
Subject: keys: Replace uid/gid/perm permissions checking with an ACL

Replace the uid/gid/perm permissions checking on a key with an ACL to allow
the SETATTR and SEARCH permissions to be split.  This will also allow a
greater range of subjects to represented.

============
WHY DO THIS?
============

The problem is that SETATTR and SEARCH cover a slew of actions, not all of
which should be grouped together.

For SETATTR, this includes actions that are about controlling access to a
key:

 (1) Changing a key's ownership.

 (2) Changing a key's security information.

 (3) Setting a keyring's restriction.

And actions that are about managing a key's lifetime:

 (4) Setting an expiry time.

 (5) Revoking a key.

and (proposed) managing a key as part of a cache:

 (6) Invalidating a key.

Managing a key's lifetime doesn't really have anything to do with
controlling access to that key.

Expiry time is awkward since it's more about the lifetime of the content
and so, in some ways goes better with WRITE permission.  It can, however,
be set unconditionally by a process with an appropriate authorisation token
for instantiating a key, and can also be set by the key type driver when a
key is instantiated, so lumping it with the access-controlling actions is
probably okay.

As for SEARCH permission, that currently covers:

 (1) Finding keys in a keyring tree during a search.

 (2) Permitting keyrings to be joined.

 (3) Invalidation.

But these don't really belong together either, since these actions really
need to be controlled separately.

Finally, there are number of special cases to do with granting the
administrator special rights to invalidate or clear keys that I would like
to handle with the ACL rather than key flags and special checks.


===============
WHAT IS CHANGED
===============

The SETATTR permission is split to create two new permissions:

 (1) SET_SECURITY - which allows the key's owner, group and ACL to be
     changed and a restriction to be placed on a keyring.

 (2) REVOKE - which allows a key to be revoked.

The SEARCH permission is split to create:

 (1) SEARCH - which allows a keyring to be search and a key to be found.

 (2) JOIN - which allows a keyring to be joined as a session keyring.

 (3) INVAL - which allows a key to be invalidated.

The WRITE permission is also split to create:

 (1) WRITE - which allows a key's content to be altered and links to be
     added, removed and replaced in a keyring.

 (2) CLEAR - which allows a keyring to be cleared completely.  This is
     split out to make it possible to give just this to an administrator.

 (3) REVOKE - see above.


Keys acquire ACLs which consist of a series of ACEs, and all that apply are
unioned together.  An ACE specifies a subject, such as:

 (*) Possessor - permitted to anyone who 'possesses' a key
 (*) Owner - permitted to the key owner
 (*) Group - permitted to the key group
 (*) Everyone - permitted to everyone

Note that 'Other' has been replaced with 'Everyone' on the assumption that
you wouldn't grant a permit to 'Other' that you wouldn't also grant to
everyone else.

Further subjects may be made available by later patches.

The ACE also specifies a permissions mask.  The set of permissions is now:

	VIEW		Can view the key metadata
	READ		Can read the key content
	WRITE		Can update/modify the key content
	SEARCH		Can find the key by searching/requesting
	LINK		Can make a link to the key
	SET_SECURITY	Can change owner, ACL, expiry
	INVAL		Can invalidate
	REVOKE		Can revoke
	JOIN		Can join this keyring
	CLEAR		Can clear this keyring


The KEYCTL_SETPERM function is then deprecated.

The KEYCTL_SET_TIMEOUT function then is permitted if SET_SECURITY is set,
or if the caller has a valid instantiation auth token.

The KEYCTL_INVALIDATE function then requires INVAL.

The KEYCTL_REVOKE function then requires REVOKE.

The KEYCTL_JOIN_SESSION_KEYRING function then requires JOIN to join an
existing keyring.

The JOIN permission is enabled by default for session keyrings and manually
created keyrings only.


======================
BACKWARD COMPATIBILITY
======================

To maintain backward compatibility, KEYCTL_SETPERM will translate the
permissions mask it is given into a new ACL for a key - unless
KEYCTL_SET_ACL has been called on that key, in which case an error will be
returned.

It will convert possessor, owner, group and other permissions into separate
ACEs, if each portion of the mask is non-zero.

SETATTR permission turns on all of INVAL, REVOKE and SET_SECURITY.  WRITE
permission turns on WRITE, REVOKE and, if a keyring, CLEAR.  JOIN is turned
on if a keyring is being altered.

The KEYCTL_DESCRIBE function translates the ACL back into a permissions
mask to return depending on possessor, owner, group and everyone ACEs.

It will make the following mappings:

 (1) INVAL, JOIN -> SEARCH

 (2) SET_SECURITY -> SETATTR

 (3) REVOKE -> WRITE if SETATTR isn't already set

 (4) CLEAR -> WRITE

Note that the value subsequently returned by KEYCTL_DESCRIBE may not match
the value set with KEYCTL_SETATTR.


=======
TESTING
=======

This passes the keyutils testsuite for all but a couple of tests:

 (1) tests/keyctl/dh_compute/badargs: The first wrong-key-type test now
     returns EOPNOTSUPP rather than ENOKEY as READ permission isn't removed
     if the type doesn't have ->read().  You still can't actually read the
     key.

 (2) tests/keyctl/permitting/valid: The view-other-permissions test doesn't
     work as Other has been replaced with Everyone in the ACL.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys/core.rst               | 128 ++++++++---
 Documentation/security/keys/request-key.rst        |   9 +-
 certs/blacklist.c                                  |   7 +-
 certs/system_keyring.c                             |  12 +-
 drivers/md/dm-crypt.c                              |   2 +-
 drivers/nvdimm/security.c                          |   2 +-
 fs/afs/security.c                                  |   2 +-
 fs/cifs/cifs_spnego.c                              |  25 ++-
 fs/cifs/cifsacl.c                                  |  28 ++-
 fs/cifs/connect.c                                  |   4 +-
 fs/crypto/keyinfo.c                                |   2 +-
 fs/ecryptfs/ecryptfs_kernel.h                      |   2 +-
 fs/ecryptfs/keystore.c                             |   2 +-
 fs/fscache/object-list.c                           |   2 +-
 fs/nfs/nfs4idmap.c                                 |  30 ++-
 fs/ubifs/auth.c                                    |   2 +-
 include/linux/key.h                                | 121 ++++++-----
 include/uapi/linux/keyctl.h                        |  63 ++++++
 lib/digsig.c                                       |   2 +-
 net/ceph/ceph_common.c                             |   2 +-
 net/dns_resolver/dns_key.c                         |  12 +-
 net/dns_resolver/dns_query.c                       |  15 +-
 net/rxrpc/key.c                                    |  19 +-
 net/wireless/reg.c                                 |   6 +-
 security/integrity/digsig.c                        |  31 ++-
 security/integrity/digsig_asymmetric.c             |   2 +-
 security/integrity/evm/evm_crypto.c                |   2 +-
 security/integrity/ima/ima_mok.c                   |  13 +-
 security/integrity/integrity.h                     |   6 +-
 .../integrity/platform_certs/platform_keyring.c    |  14 +-
 security/keys/encrypted-keys/encrypted.c           |   2 +-
 security/keys/encrypted-keys/masterkey_trusted.c   |   2 +-
 security/keys/gc.c                                 |   2 +-
 security/keys/internal.h                           |  11 +-
 security/keys/key.c                                |  29 +--
 security/keys/keyctl.c                             |  96 +++++---
 security/keys/keyring.c                            |  27 ++-
 security/keys/permission.c                         | 242 ++++++++++++++++++---
 security/keys/persistent.c                         |  27 ++-
 security/keys/proc.c                               |  22 +-
 security/keys/process_keys.c                       |  86 ++++++--
 security/keys/request_key.c                        |  34 ++-
 security/keys/request_key_auth.c                   |  15 +-
 security/selinux/hooks.c                           |  16 +-
 security/smack/smack_lsm.c                         |   3 +-
 45 files changed, 857 insertions(+), 324 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 0e74f372e58c..1b3c907980ad 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -57,9 +57,9 @@ Each key has a number of attributes:
      type provides an operation to perform a match between the description on a
      key and a criterion string.
 
-  *  Each key has an owner user ID, a group ID and a permissions mask. These
-     are used to control what a process may do to a key from userspace, and
-     whether a kernel service will be able to find the key.
+  *  Each key has an owner user ID, a group ID and an ACL.  These are used to
+     control what a process may do to a key from userspace, and whether a
+     kernel service will be able to find the key.
 
   *  Each key can be set to expire at a specific time by the key type's
      instantiation function. Keys can also be immortal.
@@ -198,43 +198,110 @@ The key service provides a number of features besides keys:
 Key Access Permissions
 ======================
 
-Keys have an owner user ID, a group access ID, and a permissions mask. The mask
-has up to eight bits each for possessor, user, group and other access. Only
-six of each set of eight bits are defined. These permissions granted are:
+Keys have an owner user ID, a group ID and an ACL.  The ACL is made up of a
+sequence of ACEs that each contain three elements:
 
-  *  View
+  * The type of subject.
+  * The subject.
 
-     This permits a key or keyring's attributes to be viewed - including key
-     type and description.
+    These two together indicate the subject to whom the permits are granted.
+    The type can be one of:
 
-  *  Read
+     * ``KEY_ACE_SUBJ_STANDARD``
 
-     This permits a key's payload to be viewed or a keyring's list of linked
-     keys.
+       The subject is a standard 'macro' type.  The subject can be one of:
+
+        * ``KEY_ACE_EVERYONE``
+
+	  The permits are granted to everyone.  It replaces the old 'other'
+	  type on the assumption that you wouldn't grant a permission to other
+	  that you you wouldn't grant to everyone else.
+
+	* ``KEY_ACE_OWNER``
+
+	  The permits are granted to the owner of the key (key->uid).
+
+	* ``KEY_ACE_GROUP``
+
+	  The permits are granted to the key's group (key->gid).
+
+	* ``KEY_ACE_POSSESSOR``
+
+	  The permits are granted to anyone who possesses the key.
+
+  * The set of permits granted to the subject.  These include:
+
+     * ``KEY_ACE_VIEW``
+
+       This permits a key or keyring's attributes to be viewed - including the
+       key type and description.
+
+     * ``KEY_ACE_READ``
+
+       This permits a key's payload to be viewed or a keyring's list of linked
+       keys.
 
-  *  Write
+     * ``KEY_ACE_WRITE``
 
-     This permits a key's payload to be instantiated or updated, or it allows a
-     link to be added to or removed from a keyring.
+       This permits a key's payload to be instantiated or updated, or it allows
+       a link to be added to or removed from a keyring.
 
-  *  Search
+     * ``KEY_ACE_SEARCH``
 
-     This permits keyrings to be searched and keys to be found. Searches can
-     only recurse into nested keyrings that have search permission set.
+       This permits keyrings to be searched and keys to be found. Searches can
+       only recurse into nested keyrings that have search permission set.
 
-  *  Link
+     * ``KEY_ACE_LINK``
 
-     This permits a key or keyring to be linked to. To create a link from a
-     keyring to a key, a process must have Write permission on the keyring and
-     Link permission on the key.
+       This permits a key or keyring to be linked to. To create a link from a
+       keyring to a key, a process must have Write permission on the keyring
+       and Link permission on the key.
 
-  *  Set Attribute
+     * ``KEY_ACE_SET_SECURITY``
 
-     This permits a key's UID, GID and permissions mask to be changed.
+       This permits a key's UID, GID and permissions mask to be changed.
+
+     * ``KEY_ACE_INVAL``
+
+       This permits a key to be invalidated with KEYCTL_INVALIDATE.
+
+     * ``KEY_ACE_REVOKE``
+
+       This permits a key to be revoked with KEYCTL_REVOKE.
+
+     * ``KEY_ACE_JOIN``
+
+       This permits a keyring to be joined as a session by
+       KEYCTL_JOIN_SESSION_KEYRING or KEYCTL_SESSION_TO_PARENT.
+
+     * ``KEY_ACE_CLEAR``
+
+       This permits a keyring to be cleared.
 
 For changing the ownership, group ID or permissions mask, being the owner of
 the key or having the sysadmin capability is sufficient.
 
+The legacy KEYCTL_SETPERM and KEYCTL_DESCRIBE functions can only see/generate
+View, Read, Write, Search, Link and SetAttr permits, and do this for each of
+possessor, user, group and other permission sets as a 32-bit flag mask.  These
+will be approximated/inferred:
+
+	SETPERM Permit	Implied ACE Permit
+	===============	=======================
+	Search		Inval, Join
+	Write		Revoke, Clear
+	Setattr		Set Security, Revoke
+
+	ACE Permit	Described as
+	===============	=======================
+	Inval		Search
+	Join		Search
+	Revoke		Write (unless Setattr)
+	Clear		write
+	Set Security	Setattr
+
+'Other' will be approximated as/inferred from the 'Everyone' subject.
+
 
 SELinux Support
 ===============
@@ -1084,7 +1151,8 @@ payload contents" for more information.
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_info);
+				const char *callout_info,
+				struct key_acl *acl);
 
     This is used to request a key or keyring with a description that matches
     the description specified according to the key type's match_preparse()
@@ -1099,6 +1167,8 @@ payload contents" for more information.
     If successful, the key will have been attached to the default keyring for
     implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
 
+    If a key is created, it will be given the specified ACL.
+
     See also Documentation/security/keys/request-key.rst.
 
 
@@ -1107,7 +1177,8 @@ payload contents" for more information.
 	struct key *request_key_tag(const struct key_type *type,
 				    const char *description,
 				    struct key_tag *domain_tag,
-				    const char *callout_info);
+				    const char *callout_info,
+				    struct key_acl *acl);
 
     This is identical to request_key(), except that a domain tag may be
     specifies that causes search algorithm to only match keys matching that
@@ -1122,7 +1193,8 @@ payload contents" for more information.
 					     struct key_tag *domain_tag,
 					     const void *callout_info,
 					     size_t callout_len,
-					     void *aux);
+					     void *aux,
+					     struct key_acl *acl);
 
     This is identical to request_key_tag(), except that the auxiliary data is
     passed to the key_type->request_key() op if it exists, and the
@@ -1195,7 +1267,7 @@ payload contents" for more information.
 
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
-				  key_perm_t perm,
+				  struct key_acl *acl,
 				  struct key_restriction *restrict_link,
 				  unsigned long flags,
 				  struct key *dest);
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 35f2296b704a..f356fd06c8d5 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -11,14 +11,16 @@ The process starts by either the kernel requesting a service by calling
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_info);
+				const char *callout_info,
+				struct key_acl *acl);
 
 or::
 
 	struct key *request_key_tag(const struct key_type *type,
 				    const char *description,
 				    const struct key_tag *domain_tag,
-				    const char *callout_info);
+				    const char *callout_info,
+				    struct key_acl *acl);
 
 or::
 
@@ -27,7 +29,8 @@ or::
 					     const struct key_tag *domain_tag,
 					     const char *callout_info,
 					     size_t callout_len,
-					     void *aux);
+					     void *aux,
+					     struct key_acl *acl);
 
 or::
 
diff --git a/certs/blacklist.c b/certs/blacklist.c
index 181cb7fa9540..39de9d68b21e 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -93,8 +93,7 @@ int mark_hash_blacklisted(const char *hash)
 				   hash,
 				   NULL,
 				   0,
-				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				    KEY_USR_VIEW),
+				   &internal_key_acl,
 				   KEY_ALLOC_NOT_IN_QUOTA |
 				   KEY_ALLOC_BUILT_IN);
 	if (IS_ERR(key)) {
@@ -153,9 +152,7 @@ static int __init blacklist_init(void)
 		keyring_alloc(".blacklist",
 			      KUIDT_INIT(0), KGIDT_INIT(0),
 			      current_cred(),
-			      (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			      KEY_USR_VIEW | KEY_USR_READ |
-			      KEY_USR_SEARCH,
+			      &internal_keyring_acl,
 			      KEY_ALLOC_NOT_IN_QUOTA |
 			      KEY_FLAG_KEEP,
 			      NULL, NULL);
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index c05c29ae4d5d..2873a4ce2828 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -103,9 +103,7 @@ static __init int system_trusted_keyring_init(void)
 	builtin_trusted_keys =
 		keyring_alloc(".builtin_trusted_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
-			      KEY_ALLOC_NOT_IN_QUOTA,
+			      &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA,
 			      NULL, NULL);
 	if (IS_ERR(builtin_trusted_keys))
 		panic("Can't allocate builtin trusted keyring\n");
@@ -114,10 +112,7 @@ static __init int system_trusted_keyring_init(void)
 	secondary_trusted_keys =
 		keyring_alloc(".secondary_trusted_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			       KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH |
-			       KEY_USR_WRITE),
-			      KEY_ALLOC_NOT_IN_QUOTA,
+			      &internal_writable_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA,
 			      get_builtin_and_secondary_restriction(),
 			      NULL);
 	if (IS_ERR(secondary_trusted_keys))
@@ -167,8 +162,7 @@ static __init int load_system_certificate_list(void)
 					   NULL,
 					   p,
 					   plen,
-					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-					   KEY_USR_VIEW | KEY_USR_READ),
+					   &internal_key_acl,
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1b16d34bb785..0fd3ca9bfe54 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2035,7 +2035,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
 		return -ENOMEM;
 
 	key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user,
-			  key_desc + 1, NULL);
+			  key_desc + 1, NULL, NULL);
 	if (IS_ERR(key)) {
 		kzfree(new_key_string);
 		return PTR_ERR(key);
diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c
index a570f2263a42..99a5708b37e3 100644
--- a/drivers/nvdimm/security.c
+++ b/drivers/nvdimm/security.c
@@ -55,7 +55,7 @@ static struct key *nvdimm_request_key(struct nvdimm *nvdimm)
 	struct device *dev = &nvdimm->dev;
 
 	sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id);
-	key = request_key(&key_type_encrypted, desc, "");
+	key = request_key(&key_type_encrypted, desc, "", NULL);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) == -ENOKEY)
 			dev_dbg(dev, "request_key() found no key\n");
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 5d8ece98561e..3185898237b2 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -32,7 +32,7 @@ struct key *afs_request_key(struct afs_cell *cell)
 
 	_debug("key %s", cell->anonymous_key->description);
 	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
-			  NULL);
+			  NULL, NULL);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) != -ENOKEY) {
 			_leave(" = %ld", PTR_ERR(key));
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7f01c6e60791..d1b439ad0f1a 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -32,6 +32,25 @@
 #include "cifsproto.h"
 static const struct cred *spnego_cred;
 
+static struct key_acl cifs_spnego_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
+static struct key_acl cifs_spnego_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR),
+	}
+};
+
 /* create a new cifs key */
 static int
 cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
@@ -170,7 +189,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 
 	cifs_dbg(FYI, "key description = %s\n", description);
 	saved_cred = override_creds(spnego_cred);
-	spnego_key = request_key(&cifs_spnego_key_type, description, "");
+	spnego_key = request_key(&cifs_spnego_key_type, description, "",
+				 &cifs_spnego_key_acl);
 	revert_creds(saved_cred);
 
 #ifdef CONFIG_CIFS_DEBUG2
@@ -207,8 +227,7 @@ init_cifs_spnego(void)
 
 	keyring = keyring_alloc(".cifs_spnego",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
+				&cifs_spnego_keyring_acl,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1d377b7f2860..78eed72f3af0 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -33,6 +33,25 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
+static struct key_acl cifs_idmap_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
+static struct key_acl cifs_idmap_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
@@ -298,7 +317,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 
 	rc = 0;
 	saved_cred = override_creds(root_cred);
-	sidkey = request_key(&cifs_idmap_key_type, desc, "");
+	sidkey = request_key(&cifs_idmap_key_type, desc, "",
+			     &cifs_idmap_key_acl);
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
 		cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
@@ -403,7 +423,8 @@ try_upcall_to_get_id:
 		return -ENOMEM;
 
 	saved_cred = override_creds(root_cred);
-	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
+	sidkey = request_key(&cifs_idmap_key_type, sidstr, "",
+			     &cifs_idmap_key_acl);
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
 		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
@@ -481,8 +502,7 @@ init_cifs_idmap(void)
 
 	keyring = keyring_alloc(".cifs_idmap",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
+				&cifs_idmap_keyring_acl,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8c4121da624e..6e50d3e87948 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2990,7 +2990,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	}
 
 	cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
-	key = request_key(&key_type_logon, desc, "");
+	key = request_key(&key_type_logon, desc, "", NULL);
 	if (IS_ERR(key)) {
 		if (!ses->domainName) {
 			cifs_dbg(FYI, "domainName is NULL\n");
@@ -3001,7 +3001,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 		/* didn't work, try to find a domain key */
 		sprintf(desc, "cifs:d:%s", ses->domainName);
 		cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
-		key = request_key(&key_type_logon, desc, "");
+		key = request_key(&key_type_logon, desc, "", NULL);
 		if (IS_ERR(key)) {
 			rc = PTR_ERR(key);
 			goto out_err;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index dcd91a3fbe49..4f85af8ab239 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -92,7 +92,7 @@ find_and_lock_process_key(const char *prefix,
 	if (!description)
 		return ERR_PTR(-ENOMEM);
 
-	key = request_key(&key_type_logon, description, NULL);
+	key = request_key(&key_type_logon, description, NULL, NULL);
 	kfree(description);
 	if (IS_ERR(key))
 		return key;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e74cb2a0b299..6460bd2a4e9d 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -105,7 +105,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
 
 static inline struct key *ecryptfs_get_encrypted_key(char *sig)
 {
-	return request_key(&key_type_encrypted, sig, NULL);
+	return request_key(&key_type_encrypted, sig, NULL, NULL);
 }
 
 #else
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 90fbac5d485b..923a6006ccea 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1624,7 +1624,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 {
 	int rc = 0;
 
-	(*auth_tok_key) = request_key(&key_type_user, sig, NULL);
+	(*auth_tok_key) = request_key(&key_type_user, sig, NULL, NULL);
 	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
 		(*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
 		if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 43e6e28c164f..6a672289e5ec 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -321,7 +321,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
 	const char *buf;
 	int len;
 
-	key = request_key(&key_type_user, "fscache:objlist", NULL);
+	key = request_key(&key_type_user, "fscache:objlist", NULL, NULL);
 	if (IS_ERR(key))
 		goto no_config;
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 1e7296395d71..69679f4f2e6c 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -72,6 +72,25 @@ struct idmap {
 	const struct cred	*cred;
 };
 
+static struct key_acl nfs_idmap_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
+static struct key_acl nfs_idmap_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
 static struct user_namespace *idmap_userns(const struct idmap *idmap)
 {
 	if (idmap && idmap->cred)
@@ -208,8 +227,7 @@ int nfs_idmap_init(void)
 
 	keyring = keyring_alloc(".id_resolver",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
+				&nfs_idmap_keyring_acl,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -287,11 +305,13 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
 		return ERR_PTR(ret);
 
 	if (!idmap->cred || idmap->cred->user_ns == &init_user_ns)
-		rkey = request_key(&key_type_id_resolver, desc, "");
+		rkey = request_key(&key_type_id_resolver, desc, "",
+				   &nfs_idmap_key_acl);
 	if (IS_ERR(rkey)) {
 		mutex_lock(&idmap->idmap_mutex);
 		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
-						desc, NULL, "", 0, idmap);
+						desc, NULL, "", 0, idmap,
+						&nfs_idmap_key_acl);
 		mutex_unlock(&idmap->idmap_mutex);
 	}
 	if (!IS_ERR(rkey))
@@ -320,8 +340,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	}
 
 	rcu_read_lock();
-	rkey->perm |= KEY_USR_VIEW;
-
 	ret = key_validate(rkey);
 	if (ret < 0)
 		goto out_up;
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 60f43b93d06e..38718026ad0b 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -227,7 +227,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 	snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
 		 c->auth_hash_name);
 
-	keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL);
+	keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL, NULL);
 
 	if (IS_ERR(keyring_key)) {
 		ubifs_err(c, "Failed to request key: %ld",
diff --git a/include/linux/key.h b/include/linux/key.h
index 18d7f62ab6b0..bc4adfd254fe 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -32,49 +32,14 @@
 /* key handle serial number */
 typedef int32_t key_serial_t;
 
-/* key handle permissions mask */
-typedef uint32_t key_perm_t;
-
 struct key;
 struct net;
 
 #ifdef CONFIG_KEYS
 
-#undef KEY_DEBUGGING
+#include <linux/keyctl.h>
 
-#define KEY_POS_VIEW	0x01000000	/* possessor can view a key's attributes */
-#define KEY_POS_READ	0x02000000	/* possessor can read key payload / view keyring */
-#define KEY_POS_WRITE	0x04000000	/* possessor can update key payload / add link to keyring */
-#define KEY_POS_SEARCH	0x08000000	/* possessor can find a key in search / search a keyring */
-#define KEY_POS_LINK	0x10000000	/* possessor can create a link to a key/keyring */
-#define KEY_POS_SETATTR	0x20000000	/* possessor can set key attributes */
-#define KEY_POS_ALL	0x3f000000
-
-#define KEY_USR_VIEW	0x00010000	/* user permissions... */
-#define KEY_USR_READ	0x00020000
-#define KEY_USR_WRITE	0x00040000
-#define KEY_USR_SEARCH	0x00080000
-#define KEY_USR_LINK	0x00100000
-#define KEY_USR_SETATTR	0x00200000
-#define KEY_USR_ALL	0x003f0000
-
-#define KEY_GRP_VIEW	0x00000100	/* group permissions... */
-#define KEY_GRP_READ	0x00000200
-#define KEY_GRP_WRITE	0x00000400
-#define KEY_GRP_SEARCH	0x00000800
-#define KEY_GRP_LINK	0x00001000
-#define KEY_GRP_SETATTR	0x00002000
-#define KEY_GRP_ALL	0x00003f00
-
-#define KEY_OTH_VIEW	0x00000001	/* third party permissions... */
-#define KEY_OTH_READ	0x00000002
-#define KEY_OTH_WRITE	0x00000004
-#define KEY_OTH_SEARCH	0x00000008
-#define KEY_OTH_LINK	0x00000010
-#define KEY_OTH_SETATTR	0x00000020
-#define KEY_OTH_ALL	0x0000003f
-
-#define KEY_PERM_UNDEF	0xffffffff
+#undef KEY_DEBUGGING
 
 struct seq_file;
 struct user_struct;
@@ -118,6 +83,36 @@ union key_payload {
 	void			*data[4];
 };
 
+struct key_ace {
+	unsigned int		type;
+	unsigned int		perm;
+	union {
+		kuid_t		uid;
+		kgid_t		gid;
+		unsigned int	subject_id;
+	};
+};
+
+struct key_acl {
+	refcount_t		usage;
+	unsigned short		nr_ace;
+	bool			possessor_viewable;
+	struct rcu_head		rcu;
+	struct key_ace		aces[];
+};
+
+#define KEY_POSSESSOR_ACE(perms) {			\
+		.type = KEY_ACE_SUBJ_STANDARD,		\
+		.perm = perms,				\
+		.subject_id = KEY_ACE_POSSESSOR		\
+	}
+
+#define KEY_OWNER_ACE(perms) {				\
+		.type = KEY_ACE_SUBJ_STANDARD,		\
+		.perm = perms,				\
+		.subject_id = KEY_ACE_OWNER		\
+	}
+
 /*****************************************************************************/
 /*
  * key reference with possession attribute handling
@@ -184,6 +179,7 @@ struct key {
 	struct rw_semaphore	sem;		/* change vs change sem */
 	struct key_user		*user;		/* owner of this key */
 	void			*security;	/* security data for this key */
+	struct key_acl		__rcu *acl;
 	union {
 		time64_t	expiry;		/* time at which key expires (or 0) */
 		time64_t	revoked_at;	/* time at which key was revoked */
@@ -191,7 +187,6 @@ struct key {
 	time64_t		last_used_at;	/* last time used for LRU keyring discard */
 	kuid_t			uid;
 	kgid_t			gid;
-	key_perm_t		perm;		/* access permissions */
 	unsigned short		quotalen;	/* length added to quota */
 	unsigned short		datalen;	/* payload data length
 						 * - may not match RCU dereferenced payload
@@ -215,6 +210,7 @@ struct key {
 #define KEY_FLAG_ROOT_CAN_INVAL	7	/* set if key can be invalidated by root without permission */
 #define KEY_FLAG_KEEP		8	/* set if key should not be removed */
 #define KEY_FLAG_UID_KEYRING	9	/* set if key is a user or user session keyring */
+#define KEY_FLAG_HAS_ACL	10	/* Set if KEYCTL_SETACL called on key */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -263,7 +259,7 @@ extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     kuid_t uid, kgid_t gid,
 			     const struct cred *cred,
-			     key_perm_t perm,
+			     struct key_acl *acl,
 			     unsigned long flags,
 			     struct key_restriction *restrict_link);
 
@@ -300,7 +296,8 @@ static inline void key_ref_put(key_ref_t key_ref)
 extern struct key *request_key_tag(struct key_type *type,
 				   const char *description,
 				   struct key_tag *domain_tag,
-				   const char *callout_info);
+				   const char *callout_info,
+				   struct key_acl *acl);
 
 extern struct key *request_key_rcu(struct key_type *type,
 				   const char *description,
@@ -311,21 +308,24 @@ extern struct key *request_key_with_auxdata(struct key_type *type,
 					    struct key_tag *domain_tag,
 					    const void *callout_info,
 					    size_t callout_len,
-					    void *aux);
+					    void *aux,
+					    struct key_acl *acl);
 
 /**
  * request_key - Request a key and wait for construction
  * @type: Type of key.
  * @description: The searchable description of the key.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
+ * @acl: The ACL to attach to a new key (or NULL).
  *
  * As for request_key_tag(), but with the default global domain tag.
  */
 static inline struct key *request_key(struct key_type *type,
 				      const char *description,
-				      const char *callout_info)
+				      const char *callout_info,
+				      struct key_acl *acl)
 {
-	return request_key_tag(type, description, NULL, callout_info);
+	return request_key_tag(type, description, NULL, callout_info, acl);
 }
 
 #ifdef CONFIG_NET
@@ -335,6 +335,7 @@ static inline struct key *request_key(struct key_type *type,
  * @description: The searchable description of the key.
  * @net: The network namespace that is the key's domain of operation.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
+ * @acl: The ACL to attach to a new key (or NULL).
  *
  * As for request_key() except that it does not add the returned key to a
  * keyring if found, new keys are always allocated in the user's quota, the
@@ -344,8 +345,8 @@ static inline struct key *request_key(struct key_type *type,
  * Furthermore, it then works as wait_for_key_construction() to wait for the
  * completion of keys undergoing construction with a non-interruptible wait.
  */
-#define request_key_net(type, description, net, callout_info) \
-	request_key_tag(type, description, net->key_domain, callout_info);
+#define request_key_net(type, description, net, callout_info, acl)	\
+	request_key_tag(type, description, net->key_domain, callout_info, acl);
 #endif /* CONFIG_NET */
 
 extern int wait_for_key_construction(struct key *key, bool intr);
@@ -357,7 +358,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring,
 				      const char *description,
 				      const void *payload,
 				      size_t plen,
-				      key_perm_t perm,
+				      struct key_acl *acl,
 				      unsigned long flags);
 
 extern int key_update(key_ref_t key,
@@ -377,7 +378,7 @@ extern int key_unlink(struct key *keyring,
 
 extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 				 const struct cred *cred,
-				 key_perm_t perm,
+				 struct key_acl *acl,
 				 unsigned long flags,
 				 struct key_restriction *restrict_link,
 				 struct key *dest);
@@ -410,19 +411,29 @@ static inline key_serial_t key_serial(const struct key *key)
 extern void key_set_timeout(struct key *, unsigned);
 
 extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
-				 key_perm_t perm);
+				 u32 desired_perm);
 extern void key_free_user_ns(struct user_namespace *);
 
 /*
  * The permissions required on a key that we're looking up.
  */
-#define	KEY_NEED_VIEW	0x01	/* Require permission to view attributes */
-#define	KEY_NEED_READ	0x02	/* Require permission to read content */
-#define	KEY_NEED_WRITE	0x04	/* Require permission to update / modify */
-#define	KEY_NEED_SEARCH	0x08	/* Require permission to search (keyring) or find (key) */
-#define	KEY_NEED_LINK	0x10	/* Require permission to link */
-#define	KEY_NEED_SETATTR 0x20	/* Require permission to change attributes */
-#define	KEY_NEED_ALL	0x3f	/* All the above permissions */
+#define	KEY_NEED_VIEW	0x001	/* Require permission to view attributes */
+#define	KEY_NEED_READ	0x002	/* Require permission to read content */
+#define	KEY_NEED_WRITE	0x004	/* Require permission to update / modify */
+#define	KEY_NEED_SEARCH	0x008	/* Require permission to search (keyring) or find (key) */
+#define	KEY_NEED_LINK	0x010	/* Require permission to link */
+#define	KEY_NEED_SETSEC	0x020	/* Require permission to set owner, group, ACL */
+#define	KEY_NEED_INVAL	0x040	/* Require permission to invalidate key */
+#define	KEY_NEED_REVOKE	0x080	/* Require permission to revoke key */
+#define	KEY_NEED_JOIN	0x100	/* Require permission to join keyring as session */
+#define	KEY_NEED_CLEAR	0x200	/* Require permission to clear a keyring */
+#define KEY_NEED_ALL	0x3ff
+
+#define OLD_KEY_NEED_SETATTR 0x20 /* Used to be Require permission to change attributes */
+
+extern struct key_acl internal_key_acl;
+extern struct key_acl internal_keyring_acl;
+extern struct key_acl internal_writable_keyring_acl;
 
 static inline short key_read_state(const struct key *key)
 {
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index ed3d5893830d..e783bf957da8 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -15,6 +15,69 @@
 
 #include <linux/types.h>
 
+/*
+ * Keyring permission grant definitions
+ */
+enum key_ace_subject_type {
+	KEY_ACE_SUBJ_STANDARD	= 0,	/* subject is one of key_ace_standard_subject */
+	nr__key_ace_subject_type
+};
+
+enum key_ace_standard_subject {
+	KEY_ACE_EVERYONE	= 0,	/* Everyone, including owner and group */
+	KEY_ACE_GROUP		= 1,	/* The key's group */
+	KEY_ACE_OWNER		= 2,	/* The owner of the key */
+	KEY_ACE_POSSESSOR	= 3,	/* Any process that possesses of the key */
+	nr__key_ace_standard_subject
+};
+
+#define KEY_ACE_VIEW		0x00000001 /* Can describe the key */
+#define KEY_ACE_READ		0x00000002 /* Can read the key content */
+#define KEY_ACE_WRITE		0x00000004 /* Can update/modify the key content */
+#define KEY_ACE_SEARCH		0x00000008 /* Can find the key by search */
+#define KEY_ACE_LINK		0x00000010 /* Can make a link to the key */
+#define KEY_ACE_SET_SECURITY	0x00000020 /* Can set owner, group, ACL */
+#define KEY_ACE_INVAL		0x00000040 /* Can invalidate the key */
+#define KEY_ACE_REVOKE		0x00000080 /* Can revoke the key */
+#define KEY_ACE_JOIN		0x00000100 /* Can join keyring */
+#define KEY_ACE_CLEAR		0x00000200 /* Can clear keyring */
+#define KEY_ACE__PERMS		0xffffffff
+
+/*
+ * Old-style permissions mask, deprecated in favour of ACL.
+ */
+#define KEY_POS_VIEW	0x01000000	/* possessor can view a key's attributes */
+#define KEY_POS_READ	0x02000000	/* possessor can read key payload / view keyring */
+#define KEY_POS_WRITE	0x04000000	/* possessor can update key payload / add link to keyring */
+#define KEY_POS_SEARCH	0x08000000	/* possessor can find a key in search / search a keyring */
+#define KEY_POS_LINK	0x10000000	/* possessor can create a link to a key/keyring */
+#define KEY_POS_SETATTR	0x20000000	/* possessor can set key attributes */
+#define KEY_POS_ALL	0x3f000000
+
+#define KEY_USR_VIEW	0x00010000	/* user permissions... */
+#define KEY_USR_READ	0x00020000
+#define KEY_USR_WRITE	0x00040000
+#define KEY_USR_SEARCH	0x00080000
+#define KEY_USR_LINK	0x00100000
+#define KEY_USR_SETATTR	0x00200000
+#define KEY_USR_ALL	0x003f0000
+
+#define KEY_GRP_VIEW	0x00000100	/* group permissions... */
+#define KEY_GRP_READ	0x00000200
+#define KEY_GRP_WRITE	0x00000400
+#define KEY_GRP_SEARCH	0x00000800
+#define KEY_GRP_LINK	0x00001000
+#define KEY_GRP_SETATTR	0x00002000
+#define KEY_GRP_ALL	0x00003f00
+
+#define KEY_OTH_VIEW	0x00000001	/* third party permissions... */
+#define KEY_OTH_READ	0x00000002
+#define KEY_OTH_WRITE	0x00000004
+#define KEY_OTH_SEARCH	0x00000008
+#define KEY_OTH_LINK	0x00000010
+#define KEY_OTH_SETATTR	0x00000020
+#define KEY_OTH_ALL	0x0000003f
+
 /* special process keyring shortcut IDs */
 #define KEY_SPEC_THREAD_KEYRING		-1	/* - key ID for thread-specific keyring */
 #define KEY_SPEC_PROCESS_KEYRING	-2	/* - key ID for process-specific keyring */
diff --git a/lib/digsig.c b/lib/digsig.c
index 3782af401c68..ce87ca2e0929 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -227,7 +227,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen,
 		else
 			key = key_ref_to_ptr(kref);
 	} else {
-		key = request_key(&key_type_user, name, NULL);
+		key = request_key(&key_type_user, name, NULL, NULL);
 	}
 	if (IS_ERR(key)) {
 		pr_err("key not found, id: %s\n", name);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 79eac465ec65..d4af93a35e2b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -305,7 +305,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
 	int err = 0;
 	struct ceph_crypto_key *ckey;
 
-	ukey = request_key(&key_type_ceph, name, NULL);
+	ukey = request_key(&key_type_ceph, name, NULL, NULL);
 	if (IS_ERR(ukey)) {
 		/* request_key errors don't map nicely to mount(2)
 		   errors; don't even try, but still printk */
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 3e1a90669006..6b201531b165 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -46,6 +46,15 @@ const struct cred *dns_resolver_cache;
 
 #define	DNS_ERRORNO_OPTION	"dnserror"
 
+static struct key_acl dns_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR),
+	}
+};
+
 /*
  * Preparse instantiation data for a dns_resolver key.
  *
@@ -343,8 +352,7 @@ static int __init init_dns_resolver(void)
 
 	keyring = keyring_alloc(".dns_resolver",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
+				&dns_keyring_acl,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index cab4e0df924f..236baf2bfa4c 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -47,6 +47,16 @@
 
 #include "internal.h"
 
+static struct key_acl dns_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_INVAL),
+	}
+};
+
 /**
  * dns_query - Query the DNS
  * @net: The network namespace to operate in.
@@ -125,7 +135,8 @@ int dns_query(struct net *net,
 	 * add_key() to preinstall malicious redirections
 	 */
 	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
+	rkey = request_key_net(&key_type_dns_resolver, desc, net, options,
+			       &dns_key_acl);
 	revert_creds(saved_cred);
 	kfree(desc);
 	if (IS_ERR(rkey)) {
@@ -135,8 +146,6 @@ int dns_query(struct net *net,
 
 	down_read(&rkey->sem);
 	set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
-	rkey->perm |= KEY_USR_VIEW;
-
 	ret = key_validate(rkey);
 	if (ret < 0)
 		goto put;
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 1cc6b0c6cc42..207d621d18c0 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -27,6 +27,14 @@
 #include <keys/user-type.h>
 #include "ar-internal.h"
 
+static struct key_acl rxrpc_null_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 1,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ),
+	}
+};
+
 static int rxrpc_vet_description_s(const char *);
 static int rxrpc_preparse(struct key_preparsed_payload *);
 static int rxrpc_preparse_s(struct key_preparsed_payload *);
@@ -914,7 +922,8 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL);
+	key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk),
+			      NULL, NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
@@ -945,7 +954,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL);
+	key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk),
+			      NULL, NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
@@ -978,7 +988,8 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 	_enter("");
 
 	key = key_alloc(&key_type_rxrpc, "x",
-			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
+			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+			&internal_key_acl,
 			KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
@@ -1026,7 +1037,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
 
 	key = key_alloc(&key_type_rxrpc, keyname,
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			&rxrpc_null_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key))
 		return key;
 
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 4831ad745f91..298fe91557f7 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -741,8 +741,7 @@ static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen)
 
 		key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1),
 					   "asymmetric", NULL, p, plen,
-					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-					    KEY_USR_VIEW | KEY_USR_READ),
+					   &internal_key_acl, 
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
@@ -768,8 +767,7 @@ static int __init load_builtin_regdb_keys(void)
 	builtin_regdb_keys =
 		keyring_alloc(".builtin_regdb_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+			      &internal_keyring_acl, 
 			      KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(builtin_regdb_keys))
 		return PTR_ERR(builtin_regdb_keys);
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index e19c2eb72c51..3bd2cc28f4f5 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -51,7 +51,8 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 
 	if (!keyring[id]) {
 		keyring[id] =
-			request_key(&key_type_keyring, keyring_name[id], NULL);
+			request_key(&key_type_keyring, keyring_name[id],
+				    NULL, NULL);
 		if (IS_ERR(keyring[id])) {
 			int err = PTR_ERR(keyring[id]);
 			pr_err("no %s keyring: %d\n", keyring_name[id], err);
@@ -73,14 +74,14 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 	return -EOPNOTSUPP;
 }
 
-static int __integrity_init_keyring(const unsigned int id, key_perm_t perm,
+static int __integrity_init_keyring(const unsigned int id, struct key_acl *acl,
 				    struct key_restriction *restriction)
 {
 	const struct cred *cred = current_cred();
 	int err = 0;
 
 	keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0),
-				    KGIDT_INIT(0), cred, perm,
+				    KGIDT_INIT(0), cred, acl,
 				    KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL);
 	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
@@ -98,10 +99,7 @@ static int __integrity_init_keyring(const unsigned int id, key_perm_t perm,
 int __init integrity_init_keyring(const unsigned int id)
 {
 	struct key_restriction *restriction;
-	key_perm_t perm;
-
-	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW
-		| KEY_USR_READ | KEY_USR_SEARCH;
+	struct key_acl *acl = &internal_keyring_acl;
 
 	if (id == INTEGRITY_KEYRING_PLATFORM) {
 		restriction = NULL;
@@ -116,14 +114,14 @@ int __init integrity_init_keyring(const unsigned int id)
 		return -ENOMEM;
 
 	restriction->check = restrict_link_to_ima;
-	perm |= KEY_USR_WRITE;
+	acl = &internal_writable_keyring_acl;
 
 out:
-	return __integrity_init_keyring(id, perm, restriction);
+	return __integrity_init_keyring(id, acl, restriction);
 }
 
-int __init integrity_add_key(const unsigned int id, const void *data,
-			     off_t size, key_perm_t perm)
+static int __init integrity_add_key(const unsigned int id, const void *data,
+				    off_t size, struct key_acl *acl)
 {
 	key_ref_t key;
 	int rc = 0;
@@ -132,7 +130,7 @@ int __init integrity_add_key(const unsigned int id, const void *data,
 		return -EINVAL;
 
 	key = key_create_or_update(make_key_ref(keyring[id], 1), "asymmetric",
-				   NULL, data, size, perm,
+				   NULL, data, size, acl ?: &internal_key_acl,
 				   KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key)) {
 		rc = PTR_ERR(key);
@@ -152,7 +150,6 @@ int __init integrity_load_x509(const unsigned int id, const char *path)
 	void *data;
 	loff_t size;
 	int rc;
-	key_perm_t perm;
 
 	rc = kernel_read_file_from_path(path, &data, &size, 0,
 					READING_X509_CERTIFICATE);
@@ -161,21 +158,19 @@ int __init integrity_load_x509(const unsigned int id, const char *path)
 		return rc;
 	}
 
-	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ;
-
 	pr_info("Loading X.509 certificate: %s\n", path);
-	rc = integrity_add_key(id, (const void *)data, size, perm);
+	rc = integrity_add_key(id, data, size, NULL);
 
 	vfree(data);
 	return rc;
 }
 
 int __init integrity_load_cert(const unsigned int id, const char *source,
-			       const void *data, size_t len, key_perm_t perm)
+			       const void *data, size_t len, struct key_acl *acl)
 {
 	if (!data)
 		return -EINVAL;
 
 	pr_info("Loading X.509 certificate: %s\n", source);
-	return integrity_add_key(id, data, len, perm);
+	return integrity_add_key(id, data, len, acl);
 }
diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c
index 358f614811e8..a8bd8b2f4fce 100644
--- a/security/integrity/digsig_asymmetric.c
+++ b/security/integrity/digsig_asymmetric.c
@@ -57,7 +57,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid)
 		else
 			key = key_ref_to_ptr(kref);
 	} else {
-		key = request_key(&key_type_asymmetric, name, NULL);
+		key = request_key(&key_type_asymmetric, name, NULL, NULL);
 	}
 
 	if (IS_ERR(key)) {
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index e11564eb645b..304cb0b21f7a 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -356,7 +356,7 @@ int evm_init_key(void)
 	struct encrypted_key_payload *ekp;
 	int rc;
 
-	evm_key = request_key(&key_type_encrypted, EVMKEY, NULL);
+	evm_key = request_key(&key_type_encrypted, EVMKEY, NULL, NULL);
 	if (IS_ERR(evm_key))
 		return -ENOENT;
 
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 073ddc9bce5b..ce48303cfacc 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -21,6 +21,15 @@
 #include <keys/system_keyring.h>
 
 
+static struct key_acl integrity_blacklist_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH),
+	}
+};
+
 struct key *ima_blacklist_keyring;
 
 /*
@@ -40,9 +49,7 @@ __init int ima_mok_init(void)
 
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ |
-				KEY_USR_WRITE | KEY_USR_SEARCH,
+			        &integrity_blacklist_keyring_acl,
 				KEY_ALLOC_NOT_IN_QUOTA,
 				restriction, NULL);
 
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 7de59f44cba3..1c50aff6f65a 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -17,6 +17,8 @@
 #include <linux/key.h>
 #include <linux/audit.h>
 
+struct key_acl;
+
 /* iint action cache flags */
 #define IMA_MEASURE		0x00000001
 #define IMA_MEASURED		0x00000002
@@ -154,7 +156,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 int __init integrity_init_keyring(const unsigned int id);
 int __init integrity_load_x509(const unsigned int id, const char *path);
 int __init integrity_load_cert(const unsigned int id, const char *source,
-			       const void *data, size_t len, key_perm_t perm);
+			       const void *data, size_t len, struct key_acl *acl);
 #else
 
 static inline int integrity_digsig_verify(const unsigned int id,
@@ -172,7 +174,7 @@ static inline int integrity_init_keyring(const unsigned int id)
 static inline int __init integrity_load_cert(const unsigned int id,
 					     const char *source,
 					     const void *data, size_t len,
-					     key_perm_t perm)
+					     struct key_acl *acl)
 {
 	return 0;
 }
diff --git a/security/integrity/platform_certs/platform_keyring.c b/security/integrity/platform_certs/platform_keyring.c
index bcafd7387729..7646e35f2d91 100644
--- a/security/integrity/platform_certs/platform_keyring.c
+++ b/security/integrity/platform_certs/platform_keyring.c
@@ -14,6 +14,15 @@
 #include <linux/slab.h>
 #include "../integrity.h"
 
+static struct key_acl platform_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
 /**
  * add_to_platform_keyring - Add to platform keyring without validation.
  * @source: Source of key
@@ -26,13 +35,10 @@
 void __init add_to_platform_keyring(const char *source, const void *data,
 				    size_t len)
 {
-	key_perm_t perm;
 	int rc;
 
-	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW;
-
 	rc = integrity_load_cert(INTEGRITY_KEYRING_PLATFORM, source, data, len,
-				 perm);
+				 &platform_key_acl);
 	if (rc)
 		pr_info("Error adding keys to platform keyring %s\n", source);
 }
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 1b1456b21a93..dc76c60a27a6 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -307,7 +307,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k
 	const struct user_key_payload *upayload;
 	struct key *ukey;
 
-	ukey = request_key(&key_type_user, master_desc, NULL);
+	ukey = request_key(&key_type_user, master_desc, NULL, NULL);
 	if (IS_ERR(ukey))
 		goto error;
 
diff --git a/security/keys/encrypted-keys/masterkey_trusted.c b/security/keys/encrypted-keys/masterkey_trusted.c
index dc3d18cae642..3322e7eeafce 100644
--- a/security/keys/encrypted-keys/masterkey_trusted.c
+++ b/security/keys/encrypted-keys/masterkey_trusted.c
@@ -33,7 +33,7 @@ struct key *request_trusted_key(const char *trusted_desc,
 	struct trusted_key_payload *tpayload;
 	struct key *tkey;
 
-	tkey = request_key(&key_type_trusted, trusted_desc, NULL);
+	tkey = request_key(&key_type_trusted, trusted_desc, NULL, NULL);
 	if (IS_ERR(tkey))
 		goto error;
 
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 83d279fb7793..3b13fb62827f 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -155,6 +155,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys)
 
 		key_user_put(key->user);
 		key_put_tag(key->domain_tag);
+		key_put_acl(rcu_access_pointer(key->acl));
 		kfree(key->description);
 
 		memzero_explicit(key, sizeof(*key));
@@ -224,7 +225,6 @@ continue_scanning:
 			if (key->type == key_gc_dead_keytype) {
 				gc_state |= KEY_GC_FOUND_DEAD_KEY;
 				set_bit(KEY_FLAG_DEAD, &key->flags);
-				key->perm = 0;
 				goto skip_dead_key;
 			} else if (key->type == &key_type_keyring &&
 				   key->restrict_link) {
diff --git a/security/keys/internal.h b/security/keys/internal.h
index f1f2b076f3a1..9375d6289bb9 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -88,8 +88,11 @@ extern struct rb_root key_serial_tree;
 extern spinlock_t key_serial_lock;
 extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
+extern struct key_acl default_key_acl;
+extern struct key_acl joinable_keyring_acl;
 
 extern void key_set_index_key(struct keyring_index_key *index_key);
+
 extern struct key_type *key_type_lookup(const char *type);
 extern void key_type_put(struct key_type *ktype);
 
@@ -160,6 +163,7 @@ extern struct key *request_key_and_link(struct key_type *type,
 					const void *callout_info,
 					size_t callout_len,
 					void *aux,
+					struct key_acl *acl,
 					struct key *dest_keyring,
 					unsigned long flags);
 
@@ -183,7 +187,10 @@ extern void key_gc_keytype(struct key_type *ktype);
 
 extern int key_task_permission(const key_ref_t key_ref,
 			       const struct cred *cred,
-			       key_perm_t perm);
+			       u32 desired_perm);
+extern unsigned int key_acl_to_perm(const struct key_acl *acl);
+extern long key_set_acl(struct key *key, struct key_acl *acl);
+extern void key_put_acl(struct key_acl *acl);
 
 /*
  * Check to see whether permission is granted to use a key in the desired way.
@@ -230,7 +237,7 @@ extern long keyctl_keyring_search(key_serial_t, const char __user *,
 				  const char __user *, key_serial_t);
 extern long keyctl_read_key(key_serial_t, char __user *, size_t);
 extern long keyctl_chown_key(key_serial_t, uid_t, gid_t);
-extern long keyctl_setperm_key(key_serial_t, key_perm_t);
+extern long keyctl_setperm_key(key_serial_t, unsigned int);
 extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
diff --git a/security/keys/key.c b/security/keys/key.c
index 85fdc2ea6c14..bb96d6235ea2 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -199,7 +199,7 @@ serial_exists:
  * @uid: The owner of the new key.
  * @gid: The group ID for the new key's group permissions.
  * @cred: The credentials specifying UID namespace.
- * @perm: The permissions mask of the new key.
+ * @acl: The ACL to attach to the new key.
  * @flags: Flags specifying quota properties.
  * @restrict_link: Optional link restriction for new keyrings.
  *
@@ -227,7 +227,7 @@ serial_exists:
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
-		      key_perm_t perm, unsigned long flags,
+		      struct key_acl *acl, unsigned long flags,
 		      struct key_restriction *restrict_link)
 {
 	struct key_user *user = NULL;
@@ -250,6 +250,9 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	desclen = strlen(desc);
 	quotalen = desclen + 1 + type->def_datalen;
 
+	if (!acl)
+		acl = &default_key_acl;
+
 	/* get hold of the key tracking for this user */
 	user = key_user_lookup(uid);
 	if (!user)
@@ -296,7 +299,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->datalen = type->def_datalen;
 	key->uid = uid;
 	key->gid = gid;
-	key->perm = perm;
+	refcount_inc(&acl->usage);
+	rcu_assign_pointer(key->acl, acl);
 	key->restrict_link = restrict_link;
 	key->last_used_at = ktime_get_real_seconds();
 
@@ -791,7 +795,7 @@ error:
  * @description: The searchable description for the key.
  * @payload: The data to use to instantiate or update the key.
  * @plen: The length of @payload.
- * @perm: The permissions mask for a new key.
+ * @acl: The ACL to attach if a key is created.
  * @flags: The quota flags for a new key.
  *
  * Search the destination keyring for a key of the same description and if one
@@ -814,7 +818,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       const char *description,
 			       const void *payload,
 			       size_t plen,
-			       key_perm_t perm,
+			       struct key_acl *acl,
 			       unsigned long flags)
 {
 	struct keyring_index_key index_key = {
@@ -911,22 +915,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			goto found_matching_key;
 	}
 
-	/* if the client doesn't provide, decide on the permissions we want */
-	if (perm == KEY_PERM_UNDEF) {
-		perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
-		perm |= KEY_USR_VIEW;
-
-		if (index_key.type->read)
-			perm |= KEY_POS_READ;
-
-		if (index_key.type == &key_type_keyring ||
-		    index_key.type->update)
-			perm |= KEY_POS_WRITE;
-	}
-
 	/* allocate a new key */
 	key = key_alloc(index_key.type, index_key.description,
-			cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
+			cred->fsuid, cred->fsgid, cred, acl, flags, NULL);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_link_end;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index d2f8eabcbcf4..c8911b430e59 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -134,8 +134,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
 	/* create or update the requested key and add it to the target
 	 * keyring */
 	key_ref = key_create_or_update(keyring_ref, type, description,
-				       payload, plen, KEY_PERM_UNDEF,
-				       KEY_ALLOC_IN_QUOTA);
+				       payload, plen, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key_ref)) {
 		ret = key_ref_to_ptr(key_ref)->serial;
 		key_ref_put(key_ref);
@@ -225,7 +224,8 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type,
 
 	/* do the search */
 	key = request_key_and_link(ktype, description, NULL, callout_info,
-				   callout_len, NULL, key_ref_to_ptr(dest_ref),
+				   callout_len, NULL, NULL,
+				   key_ref_to_ptr(dest_ref),
 				   KEY_ALLOC_IN_QUOTA);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -387,16 +387,10 @@ long keyctl_revoke_key(key_serial_t id)
 	struct key *key;
 	long ret;
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_REVOKE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
-		if (ret != -EACCES)
-			goto error;
-		key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
-		if (IS_ERR(key_ref)) {
-			ret = PTR_ERR(key_ref);
-			goto error;
-		}
+		goto error;
 	}
 
 	key = key_ref_to_ptr(key_ref);
@@ -430,7 +424,7 @@ long keyctl_invalidate_key(key_serial_t id)
 
 	kenter("%d", id);
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_INVAL);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 
@@ -475,7 +469,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	struct key *keyring;
 	long ret;
 
-	keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
+	keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_CLEAR);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 
@@ -650,6 +644,7 @@ long keyctl_describe_key(key_serial_t keyid,
 			 size_t buflen)
 {
 	struct key *key, *instkey;
+	unsigned int perm;
 	key_ref_t key_ref;
 	char *infobuf;
 	long ret;
@@ -679,6 +674,10 @@ okay:
 	key = key_ref_to_ptr(key_ref);
 	desclen = strlen(key->description);
 
+	rcu_read_lock();
+	perm = key_acl_to_perm(rcu_dereference(key->acl));
+	rcu_read_unlock();
+
 	/* calculate how much information we're going to return */
 	ret = -ENOMEM;
 	infobuf = kasprintf(GFP_KERNEL,
@@ -686,7 +685,7 @@ okay:
 			    key->type->name,
 			    from_kuid_munged(current_user_ns(), key->uid),
 			    from_kgid_munged(current_user_ns(), key->gid),
-			    key->perm);
+			    perm);
 	if (!infobuf)
 		goto error2;
 	infolen = strlen(infobuf);
@@ -903,7 +902,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group)
 		goto error;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETATTR);
+				  KEY_NEED_SETSEC);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -998,18 +997,25 @@ quota_overrun:
  * the key need not be fully instantiated yet.  If the caller does not have
  * sysadmin capability, it may only change the permission on keys that it owns.
  */
-long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
+long keyctl_setperm_key(key_serial_t id, unsigned int perm)
 {
+	struct key_acl *acl;
 	struct key *key;
 	key_ref_t key_ref;
 	long ret;
+	int nr, i, j;
 
-	ret = -EINVAL;
 	if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
-		goto error;
+		return -EINVAL;
+
+	nr = 0;
+	if (perm & KEY_POS_ALL) nr++;
+	if (perm & KEY_USR_ALL) nr++;
+	if (perm & KEY_GRP_ALL) nr++;
+	if (perm & KEY_OTH_ALL) nr++;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETATTR);
+				  KEY_NEED_SETSEC);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -1017,17 +1023,45 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 
 	key = key_ref_to_ptr(key_ref);
 
-	/* make the changes with the locks held to prevent chown/chmod races */
-	ret = -EACCES;
-	down_write(&key->sem);
+	ret = -EOPNOTSUPP;
+	if (test_bit(KEY_FLAG_HAS_ACL, &key->flags))
+		goto error_key;
 
-	/* if we're not the sysadmin, we can only change a key that we own */
-	if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) {
-		key->perm = perm;
-		ret = 0;
+	ret = -ENOMEM;
+	acl = kzalloc(struct_size(acl, aces, nr), GFP_KERNEL);
+	if (!acl)
+		goto error_key;
+
+	refcount_set(&acl->usage, 1);
+	acl->nr_ace = nr;
+	j = 0;
+	for (i = 0; i < 4; i++) {
+		struct key_ace *ace = &acl->aces[j];
+		unsigned int subset = (perm >> (i * 8)) & KEY_OTH_ALL;
+
+		if (!subset)
+			continue;
+		ace->type = KEY_ACE_SUBJ_STANDARD;
+		ace->subject_id = KEY_ACE_EVERYONE + i;
+		ace->perm = subset;
+		if (subset & (KEY_OTH_WRITE | KEY_OTH_SETATTR))
+			ace->perm |= KEY_ACE_REVOKE;
+		if (subset & KEY_OTH_SEARCH)
+			ace->perm |= KEY_ACE_INVAL;
+		if (key->type == &key_type_keyring) {
+			if (subset & KEY_OTH_SEARCH)
+				ace->perm |= KEY_ACE_JOIN;
+			if (subset & KEY_OTH_WRITE)
+				ace->perm |= KEY_ACE_CLEAR;
+		}
+		j++;
 	}
 
+	/* make the changes with the locks held to prevent chown/chmod races */
+	down_write(&key->sem);
+	ret = key_set_acl(key, acl);
 	up_write(&key->sem);
+error_key:
 	key_put(key);
 error:
 	return ret;
@@ -1392,7 +1426,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout)
 	long ret;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETATTR);
+				  KEY_NEED_SETSEC);
 	if (IS_ERR(key_ref)) {
 		/* setting the timeout on a key under construction is permitted
 		 * if we have the authorisation token handy */
@@ -1543,7 +1577,7 @@ long keyctl_get_security(key_serial_t keyid,
  * Attempt to install the calling process's session keyring on the process's
  * parent process.
  *
- * The keyring must exist and must grant the caller LINK permission, and the
+ * The keyring must exist and must grant the caller JOIN permission, and the
  * parent process must be single-threaded and must have the same effective
  * ownership as this process and mustn't be SUID/SGID.
  *
@@ -1560,7 +1594,7 @@ long keyctl_session_to_parent(void)
 	struct cred *cred;
 	int ret;
 
-	keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK);
+	keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_JOIN);
 	if (IS_ERR(keyring_r))
 		return PTR_ERR(keyring_r);
 
@@ -1662,7 +1696,7 @@ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type,
 	char *restriction = NULL;
 	long ret;
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_SETSEC);
 	if (IS_ERR(key_ref))
 		return PTR_ERR(key_ref);
 
@@ -1768,7 +1802,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 	case KEYCTL_SETPERM:
 		return keyctl_setperm_key((key_serial_t) arg2,
-					  (key_perm_t) arg3);
+					  (unsigned int)arg3);
 
 	case KEYCTL_INSTANTIATE:
 		return keyctl_instantiate_key((key_serial_t) arg2,
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 29c31585ed61..62fb26c61968 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -519,11 +519,19 @@ static long keyring_read(const struct key *keyring,
 	return ret;
 }
 
-/*
- * Allocate a keyring and link into the destination keyring.
+/**
+ * keyring_alloc - Allocate a keyring and link into the destination
+ * @description: The key description to allow the key to be searched out.
+ * @uid: The owner of the new key.
+ * @gid: The group ID for the new key's group permissions.
+ * @cred: The credentials specifying UID namespace.
+ * @acl: The ACL to attach to the new key.
+ * @flags: Flags specifying quota properties.
+ * @restrict_link: Optional link restriction for new keyrings.
+ * @dest: Destination keyring.
  */
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
-			  const struct cred *cred, key_perm_t perm,
+			  const struct cred *cred, struct key_acl *acl,
 			  unsigned long flags,
 			  struct key_restriction *restrict_link,
 			  struct key *dest)
@@ -532,7 +540,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, cred, perm, flags, restrict_link);
+			    uid, gid, cred, acl, flags, restrict_link);
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
@@ -1136,10 +1144,11 @@ found:
 /*
  * Find a keyring with the specified name.
  *
- * Only keyrings that have nonzero refcount, are not revoked, and are owned by a
- * user in the current user namespace are considered.  If @uid_keyring is %true,
- * the keyring additionally must have been allocated as a user or user session
- * keyring; otherwise, it must grant Search permission directly to the caller.
+ * Only keyrings that have nonzero refcount, are not revoked, and are owned by
+ * a user in the current user namespace are considered.  If @uid_keyring is
+ * %true, the keyring additionally must have been allocated as a user or user
+ * session keyring; otherwise, it must grant JOIN permission directly to the
+ * caller (ie. not through possession).
  *
  * Returns a pointer to the keyring with the keyring's refcount having being
  * incremented on success.  -ENOKEY is returned if a key could not be found.
@@ -1173,7 +1182,7 @@ struct key *find_keyring_by_name(const char *name, bool uid_keyring)
 				continue;
 		} else {
 			if (key_permission(make_key_ref(keyring, 0),
-					   KEY_NEED_SEARCH) < 0)
+					   KEY_NEED_JOIN) < 0)
 				continue;
 		}
 
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 06df9d5e7572..e3237bb2e970 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -11,13 +11,67 @@
 
 #include <linux/export.h>
 #include <linux/security.h>
+#include <linux/user_namespace.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
+struct key_acl default_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+EXPORT_SYMBOL(default_key_acl);
+
+struct key_acl joinable_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces	= {
+		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_LINK | KEY_ACE_JOIN),
+	}
+};
+EXPORT_SYMBOL(joinable_keyring_acl);
+
+struct key_acl internal_key_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH),
+	}
+};
+EXPORT_SYMBOL(internal_key_acl);
+
+struct key_acl internal_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH),
+	}
+};
+EXPORT_SYMBOL(internal_keyring_acl);
+
+struct key_acl internal_writable_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH),
+	}
+};
+EXPORT_SYMBOL(internal_writable_keyring_acl);
+
 /**
  * key_task_permission - Check a key can be used
  * @key_ref: The key to check.
  * @cred: The credentials to use.
- * @perm: The permissions to check for.
+ * @desired_perm: The permission to check for.
  *
  * Check to see whether permission is granted to use a key in the desired way,
  * but permit the security modules to override.
@@ -28,53 +82,73 @@
  * permissions bits or the LSM check.
  */
 int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
-			unsigned perm)
+			unsigned int desired_perm)
 {
-	struct key *key;
-	key_perm_t kperm;
-	int ret;
+	const struct key_acl *acl;
+	const struct key *key;
+	unsigned int allow = 0;
+	int i;
+
+	BUILD_BUG_ON(KEY_NEED_VIEW	!= KEY_ACE_VIEW		||
+		     KEY_NEED_READ	!= KEY_ACE_READ		||
+		     KEY_NEED_WRITE	!= KEY_ACE_WRITE	||
+		     KEY_NEED_SEARCH	!= KEY_ACE_SEARCH	||
+		     KEY_NEED_LINK	!= KEY_ACE_LINK		||
+		     KEY_NEED_SETSEC	!= KEY_ACE_SET_SECURITY	||
+		     KEY_NEED_INVAL	!= KEY_ACE_INVAL	||
+		     KEY_NEED_REVOKE	!= KEY_ACE_REVOKE	||
+		     KEY_NEED_JOIN	!= KEY_ACE_JOIN		||
+		     KEY_NEED_CLEAR	!= KEY_ACE_CLEAR);
 
 	key = key_ref_to_ptr(key_ref);
 
-	/* use the second 8-bits of permissions for keys the caller owns */
-	if (uid_eq(key->uid, cred->fsuid)) {
-		kperm = key->perm >> 16;
-		goto use_these_perms;
-	}
+	rcu_read_lock();
 
-	/* use the third 8-bits of permissions for keys the caller has a group
-	 * membership in common with */
-	if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) {
-		if (gid_eq(key->gid, cred->fsgid)) {
-			kperm = key->perm >> 8;
-			goto use_these_perms;
-		}
+	acl = rcu_dereference(key->acl);
+	if (!acl || acl->nr_ace == 0)
+		goto no_access_rcu;
+
+	for (i = 0; i < acl->nr_ace; i++) {
+		const struct key_ace *ace = &acl->aces[i];
 
-		ret = groups_search(cred->group_info, key->gid);
-		if (ret) {
-			kperm = key->perm >> 8;
-			goto use_these_perms;
+		switch (ace->type) {
+		case KEY_ACE_SUBJ_STANDARD:
+			switch (ace->subject_id) {
+			case KEY_ACE_POSSESSOR:
+				if (is_key_possessed(key_ref))
+					allow |= ace->perm;
+				break;
+			case KEY_ACE_OWNER:
+				if (uid_eq(key->uid, cred->fsuid))
+					allow |= ace->perm;
+				break;
+			case KEY_ACE_GROUP:
+				if (gid_valid(key->gid)) {
+					if (gid_eq(key->gid, cred->fsgid))
+						allow |= ace->perm;
+					else if (groups_search(cred->group_info, key->gid))
+						allow |= ace->perm;
+				}
+				break;
+			case KEY_ACE_EVERYONE:
+				allow |= ace->perm;
+				break;
+			}
+			break;
 		}
 	}
 
-	/* otherwise use the least-significant 8-bits */
-	kperm = key->perm;
-
-use_these_perms:
+	rcu_read_unlock();
 
-	/* use the top 8-bits of permissions for keys the caller possesses
-	 * - possessor permissions are additive with other permissions
-	 */
-	if (is_key_possessed(key_ref))
-		kperm |= key->perm >> 24;
+	if (!(allow & desired_perm))
+		goto no_access;
 
-	kperm = kperm & perm & KEY_NEED_ALL;
+	return security_key_permission(key_ref, cred, desired_perm);
 
-	if (kperm != perm)
-		return -EACCES;
-
-	/* let LSM be the final arbiter */
-	return security_key_permission(key_ref, cred, perm);
+no_access_rcu:
+	rcu_read_unlock();
+no_access:
+	return -EACCES;
 }
 EXPORT_SYMBOL(key_task_permission);
 
@@ -108,3 +182,99 @@ int key_validate(const struct key *key)
 	return 0;
 }
 EXPORT_SYMBOL(key_validate);
+
+/*
+ * Roughly render an ACL to an old-style permissions mask.  We cannot
+ * accurately render what the ACL, particularly if it has ACEs that represent
+ * subjects outside of { poss, user, group, other }.
+ */
+unsigned int key_acl_to_perm(const struct key_acl *acl)
+{
+	unsigned int perm = 0, tperm;
+	int i;
+
+	BUILD_BUG_ON(KEY_OTH_VIEW	!= KEY_ACE_VIEW		||
+		     KEY_OTH_READ	!= KEY_ACE_READ		||
+		     KEY_OTH_WRITE	!= KEY_ACE_WRITE	||
+		     KEY_OTH_SEARCH	!= KEY_ACE_SEARCH	||
+		     KEY_OTH_LINK	!= KEY_ACE_LINK		||
+		     KEY_OTH_SETATTR	!= KEY_ACE_SET_SECURITY);
+
+	if (!acl || acl->nr_ace == 0)
+		return 0;
+
+	for (i = 0; i < acl->nr_ace; i++) {
+		const struct key_ace *ace = &acl->aces[i];
+
+		switch (ace->type) {
+		case KEY_ACE_SUBJ_STANDARD:
+			tperm = ace->perm & KEY_OTH_ALL;
+
+			/* Invalidation and joining were allowed by SEARCH */
+			if (ace->perm & (KEY_ACE_INVAL | KEY_ACE_JOIN))
+				tperm |= KEY_OTH_SEARCH;
+
+			/* Revocation was allowed by either SETATTR or WRITE */
+			if ((ace->perm & KEY_ACE_REVOKE) && !(tperm & KEY_OTH_SETATTR))
+				tperm |= KEY_OTH_WRITE;
+
+			/* Clearing was allowed by WRITE */
+			if (ace->perm & KEY_ACE_CLEAR)
+				tperm |= KEY_OTH_WRITE;
+
+			switch (ace->subject_id) {
+			case KEY_ACE_POSSESSOR:
+				perm |= tperm << 24;
+				break;
+			case KEY_ACE_OWNER:
+				perm |= tperm << 16;
+				break;
+			case KEY_ACE_GROUP:
+				perm |= tperm << 8;
+				break;
+			case KEY_ACE_EVERYONE:
+				perm |= tperm << 0;
+				break;
+			}
+		}
+	}
+
+	return perm;
+}
+
+/*
+ * Destroy a key's ACL.
+ */
+void key_put_acl(struct key_acl *acl)
+{
+	if (acl && refcount_dec_and_test(&acl->usage))
+		kfree_rcu(acl, rcu);
+}
+
+/*
+ * Try to set the ACL.  This either attaches or discards the proposed ACL.
+ */
+long key_set_acl(struct key *key, struct key_acl *acl)
+{
+	int i;
+
+	/* If we're not the sysadmin, we can only change a key that we own. */
+	if (!capable(CAP_SYS_ADMIN) && !uid_eq(key->uid, current_fsuid())) {
+		key_put_acl(acl);
+		return -EACCES;
+	}
+
+	for (i = 0; i < acl->nr_ace; i++) {
+		const struct key_ace *ace = &acl->aces[i];
+		if (ace->type == KEY_ACE_SUBJ_STANDARD &&
+		    ace->subject_id == KEY_ACE_POSSESSOR) {
+			if (ace->perm & KEY_ACE_VIEW)
+				acl->possessor_viewable = true;
+			break;
+		}
+	}
+
+	rcu_swap_protected(key->acl, acl, lockdep_is_held(&key->sem));
+	key_put_acl(acl);
+	return 0;
+}
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index 9944d855a28d..c4c480f630ea 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -16,6 +16,27 @@
 
 unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
 
+static struct key_acl persistent_register_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
+static struct key_acl persistent_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE |
+				  KEY_ACE_SEARCH | KEY_ACE_LINK |
+				  KEY_ACE_CLEAR | KEY_ACE_INVAL),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
 /*
  * Create the persistent keyring register for the current user namespace.
  *
@@ -26,8 +47,7 @@ static int key_create_persistent_register(struct user_namespace *ns)
 	struct key *reg = keyring_alloc(".persistent_register",
 					KUIDT_INIT(0), KGIDT_INIT(0),
 					current_cred(),
-					((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-					 KEY_USR_VIEW | KEY_USR_READ),
+					&persistent_register_keyring_acl,
 					KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
@@ -60,8 +80,7 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
 
 	persistent = keyring_alloc(index_key->description,
 				   uid, INVALID_GID, current_cred(),
-				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				    KEY_USR_VIEW | KEY_USR_READ),
+				   &persistent_keyring_acl,
 				   KEY_ALLOC_NOT_IN_QUOTA, NULL,
 				   ns->persistent_keyring_register);
 	if (IS_ERR(persistent))
diff --git a/security/keys/proc.c b/security/keys/proc.c
index b4f5ba56b9cb..0056fe2dc39b 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -114,11 +114,13 @@ static struct key *find_ge_key(struct seq_file *p, key_serial_t id)
 }
 
 static void *proc_keys_start(struct seq_file *p, loff_t *_pos)
+	__acquires(rcu)
 	__acquires(key_serial_lock)
 {
 	key_serial_t pos = *_pos;
 	struct key *key;
 
+	rcu_read_lock();
 	spin_lock(&key_serial_lock);
 
 	if (*_pos > INT_MAX)
@@ -148,12 +150,15 @@ static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos)
 
 static void proc_keys_stop(struct seq_file *p, void *v)
 	__releases(key_serial_lock)
+	__releases(rcu)
 {
 	spin_unlock(&key_serial_lock);
+	rcu_read_unlock();
 }
 
 static int proc_keys_show(struct seq_file *m, void *v)
 {
+	const struct key_acl *acl;
 	struct rb_node *_p = v;
 	struct key *key = rb_entry(_p, struct key, serial_node);
 	unsigned long flags;
@@ -161,6 +166,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	time64_t now, expiry;
 	char xbuf[16];
 	short state;
+	bool check_pos;
 	u64 timo;
 	int rc;
 
@@ -174,15 +180,15 @@ static int proc_keys_show(struct seq_file *m, void *v)
 					   KEYRING_SEARCH_RECURSE),
 	};
 
-	key_ref = make_key_ref(key, 0);
+	acl = rcu_dereference(key->acl);
+	check_pos = acl->possessor_viewable;
 
 	/* determine if the key is possessed by this process (a test we can
 	 * skip if the key does not indicate the possessor can view it
 	 */
-	if (key->perm & KEY_POS_VIEW) {
-		rcu_read_lock();
+	key_ref = make_key_ref(key, 0);
+	if (check_pos) {
 		skey_ref = search_cred_keyrings_rcu(&ctx);
-		rcu_read_unlock();
 		if (!IS_ERR(skey_ref)) {
 			key_ref_put(skey_ref);
 			key_ref = make_key_ref(key, 1);
@@ -192,12 +198,10 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	/* check whether the current task is allowed to view the key */
 	rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW);
 	if (rc < 0)
-		return 0;
+		goto out;
 
 	now = ktime_get_real_seconds();
 
-	rcu_read_lock();
-
 	/* come up with a suitable timeout value */
 	expiry = READ_ONCE(key->expiry);
 	if (expiry == 0) {
@@ -236,7 +240,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		   showflag(flags, 'i', KEY_FLAG_INVALIDATED),
 		   refcount_read(&key->usage),
 		   xbuf,
-		   key->perm,
+		   key_acl_to_perm(acl),
 		   from_kuid_munged(seq_user_ns(m), key->uid),
 		   from_kgid_munged(seq_user_ns(m), key->gid),
 		   key->type->name);
@@ -247,7 +251,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		key->type->describe(key, m);
 	seq_putc(m, '\n');
 
-	rcu_read_unlock();
+out:
 	return 0;
 }
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index f74d64215942..ddda8544630d 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -36,6 +36,47 @@ struct key_user root_key_user = {
 	.uid		= GLOBAL_ROOT_UID,
 };
 
+static struct key_acl user_reg_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.possessor_viewable = true,
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_WRITE | KEY_ACE_SEARCH),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
+static struct key_acl user_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.possessor_viewable = true,
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE |
+				  KEY_ACE_SEARCH | KEY_ACE_LINK),
+		KEY_OWNER_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)),
+	}
+};
+
+static struct key_acl session_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.possessor_viewable = true,
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
+		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
+	}
+};
+
+static struct key_acl thread_and_process_keyring_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.possessor_viewable = true,
+	.nr_ace	= 2,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
 /*
  * Get or create a user register keyring.
  */
@@ -55,11 +96,8 @@ static struct key *get_user_register(struct user_namespace *user_ns)
 	if (!reg_keyring) {
 		reg_keyring = keyring_alloc(".user_reg",
 					    user_ns->owner, INVALID_GID,
-					    &init_cred,
-					    KEY_POS_WRITE | KEY_POS_SEARCH |
-					    KEY_USR_VIEW | KEY_USR_READ,
-					    0,
-					    NULL, NULL);
+					    &init_cred, &user_reg_keyring_acl,
+					    0, NULL, NULL);
 		if (!IS_ERR(reg_keyring))
 			smp_store_release(&user_ns->user_keyring_register,
 					  reg_keyring);
@@ -81,14 +119,11 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	const struct cred *cred = current_cred();
 	struct user_namespace *user_ns = current_user_ns();
 	struct key *reg_keyring, *uid_keyring, *session_keyring;
-	key_perm_t user_keyring_perm;
 	key_ref_t uid_keyring_r, session_keyring_r;
 	uid_t uid = from_kuid(user_ns, cred->user->uid);
 	char buf[20];
 	int ret;
 
-	user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;
-
 	kenter("%u", uid);
 
 	reg_keyring = get_user_register(user_ns);
@@ -108,7 +143,7 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	kdebug("_uid %p", uid_keyring_r);
 	if (uid_keyring_r == ERR_PTR(-EAGAIN)) {
 		uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
-					    cred, user_keyring_perm,
+					    cred, &user_keyring_acl,
 					    KEY_ALLOC_UID_KEYRING |
 					    KEY_ALLOC_IN_QUOTA,
 					    NULL, reg_keyring);
@@ -130,7 +165,7 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	kdebug("_uid_ses %p", session_keyring_r);
 	if (session_keyring_r == ERR_PTR(-EAGAIN)) {
 		session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
-						cred, user_keyring_perm,
+						cred, &user_keyring_acl,
 						KEY_ALLOC_UID_KEYRING |
 						KEY_ALLOC_IN_QUOTA,
 						NULL, NULL);
@@ -230,7 +265,7 @@ int install_thread_keyring_to_cred(struct cred *new)
 		return 0;
 
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
-				KEY_POS_ALL | KEY_USR_VIEW,
+				&thread_and_process_keyring_acl,
 				KEY_ALLOC_QUOTA_OVERRUN,
 				NULL, NULL);
 	if (IS_ERR(keyring))
@@ -277,7 +312,7 @@ int install_process_keyring_to_cred(struct cred *new)
 		return 0;
 
 	keyring = keyring_alloc("_pid", new->uid, new->gid, new,
-				KEY_POS_ALL | KEY_USR_VIEW,
+				&thread_and_process_keyring_acl,
 				KEY_ALLOC_QUOTA_OVERRUN,
 				NULL, NULL);
 	if (IS_ERR(keyring))
@@ -332,8 +367,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
 		keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
-					KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-					flags, NULL, NULL);
+					&session_keyring_acl, flags, NULL, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	} else {
@@ -613,7 +647,7 @@ bool lookup_user_key_possessed(const struct key *key,
  * returned key reference.
  */
 key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
-			  key_perm_t perm)
+			  unsigned int desired_perm)
 {
 	struct keyring_search_context ctx = {
 		.match_data.cmp		= lookup_user_key_possessed,
@@ -788,12 +822,12 @@ try_again:
 		case -ERESTARTSYS:
 			goto invalid_key;
 		default:
-			if (perm)
+			if (desired_perm)
 				goto invalid_key;
 		case 0:
 			break;
 		}
-	} else if (perm) {
+	} else if (desired_perm) {
 		ret = key_validate(key);
 		if (ret < 0)
 			goto invalid_key;
@@ -805,9 +839,11 @@ try_again:
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, ctx.cred, perm);
-	if (ret < 0)
-		goto invalid_key;
+	if (desired_perm) {
+		ret = key_task_permission(key_ref, ctx.cred, desired_perm);
+		if (ret < 0)
+			goto invalid_key;
+	}
 
 	key->last_used_at = ktime_get_real_seconds();
 
@@ -872,13 +908,13 @@ long join_session_keyring(const char *name)
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
 		keyring = keyring_alloc(
-			name, old->uid, old->gid, old,
-			KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
+			name, old->uid, old->gid, old, &joinable_keyring_acl,
 			KEY_ALLOC_IN_QUOTA, NULL, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
 		}
+		goto no_perm_test;
 	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
@@ -887,6 +923,12 @@ long join_session_keyring(const char *name)
 		goto error3;
 	}
 
+	ret = key_task_permission(make_key_ref(keyring, false), old,
+				  KEY_NEED_JOIN);
+	if (ret < 0)
+		goto error3;
+
+no_perm_test:
 	/* we've got a keyring - now to install it */
 	ret = install_session_keyring_to_cred(new, keyring);
 	if (ret < 0)
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index aa589d3c90e2..64af697a9126 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -139,8 +139,7 @@ static int call_sbin_request_key(struct key *authkey, void *aux)
 
 	cred = get_current_cred();
 	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
-				KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
+				NULL, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
 	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -371,11 +370,11 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 			       struct key *dest_keyring,
 			       unsigned long flags,
 			       struct key_user *user,
+			       struct key_acl *acl,
 			       struct key **_key)
 {
 	struct assoc_array_edit *edit = NULL;
 	struct key *key;
-	key_perm_t perm;
 	key_ref_t key_ref;
 	int ret;
 
@@ -385,17 +384,9 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 	*_key = NULL;
 	mutex_lock(&user->cons_lock);
 
-	perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
-	perm |= KEY_USR_VIEW;
-	if (ctx->index_key.type->read)
-		perm |= KEY_POS_READ;
-	if (ctx->index_key.type == &key_type_keyring ||
-	    ctx->index_key.type->update)
-		perm |= KEY_POS_WRITE;
-
 	key = key_alloc(ctx->index_key.type, ctx->index_key.description,
 			ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
-			perm, flags, NULL);
+			acl, flags, NULL);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -478,6 +469,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx,
 					  const char *callout_info,
 					  size_t callout_len,
 					  void *aux,
+					  struct key_acl *acl,
 					  struct key *dest_keyring,
 					  unsigned long flags)
 {
@@ -500,7 +492,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx,
 		goto error_put_dest_keyring;
 	}
 
-	ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key);
+	ret = construct_alloc_key(ctx, dest_keyring, flags, user, acl, &key);
 	key_user_put(user);
 
 	if (ret == 0) {
@@ -538,6 +530,7 @@ error:
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
+ * @acl: The ACL to attach if a new key is created.
  * @dest_keyring: Where to cache the key.
  * @flags: Flags to key_alloc().
  *
@@ -565,6 +558,7 @@ struct key *request_key_and_link(struct key_type *type,
 				 const void *callout_info,
 				 size_t callout_len,
 				 void *aux,
+				 struct key_acl *acl,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
@@ -639,7 +633,7 @@ struct key *request_key_and_link(struct key_type *type,
 			goto error_free;
 
 		key = construct_key_and_link(&ctx, callout_info, callout_len,
-					     aux, dest_keyring, flags);
+					     aux, acl, dest_keyring, flags);
 	}
 
 error_free:
@@ -682,6 +676,7 @@ EXPORT_SYMBOL(wait_for_key_construction);
  * @description: The searchable description of the key.
  * @domain_tag: The domain in which the key operates.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
+ * @acl: The ACL to attach if a new key is created.
  *
  * As for request_key_and_link() except that it does not add the returned key
  * to a keyring if found, new keys are always allocated in the user's quota,
@@ -694,7 +689,8 @@ EXPORT_SYMBOL(wait_for_key_construction);
 struct key *request_key_tag(struct key_type *type,
 			    const char *description,
 			    struct key_tag *domain_tag,
-			    const char *callout_info)
+			    const char *callout_info,
+			    struct key_acl *acl)
 {
 	struct key *key;
 	size_t callout_len = 0;
@@ -704,7 +700,7 @@ struct key *request_key_tag(struct key_type *type,
 		callout_len = strlen(callout_info);
 	key = request_key_and_link(type, description, domain_tag,
 				   callout_info, callout_len,
-				   NULL, NULL, KEY_ALLOC_IN_QUOTA);
+				   NULL, acl, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
 		if (ret < 0) {
@@ -724,6 +720,7 @@ EXPORT_SYMBOL(request_key_tag);
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
+ * @acl: The ACL to attach if a new key is created.
  *
  * As for request_key_and_link() except that it does not add the returned key
  * to a keyring if found and new keys are always allocated in the user's quota.
@@ -736,14 +733,15 @@ struct key *request_key_with_auxdata(struct key_type *type,
 				     struct key_tag *domain_tag,
 				     const void *callout_info,
 				     size_t callout_len,
-				     void *aux)
+				     void *aux,
+				     struct key_acl *acl)
 {
 	struct key *key;
 	int ret;
 
 	key = request_key_and_link(type, description, domain_tag,
 				   callout_info, callout_len,
-				   aux, NULL, KEY_ALLOC_IN_QUOTA);
+				   aux, acl, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
 		if (ret < 0) {
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index f613987e8a63..d9146606f54e 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -28,6 +28,17 @@ static void request_key_auth_revoke(struct key *);
 static void request_key_auth_destroy(struct key *);
 static long request_key_auth_read(const struct key *, char __user *, size_t);
 
+static struct key_acl request_key_auth_acl = {
+	.usage	= REFCOUNT_INIT(1),
+	.nr_ace	= 2,
+	.possessor_viewable = true,
+	.aces = {
+		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH |
+				  KEY_ACE_LINK),
+		KEY_OWNER_ACE(KEY_ACE_VIEW),
+	}
+};
+
 /*
  * The request-key authorisation key type definition.
  */
@@ -214,8 +225,8 @@ struct key *request_key_auth_new(struct key *target, const char *op,
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
 			    cred->fsuid, cred->fsgid, cred,
-			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_POS_LINK |
-			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			    &request_key_auth_acl,
+			    KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(authkey)) {
 		ret = PTR_ERR(authkey);
 		goto error_free_rka;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c61787b15f27..b828401dcb70 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6481,6 +6481,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 {
 	struct key *key;
 	struct key_security_struct *ksec;
+	unsigned oldstyle_perm;
 	u32 sid;
 
 	/* if no specific permissions are requested, we skip the
@@ -6489,13 +6490,26 @@ static int selinux_key_permission(key_ref_t key_ref,
 	if (perm == 0)
 		return 0;
 
+	oldstyle_perm = perm & (KEY_NEED_VIEW | KEY_NEED_READ | KEY_NEED_WRITE |
+				KEY_NEED_SEARCH | KEY_NEED_LINK);
+	if (perm & KEY_NEED_SETSEC)
+		oldstyle_perm |= OLD_KEY_NEED_SETATTR;
+	if (perm & KEY_NEED_INVAL)
+		oldstyle_perm |= KEY_NEED_SEARCH;
+	if (perm & KEY_NEED_REVOKE && !(perm & OLD_KEY_NEED_SETATTR))
+		oldstyle_perm |= KEY_NEED_WRITE;
+	if (perm & KEY_NEED_JOIN)
+		oldstyle_perm |= KEY_NEED_SEARCH;
+	if (perm & KEY_NEED_CLEAR)
+		oldstyle_perm |= KEY_NEED_WRITE;
+
 	sid = cred_sid(cred);
 
 	key = key_ref_to_ptr(key_ref);
 	ksec = key->security;
 
 	return avc_has_perm(&selinux_state,
-			    sid, ksec->sid, SECCLASS_KEY, perm, NULL);
+			    sid, ksec->sid, SECCLASS_KEY, oldstyle_perm, NULL);
 }
 
 static int selinux_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0de725f88bed..6095dc3565a5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4285,7 +4285,8 @@ static int smack_key_permission(key_ref_t key_ref,
 #endif
 	if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW))
 		request |= MAY_READ;
-	if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR))
+	if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETSEC |
+		    KEY_NEED_INVAL | KEY_NEED_REVOKE | KEY_NEED_CLEAR))
 		request |= MAY_WRITE;
 	rc = smk_access(tkp, keyp->security, request, &ad);
 	rc = smk_bu_note("key access", tkp, keyp->security, request, rc);
-- 
cgit v1.2.3


From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 27 Jun 2019 13:38:47 -0700
Subject: bpf: implement getsockopt and setsockopt hooks

Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and
BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks.

BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before
passing them down to the kernel or bypass kernel completely.
BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that
kernel returns.
Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure.

The buffer memory is pre-allocated (because I don't think there is
a precedent for working with __user memory from bpf). This might be
slow to do for each {s,g}etsockopt call, that's why I've added
__cgroup_bpf_prog_array_is_empty that exits early if there is nothing
attached to a cgroup. Note, however, that there is a race between
__cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup
program layout might have changed; this should not be a problem
because in general there is a race between multiple calls to
{s,g}etsocktop and user adding/removing bpf progs from a cgroup.

The return code of the BPF program is handled as follows:
* 0: EPERM
* 1: success, continue with next BPF program in the cgroup chain

v9:
* allow overwriting setsockopt arguments (Alexei Starovoitov):
  * use set_fs (same as kernel_setsockopt)
  * buffer is always kzalloc'd (no small on-stack buffer)

v8:
* use s32 for optlen (Andrii Nakryiko)

v7:
* return only 0 or 1 (Alexei Starovoitov)
* always run all progs (Alexei Starovoitov)
* use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov)
  (decided to use optval=-1 instead, optval=0 might be a valid input)
* call getsockopt hook after kernel handlers (Alexei Starovoitov)

v6:
* rework cgroup chaining; stop as soon as bpf program returns
  0 or 2; see patch with the documentation for the details
* drop Andrii's and Martin's Acked-by (not sure they are comfortable
  with the new state of things)

v5:
* skip copy_to_user() and put_user() when ret == 0 (Martin Lau)

v4:
* don't export bpf_sk_fullsock helper (Martin Lau)
* size != sizeof(__u64) for uapi pointers (Martin Lau)
* offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau)

v3:
* typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko)
* reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii
  Nakryiko)
* use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau)
* use BPF_FIELD_SIZEOF() for consistency (Martin Lau)
* new CG_SOCKOPT_ACCESS macro to wrap repeated parts

v2:
* moved bpf_sockopt_kern fields around to remove a hole (Martin Lau)
* aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau)
* bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau)
* added [0,2] return code check to verifier (Martin Lau)
* dropped unused buf[64] from the stack (Martin Lau)
* use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau)
* dropped bpf_target_off from ctx rewrites (Martin Lau)
* use return code for kernel bypass (Martin Lau & Andrii Nakryiko)

Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  45 ++++++
 include/linux/bpf.h        |   2 +
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h     |  10 ++
 include/uapi/linux/bpf.h   |  14 ++
 kernel/bpf/cgroup.c        | 333 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          |   9 ++
 kernel/bpf/syscall.c       |  19 +++
 kernel/bpf/verifier.c      |   8 ++
 net/core/filter.c          |   2 +-
 net/socket.c               |  30 ++++
 11 files changed, 472 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index bd79ae32909a..169fd25f6bc2 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   loff_t *ppos, void **new_buf,
 				   enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
+				       int *optname, char __user *optval,
+				       int *optlen, char **kernel_optval);
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+				       int optname, char __user *optval,
+				       int __user *optlen, int max_optlen,
+				       int retval);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen,   \
+				       kernel_optval)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
+							   optname, optval,    \
+							   optlen,	       \
+							   kernel_optval);     \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		get_user(__ret, optlen);				       \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen,   \
+				       max_optlen, retval)		       \
+({									       \
+	int __ret = retval;						       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
+							   optname, optval,    \
+							   optlen, max_optlen, \
+							   retval);	       \
+	__ret;								       \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
+				       optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
+				       kernel_optval) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a62e7889b0b6..18f4cc2c6acd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -518,6 +518,7 @@ struct bpf_prog_array {
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
@@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 extern const struct bpf_func_proto bpf_strtol_proto;
 extern const struct bpf_func_proto bpf_strtoul_proto;
+extern const struct bpf_func_proto bpf_tcp_sock_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a9975678d6f..eec5aeeeaf92 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43b45d6db36d..340f7d648974 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
+struct bpf_sockopt_kern {
+	struct sock	*sk;
+	u8		*optval;
+	u8		*optval_end;
+	s32		level;
+	s32		optname;
+	s32		optlen;
+	s32		retval;
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b077507efa3f..a396b516a2b2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+	BPF_PROG_TYPE_CGROUP_SOCKOPT,
 };
 
 enum bpf_attach_type {
@@ -194,6 +195,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_SYSCTL,
 	BPF_CGROUP_UDP4_RECVMSG,
 	BPF_CGROUP_UDP6_RECVMSG,
+	BPF_CGROUP_GETSOCKOPT,
+	BPF_CGROUP_SETSOCKOPT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3541,4 +3544,15 @@ struct bpf_sysctl {
 				 */
 };
 
+struct bpf_sockopt {
+	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(void *, optval);
+	__bpf_md_ptr(void *, optval_end);
+
+	__s32	level;
+	__s32	optname;
+	__s32	optlen;
+	__s32	retval;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 077ed3a19848..76fa0076f20d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
+#include <net/bpf_sk_storage.h>
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+					     enum bpf_attach_type attach_type)
+{
+	struct bpf_prog_array *prog_array;
+	bool empty;
+
+	rcu_read_lock();
+	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+	empty = bpf_prog_array_is_empty(prog_array);
+	rcu_read_unlock();
+
+	return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+	if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+		return -EINVAL;
+
+	ctx->optval = kzalloc(max_optlen, GFP_USER);
+	if (!ctx->optval)
+		return -ENOMEM;
+
+	ctx->optval_end = ctx->optval + max_optlen;
+	ctx->optlen = max_optlen;
+
+	return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+	kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+				       int *optname, char __user *optval,
+				       int *optlen, char **kernel_optval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = *level,
+		.optname = *optname,
+	};
+	int ret;
+
+	/* Opportunistic check to see whether we have any BPF program
+	 * attached to the hook so we don't waste time allocating
+	 * memory and locking the socket.
+	 */
+	if (!cgroup_bpf_enabled ||
+	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+		return 0;
+
+	ret = sockopt_alloc_buf(&ctx, *optlen);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	lock_sock(sk);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	release_sock(sk);
+
+	if (!ret) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (ctx.optlen == -1) {
+		/* optlen set to -1, bypass kernel */
+		ret = 1;
+	} else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+		/* optlen is out of bounds */
+		ret = -EFAULT;
+	} else {
+		/* optlen within bounds, run kernel handler */
+		ret = 0;
+
+		/* export any potential modifications */
+		*level = ctx.level;
+		*optname = ctx.optname;
+		*optlen = ctx.optlen;
+		*kernel_optval = ctx.optval;
+	}
+
+out:
+	if (ret)
+		sockopt_free_buf(&ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+				       int optname, char __user *optval,
+				       int __user *optlen, int max_optlen,
+				       int retval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = level,
+		.optname = optname,
+		.retval = retval,
+	};
+	int ret;
+
+	/* Opportunistic check to see whether we have any BPF program
+	 * attached to the hook so we don't waste time allocating
+	 * memory and locking the socket.
+	 */
+	if (!cgroup_bpf_enabled ||
+	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+		return retval;
+
+	ret = sockopt_alloc_buf(&ctx, max_optlen);
+	if (ret)
+		return ret;
+
+	if (!retval) {
+		/* If kernel getsockopt finished successfully,
+		 * copy whatever was returned to the user back
+		 * into our temporary buffer. Set optlen to the
+		 * one that kernel returned as well to let
+		 * BPF programs inspect the value.
+		 */
+
+		if (get_user(ctx.optlen, optlen)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		if (ctx.optlen > max_optlen)
+			ctx.optlen = max_optlen;
+
+		if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	lock_sock(sk);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	release_sock(sk);
+
+	if (!ret) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (ctx.optlen > max_optlen) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* BPF programs only allowed to set retval to 0, not some
+	 * arbitrary value.
+	 */
+	if (ctx.retval != 0 && ctx.retval != retval) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+	    put_user(ctx.optlen, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = ctx.retval;
+
+out:
+	sockopt_free_buf(&ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
 			      size_t *lenp)
 {
@@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
 
 const struct bpf_prog_ops cg_sysctl_prog_ops = {
 };
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+	case BPF_FUNC_tcp_sock:
+		return &bpf_tcp_sock_proto;
+#endif
+	default:
+		return cgroup_base_func_proto(func_id, prog);
+	}
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off >= sizeof(struct bpf_sockopt))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sockopt, retval):
+			if (size != size_default)
+				return false;
+			return prog->expected_attach_type ==
+				BPF_CGROUP_GETSOCKOPT;
+		case offsetof(struct bpf_sockopt, optname):
+			/* fallthrough */
+		case offsetof(struct bpf_sockopt, level):
+			if (size != size_default)
+				return false;
+			return prog->expected_attach_type ==
+				BPF_CGROUP_SETSOCKOPT;
+		case offsetof(struct bpf_sockopt, optlen):
+			return size == size_default;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case offsetof(struct bpf_sockopt, sk):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_SOCKET;
+		break;
+	case offsetof(struct bpf_sockopt, optval):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct bpf_sockopt, optval_end):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	case offsetof(struct bpf_sockopt, retval):
+		if (size != size_default)
+			return false;
+		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+	default:
+		if (size != size_default)
+			return false;
+		break;
+	}
+	return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
+	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
+	  si->dst_reg, si->src_reg,					\
+	  offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
+					 struct bpf_insn *insn_buf,
+					 struct bpf_prog *prog,
+					 u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sockopt, sk):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+		break;
+	case offsetof(struct bpf_sockopt, level):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+		break;
+	case offsetof(struct bpf_sockopt, optname):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+		break;
+	case offsetof(struct bpf_sockopt, optlen):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+		break;
+	case offsetof(struct bpf_sockopt, retval):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+		break;
+	case offsetof(struct bpf_sockopt, optval):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+		break;
+	case offsetof(struct bpf_sockopt, optval_end):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+				   bool direct_write,
+				   const struct bpf_prog *prog)
+{
+	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
+	 */
+	return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+	.get_func_proto		= cg_sockopt_func_proto,
+	.is_valid_access	= cg_sockopt_is_valid_access,
+	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
+	.gen_prologue		= cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 561ed07d3007..e2c1b43728da 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array)
 	return cnt;
 }
 
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+	struct bpf_prog_array_item *item;
+
+	for (item = array->items; item->prog; item++)
+		if (item->prog != &dummy_bpf_prog.prog)
+			return false;
+	return true;
+}
 
 static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
 				     u32 *prog_ids,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7713cf39795a..b0f545e07425 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		switch (expected_attach_type) {
+		case BPF_CGROUP_SETSOCKOPT:
+		case BPF_CGROUP_GETSOCKOPT:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	default:
 		return 0;
 	}
@@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	switch (prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		return prog->enforce_expected_attach_type &&
@@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SYSCTL:
 		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
 		break;
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SYSCTL:
 		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
 		break;
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
 		break;
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e079b2298f8..6b5623d320f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 
 		env->seen_direct_write = true;
 		return true;
+
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		if (t == BPF_WRITE)
+			env->seen_direct_write = true;
+
+		return true;
+
 	default:
 		return false;
 	}
@@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		break;
 	default:
 		return 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 2014d76e0d2a..dc8534be12fc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
 	return (unsigned long)NULL;
 }
 
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.func		= bpf_tcp_sock,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_TCP_SOCK_OR_NULL,
diff --git a/net/socket.c b/net/socket.c
index 963df5dbdd54..0ddfbfb761d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
 static int __sys_setsockopt(int fd, int level, int optname,
 			    char __user *optval, int optlen)
 {
+	mm_segment_t oldfs = get_fs();
+	char *kernel_optval = NULL;
 	int err, fput_needed;
 	struct socket *sock;
 
@@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+						     &optname, optval, &optlen,
+						     &kernel_optval);
+
+		if (err < 0) {
+			goto out_put;
+		} else if (err > 0) {
+			err = 0;
+			goto out_put;
+		}
+
+		if (kernel_optval) {
+			set_fs(KERNEL_DS);
+			optval = (char __user __force *)kernel_optval;
+		}
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_setsockopt(sock, level, optname, optval,
@@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->setsockopt(sock, level, optname, optval,
 						  optlen);
+
+		if (kernel_optval) {
+			set_fs(oldfs);
+			kfree(kernel_optval);
+		}
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
@@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
 {
 	int err, fput_needed;
 	struct socket *sock;
+	int max_optlen;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock != NULL) {
@@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_getsockopt(sock, level, optname, optval,
@@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->getsockopt(sock, level, optname, optval,
 						  optlen);
+
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+						     optval, optlen,
+						     max_optlen, err);
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
-- 
cgit v1.2.3


From 8d3e72a180b42c01ec00045e1bb8eb91175adafe Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 27 Jun 2019 17:28:40 -0700
Subject: iomap: don't mark the inode dirty in iomap_write_end

Marking the inode dirty for each page copied into the page cache can be
very inefficient for file systems that use the VFS dirty inode tracking,
and is completely pointless for those that don't use the VFS dirty inode
tracking.  So instead, only set an iomap flag when changing the in-core
inode size, and open code the rest of __generic_write_end.

Partially based on code from Christoph Hellwig.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/gfs2/bmap.c        |  2 ++
 fs/iomap.c            | 15 ++++++++++++++-
 include/linux/iomap.h |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c78ccaf83ef8..8e8768685264 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1179,6 +1179,8 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 
 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
 		gfs2_quota_unlock(ip);
+	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+		mark_inode_dirty(inode);
 	gfs2_write_unlock(inode);
 
 out:
diff --git a/fs/iomap.c b/fs/iomap.c
index 23ef63fd1669..c4acf69b7196 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -773,6 +773,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 		unsigned copied, struct page *page, struct iomap *iomap)
 {
 	const struct iomap_page_ops *page_ops = iomap->page_ops;
+	loff_t old_size = inode->i_size;
 	int ret;
 
 	if (iomap->type == IOMAP_INLINE) {
@@ -784,7 +785,19 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 		ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
 	}
 
-	__generic_write_end(inode, pos, ret, page);
+	/*
+	 * Update the in-memory inode size after copying the data into the page
+	 * cache.  It's up to the file system to write the updated size to disk,
+	 * preferably after I/O completion so that no stale data is exposed.
+	 */
+	if (pos + ret > old_size) {
+		i_size_write(inode, pos + ret);
+		iomap->flags |= IOMAP_F_SIZE_CHANGED;
+	}
+	unlock_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
 	if (page_ops && page_ops->page_done)
 		page_ops->page_done(inode, pos, copied, page, iomap);
 	put_page(page);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2103b94cb1bf..1df9ea187a9a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -35,6 +35,7 @@ struct vm_fault;
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
 #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
 #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
+#define IOMAP_F_SIZE_CHANGED	0x08	/* file size has changed */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
-- 
cgit v1.2.3


From 8ff80fbe7e9870078b1cc3c2cdd8f3f223b333a9 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 24 May 2019 15:38:10 +0800
Subject: x86/kdump/64: Restrict kdump kernel reservation to <64TB

Restrict kdump to only reserve crashkernel below 64TB.

The reaons is that the kdump may jump from a 5-level paging mode to a
4-level paging mode kernel. If a 4-level paging mode kdump kernel is put
above 64TB, then the kdump kernel cannot start.

The 1st kernel reserves the kdump kernel region during bootup. At that
point it is not known whether the kdump kernel has 5-level or 4-level
paging support.

To support both restrict the kdump kernel reservation to the lower 64TB
address space to ensure that a 4-level paging mode kdump kernel can be
loaded and successfully started.

[ tglx: Massaged changelog ]

Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: bp@alien8.de
Cc: hpa@zytor.com
Link: https://lkml.kernel.org/r/20190524073810.24298-4-bhe@redhat.com
---
 arch/x86/kernel/setup.c | 15 ++++++++++++---
 include/linux/sizes.h   |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08a5f4a131f5..dcbdf54fb5c1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -453,15 +453,24 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 #define CRASH_ALIGN		SZ_16M
 
 /*
- * Keep the crash kernel below this limit.  On 32 bits earlier kernels
- * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * Keep the crash kernel below this limit.
+ *
+ * On 32 bits earlier kernels would limit the kernel to the low 512 MiB
+ * due to mapping restrictions.
+ *
+ * On 64bit, kdump kernel need be restricted to be under 64TB, which is
+ * the upper limit of system RAM in 4-level paing mode. Since the kdump
+ * jumping could be from 5-level to 4-level, the jumping will fail if
+ * kernel is put above 64TB, and there's no way to detect the paging mode
+ * of the kernel which will be loaded for dumping during the 1st kernel
+ * bootup.
  */
 #ifdef CONFIG_X86_32
 # define CRASH_ADDR_LOW_MAX	SZ_512M
 # define CRASH_ADDR_HIGH_MAX	SZ_512M
 #else
 # define CRASH_ADDR_LOW_MAX	SZ_4G
-# define CRASH_ADDR_HIGH_MAX	MAXMEM
+# define CRASH_ADDR_HIGH_MAX	SZ_64T
 #endif
 
 static int __init reserve_crashkernel_low(void)
diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index fbde0bc7e882..8651269cb46c 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -47,5 +47,6 @@
 #define SZ_2G				0x80000000
 
 #define SZ_4G				_AC(0x100000000, ULL)
+#define SZ_64T				_AC(0x400000000000, ULL)
 
 #endif /* __LINUX_SIZES_H__ */
-- 
cgit v1.2.3


From cf394fc5f7155c24efb584979e81427575ab3539 Mon Sep 17 00:00:00 2001
From: Fredrik Noring <noring@nocrew.org>
Date: Tue, 25 Jun 2019 17:05:58 +0200
Subject: lib/genalloc.c: Add algorithm, align and zeroed family of DMA
 allocators

Provide the algorithm option to DMA allocators as well, along with
convenience variants for zeroed and aligned memory. The following
four functions are added:

- gen_pool_dma_alloc_algo()
- gen_pool_dma_alloc_align()
- gen_pool_dma_zalloc_algo()
- gen_pool_dma_zalloc_align()

Signed-off-by: Fredrik Noring <noring@nocrew.org>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/genalloc.h |  10 ++++-
 lib/genalloc.c           | 100 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 6c62eeca754f..ed641337df87 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -121,7 +121,15 @@ extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
 		genpool_algo_t algo, void *data);
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
-void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma);
+extern void *gen_pool_dma_alloc_algo(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, genpool_algo_t algo, void *data);
+extern void *gen_pool_dma_alloc_align(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, int align);
+extern void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma);
+extern void *gen_pool_dma_zalloc_algo(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, genpool_algo_t algo, void *data);
+extern void *gen_pool_dma_zalloc_align(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, int align);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
 extern void gen_pool_for_each_chunk(struct gen_pool *,
 	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 5db43476a19b..512623fbac51 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -347,13 +347,35 @@ EXPORT_SYMBOL(gen_pool_alloc_algo);
  * Return: virtual address of the allocated memory, or %NULL on failure
  */
 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
+{
+	return gen_pool_dma_alloc_algo(pool, size, dma, pool->algo, pool->data);
+}
+EXPORT_SYMBOL(gen_pool_dma_alloc);
+
+/**
+ * gen_pool_dma_alloc_algo - allocate special memory from the pool for DMA
+ * usage with the given pool algorithm
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @dma: DMA-view physical address return value. Use %NULL if unneeded.
+ * @algo: algorithm passed from caller
+ * @data: data passed to algorithm
+ *
+ * Allocate the requested number of bytes from the specified pool. Uses the
+ * given pool allocation function. Can not be used in NMI handler on
+ * architectures without NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated memory, or %NULL on failure
+ */
+void *gen_pool_dma_alloc_algo(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, genpool_algo_t algo, void *data)
 {
 	unsigned long vaddr;
 
 	if (!pool)
 		return NULL;
 
-	vaddr = gen_pool_alloc(pool, size);
+	vaddr = gen_pool_alloc_algo(pool, size, algo, data);
 	if (!vaddr)
 		return NULL;
 
@@ -362,7 +384,31 @@ void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 
 	return (void *)vaddr;
 }
-EXPORT_SYMBOL(gen_pool_dma_alloc);
+EXPORT_SYMBOL(gen_pool_dma_alloc_algo);
+
+/**
+ * gen_pool_dma_alloc_align - allocate special memory from the pool for DMA
+ * usage with the given alignment
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @dma: DMA-view physical address return value. Use %NULL if unneeded.
+ * @align: alignment in bytes for starting address
+ *
+ * Allocate the requested number bytes from the specified pool, with the given
+ * alignment restriction. Can not be used in NMI handler on architectures
+ * without NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated memory, or %NULL on failure
+ */
+void *gen_pool_dma_alloc_align(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, int align)
+{
+	struct genpool_data_align data = { .align = align };
+
+	return gen_pool_dma_alloc_algo(pool, size, dma,
+			gen_pool_first_fit_align, &data);
+}
+EXPORT_SYMBOL(gen_pool_dma_alloc_align);
 
 /**
  * gen_pool_dma_zalloc - allocate special zeroed memory from the pool for
@@ -380,14 +426,60 @@ EXPORT_SYMBOL(gen_pool_dma_alloc);
  */
 void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 {
-	void *vaddr = gen_pool_dma_alloc(pool, size, dma);
+	return gen_pool_dma_zalloc_algo(pool, size, dma, pool->algo, pool->data);
+}
+EXPORT_SYMBOL(gen_pool_dma_zalloc);
+
+/**
+ * gen_pool_dma_zalloc_algo - allocate special zeroed memory from the pool for
+ * DMA usage with the given pool algorithm
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @dma: DMA-view physical address return value. Use %NULL if unneeded.
+ * @algo: algorithm passed from caller
+ * @data: data passed to algorithm
+ *
+ * Allocate the requested number of zeroed bytes from the specified pool. Uses
+ * the given pool allocation function. Can not be used in NMI handler on
+ * architectures without NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated zeroed memory, or %NULL on failure
+ */
+void *gen_pool_dma_zalloc_algo(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, genpool_algo_t algo, void *data)
+{
+	void *vaddr = gen_pool_dma_alloc_algo(pool, size, dma, algo, data);
 
 	if (vaddr)
 		memset(vaddr, 0, size);
 
 	return vaddr;
 }
-EXPORT_SYMBOL(gen_pool_dma_zalloc);
+EXPORT_SYMBOL(gen_pool_dma_zalloc_algo);
+
+/**
+ * gen_pool_dma_zalloc_align - allocate special zeroed memory from the pool for
+ * DMA usage with the given alignment
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @dma: DMA-view physical address return value. Use %NULL if unneeded.
+ * @align: alignment in bytes for starting address
+ *
+ * Allocate the requested number of zeroed bytes from the specified pool,
+ * with the given alignment restriction. Can not be used in NMI handler on
+ * architectures without NMI-safe cmpxchg implementation.
+ *
+ * Return: virtual address of the allocated zeroed memory, or %NULL on failure
+ */
+void *gen_pool_dma_zalloc_align(struct gen_pool *pool, size_t size,
+		dma_addr_t *dma, int align)
+{
+	struct genpool_data_align data = { .align = align };
+
+	return gen_pool_dma_zalloc_algo(pool, size, dma,
+			gen_pool_first_fit_align, &data);
+}
+EXPORT_SYMBOL(gen_pool_dma_zalloc_align);
 
 /**
  * gen_pool_free - free allocated special memory back to the pool
-- 
cgit v1.2.3


From b53b0b9d9a613c418057f6cb921c2f40a6f78c24 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 30 Apr 2019 12:21:53 -0400
Subject: pidfd: add polling support

This patch adds polling support to pidfd.

Android low memory killer (LMK) needs to know when a process dies once
it is sent the kill signal. It does so by checking for the existence of
/proc/pid which is both racy and slow. For example, if a PID is reused
between when LMK sends a kill signal and checks for existence of the
PID, since the wrong PID is now possibly checked for existence.
Using the polling support, LMK will be able to get notified when a process
exists in race-free and fast way, and allows the LMK to do other things
(such as by polling on other fds) while awaiting the process being killed
to die.

For notification to polling processes, we follow the same existing
mechanism in the kernel used when the parent of the task group is to be
notified of a child's death (do_notify_parent). This is precisely when the
tasks waiting on a poll of pidfd are also awakened in this patch.

We have decided to include the waitqueue in struct pid for the following
reasons:
1. The wait queue has to survive for the lifetime of the poll. Including
   it in task_struct would not be option in this case because the task can
   be reaped and destroyed before the poll returns.

2. By including the struct pid for the waitqueue means that during
   de_thread(), the new thread group leader automatically gets the new
   waitqueue/pid even though its task_struct is different.

Appropriate test cases are added in the second patch to provide coverage of
all the cases the patch is handling.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Colascione <dancol@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Jonathan Kowalski <bl0pbl33p@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: kernel-team@android.com
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Co-developed-by: Daniel Colascione <dancol@google.com>
Signed-off-by: Daniel Colascione <dancol@google.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 include/linux/pid.h |  3 +++
 kernel/fork.c       | 26 ++++++++++++++++++++++++++
 kernel/pid.c        |  2 ++
 kernel/signal.c     | 11 +++++++++++
 4 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 3c8ef5a199ca..1484db6ca8d1 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rculist.h>
+#include <linux/wait.h>
 
 enum pid_type
 {
@@ -60,6 +61,8 @@ struct pid
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
+	/* wait queue for pidfd notifications */
+	wait_queue_head_t wait_pidfd;
 	struct rcu_head rcu;
 	struct upid numbers[1];
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..2cdf295b72c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1704,8 +1704,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct task_struct *task;
+	struct pid *pid = file->private_data;
+	int poll_flags = 0;
+
+	poll_wait(file, &pid->wait_pidfd, pts);
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	/*
+	 * Inform pollers only when the whole thread group exits.
+	 * If the thread group leader exits before all other threads in the
+	 * group, then poll(2) should block, similar to the wait(2) family.
+	 */
+	if (!task || (task->exit_state && thread_group_empty(task)))
+		poll_flags = POLLIN | POLLRDNORM;
+	rcu_read_unlock();
+
+	return poll_flags;
+}
+
 const struct file_operations pidfd_fops = {
 	.release = pidfd_release,
+	.poll = pidfd_poll,
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo = pidfd_show_fdinfo,
 #endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 89548d35eefb..6ce3a95968f7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -213,6 +213,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
+	init_waitqueue_head(&pid->wait_pidfd);
+
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c
index a1eb44dc9ff5..1c86b78a7597 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1803,6 +1803,14 @@ ret:
 	return ret;
 }
 
+static void do_notify_pidfd(struct task_struct *task)
+{
+	struct pid *pid;
+
+	pid = task_pid(task);
+	wake_up_all(&pid->wait_pidfd);
+}
+
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1826,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	BUG_ON(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+	/* Wake up all pidfd waiters */
+	do_notify_pidfd(tsk);
+
 	if (sig != SIGCHLD) {
 		/*
 		 * This is only possible if parent == real_parent.
-- 
cgit v1.2.3


From 32fcb426ec001cb6d5a4a195091a8486ea77e2df Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Fri, 24 May 2019 12:43:51 +0200
Subject: pid: add pidfd_open()

This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
process that is created via traditional fork()/clone() calls that is only
referenced by a PID:

int pidfd = pidfd_open(1234, 0);
ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);

With the introduction of pidfds through CLONE_PIDFD it is possible to
created pidfds at process creation time.
However, a lot of processes get created with traditional PID-based calls
such as fork() or clone() (without CLONE_PIDFD). For these processes a
caller can currently not create a pollable pidfd. This is a problem for
Android's low memory killer (LMK) and service managers such as systemd.
Both are examples of tools that want to make use of pidfds to get reliable
notification of process exit for non-parents (pidfd polling) and race-free
signal sending (pidfd_send_signal()). They intend to switch to this API for
process supervision/management as soon as possible. Having no way to get
pollable pidfds from PID-only processes is one of the biggest blockers for
them in adopting this api. With pidfd_open() making it possible to retrieve
pidfds for PID-based processes we enable them to adopt this api.

In line with Arnd's recent changes to consolidate syscall numbers across
architectures, I have added the pidfd_open() syscall to all architectures
at the same time.

Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: David Howells <dhowells@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jann Horn <jannh@google.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
---
 include/linux/syscalls.h |  1 +
 kernel/pid.c             | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..989055e0b501 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -929,6 +929,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
 				struct old_timex32 __user *tx);
 asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_setns(int fd, int nstype);
+asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
 			     unsigned int vlen, unsigned flags);
 asmlinkage long sys_process_vm_readv(pid_t pid,
diff --git a/kernel/pid.c b/kernel/pid.c
index 6ce3a95968f7..8e6f50053364 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,6 +37,8 @@
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
@@ -452,6 +454,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+	int fd;
+
+	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+			      O_RDWR | O_CLOEXEC);
+	if (fd < 0)
+		put_pid(pid);
+
+	return fd;
+}
+
+/**
+ * pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid:   pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the process identified by @pid. Currently, the process identified by
+ * @pid must be a thread-group leader. This restriction currently exists
+ * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
+ * be used with CLONE_THREAD) and pidfd polling (only supports thread group
+ * leaders).
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+	int fd, ret;
+	struct pid *p;
+
+	if (flags)
+		return -EINVAL;
+
+	if (pid <= 0)
+		return -EINVAL;
+
+	p = find_get_pid(pid);
+	if (!p)
+		return -ESRCH;
+
+	ret = 0;
+	rcu_read_lock();
+	if (!pid_task(p, PIDTYPE_TGID))
+		ret = -EINVAL;
+	rcu_read_unlock();
+
+	fd = ret ?: pidfd_create(p);
+	put_pid(p);
+	return fd;
+}
+
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-- 
cgit v1.2.3


From a4496d52b3430cb3c4c16d03cdd5f4ee97ad1241 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Tue, 7 May 2019 11:52:47 +0200
Subject: power: supply: add input power and voltage limit properties

For thermal management strategy you might be interested on limit the
input power for a power supply. We already have current limit but
basically what we probably want is to limit power. So, introduce the
input_power_limit property.

Although the common use case is limit the input power, in some
specific cases it is the voltage that is problematic (i.e some regulators
have different efficiencies at higher voltage resulting in more heat).
So introduce also the input_voltage_limit property.

This happens in one Chromebook and is used on the Pixel C's thermal
management strategy to effectively limit the input power to 5V 3A when
the screen is on. When the screen is on, the display, the CPU, and the GPU
all contribute more heat to the system than while the screen is off, and
we made a tradeoff to throttle the charger in order to give more of the
thermal budget to those other components.

So there's nothing fundamentally broken about the hardware that would
cause the Pixel C to malfunction if we were charging at 9V or 12V instead
of 5V when the screen is on, i.e. if userspace doesn't change this.

What would happen is that you wouldn't meet Google's skin temperature
targets on the system if the charger was allowed to run at 9V or 12V with
the screen on.

For folks hacking on Pixel Cs (which is now outside of Google's official
support window for Android) and customizing their own kernel and userspace
this would be acceptable, but we wanted to expose this feature in the
power supply properties because the feature does exist in the Emedded
Controller firmware of the Pixel C and all of Google's Chromebooks with
USB-C made since 2015 in case someone running an up to date kernel wanted
to limit the charging power for thermal or other reasons.

This patch exposes a new property, similar to input current limit, to
re-configure the maximum voltage from the external supply at runtime
based on system-level knowledge or user input.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Acked-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Reviewed-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 Documentation/ABI/testing/sysfs-class-power | 32 +++++++++++++++++++++++++++++
 Documentation/power/power_supply_class.txt  |  4 ++++
 drivers/power/supply/power_supply_sysfs.c   |  2 ++
 include/linux/power_supply.h                |  2 ++
 4 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index b77e30b9014e..27edc06e2495 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -376,10 +376,42 @@ Description:
 		supply. Normally this is configured based on the type of
 		connection made (e.g. A configured SDP should output a maximum
 		of 500mA so the input current limit is set to the same value).
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_current_limit.
 
 		Access: Read, Write
 		Valid values: Represented in microamps
 
+What:		/sys/class/power_supply/<supply_name>/input_voltage_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming VBUS voltage limit currently
+		set in the supply. Normally this is configured based on
+		system-level knowledge or user input (e.g. This is part of the
+		Pixel C's thermal management strategy to effectively limit the
+		input power to 5V when the screen is on to meet Google's skin
+		temperature targets). Note that this feature should not be
+		used for safety critical things.
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_voltage_limit.
+
+		Access: Read, Write
+		Valid values: Represented in microvolts
+
+What:		/sys/class/power_supply/<supply_name>/input_power_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming power limit currently set
+		in the supply. Normally this is configured based on
+		system-level knowledge or user input. Use preferably this
+		feature to limit the incoming power and use current/voltage
+		limit only for problems that can be solved using power limit.
+
+		Access: Read, Write
+		Valid values: Represented in microwatts
+
 What:		/sys/class/power_supply/<supply_name>/online,
 Date:		May 2007
 Contact:	linux-pm@vger.kernel.org
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
index 300d37896e51..1e3c705111db 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -137,6 +137,10 @@ power supply object.
 
 INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
 the current drawn from a charging source.
+INPUT_VOLTAGE_LIMIT - input voltage limit programmed by charger. Indicates
+the voltage limit from a charging source.
+INPUT_POWER_LIMIT - input power limit programmed by charger. Indicates
+the power limit from a charging source.
 
 CHARGE_CONTROL_LIMIT - current charge control limit setting
 CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index a704a76d7529..829e12c800e5 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -277,6 +277,8 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(charge_control_start_threshold),
 	POWER_SUPPLY_ATTR(charge_control_end_threshold),
 	POWER_SUPPLY_ATTR(input_current_limit),
+	POWER_SUPPLY_ATTR(input_voltage_limit),
+	POWER_SUPPLY_ATTR(input_power_limit),
 	POWER_SUPPLY_ATTR(energy_full_design),
 	POWER_SUPPLY_ATTR(energy_empty_design),
 	POWER_SUPPLY_ATTR(energy_full),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d5b15e039f4f..cbb708b57b11 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -129,6 +129,8 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_VOLTAGE_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_POWER_LIMIT,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_EMPTY_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
-- 
cgit v1.2.3


From 32e454efbb2279b0fa5874abb0944a9d42080ad1 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 25 Jun 2019 10:44:33 +0100
Subject: net: phylink: further documentation clarifications

Clarify the validate() behaviour in a few cases which weren't mentioned
in the documentation, but which are necessary for users to get the
correct behaviour.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2d2e55dfea94..5b130140fb8f 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -93,12 +93,19 @@ struct phylink_mac_ops {
  * Note that the PHY may be able to transform from one connection
  * technology to another, so, eg, don't clear 1000BaseX just
  * because the MAC is unable to BaseX mode. This is more about
- * clearing unsupported speeds and duplex settings.
+ * clearing unsupported speeds and duplex settings. The port modes
+ * should not be cleared; phylink_set_port_modes() will help with this.
  *
  * If the @state->interface mode is %PHY_INTERFACE_MODE_1000BASEX
  * or %PHY_INTERFACE_MODE_2500BASEX, select the appropriate mode
  * based on @state->advertising and/or @state->speed and update
- * @state->interface accordingly.
+ * @state->interface accordingly. See phylink_helper_basex_speed().
+ *
+ * When @state->interface is %PHY_INTERFACE_MODE_NA, phylink expects the
+ * MAC driver to return all supported link modes.
+ *
+ * If the @state->interface mode is not supported, then the @supported
+ * mask must be cleared.
  */
 void validate(struct net_device *ndev, unsigned long *supported,
 	      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 5233794b179136d597b84188c1285148f07012e6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 18 Jun 2019 13:15:06 +0200
Subject: net/mlx5e: reduce stack usage in mlx5_eswitch_termtbl_create

Putting an empty 'mlx5_flow_spec' structure on the stack is a bit
wasteful and causes a warning on 32-bit architectures when building
with clang -fsanitize-coverage:

drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c: In function 'mlx5_eswitch_termtbl_create':
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c:90:1: error: the frame size of 1032 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

Since the structure is never written to, we can statically allocate
it to avoid the stack usage. To be on the safe side, mark all
subsequent function arguments that we pass it into as 'const'
as well.

Fixes: 10caabdaad5a ("net/mlx5e: Use termination table for VLAN push actions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../mellanox/mlx5/core/eswitch_offloads_termtbl.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c    | 20 ++++++++++----------
 include/linux/mlx5/fs.h                              |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
index cb7d8ebe2c95..1d55a324a17e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
@@ -49,8 +49,8 @@ mlx5_eswitch_termtbl_create(struct mlx5_core_dev *dev,
 			    struct mlx5_termtbl_handle *tt,
 			    struct mlx5_flow_act *flow_act)
 {
+	static const struct mlx5_flow_spec spec = {};
 	struct mlx5_flow_namespace *root_ns;
-	struct mlx5_flow_spec spec = {};
 	int prio, flags;
 	int err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 585e7adcbf99..a68a51c5011a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -584,7 +584,7 @@ err_ida_remove:
 }
 
 static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
-				struct mlx5_flow_spec *spec,
+				const struct mlx5_flow_spec *spec,
 				struct mlx5_flow_act *flow_act)
 {
 	struct mlx5_flow_steering *steering = get_steering(&ft->node);
@@ -613,7 +613,7 @@ static void dealloc_flow_group(struct mlx5_flow_steering *steering,
 
 static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
 						u8 match_criteria_enable,
-						void *match_criteria,
+						const void *match_criteria,
 						int start_index,
 						int end_index)
 {
@@ -643,7 +643,7 @@ static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steer
 
 static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
 						       u8 match_criteria_enable,
-						       void *match_criteria,
+						       const void *match_criteria,
 						       int start_index,
 						       int end_index,
 						       struct list_head *prev)
@@ -1286,7 +1286,7 @@ free_handle:
 }
 
 static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
-						     struct mlx5_flow_spec *spec)
+						     const struct mlx5_flow_spec *spec)
 {
 	struct list_head *prev = &ft->node.children;
 	struct mlx5_flow_group *fg;
@@ -1454,7 +1454,7 @@ static int check_conflicting_ftes(struct fs_fte *fte,
 }
 
 static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
-					    struct mlx5_flow_spec *spec,
+					    const struct mlx5_flow_spec *spec,
 					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
 					    int dest_num,
@@ -1539,7 +1539,7 @@ static void free_match_list(struct match_list_head *head)
 
 static int build_match_list(struct match_list_head *match_head,
 			    struct mlx5_flow_table *ft,
-			    struct mlx5_flow_spec *spec)
+			    const struct mlx5_flow_spec *spec)
 {
 	struct rhlist_head *tmp, *list;
 	struct mlx5_flow_group *g;
@@ -1592,7 +1592,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 
 static struct fs_fte *
 lookup_fte_locked(struct mlx5_flow_group *g,
-		  u32 *match_value,
+		  const u32 *match_value,
 		  bool take_write)
 {
 	struct fs_fte *fte_tmp;
@@ -1625,7 +1625,7 @@ out:
 static struct mlx5_flow_handle *
 try_add_to_existing_fg(struct mlx5_flow_table *ft,
 		       struct list_head *match_head,
-		       struct mlx5_flow_spec *spec,
+		       const struct mlx5_flow_spec *spec,
 		       struct mlx5_flow_act *flow_act,
 		       struct mlx5_flow_destination *dest,
 		       int dest_num,
@@ -1716,7 +1716,7 @@ out:
 
 static struct mlx5_flow_handle *
 _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		     struct mlx5_flow_spec *spec,
+		     const struct mlx5_flow_spec *spec,
 		     struct mlx5_flow_act *flow_act,
 		     struct mlx5_flow_destination *dest,
 		     int dest_num)
@@ -1823,7 +1823,7 @@ static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
 
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		    struct mlx5_flow_spec *spec,
+		    const struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int num_dest)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index dc7e7aa53a13..04a569568eac 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -208,7 +208,7 @@ struct mlx5_flow_act {
  */
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		    struct mlx5_flow_spec *spec,
+		    const struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int num_dest);
-- 
cgit v1.2.3


From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: xskmap: Move non-standard list manipulation to helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper in list.h for the non-standard way of clearing a list that is
used in xskmap. This makes it easier to reuse it in the other map types,
and also makes sure this usage is not forgotten in any list refactorings in
the future.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/list.h | 14 ++++++++++++++
 kernel/bpf/xskmap.c  |  3 +--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index e951228db4b2..85c92555e31f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
 	WRITE_ONCE(prev->next, next);
 }
 
+/*
+ * Delete a list entry and clear the 'prev' pointer.
+ *
+ * This is a special-purpose list clearing method used in the networking code
+ * for lists allocated as per-cpu, where we don't want to incur the extra
+ * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
+ * needs to check the node 'prev' pointer instead of calling list_empty().
+ */
+static inline void __list_del_clearprev(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->prev = NULL;
+}
+
 /**
  * list_del - deletes entry from list.
  * @entry: the element to delete from the list.
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index ef7338cebd18..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map)
 
 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
 		xsk_flush(xs);
-		__list_del(xs->flush_node.prev, xs->flush_node.next);
-		xs->flush_node.prev = NULL;
+		__list_del_clearprev(&xs->flush_node);
 	}
 }
 
-- 
cgit v1.2.3


From 4b55cf290dc6bd3a9e5da26d1ad60e77aa88c8cf Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: devmap: Rename ifindex member in bpf_redirect_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_redirect_info struct has an 'ifindex' member which was named back
when the redirects could only target egress interfaces. Now that we can
also redirect to sockets and CPUs, this is a bit misleading, so rename the
member to tgt_index.

Reorder the struct members so we can have 'tgt_index' and 'tgt_value' next
to each other in a subsequent patch.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 340f7d648974..92bd192f7786 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -578,8 +578,8 @@ struct bpf_skb_data_end {
 };
 
 struct bpf_redirect_info {
-	u32 ifindex;
 	u32 flags;
+	u32 tgt_index;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1e5fd37e9ab5..b4a062379bb9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2158,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return TC_ACT_SHOT;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 
 	return TC_ACT_REDIRECT;
 }
@@ -2169,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct net_device *dev;
 
-	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
-	ri->ifindex = 0;
+	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+	ri->tgt_index = 0;
 	if (unlikely(!dev)) {
 		kfree_skb(skb);
 		return -EINVAL;
@@ -3488,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
 		     struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
 {
 	struct net_device *fwd;
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	int err;
 
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
@@ -3604,11 +3604,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_prog *xdp_prog, struct bpf_map *map,
 			       struct bpf_redirect_info *ri)
 {
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	void *fwd = NULL;
 	int err;
 
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	WRITE_ONCE(ri->map, NULL);
 
 	fwd = __xdp_map_lookup_elem(map, index);
@@ -3651,11 +3651,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	void *fwd = NULL;
 	int err = 0;
 
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	WRITE_ONCE(ri->map, NULL);
 
 	fwd = __xdp_map_lookup_elem(map, index);
@@ -3695,14 +3695,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = READ_ONCE(ri->map);
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	struct net_device *fwd;
 	int err = 0;
 
 	if (map)
 		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
 						   map);
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
@@ -3730,8 +3730,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 	if (unlikely(flags))
 		return XDP_ABORTED;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
@@ -3753,8 +3753,8 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 	if (unlikely(flags))
 		return XDP_ABORTED;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, map);
 
 	return XDP_REDIRECT;
-- 
cgit v1.2.3


From 43e74c0267a35d6f5127218054b2d80c7fe801f5 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: bpf_xdp_redirect_map: Perform map lookup in eBPF helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_redirect_map() helper used by XDP programs doesn't return any
indication of whether it can successfully redirect to the map index it was
given. Instead, BPF programs have to track this themselves, leading to
programs using duplicate maps to track which entries are populated in the
devmap.

This patch fixes this by moving the map lookup into the bpf_redirect_map()
helper, which makes it possible to return failure to the eBPF program. The
lower bits of the flags argument is used as the return code, which means
that existing users who pass a '0' flag argument will get XDP_ABORTED.

With this, a BPF program can check the return code from the helper call and
react by, for instance, substituting a different redirect. This works for
any type of map used for redirect.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h     |  1 +
 include/trace/events/xdp.h |  5 ++---
 include/uapi/linux/bpf.h   |  7 +++++--
 net/core/filter.c          | 32 ++++++++++++++++++--------------
 4 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92bd192f7786..1fe53e78c7e3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -580,6 +580,7 @@ struct bpf_skb_data_end {
 struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
+	void *tgt_value;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 81e708c4b513..68899fdc985b 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,9 +175,8 @@ struct _bpf_dtab_netdev {
 #endif /* __DEVMAP_OBJ_TYPE */
 
 #define devmap_ifindex(fwd, map)				\
-	(!fwd ? 0 :						\
-	 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
+	((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a396b516a2b2..cffea1826a1f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1571,8 +1571,11 @@ union bpf_attr {
  * 		but this is only implemented for native XDP (with driver
  * 		support) as of this writing).
  *
- * 		All values for *flags* are reserved for future usage, and must
- * 		be left at zero.
+ * 		The lower two bits of *flags* are used as the return code if
+ * 		the map lookup fails. This is so that the return value can be
+ * 		one of the XDP program return codes up to XDP_TX, as chosen by
+ * 		the caller. Any higher bits in the *flags* argument must be
+ * 		unset.
  *
  * 		When used to redirect packets to net devices, this helper
  * 		provides a high performance increase over **bpf_redirect**\ ().
diff --git a/net/core/filter.c b/net/core/filter.c
index b4a062379bb9..4836264f82ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3605,17 +3605,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_redirect_info *ri)
 {
 	u32 index = ri->tgt_index;
-	void *fwd = NULL;
+	void *fwd = ri->tgt_value;
 	int err;
 
 	ri->tgt_index = 0;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
 	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
 		xdp_do_flush_map();
 
@@ -3652,18 +3648,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	u32 index = ri->tgt_index;
-	void *fwd = NULL;
+	void *fwd = ri->tgt_value;
 	int err = 0;
 
 	ri->tgt_index = 0;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
-
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct bpf_dtab_netdev *dst = fwd;
 
@@ -3732,6 +3723,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 
 	ri->flags = flags;
 	ri->tgt_index = ifindex;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
@@ -3750,9 +3742,21 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-	if (unlikely(flags))
+	/* Lower bits of the flags are used as return code on lookup failure */
+	if (unlikely(flags > XDP_TX))
 		return XDP_ABORTED;
 
+	ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
+	if (unlikely(!ri->tgt_value)) {
+		/* If the lookup fails we want to clear out the state in the
+		 * redirect_info struct completely, so that if an eBPF program
+		 * performs multiple lookups, the last one always takes
+		 * precedence.
+		 */
+		WRITE_ONCE(ri->map, NULL);
+		return flags;
+	}
+
 	ri->flags = flags;
 	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, map);
-- 
cgit v1.2.3


From 5e4c7cf60ec3cad59703c203de1dfb31ea608e6e Mon Sep 17 00:00:00 2001
From: Revanth Rajashekar <revanth.rajashekar@intel.com>
Date: Thu, 27 Jun 2019 16:30:02 -0600
Subject: block: sed-opal: PSID reverttper capability

PSID is a 32 character password printed on the drive label,
to prove its physical access. This PSID reverttper function
is very useful to regain the control over the drive when it
is locked and the user can no longer access it because of some
failures. However, *all the data on the drive is completely
erased*. This method is advisable only when the user is exhausted
of all other recovery methods.

PSID capabilities are described in:
https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage-Opal_Feature_Set_PSID_v1.00_r1.00.pdf

Signed-off-by: Revanth Rajashekar <revanth.rajashekar@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 33 +++++++++++++++++++++++++++++----
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h |  1 +
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index a46e8d13e16d..bb8ef7963d11 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1307,6 +1307,7 @@ static int start_generic_opal_session(struct opal_dev *dev,
 		break;
 	case OPAL_ADMIN1_UID:
 	case OPAL_SID_UID:
+	case OPAL_PSID_UID:
 		add_token_u8(&err, dev, OPAL_STARTNAME);
 		add_token_u8(&err, dev, 0); /* HostChallenge */
 		add_token_bytestring(&err, dev, key, key_len);
@@ -1367,6 +1368,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
 					  key->key, key->key_len);
 }
 
+static int start_PSID_opal_session(struct opal_dev *dev, void *data)
+{
+	const struct opal_key *okey = data;
+
+	return start_generic_opal_session(dev, OPAL_PSID_UID,
+					  OPAL_ADMINSP_UID,
+					  okey->key,
+					  okey->key_len);
+}
+
 static int start_auth_opal_session(struct opal_dev *dev, void *data)
 {
 	struct opal_session_info *session = data;
@@ -2030,17 +2041,28 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 	return ret;
 }
 
-static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid)
 {
+	/* controller will terminate session */
 	const struct opal_step revert_steps[] = {
 		{ start_SIDASP_opal_session, opal },
-		{ revert_tper, } /* controller will terminate session */
+		{ revert_tper, }
 	};
+	const struct opal_step psid_revert_steps[] = {
+		{ start_PSID_opal_session, opal },
+		{ revert_tper, }
+	};
+
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
-	ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps));
+	if (psid)
+		ret = execute_steps(dev, psid_revert_steps,
+				    ARRAY_SIZE(psid_revert_steps));
+	else
+		ret = execute_steps(dev, revert_steps,
+				    ARRAY_SIZE(revert_steps));
 	mutex_unlock(&dev->dev_lock);
 
 	/*
@@ -2280,7 +2302,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 		ret = opal_activate_user(dev, p);
 		break;
 	case IOC_OPAL_REVERT_TPR:
-		ret = opal_reverttper(dev, p);
+		ret = opal_reverttper(dev, p, false);
 		break;
 	case IOC_OPAL_LR_SETUP:
 		ret = opal_setup_locking_range(dev, p);
@@ -2297,6 +2319,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_SECURE_ERASE_LR:
 		ret = opal_secure_erase_locking_range(dev, p);
 		break;
+	case IOC_OPAL_PSID_REVERT_TPR:
+		ret = opal_reverttper(dev, p, true);
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 3e76b6d7d97f..f03bbffd3281 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -39,6 +39,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_ENABLE_DISABLE_MBR:
 	case IOC_OPAL_ERASE_LR:
 	case IOC_OPAL_SECURE_ERASE_LR:
+	case IOC_OPAL_PSID_REVERT_TPR:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 33e53b80cd1f..7a03e5b4df6e 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -107,5 +107,6 @@ struct opal_mbr_data {
 #define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data)
 #define IOC_OPAL_ERASE_LR           _IOW('p', 230, struct opal_session_info)
 #define IOC_OPAL_SECURE_ERASE_LR    _IOW('p', 231, struct opal_session_info)
+#define IOC_OPAL_PSID_REVERT_TPR    _IOW('p', 232, struct opal_key)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From b2d0d99135ad145667765cbd27f148c1a4cd50d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 15:49:20 +0200
Subject: block: move the BIO_NO_PAGE_REF check into bio_release_pages

Move the BIO_NO_PAGE_REF check into bio_release_pages instead of
duplicating it in both callers.

Also make the function available outside of bio.c so that we can
reuse it in other direct I/O implementations.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 11 ++++++-----
 include/linux/bio.h |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index bb55b94bb361..b35356c6093b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -845,11 +845,14 @@ static void bio_get_pages(struct bio *bio)
 		get_page(bvec->bv_page);
 }
 
-static void bio_release_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio)
 {
 	struct bvec_iter_all iter_all;
 	struct bio_vec *bvec;
 
+	if (bio_flagged(bio, BIO_NO_PAGE_REF))
+		return;
+
 	bio_for_each_segment_all(bvec, bio, iter_all)
 		put_page(bvec->bv_page);
 }
@@ -1681,8 +1684,7 @@ static void bio_dirty_fn(struct work_struct *work)
 		next = bio->bi_private;
 
 		bio_set_pages_dirty(bio);
-		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
-			bio_release_pages(bio);
+		bio_release_pages(bio);
 		bio_put(bio);
 	}
 }
@@ -1698,8 +1700,7 @@ void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
 
-	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
-		bio_release_pages(bio);
+	bio_release_pages(bio);
 	bio_put(bio);
 	return;
 defer:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a8ae56e09ff..6d82b4856282 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -426,6 +426,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+void bio_release_pages(struct bio *bio);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct iov_iter *, gfp_t);
-- 
cgit v1.2.3


From d241a95f3514a5eb544dfd8d9d141ffd1c89b707 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 15:49:21 +0200
Subject: block: optionally mark pages dirty in bio_release_pages

A lot of callers of bio_release_pages also want to mark the released
pages as dirty.  Add a mark_dirty parameter to avoid a second
relatively expensive bio_for_each_segment_all loop.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 12 +++++++-----
 include/linux/bio.h |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index b35356c6093b..8a7b315630ce 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -845,7 +845,7 @@ static void bio_get_pages(struct bio *bio)
 		get_page(bvec->bv_page);
 }
 
-void bio_release_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio, bool mark_dirty)
 {
 	struct bvec_iter_all iter_all;
 	struct bio_vec *bvec;
@@ -853,8 +853,11 @@ void bio_release_pages(struct bio *bio)
 	if (bio_flagged(bio, BIO_NO_PAGE_REF))
 		return;
 
-	bio_for_each_segment_all(bvec, bio, iter_all)
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		if (mark_dirty && !PageCompound(bvec->bv_page))
+			set_page_dirty_lock(bvec->bv_page);
 		put_page(bvec->bv_page);
+	}
 }
 
 static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
@@ -1683,8 +1686,7 @@ static void bio_dirty_fn(struct work_struct *work)
 	while ((bio = next) != NULL) {
 		next = bio->bi_private;
 
-		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
+		bio_release_pages(bio, true);
 		bio_put(bio);
 	}
 }
@@ -1700,7 +1702,7 @@ void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
 
-	bio_release_pages(bio);
+	bio_release_pages(bio, false);
 	bio_put(bio);
 	return;
 defer:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6d82b4856282..2d8c73f0ecaf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -426,7 +426,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-void bio_release_pages(struct bio *bio);
+void bio_release_pages(struct bio *bio, bool mark_dirty);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct iov_iter *, gfp_t);
-- 
cgit v1.2.3


From b620743077e291ae7d0debd21f50413a8c266229 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 15:49:28 +0200
Subject: block: never take page references for ITER_BVEC

If we pass pages through an iov_iter we always already have a reference
in the caller.  Thus remove the ITER_BVEC_FLAG_NO_REF and don't take
reference to pages by default for bvec backed iov_iters.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c          | 14 +-------------
 drivers/block/loop.c | 16 ++++------------
 fs/io_uring.c        |  3 ---
 include/linux/uio.h  | 10 +---------
 4 files changed, 6 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 1cbf2a7c245e..5733b9426231 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -836,15 +836,6 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
-static void bio_get_pages(struct bio *bio)
-{
-	struct bvec_iter_all iter_all;
-	struct bio_vec *bvec;
-
-	bio_for_each_segment_all(bvec, bio, iter_all)
-		get_page(bvec->bv_page);
-}
-
 void bio_release_pages(struct bio *bio, bool mark_dirty)
 {
 	struct bvec_iter_all iter_all;
@@ -960,11 +951,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 			ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio));
 
-	if (iov_iter_bvec_no_ref(iter))
+	if (is_bvec)
 		bio_set_flag(bio, BIO_NO_PAGE_REF);
-	else if (is_bvec)
-		bio_get_pages(bio);
-
 	return bio->bi_vcnt ? 0 : ret;
 }
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f11b7dc16e9d..44c9985f352a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -264,20 +264,12 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 	return ret;
 }
 
-static inline void loop_iov_iter_bvec(struct iov_iter *i,
-		unsigned int direction, const struct bio_vec *bvec,
-		unsigned long nr_segs, size_t count)
-{
-	iov_iter_bvec(i, direction, bvec, nr_segs, count);
-	i->type |= ITER_BVEC_FLAG_NO_REF;
-}
-
 static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
 {
 	struct iov_iter i;
 	ssize_t bw;
 
-	loop_iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len);
+	iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len);
 
 	file_start_write(file);
 	bw = vfs_iter_write(file, &i, ppos, 0);
@@ -355,7 +347,7 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq,
 	ssize_t len;
 
 	rq_for_each_segment(bvec, rq, iter) {
-		loop_iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len);
+		iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len);
 		len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
 		if (len < 0)
 			return len;
@@ -396,7 +388,7 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
 		b.bv_offset = 0;
 		b.bv_len = bvec.bv_len;
 
-		loop_iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
+		iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
 		len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
 		if (len < 0) {
 			ret = len;
@@ -563,7 +555,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	}
 	atomic_set(&cmd->ref, 2);
 
-	loop_iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
+	iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
 	iter.iov_offset = offset;
 
 	cmd->iocb.ki_pos = pos;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86a2bd721900..eb6ab1507913 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -997,9 +997,6 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 	if (offset)
 		iov_iter_advance(iter, offset);
-
-	/* don't drop a reference to these pages */
-	iter->type |= ITER_BVEC_FLAG_NO_REF;
 	return 0;
 }
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2c90a0842ee8..cea1761c5672 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -19,9 +19,6 @@ struct kvec {
 };
 
 enum iter_type {
-	/* set if ITER_BVEC doesn't hold a bv_page ref */
-	ITER_BVEC_FLAG_NO_REF = 2,
-
 	/* iter types */
 	ITER_IOVEC = 4,
 	ITER_KVEC = 8,
@@ -56,7 +53,7 @@ struct iov_iter {
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
-	return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_NO_REF);
+	return i->type & ~(READ | WRITE);
 }
 
 static inline bool iter_is_iovec(const struct iov_iter *i)
@@ -89,11 +86,6 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->type & (READ | WRITE);
 }
 
-static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
-{
-	return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
-}
-
 /*
  * Total number of bytes covered by an iovec.
  *
-- 
cgit v1.2.3


From c9888443413e4e06013e482fc484dbb9c559c145 Mon Sep 17 00:00:00 2001
From: Jonas Rabenstein <jonas.rabenstein@studium.uni-erlangen.de>
Date: Tue, 21 May 2019 22:46:44 +0200
Subject: block: sed-opal: add ioctl for done-mark of shadow mbr

Enable users to mark the shadow mbr as done without completely
deactivating the shadow mbr feature. This may be useful on reboots,
when the power to the disk is not disconnected in between and the shadow
mbr stores the required boot files. Of course, this saves also the
(few) commands required to enable the feature if it is already enabled
and one only wants to mark the shadow mbr as done.

Co-authored-by: David Kozub <zub@linux.fjfi.cvut.cz>
Signed-off-by: Jonas Rabenstein <jonas.rabenstein@studium.uni-erlangen.de>
Signed-off-by: David Kozub <zub@linux.fjfi.cvut.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed by: Scott Bauer <sbauer@plzdonthack.me>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 27 +++++++++++++++++++++++++++
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h | 12 ++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index c54019c11e91..f94f359dd688 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1989,6 +1989,30 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
 	return ret;
 }
 
+static int opal_set_mbr_done(struct opal_dev *dev,
+			     struct opal_mbr_done *mbr_done)
+{
+	u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ?
+		OPAL_TRUE : OPAL_FALSE;
+
+	const struct opal_step mbr_steps[] = {
+		{ start_admin1LSP_opal_session, &mbr_done->key },
+		{ set_mbr_done, &mbr_done_tf },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	if (mbr_done->done_flag != OPAL_MBR_DONE &&
+	    mbr_done->done_flag != OPAL_MBR_NOT_DONE)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
 static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 {
 	struct opal_suspend_data *suspend;
@@ -2310,6 +2334,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_ENABLE_DISABLE_MBR:
 		ret = opal_enable_disable_shadow_mbr(dev, p);
 		break;
+	case IOC_OPAL_MBR_DONE:
+		ret = opal_set_mbr_done(dev, p);
+		break;
 	case IOC_OPAL_ERASE_LR:
 		ret = opal_erase_locking_range(dev, p);
 		break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index f03bbffd3281..f834e8a1495f 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -40,6 +40,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_ERASE_LR:
 	case IOC_OPAL_SECURE_ERASE_LR:
 	case IOC_OPAL_PSID_REVERT_TPR:
+	case IOC_OPAL_MBR_DONE:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 7a03e5b4df6e..5681f55d334b 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -20,6 +20,11 @@ enum opal_mbr {
 	OPAL_MBR_DISABLE = 0x01,
 };
 
+enum opal_mbr_done_flag {
+	OPAL_MBR_NOT_DONE = 0x0,
+	OPAL_MBR_DONE = 0x01
+};
+
 enum opal_user {
 	OPAL_ADMIN1 = 0x0,
 	OPAL_USER1 = 0x01,
@@ -95,6 +100,12 @@ struct opal_mbr_data {
 	__u8 __align[7];
 };
 
+struct opal_mbr_done {
+	struct opal_key key;
+	__u8 done_flag;
+	__u8 __align[7];
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -108,5 +119,6 @@ struct opal_mbr_data {
 #define IOC_OPAL_ERASE_LR           _IOW('p', 230, struct opal_session_info)
 #define IOC_OPAL_SECURE_ERASE_LR    _IOW('p', 231, struct opal_session_info)
 #define IOC_OPAL_PSID_REVERT_TPR    _IOW('p', 232, struct opal_key)
+#define IOC_OPAL_MBR_DONE           _IOW('p', 233, struct opal_mbr_done)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From a9b25b4cf2b76d320afc999f881ccb805fecdd84 Mon Sep 17 00:00:00 2001
From: Jonas Rabenstein <jonas.rabenstein@studium.uni-erlangen.de>
Date: Tue, 21 May 2019 22:46:45 +0200
Subject: block: sed-opal: ioctl for writing to shadow mbr

Allow modification of the shadow mbr. If the shadow mbr is not marked as
done, this data will be presented read only as the device content. Only
after marking the shadow mbr as done and unlocking a locking range the
actual content is accessible.

Co-authored-by: David Kozub <zub@linux.fjfi.cvut.cz>
Signed-off-by: Jonas Rabenstein <jonas.rabenstein@studium.uni-erlangen.de>
Signed-off-by: David Kozub <zub@linux.fjfi.cvut.cz>
Reviewed-by: Scott Bauer <sbauer@plzdonthack.me>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 91 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h |  8 ++++
 3 files changed, 98 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index f94f359dd688..b02ef2ff0d75 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -26,6 +26,9 @@
 #define IO_BUFFER_LENGTH 2048
 #define MAX_TOKS 64
 
+/* Number of bytes needed by cmd_finalize. */
+#define CMD_FINALIZE_BYTES_NEEDED 7
+
 struct opal_step {
 	int (*fn)(struct opal_dev *dev, void *data);
 	void *data;
@@ -523,12 +526,17 @@ static int opal_discovery0_step(struct opal_dev *dev)
 	return execute_step(dev, &discovery0_step, 0);
 }
 
+static size_t remaining_size(struct opal_dev *cmd)
+{
+	return IO_BUFFER_LENGTH - cmd->pos;
+}
+
 static bool can_add(int *err, struct opal_dev *cmd, size_t len)
 {
 	if (*err)
 		return false;
 
-	if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) {
+	if (remaining_size(cmd) < len) {
 		pr_debug("Error adding %zu bytes: end of buffer.\n", len);
 		*err = -ERANGE;
 		return false;
@@ -674,7 +682,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
 	struct opal_header *hdr;
 	int err = 0;
 
-	/* close the parameter list opened from cmd_start */
+	/*
+	 * Close the parameter list opened from cmd_start.
+	 * The number of bytes added must be equal to
+	 * CMD_FINALIZE_BYTES_NEEDED.
+	 */
 	add_token_u8(&err, cmd, OPAL_ENDLIST);
 
 	add_token_u8(&err, cmd, OPAL_ENDOFDATA);
@@ -1536,6 +1548,58 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
+static int write_shadow_mbr(struct opal_dev *dev, void *data)
+{
+	struct opal_shadow_mbr *shadow = data;
+	const u8 __user *src;
+	u8 *dst;
+	size_t off = 0;
+	u64 len;
+	int err = 0;
+
+	/* do the actual transmission(s) */
+	src = (u8 __user *)(uintptr_t)shadow->data;
+	while (off < shadow->size) {
+		err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, OPAL_WHERE);
+		add_token_u64(&err, dev, shadow->offset + off);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, OPAL_VALUES);
+
+		/*
+		 * The bytestring header is either 1 or 2 bytes, so assume 2.
+		 * There also needs to be enough space to accommodate the
+		 * trailing OPAL_ENDNAME (1 byte) and tokens added by
+		 * cmd_finalize.
+		 */
+		len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED),
+			  (size_t)(shadow->size - off));
+		pr_debug("MBR: write bytes %zu+%llu/%llu\n",
+			 off, len, shadow->size);
+
+		dst = add_bytestring_header(&err, dev, len);
+		if (!dst)
+			break;
+		if (copy_from_user(dst, src + off, len))
+			err = -EFAULT;
+		dev->pos += len;
+
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		if (err)
+			break;
+
+		err = finalize_and_send(dev, parse_and_check_status);
+		if (err)
+			break;
+
+		off += len;
+	}
+	return err;
+}
+
 static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
 			  struct opal_dev *dev)
 {
@@ -2013,6 +2077,26 @@ static int opal_set_mbr_done(struct opal_dev *dev,
 	return ret;
 }
 
+static int opal_write_shadow_mbr(struct opal_dev *dev,
+				 struct opal_shadow_mbr *info)
+{
+	const struct opal_step mbr_steps[] = {
+		{ start_admin1LSP_opal_session, &info->key },
+		{ write_shadow_mbr, info },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	if (info->size == 0)
+		return 0;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
 static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 {
 	struct opal_suspend_data *suspend;
@@ -2337,6 +2421,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_MBR_DONE:
 		ret = opal_set_mbr_done(dev, p);
 		break;
+	case IOC_OPAL_WRITE_SHADOW_MBR:
+		ret = opal_write_shadow_mbr(dev, p);
+		break;
 	case IOC_OPAL_ERASE_LR:
 		ret = opal_erase_locking_range(dev, p);
 		break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index f834e8a1495f..53c28d750a45 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -41,6 +41,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_SECURE_ERASE_LR:
 	case IOC_OPAL_PSID_REVERT_TPR:
 	case IOC_OPAL_MBR_DONE:
+	case IOC_OPAL_WRITE_SHADOW_MBR:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 5681f55d334b..c6d035fa1b6c 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -106,6 +106,13 @@ struct opal_mbr_done {
 	__u8 __align[7];
 };
 
+struct opal_shadow_mbr {
+	struct opal_key key;
+	const __u64 data;
+	__u64 offset;
+	__u64 size;
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -120,5 +127,6 @@ struct opal_mbr_done {
 #define IOC_OPAL_SECURE_ERASE_LR    _IOW('p', 231, struct opal_session_info)
 #define IOC_OPAL_PSID_REVERT_TPR    _IOW('p', 232, struct opal_key)
 #define IOC_OPAL_MBR_DONE           _IOW('p', 233, struct opal_mbr_done)
+#define IOC_OPAL_WRITE_SHADOW_MBR   _IOW('p', 234, struct opal_shadow_mbr)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From b1a17513a2d60f9e933016bed04d0eeb8651a915 Mon Sep 17 00:00:00 2001
From: Clement Leger <cleger@kalray.eu>
Date: Mon, 17 Jun 2019 14:57:30 +0200
Subject: remoteproc: add vendor resources handling

In order to allow rproc backend to handle vendor resources such as in
OpenAMP, add a handle_rsc hook. This hook allow the rproc backends to
handle vendor resources as they like. The hook will be called only for
vendor resources and should return RSC_HANDLED on successful resource
handling, RSC_IGNORED if resource was ignored, or a negative value on
error.

Signed-off-by: Clement Leger <cleger@kalray.eu>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/remoteproc.txt             | 14 +++++++++-----
 drivers/remoteproc/remoteproc_core.c     | 14 ++++++++++++++
 drivers/remoteproc/remoteproc_internal.h | 11 +++++++++++
 include/linux/remoteproc.h               | 32 ++++++++++++++++++++++++++------
 4 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index 77fb03acdbb4..03c3d2e568b0 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -314,6 +314,8 @@ Here are the various resource types that are currently supported::
    * @RSC_VDEV:       declare support for a virtio device, and serve as its
    *		    virtio header.
    * @RSC_LAST:       just keep this one at the end
+   * @RSC_VENDOR_START:	start of the vendor specific resource types range
+   * @RSC_VENDOR_END:	end of the vendor specific resource types range
    *
    * Please note that these values are used as indices to the rproc_handle_rsc
    * lookup table, so please keep them sane. Moreover, @RSC_LAST is used to
@@ -321,11 +323,13 @@ Here are the various resource types that are currently supported::
    * please update it as needed.
    */
   enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
   };
 
 For more details regarding a specific resource type, please see its
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 48feebd6d0a2..263e9c9614a8 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1066,6 +1066,20 @@ static int rproc_handle_resources(struct rproc *rproc,
 
 		dev_dbg(dev, "rsc: type %d\n", hdr->type);
 
+		if (hdr->type >= RSC_VENDOR_START &&
+		    hdr->type <= RSC_VENDOR_END) {
+			ret = rproc_handle_rsc(rproc, hdr->type, rsc,
+					       offset + sizeof(*hdr), avail);
+			if (ret == RSC_HANDLED)
+				continue;
+			else if (ret < 0)
+				break;
+
+			dev_warn(dev, "unsupported vendor resource %d\n",
+				 hdr->type);
+			continue;
+		}
+
 		if (hdr->type >= RSC_LAST) {
 			dev_warn(dev, "unsupported resource %d\n", hdr->type);
 			continue;
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index 45ff76a06c72..4c77bdd517b9 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -106,6 +106,17 @@ static inline int rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
 	return 0;
 }
 
+static inline
+int rproc_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc, int offset,
+		     int avail)
+{
+	if (rproc->ops->handle_rsc)
+		return rproc->ops->handle_rsc(rproc, rsc_type, rsc, offset,
+					      avail);
+
+	return RSC_IGNORED;
+}
+
 static inline
 struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
 						   const struct firmware *fw)
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 04d04709f2bd..16ad66683ad0 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -100,7 +100,9 @@ struct fw_rsc_hdr {
  *		    the remote processor will be writing logs.
  * @RSC_VDEV:       declare support for a virtio device, and serve as its
  *		    virtio header.
- * @RSC_LAST:       just keep this one at the end
+ * @RSC_LAST:       just keep this one at the end of standard resources
+ * @RSC_VENDOR_START:	start of the vendor specific resource types range
+ * @RSC_VENDOR_END:	end of the vendor specific resource types range
  *
  * For more details regarding a specific resource type, please see its
  * dedicated structure below.
@@ -111,11 +113,13 @@ struct fw_rsc_hdr {
  * please update it as needed.
  */
 enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
 };
 
 #define FW_RSC_ADDR_ANY (-1)
@@ -339,6 +343,16 @@ struct rproc_mem_entry {
 
 struct firmware;
 
+/**
+ * enum rsc_handling_status - return status of rproc_ops handle_rsc hook
+ * @RSC_HANDLED:	resource was handled
+ * @RSC_IGNORED:	resource was ignored
+ */
+enum rsc_handling_status {
+	RSC_HANDLED	= 0,
+	RSC_IGNORED	= 1,
+};
+
 /**
  * struct rproc_ops - platform-specific device handlers
  * @start:	power on the device and boot it
@@ -346,6 +360,10 @@ struct firmware;
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:	optional platform hook to perform address translations
  * @parse_fw:	parse firmware to extract information (e.g. resource table)
+ * @handle_rsc:	optional platform hook to handle vendor resources. Should return
+ * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
+ * negative value on error
+ * @load_rsc_table:	load resource table from firmware image
  * @find_loaded_rsc_table: find the loaded resouce table
  * @load:		load firmware to memory, where the remote processor
  *			expects to find it
@@ -358,6 +376,8 @@ struct rproc_ops {
 	void (*kick)(struct rproc *rproc, int vqid);
 	void * (*da_to_va)(struct rproc *rproc, u64 da, int len);
 	int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
+	int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc,
+			  int offset, int avail);
 	struct resource_table *(*find_loaded_rsc_table)(
 				struct rproc *rproc, const struct firmware *fw);
 	int (*load)(struct rproc *rproc, const struct firmware *fw);
-- 
cgit v1.2.3


From 360aa640a59f269b784848c0b2d6d462952750d9 Mon Sep 17 00:00:00 2001
From: Fabien Dessenne <fabien.dessenne@st.com>
Date: Thu, 7 Mar 2019 16:58:23 +0100
Subject: hwspinlock: add the 'in_atomic' API

Add the 'in_atomic' mode which can be called from an atomic context.
This mode relies on the existing 'raw' mode (no lock, no preemption/irq
disabling) with the difference that the timeout is not based on jiffies
(jiffies won't increase when irq are disabled) but handled with
busy-waiting udelay() calls.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/hwspinlock.txt         | 39 +++++++++++++++++++++++
 drivers/hwspinlock/hwspinlock_core.c | 43 +++++++++++++++++--------
 include/linux/hwspinlock.h           | 61 ++++++++++++++++++++++++++++++++++--
 3 files changed, 127 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index c3403f9ae27a..6f03713b7003 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -151,6 +151,22 @@ notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
 
 The function will never sleep.
 
+::
+
+  int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+This function shall be called only from an atomic context and the timeout
+value shall not exceed a few msecs.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
 ::
 
   int hwspin_trylock(struct hwspinlock *hwlock);
@@ -216,6 +232,19 @@ Returns 0 on success and an appropriate error code otherwise (most
 notably -EBUSY if the hwspinlock was already taken).
 The function will never sleep.
 
+::
+
+  int hwspin_trylock_in_atomic(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+This function shall be called only from an atomic context.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
 ::
 
   void hwspin_unlock(struct hwspinlock *hwlock);
@@ -262,6 +291,16 @@ The caller should **never** unlock an hwspinlock which is already unlocked.
 Doing so is considered a bug (there is no protection against this).
 This function will never sleep.
 
+::
+
+  void hwspin_unlock_in_atomic(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
 ::
 
   int hwspin_lock_get_id(struct hwspinlock *hwlock);
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index d806307f19c2..8862445aa858 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -9,6 +9,7 @@
 
 #define pr_fmt(fmt)    "%s: " fmt, __func__
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -23,6 +24,9 @@
 
 #include "hwspinlock_internal.h"
 
+/* retry delay used in atomic context */
+#define HWSPINLOCK_RETRY_DELAY_US	100
+
 /* radix tree tags */
 #define HWSPINLOCK_UNUSED	(0) /* tags an hwspinlock as unused */
 
@@ -68,11 +72,11 @@ static DEFINE_MUTEX(hwspinlock_tree_lock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption (and possibly interrupts) is disabled, so the caller must not
- * sleep, and is advised to release the hwspinlock as soon as possible. This is
- * required in order to minimize remote cores polling on the hardware
- * interconnect.
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -112,6 +116,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		ret = spin_trylock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		ret = 1;
 		break;
 	default:
@@ -136,6 +141,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 			spin_unlock_irq(&hwlock->lock);
 			break;
 		case HWLOCK_RAW:
+		case HWLOCK_IN_ATOMIC:
 			/* Nothing to do */
 			break;
 		default:
@@ -179,11 +185,14 @@ EXPORT_SYMBOL_GPL(__hwspin_trylock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption is disabled (and possibly local interrupts, too), so the caller
- * must not sleep, and is advised to release the hwspinlock as soon as possible.
- * This is required in order to minimize remote cores polling on the
- * hardware interconnect.
+ * If the mode is HWLOCK_IN_ATOMIC (called from an atomic context) the timeout
+ * is handled with busy-waiting delays, hence shall not exceed few msecs.
+ *
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -198,7 +207,7 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 					int mode, unsigned long *flags)
 {
 	int ret;
-	unsigned long expire;
+	unsigned long expire, atomic_delay = 0;
 
 	expire = msecs_to_jiffies(to) + jiffies;
 
@@ -212,8 +221,15 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 		 * The lock is already taken, let's check if the user wants
 		 * us to try again
 		 */
-		if (time_is_before_eq_jiffies(expire))
-			return -ETIMEDOUT;
+		if (mode == HWLOCK_IN_ATOMIC) {
+			udelay(HWSPINLOCK_RETRY_DELAY_US);
+			atomic_delay += HWSPINLOCK_RETRY_DELAY_US;
+			if (atomic_delay > to * 1000)
+				return -ETIMEDOUT;
+		} else {
+			if (time_is_before_eq_jiffies(expire))
+				return -ETIMEDOUT;
+		}
 
 		/*
 		 * Allow platform-specific relax handlers to prevent
@@ -276,6 +292,7 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		spin_unlock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		/* Nothing to do */
 		break;
 	default:
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 0afe693be5f4..bfe7c1f1ac6d 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -14,9 +14,10 @@
 #include <linux/sched.h>
 
 /* hwspinlock mode argument */
-#define HWLOCK_IRQSTATE	0x01	/* Disable interrupts, save state */
-#define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
-#define HWLOCK_RAW	0x03
+#define HWLOCK_IRQSTATE		0x01 /* Disable interrupts, save state */
+#define HWLOCK_IRQ		0x02 /* Disable interrupts, don't save state */
+#define HWLOCK_RAW		0x03
+#define HWLOCK_IN_ATOMIC	0x04 /* Called while in atomic context */
 
 struct device;
 struct device_node;
@@ -222,6 +223,23 @@ static inline int hwspin_trylock_raw(struct hwspinlock *hwlock)
 	return __hwspin_trylock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_trylock_in_atomic() - attempt to lock a specific hwspinlock
+ * @hwlock: an hwspinlock which we want to trylock
+ *
+ * This function attempts to lock an hwspinlock, and will immediately fail
+ * if the hwspinlock is already taken.
+ *
+ * This function shall be called only from an atomic context.
+ *
+ * Returns 0 if we successfully locked the hwspinlock, -EBUSY if
+ * the hwspinlock was already taken, and -EINVAL if @hwlock is invalid.
+ */
+static inline int hwspin_trylock_in_atomic(struct hwspinlock *hwlock)
+{
+	return __hwspin_trylock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_trylock() - attempt to lock a specific hwspinlock
  * @hwlock: an hwspinlock which we want to trylock
@@ -312,6 +330,28 @@ int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int to)
 	return __hwspin_lock_timeout(hwlock, to, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_lock_timeout_in_atomic() - lock an hwspinlock with timeout limit
+ * @hwlock: the hwspinlock to be locked
+ * @to: timeout value in msecs
+ *
+ * This function locks the underlying @hwlock. If the @hwlock
+ * is already taken, the function will busy loop waiting for it to
+ * be released, but give up when @timeout msecs have elapsed.
+ *
+ * This function shall be called only from an atomic context and the timeout
+ * value shall not exceed a few msecs.
+ *
+ * Returns 0 when the @hwlock was successfully taken, and an appropriate
+ * error code otherwise (most notably an -ETIMEDOUT if the @hwlock is still
+ * busy after @timeout msecs). The function will never sleep.
+ */
+static inline
+int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to)
+{
+	return __hwspin_lock_timeout(hwlock, to, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_lock_timeout() - lock an hwspinlock with timeout limit
  * @hwlock: the hwspinlock to be locked
@@ -386,6 +426,21 @@ static inline void hwspin_unlock_raw(struct hwspinlock *hwlock)
 	__hwspin_unlock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_unlock_in_atomic() - unlock hwspinlock
+ * @hwlock: a previously-acquired hwspinlock which we want to unlock
+ *
+ * This function will unlock a specific hwspinlock.
+ *
+ * @hwlock must be already locked (e.g. by hwspin_trylock()) before calling
+ * this function: it is a bug to call unlock on a @hwlock that is already
+ * unlocked.
+ */
+static inline void hwspin_unlock_in_atomic(struct hwspinlock *hwlock)
+{
+	__hwspin_unlock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_unlock() - unlock hwspinlock
  * @hwlock: a previously-acquired hwspinlock which we want to unlock
-- 
cgit v1.2.3


From 2aeac95d1a4cc85aae57ab842d5c3340df0f817f Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Tue, 11 Jun 2019 11:40:41 +0100
Subject: soundwire: add module_sdw_driver helper macro

This Helper macro is for SoundWire drivers which do not do anything special in
module init/exit. This eliminates a lot of boilerplate. Each module may only
use this macro once, and calling it replaces module_init() and module_exit()

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_type.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_type.h b/include/linux/soundwire/sdw_type.h
index 9c756b5a0dfe..aaa7f4267c14 100644
--- a/include/linux/soundwire/sdw_type.h
+++ b/include/linux/soundwire/sdw_type.h
@@ -16,4 +16,15 @@ void sdw_unregister_driver(struct sdw_driver *drv);
 
 int sdw_slave_modalias(const struct sdw_slave *slave, char *buf, size_t size);
 
+/**
+ * module_sdw_driver() - Helper macro for registering a Soundwire driver
+ * @__sdw_driver: soundwire slave driver struct
+ *
+ * Helper macro for Soundwire drivers which do not do anything special in
+ * module init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_sdw_driver(__sdw_driver) \
+	module_driver(__sdw_driver, sdw_register_driver, \
+			sdw_unregister_driver)
 #endif /* __SOUNDWIRE_TYPES_H */
-- 
cgit v1.2.3


From 8c3166e17cf10161d2871dfb1d017287c7b79ff1 Mon Sep 17 00:00:00 2001
From: Evan Green <evgreen@chromium.org>
Date: Thu, 27 Jun 2019 13:44:45 -0700
Subject: mfd / platform: cros_ec_debugfs: Expose resume result via debugfs

For ECs that support it, the EC returns the number of slp_s0
transitions and whether or not there was a timeout in the resume
response. Expose the last resume result to usermode via debugfs so
that usermode can detect and report S0ix timeouts.

Signed-off-by: Evan Green <evgreen@chromium.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 Documentation/ABI/testing/debugfs-cros-ec | 22 ++++++++++++++++++++++
 drivers/mfd/cros_ec.c                     |  6 +++++-
 drivers/platform/chrome/cros_ec_debugfs.c |  3 +++
 include/linux/mfd/cros_ec.h               |  1 +
 4 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-cros-ec b/Documentation/ABI/testing/debugfs-cros-ec
index 573a82d23c89..1fe0add99a2a 100644
--- a/Documentation/ABI/testing/debugfs-cros-ec
+++ b/Documentation/ABI/testing/debugfs-cros-ec
@@ -32,3 +32,25 @@ Description:
 		is used for synchronizing the AP host time with the EC
 		log. An error is returned if the command is not supported
 		by the EC or there is a communication problem.
+
+What:		/sys/kernel/debug/<cros-ec-device>/last_resume_result
+Date:		June 2019
+KernelVersion:	5.3
+Description:
+		Some ECs have a feature where they will track transitions to
+		the (Intel) processor's SLP_S0 line, in order to detect cases
+		where a system failed to go into S0ix. When the system resumes,
+		an EC with this feature will return a summary of SLP_S0
+		transitions that occurred. The last_resume_result file returns
+		the most recent response from the AP's resume message to the EC.
+
+		The bottom 31 bits contain a count of the number of SLP_S0
+		transitions that occurred since the suspend message was
+		received. Bit 31 is set if the EC attempted to wake the
+		system due to a timeout when watching for SLP_S0 transitions.
+		Callers can use this to detect a wake from the EC due to
+		S0ix timeouts. The result will be zero if no suspend
+		transitions have been attempted, or the EC does not support
+		this feature.
+
+		Output will be in the format: "0x%08x\n".
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index bd2bcdd4718b..64a2d3adc729 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -110,12 +110,16 @@ static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
 
 	/* For now, report failure to transition to S0ix with a warning. */
 	if (ret >= 0 && ec_dev->host_sleep_v1 &&
-	    (sleep_event == HOST_SLEEP_EVENT_S0IX_RESUME))
+	    (sleep_event == HOST_SLEEP_EVENT_S0IX_RESUME)) {
+		ec_dev->last_resume_result =
+			buf.u.resp1.resume_response.sleep_transitions;
+
 		WARN_ONCE(buf.u.resp1.resume_response.sleep_transitions &
 			  EC_HOST_RESUME_SLEEP_TIMEOUT,
 			  "EC detected sleep transition timeout. Total slp_s0 transitions: %d",
 			  buf.u.resp1.resume_response.sleep_transitions &
 			  EC_HOST_RESUME_SLEEP_TRANSITIONS_MASK);
+	}
 
 	return ret;
 }
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 7ee060743844..8ec1cc2889f2 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -447,6 +447,9 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
 	debugfs_create_file("uptime", 0444, debug_info->dir, debug_info,
 			    &cros_ec_uptime_fops);
 
+	debugfs_create_x32("last_resume_result", 0444, debug_info->dir,
+			   &ec->ec_dev->last_resume_result);
+
 	ec->debug_info = debug_info;
 
 	dev_set_drvdata(&pd->dev, ec);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index cfa78bb4990f..d50ade418a83 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -163,6 +163,7 @@ struct cros_ec_device {
 	struct ec_response_get_next_event_v1 event_data;
 	int event_size;
 	u32 host_event_wake_mask;
+	u32 last_resume_result;
 };
 
 /**
-- 
cgit v1.2.3


From 79d08f89bb1b5c2c1ff90d9bb95497ab9e8aa7e0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 1 Jul 2019 15:14:46 +0800
Subject: block: fix .bi_size overflow

'bio->bi_iter.bi_size' is 'unsigned int', which at most hold 4G - 1
bytes.

Before 07173c3ec276 ("block: enable multipage bvecs"), one bio can
include very limited pages, and usually at most 256, so the fs bio
size won't be bigger than 1M bytes most of times.

Since we support multi-page bvec, in theory one fs bio really can
be added > 1M pages, especially in case of hugepage, or big writeback
with too many dirty pages. Then there is chance in which .bi_size
is overflowed.

Fixes this issue by using bio_full() to check if the added segment may
overflow .bi_size.

Cc: Liu Yiding <liuyd.fnst@cn.fujitsu.com>
Cc: kernel test robot <rong.a.chen@intel.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: linux-xfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: 07173c3ec276 ("block: enable multipage bvecs")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 10 +++++-----
 fs/iomap.c          |  2 +-
 fs/xfs/xfs_aops.c   |  2 +-
 include/linux/bio.h | 18 ++++++++++++++++--
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 933c1e36643b..29cd6cf4da51 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -723,7 +723,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
 		}
 	}
 
-	if (bio_full(bio))
+	if (bio_full(bio, len))
 		return 0;
 
 	if (bio->bi_vcnt >= queue_max_segments(q))
@@ -797,7 +797,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
 	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
 
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-	WARN_ON_ONCE(bio_full(bio));
+	WARN_ON_ONCE(bio_full(bio, len));
 
 	bv->bv_page = page;
 	bv->bv_offset = off;
@@ -824,7 +824,7 @@ int bio_add_page(struct bio *bio, struct page *page,
 	bool same_page = false;
 
 	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-		if (bio_full(bio))
+		if (bio_full(bio, len))
 			return 0;
 		__bio_add_page(bio, page, len, offset);
 	}
@@ -909,7 +909,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 			if (same_page)
 				put_page(page);
 		} else {
-			if (WARN_ON_ONCE(bio_full(bio)))
+			if (WARN_ON_ONCE(bio_full(bio, len)))
                                 return -EINVAL;
 			__bio_add_page(bio, page, len, offset);
 		}
@@ -953,7 +953,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 			ret = __bio_iov_bvec_add_pages(bio, iter);
 		else
 			ret = __bio_iov_iter_get_pages(bio, iter);
-	} while (!ret && iov_iter_count(iter) && !bio_full(bio));
+	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	if (is_bvec)
 		bio_set_flag(bio, BIO_NO_PAGE_REF);
diff --git a/fs/iomap.c b/fs/iomap.c
index 4f94788db43b..7a147aa0c4d9 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -333,7 +333,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	if (iop)
 		atomic_inc(&iop->read_count);
 
-	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
+	if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8da5e6637771..11f703d4a605 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -782,7 +782,7 @@ xfs_add_to_ioend(
 		atomic_inc(&iop->write_count);
 
 	if (!merged) {
-		if (bio_full(wpc->ioend->io_bio))
+		if (bio_full(wpc->ioend->io_bio, len))
 			xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
 		bio_add_page(wpc->ioend->io_bio, page, len, poff);
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dc630b05e6e5..3cdb84cdc488 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -102,9 +102,23 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
-static inline bool bio_full(struct bio *bio)
+/**
+ * bio_full - check if the bio is full
+ * @bio:	bio to check
+ * @len:	length of one segment to be added
+ *
+ * Return true if @bio is full and one segment with @len bytes can't be
+ * added to the bio, otherwise return false
+ */
+static inline bool bio_full(struct bio *bio, unsigned len)
 {
-	return bio->bi_vcnt >= bio->bi_max_vecs;
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+
+	return false;
 }
 
 static inline bool bio_next_segment(const struct bio *bio,
-- 
cgit v1.2.3


From 5aca284210ce827f780ea2f4f9c6ab8d6e2d6648 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 1 Jul 2019 08:25:34 -0700
Subject: vfs: create a generic checking and prep function for FS_IOC_SETFLAGS

Create a generic function to check incoming FS_IOC_SETFLAGS flag values
and later prepare the inode for updates so that we can standardize the
implementations that follow ext4's flag values.

Note that the efivarfs implementation no longer fails a no-op SETFLAGS
without CAP_LINUX_IMMUTABLE since that's the behavior in ext*.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Sterba <dsterba@suse.com>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/btrfs/ioctl.c    | 13 +++++--------
 fs/efivarfs/file.c  | 26 +++++++++++++++++---------
 fs/ext2/ioctl.c     | 16 ++++------------
 fs/ext4/ioctl.c     | 13 +++----------
 fs/gfs2/file.c      | 42 +++++++++++++++++++++++++++++-------------
 fs/hfsplus/ioctl.c  | 21 ++++++++++++---------
 fs/inode.c          | 24 ++++++++++++++++++++++++
 fs/jfs/ioctl.c      | 22 +++++++---------------
 fs/nilfs2/ioctl.c   |  9 ++-------
 fs/ocfs2/ioctl.c    | 13 +++----------
 fs/orangefs/file.c  | 37 ++++++++++++++++++++++++++++---------
 fs/reiserfs/ioctl.c | 10 ++++------
 fs/ubifs/ioctl.c    | 13 +++----------
 include/linux/fs.h  |  3 +++
 14 files changed, 144 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6dafa857bbb9..d3d9b4abb09b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -187,7 +187,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	struct btrfs_inode *binode = BTRFS_I(inode);
 	struct btrfs_root *root = binode->root;
 	struct btrfs_trans_handle *trans;
-	unsigned int fsflags;
+	unsigned int fsflags, old_fsflags;
 	int ret;
 	const char *comp = NULL;
 	u32 binode_flags = binode->flags;
@@ -212,13 +212,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	inode_lock(inode);
 
 	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
-	if ((fsflags ^ btrfs_inode_flags_to_fsflags(binode->flags)) &
-	    (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			ret = -EPERM;
-			goto out_unlock;
-		}
-	}
+	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+	ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+	if (ret)
+		goto out_unlock;
 
 	if (fsflags & FS_SYNC_FL)
 		binode_flags |= BTRFS_INODE_SYNC;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8e568428c88b..a3cc10b1bfe1 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -110,16 +110,22 @@ out_free:
 	return size;
 }
 
-static int
-efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+static inline unsigned int efivarfs_getflags(struct inode *inode)
 {
-	struct inode *inode = file->f_mapping->host;
 	unsigned int i_flags;
 	unsigned int flags = 0;
 
 	i_flags = inode->i_flags;
 	if (i_flags & S_IMMUTABLE)
 		flags |= FS_IMMUTABLE_FL;
+	return flags;
+}
+
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_mapping->host;
+	unsigned int flags = efivarfs_getflags(inode);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
 		return -EFAULT;
@@ -132,6 +138,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	struct inode *inode = file->f_mapping->host;
 	unsigned int flags;
 	unsigned int i_flags = 0;
+	unsigned int oldflags = efivarfs_getflags(inode);
 	int error;
 
 	if (!inode_owner_or_capable(inode))
@@ -143,9 +150,6 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	if (flags & ~FS_IMMUTABLE_FL)
 		return -EOPNOTSUPP;
 
-	if (!capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
 	if (flags & FS_IMMUTABLE_FL)
 		i_flags |= S_IMMUTABLE;
 
@@ -155,12 +159,16 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 		return error;
 
 	inode_lock(inode);
+
+	error = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (error)
+		goto out;
+
 	inode_set_flags(inode, i_flags, S_IMMUTABLE);
+out:
 	inode_unlock(inode);
-
 	mnt_drop_write_file(file);
-
-	return 0;
+	return error;
 }
 
 static long
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 0367c0039e68..1b853fb0b163 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -60,18 +60,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		oldflags = ei->i_flags;
 
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 *
-		 * This test looks nicer. Thanks to Pauline Middelink
-		 */
-		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				inode_unlock(inode);
-				ret = -EPERM;
-				goto setflags_out;
-			}
+		ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+		if (ret) {
+			inode_unlock(inode);
+			goto setflags_out;
 		}
 
 		flags = flags & EXT2_FL_USER_MODIFIABLE;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e486e49b31ed..272b6e44191b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -289,16 +289,9 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	/* The JOURNAL_DATA flag is modifiable only by root */
 	jflag = flags & EXT4_JOURNAL_DATA_FL;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 *
-	 * This test looks nicer. Thanks to Pauline Middelink
-	 */
-	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			goto flags_out;
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto flags_out;
 
 	/*
 	 * The JOURNAL_DATA flag can only be changed by
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d174b1f8fd08..1cb0c3afd3dc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -136,27 +136,36 @@ static struct {
 	{FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA},
 };
 
+static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
+{
+	int i;
+	u32 fsflags = 0;
+
+	if (S_ISDIR(inode->i_mode))
+		gfsflags &= ~GFS2_DIF_JDATA;
+	else
+		gfsflags &= ~GFS2_DIF_INHERIT_JDATA;
+
+	for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++)
+		if (gfsflags & fsflag_gfs2flag[i].gfsflag)
+			fsflags |= fsflag_gfs2flag[i].fsflag;
+	return fsflags;
+}
+
 static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 {
 	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
-	int i, error;
-	u32 gfsflags, fsflags = 0;
+	int error;
+	u32 fsflags;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
 	error = gfs2_glock_nq(&gh);
 	if (error)
 		goto out_uninit;
 
-	gfsflags = ip->i_diskflags;
-	if (S_ISDIR(inode->i_mode))
-		gfsflags &= ~GFS2_DIF_JDATA;
-	else
-		gfsflags &= ~GFS2_DIF_INHERIT_JDATA;
-	for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++)
-		if (gfsflags & fsflag_gfs2flag[i].gfsflag)
-			fsflags |= fsflag_gfs2flag[i].fsflag;
+	fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
 
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
@@ -200,9 +209,11 @@ void gfs2_set_inode_flags(struct inode *inode)
  * @filp: file pointer
  * @reqflags: The flags to set
  * @mask: Indicates which flags are valid
+ * @fsflags: The FS_* inode flags passed in
  *
  */
-static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
+			     const u32 fsflags)
 {
 	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -210,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	struct buffer_head *bh;
 	struct gfs2_holder gh;
 	int error;
-	u32 new_flags, flags;
+	u32 new_flags, flags, oldflags;
 
 	error = mnt_want_write_file(filp);
 	if (error)
@@ -220,6 +231,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_drop_write;
 
+	oldflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
+	error = vfs_ioc_setflags_prepare(inode, oldflags, fsflags);
+	if (error)
+		goto out;
+
 	error = -EACCES;
 	if (!inode_owner_or_capable(inode))
 		goto out;
@@ -308,7 +324,7 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 		mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
 	}
 
-	return do_gfs2_set_flags(filp, gfsflags, mask);
+	return do_gfs2_set_flags(filp, gfsflags, mask, fsflags);
 }
 
 static int gfs2_getlabel(struct file *filp, char __user *label)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5e6502ef7415..ce15b9496b77 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -57,9 +57,8 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
 	return 0;
 }
 
-static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+static inline unsigned int hfsplus_getflags(struct inode *inode)
 {
-	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags = 0;
 
@@ -69,6 +68,13 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 		flags |= FS_APPEND_FL;
 	if (hip->userflags & HFSPLUS_FLG_NODUMP)
 		flags |= FS_NODUMP_FL;
+	return flags;
+}
+
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file_inode(file);
+	unsigned int flags = hfsplus_getflags(inode);
 
 	return put_user(flags, user_flags);
 }
@@ -78,6 +84,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags, new_fl = 0;
+	unsigned int oldflags = hfsplus_getflags(inode);
 	int err = 0;
 
 	err = mnt_want_write_file(file);
@@ -96,13 +103,9 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 
 	inode_lock(inode);
 
-	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
-	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			err = -EPERM;
-			goto out_unlock_inode;
-		}
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto out_unlock_inode;
 
 	/* don't silently ignore unsupported ext2 flags */
 	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
diff --git a/fs/inode.c b/fs/inode.c
index df6542ec3b88..8072a09fd0b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2170,3 +2170,27 @@ struct timespec64 current_time(struct inode *inode)
 	return timespec64_trunc(now, inode->i_sb->s_time_gran);
 }
 EXPORT_SYMBOL(current_time);
+
+/*
+ * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
+ * configurations.
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that they have
+ * exclusive access to the inode structure.
+ */
+int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
+			     unsigned int flags)
+{
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 *
+	 * This test looks nicer. Thanks to Pauline Middelink
+	 */
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	return 0;
+}
+EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index ba34dae8bd9f..10ee0ecca1a8 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -98,24 +98,16 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		/* Lock against other parallel changes of flags */
 		inode_lock(inode);
 
-		oldflags = jfs_inode->mode2;
-
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 */
-		if ((oldflags & JFS_IMMUTABLE_FL) ||
-			((flags ^ oldflags) &
-			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				inode_unlock(inode);
-				err = -EPERM;
-				goto setflags_out;
-			}
+		oldflags = jfs_map_ext2(jfs_inode->mode2 & JFS_FL_USER_VISIBLE,
+					0);
+		err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+		if (err) {
+			inode_unlock(inode);
+			goto setflags_out;
 		}
 
 		flags = flags & JFS_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~JFS_FL_USER_MODIFIABLE;
+		flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE;
 		jfs_inode->mode2 = flags;
 
 		jfs_set_inode_flags(inode);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 9b96d79eea6c..91b9dac6b2cc 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -148,13 +148,8 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 
 	oldflags = NILFS_I(inode)->i_flags;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
-	 * relevant capability.
-	 */
-	ret = -EPERM;
-	if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
+	ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (ret)
 		goto out;
 
 	ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 994726ada857..d6f7b299eb23 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -106,16 +106,9 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	flags = flags & mask;
 	flags |= oldflags & ~mask;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 */
-	status = -EPERM;
-	if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
-		(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			goto bail_unlock;
-	}
+	status = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (status)
+		goto bail_unlock;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a35c17017210..679a3c8e4fb3 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -357,11 +357,28 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
 	return ret;
 }
 
+static int orangefs_getflags(struct inode *inode, unsigned long *uval)
+{
+	__u64 val = 0;
+	int ret;
+
+	ret = orangefs_inode_getxattr(inode,
+				      "user.pvfs2.meta_hint",
+				      &val, sizeof(val));
+	if (ret < 0 && ret != -ENODATA)
+		return ret;
+	else if (ret == -ENODATA)
+		val = 0;
+	*uval = val;
+	return 0;
+}
+
 /*
  * Perform a miscellaneous operation on a file.
  */
 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = file_inode(file);
 	int ret = -ENOTTY;
 	__u64 val = 0;
 	unsigned long uval;
@@ -375,20 +392,16 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 	 * and append flags
 	 */
 	if (cmd == FS_IOC_GETFLAGS) {
-		val = 0;
-		ret = orangefs_inode_getxattr(file_inode(file),
-					      "user.pvfs2.meta_hint",
-					      &val, sizeof(val));
-		if (ret < 0 && ret != -ENODATA)
+		ret = orangefs_getflags(inode, &uval);
+		if (ret)
 			return ret;
-		else if (ret == -ENODATA)
-			val = 0;
-		uval = val;
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			     "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
 			     (unsigned long long)uval);
 		return put_user(uval, (int __user *)arg);
 	} else if (cmd == FS_IOC_SETFLAGS) {
+		unsigned long old_uval;
+
 		ret = 0;
 		if (get_user(uval, (int __user *)arg))
 			return -EFAULT;
@@ -404,11 +417,17 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 			gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
 			return -EINVAL;
 		}
+		ret = orangefs_getflags(inode, &old_uval);
+		if (ret)
+			return ret;
+		ret = vfs_ioc_setflags_prepare(inode, old_uval, uval);
+		if (ret)
+			return ret;
 		val = uval;
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			     "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
 			     (unsigned long long)val);
-		ret = orangefs_inode_setxattr(file_inode(file),
+		ret = orangefs_inode_setxattr(inode,
 					      "user.pvfs2.meta_hint",
 					      &val, sizeof(val), 0);
 	}
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index acbbaf7a0bb2..45e1a5d11af3 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -74,13 +74,11 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				err = -EPERM;
 				goto setflags_out;
 			}
-			if (((flags ^ REISERFS_I(inode)->
-			      i_attrs) & (REISERFS_IMMUTABLE_FL |
-					  REISERFS_APPEND_FL))
-			    && !capable(CAP_LINUX_IMMUTABLE)) {
-				err = -EPERM;
+			err = vfs_ioc_setflags_prepare(inode,
+						     REISERFS_I(inode)->i_attrs,
+						     flags);
+			if (err)
 				goto setflags_out;
-			}
 			if ((flags & REISERFS_NOTAIL_FL) &&
 			    S_ISREG(inode->i_mode)) {
 				int result;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4f1a397fda69..034ad14710d1 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -107,18 +107,11 @@ static int setflags(struct inode *inode, int flags)
 	if (err)
 		return err;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 */
 	mutex_lock(&ui->ui_mutex);
 	oldflags = ubifs2ioctl(ui->flags);
-	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			err = -EPERM;
-			goto out_unlock;
-		}
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto out_unlock;
 
 	ui->flags = ioctl2ubifs(flags);
 	ubifs_set_inode_flags(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..41d5175ffdd7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3546,4 +3546,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
 }
 #endif
 
+int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
+			     unsigned int flags);
+
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From 7b0e492e6b80d51db4156996b248522c7b50d467 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 1 Jul 2019 08:25:35 -0700
Subject: vfs: create a generic checking function for FS_IOC_FSSETXATTR

Create a generic checking function for the incoming FS_IOC_FSSETXATTR
fsxattr values so that we can standardize some of the implementation
behaviors.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/btrfs/ioctl.c   | 17 +++++---------
 fs/ext4/ioctl.c    | 25 +++++++++++++-------
 fs/inode.c         | 23 ++++++++++++++++++
 fs/xfs/xfs_ioctl.c | 69 +++++++++++++++++++++++++++++++-----------------------
 include/linux/fs.h |  9 +++++++
 5 files changed, 95 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d3d9b4abb09b..3cd66efdb99d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -375,9 +375,7 @@ static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
 	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
 	struct fsxattr fa;
 
-	memset(&fa, 0, sizeof(fa));
-	fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
-
+	simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
 	if (copy_to_user(arg, &fa, sizeof(fa)))
 		return -EFAULT;
 
@@ -390,7 +388,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
 	struct btrfs_inode *binode = BTRFS_I(inode);
 	struct btrfs_root *root = binode->root;
 	struct btrfs_trans_handle *trans;
-	struct fsxattr fa;
+	struct fsxattr fa, old_fa;
 	unsigned old_flags;
 	unsigned old_i_flags;
 	int ret = 0;
@@ -401,7 +399,6 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	memset(&fa, 0, sizeof(fa));
 	if (copy_from_user(&fa, arg, sizeof(fa)))
 		return -EFAULT;
 
@@ -421,13 +418,11 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
 	old_flags = binode->flags;
 	old_i_flags = inode->i_flags;
 
-	/* We need the capabilities to change append-only or immutable inode */
-	if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
-	     (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
-	    !capable(CAP_LINUX_IMMUTABLE)) {
-		ret = -EPERM;
+	simple_fill_fsxattr(&old_fa,
+			    btrfs_inode_flags_to_xflags(binode->flags));
+	ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
+	if (ret)
 		goto out_unlock;
-	}
 
 	if (fa.fsx_xflags & FS_XFLAG_SYNC)
 		binode->flags |= BTRFS_INODE_SYNC;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 272b6e44191b..1974cb755d09 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -721,6 +721,17 @@ static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
 	return 0;
 }
 
+static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
+						      EXT4_FL_USER_VISIBLE));
+
+	if (ext4_has_feature_project(inode->i_sb))
+		fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -1089,13 +1100,7 @@ resizefs_out:
 	{
 		struct fsxattr fa;
 
-		memset(&fa, 0, sizeof(struct fsxattr));
-		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
-
-		if (ext4_has_feature_project(inode->i_sb)) {
-			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
-				EXT4_I(inode)->i_projid);
-		}
+		ext4_fill_fsxattr(inode, &fa);
 
 		if (copy_to_user((struct fsxattr __user *)arg,
 				 &fa, sizeof(fa)))
@@ -1104,7 +1109,7 @@ resizefs_out:
 	}
 	case EXT4_IOC_FSSETXATTR:
 	{
-		struct fsxattr fa;
+		struct fsxattr fa, old_fa;
 		int err;
 
 		if (copy_from_user(&fa, (struct fsxattr __user *)arg,
@@ -1127,7 +1132,11 @@ resizefs_out:
 			return err;
 
 		inode_lock(inode);
+		ext4_fill_fsxattr(inode, &old_fa);
 		err = ext4_ioctl_check_project(inode, &fa);
+		if (err)
+			goto out;
+		err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
 		if (err)
 			goto out;
 		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
diff --git a/fs/inode.c b/fs/inode.c
index 8072a09fd0b9..ba2bafa22885 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2194,3 +2194,26 @@ int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
 	return 0;
 }
 EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
+ * configurations.
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that they have
+ * exclusive access to the inode structure.
+ */
+int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
+			     struct fsxattr *fa)
+{
+	/*
+	 * Can't modify an immutable/append-only file unless we have
+	 * appropriate permission.
+	 */
+	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
+			(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	return 0;
+}
+EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d7dfc13f30f5..458a7043b4d2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -879,37 +879,44 @@ xfs_di2lxflags(
 	return flags;
 }
 
-STATIC int
-xfs_ioc_fsgetxattr(
-	xfs_inode_t		*ip,
-	int			attr,
-	void			__user *arg)
+static void
+xfs_fill_fsxattr(
+	struct xfs_inode	*ip,
+	bool			attr,
+	struct fsxattr		*fa)
 {
-	struct fsxattr		fa;
-
-	memset(&fa, 0, sizeof(struct fsxattr));
-
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	fa.fsx_xflags = xfs_ip2xflags(ip);
-	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
+	simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
+	fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
 			ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_projid = xfs_get_projid(ip);
+	fa->fsx_projid = xfs_get_projid(ip);
 
 	if (attr) {
 		if (ip->i_afp) {
 			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
-				fa.fsx_nextents = xfs_iext_count(ip->i_afp);
+				fa->fsx_nextents = xfs_iext_count(ip->i_afp);
 			else
-				fa.fsx_nextents = ip->i_d.di_anextents;
+				fa->fsx_nextents = ip->i_d.di_anextents;
 		} else
-			fa.fsx_nextents = 0;
+			fa->fsx_nextents = 0;
 	} else {
 		if (ip->i_df.if_flags & XFS_IFEXTENTS)
-			fa.fsx_nextents = xfs_iext_count(&ip->i_df);
+			fa->fsx_nextents = xfs_iext_count(&ip->i_df);
 		else
-			fa.fsx_nextents = ip->i_d.di_nextents;
+			fa->fsx_nextents = ip->i_d.di_nextents;
 	}
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+	xfs_inode_t		*ip,
+	int			attr,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	xfs_fill_fsxattr(ip, attr, &fa);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (copy_to_user(arg, &fa, sizeof(fa)))
@@ -1035,15 +1042,6 @@ xfs_ioctl_setattr_xflags(
 	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
 		return -EINVAL;
 
-	/*
-	 * Can't modify an immutable/append-only file unless
-	 * we have appropriate permission.
-	 */
-	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
-	     (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
 	/* diflags2 only valid for v3 inodes. */
 	di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
 	if (di_flags2 && ip->i_d.di_version < 3)
@@ -1323,6 +1321,7 @@ xfs_ioctl_setattr(
 	xfs_inode_t		*ip,
 	struct fsxattr		*fa)
 {
+	struct fsxattr		old_fa;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	struct xfs_dquot	*udqp = NULL;
@@ -1370,7 +1369,6 @@ xfs_ioctl_setattr(
 		goto error_free_dquots;
 	}
 
-
 	if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
 	    xfs_get_projid(ip) != fa->fsx_projid) {
 		code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
@@ -1379,6 +1377,11 @@ xfs_ioctl_setattr(
 			goto error_trans_cancel;
 	}
 
+	xfs_fill_fsxattr(ip, false, &old_fa);
+	code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+	if (code)
+		goto error_trans_cancel;
+
 	code = xfs_ioctl_setattr_check_extsize(ip, fa);
 	if (code)
 		goto error_trans_cancel;
@@ -1489,6 +1492,7 @@ xfs_ioc_setxflags(
 {
 	struct xfs_trans	*tp;
 	struct fsxattr		fa;
+	struct fsxattr		old_fa;
 	unsigned int		flags;
 	int			join_flags = 0;
 	int			error;
@@ -1524,6 +1528,13 @@ xfs_ioc_setxflags(
 		goto out_drop_write;
 	}
 
+	xfs_fill_fsxattr(ip, false, &old_fa);
+	error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
+	if (error) {
+		xfs_trans_cancel(tp);
+		goto out_drop_write;
+	}
+
 	error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
 	if (error) {
 		xfs_trans_cancel(tp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41d5175ffdd7..36f9691d7046 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3549,4 +3549,13 @@ static inline struct sock *io_uring_get_socket(struct file *file)
 int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
 			     unsigned int flags);
 
+int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
+			     struct fsxattr *fa);
+
+static inline void simple_fill_fsxattr(struct fsxattr *fa, __u32 xflags)
+{
+	memset(fa, 0, sizeof(*fa));
+	fa->fsx_xflags = xflags;
+}
+
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From 1759d322f4bad2f82c376856363b725cac12e61d Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:48 +0000
Subject: net/mlx5: Add hardware definitions for sub functions

Update mlx5 device interface data structures for:
1. New command definitions for allocating, deallocating SF
2. Query SF partition
3. Eswitch SF fields
4. HCA CAP SF fields
5. Extend Eswitch functions command for SF

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 99 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d4409654f760..db00effaa83a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -106,6 +106,9 @@ enum {
 	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
 	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
 	MLX5_CMD_OP_SET_DRIVER_VERSION            = 0x10d,
+	MLX5_CMD_OP_QUERY_SF_PARTITION            = 0x111,
+	MLX5_CMD_OP_ALLOC_SF                      = 0x113,
+	MLX5_CMD_OP_DEALLOC_SF                    = 0x114,
 	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
 	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
 	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
@@ -713,7 +716,11 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         reserved_2b[0x6];
 	u8         max_encap_header_size[0xa];
 
-	u8         reserved_40[0x7c0];
+	u8         reserved_at_40[0xb];
+	u8         log_max_esw_sf[0x5];
+	u8         esw_sf_base_id[0x10];
+
+	u8         reserved_at_60[0x7a0];
 
 };
 
@@ -1330,13 +1337,24 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_640[0x10];
 	u8         num_q_monitor_counters[0x10];
 
-	u8         reserved_at_660[0x40];
+	u8         reserved_at_660[0x20];
+
+	u8         sf[0x1];
+	u8         sf_set_partition[0x1];
+	u8         reserved_at_682[0x1];
+	u8         log_max_sf[0x5];
+	u8         reserved_at_688[0x8];
+	u8         log_min_sf_size[0x8];
+	u8         max_num_sf_partitions[0x8];
 
 	u8         uctx_cap[0x20];
 
 	u8         reserved_at_6c0[0x4];
 	u8         flex_parser_id_geneve_tlv_option_0[0x4];
-	u8         reserved_at_6c8[0x138];
+	u8	   reserved_at_6c8[0x28];
+	u8	   sf_base_id[0x10];
+
+	u8	   reserved_at_700[0x100];
 };
 
 enum mlx5_flow_destination_type {
@@ -9786,6 +9804,81 @@ struct mlx5_ifc_query_esw_functions_out_bits {
 	struct mlx5_ifc_host_params_context_bits host_params_context;
 
 	u8         reserved_at_280[0x180];
+	u8         host_sf_enable[0][0x40];
+};
+
+struct mlx5_ifc_sf_partition_bits {
+	u8         reserved_at_0[0x10];
+	u8         log_num_sf[0x8];
+	u8         log_sf_bar_size[0x8];
+};
+
+struct mlx5_ifc_query_sf_partitions_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         num_sf_partitions[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_sf_partition_bits sf_partition[0];
+};
+
+struct mlx5_ifc_query_sf_partitions_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_sf_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_sf_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_sf_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_sf_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
 };
 
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From a82e0b5bdac29d9719d3ca2df01494a7947351aa Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:50 +0000
Subject: net/mlx5: Added MCQI and MCQS registers' description to ifc

Given a fw component index, the MCQI register allows us to query
this component's information (e.g. its version and capabilities).

Given a fw component index, the MCQS register allows us to query the
status of a fw component, including its type and state
(e.g. PRESET/IN_USE).
It can be used to find the index of a component of a specific type, by
sequentially increasing the component index, and querying each time the
type of the returned component.
If max component index is reached, 'last_index_flag' is set by the HCA.

These registers' description was added to query the running and pending
fw version of the HCA.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 59 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 87f77ded78d4..2ff624a91e3d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -138,6 +138,7 @@ enum {
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
 	MLX5_REG_MPEGC		 = 0x9056,
+	MLX5_REG_MCQS		 = 0x9060,
 	MLX5_REG_MCQI		 = 0x9061,
 	MLX5_REG_MCC		 = 0x9062,
 	MLX5_REG_MCDA		 = 0x9063,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index db00effaa83a..e2a77b5152a8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8542,7 +8542,7 @@ struct mlx5_ifc_mcam_access_reg_bits {
 	u8         mcda[0x1];
 	u8         mcc[0x1];
 	u8         mcqi[0x1];
-	u8         reserved_at_1f[0x1];
+	u8         mcqs[0x1];
 
 	u8         regs_95_to_87[0x9];
 	u8         mpegc[0x1];
@@ -9034,6 +9034,24 @@ struct mlx5_ifc_mtppse_reg_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_mcqs_reg_bits {
+	u8         last_index_flag[0x1];
+	u8         reserved_at_1[0x7];
+	u8         fw_device[0x8];
+	u8         component_index[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         identifier[0x10];
+
+	u8         reserved_at_40[0x17];
+	u8         component_status[0x5];
+	u8         component_update_state[0x4];
+
+	u8         last_update_state_changer_type[0x4];
+	u8         last_update_state_changer_host_id[0x4];
+	u8         reserved_at_68[0x18];
+};
+
 struct mlx5_ifc_mcqi_cap_bits {
 	u8         supported_info_bitmask[0x20];
 
@@ -9054,6 +9072,43 @@ struct mlx5_ifc_mcqi_cap_bits {
 	u8         reserved_at_86[0x1a];
 };
 
+struct mlx5_ifc_mcqi_version_bits {
+	u8         reserved_at_0[0x2];
+	u8         build_time_valid[0x1];
+	u8         user_defined_time_valid[0x1];
+	u8         reserved_at_4[0x14];
+	u8         version_string_length[0x8];
+
+	u8         version[0x20];
+
+	u8         build_time[0x40];
+
+	u8         user_defined_time[0x40];
+
+	u8         build_tool_version[0x20];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         version_string[92][0x8];
+};
+
+struct mlx5_ifc_mcqi_activation_method_bits {
+	u8         pending_server_ac_power_cycle[0x1];
+	u8         pending_server_dc_power_cycle[0x1];
+	u8         pending_server_reboot[0x1];
+	u8         pending_fw_reset[0x1];
+	u8         auto_activate[0x1];
+	u8         all_hosts_sync[0x1];
+	u8         device_hw_reset[0x1];
+	u8         reserved_at_7[0x19];
+};
+
+union mlx5_ifc_mcqi_reg_data_bits {
+	struct mlx5_ifc_mcqi_cap_bits               mcqi_caps;
+	struct mlx5_ifc_mcqi_version_bits           mcqi_version;
+	struct mlx5_ifc_mcqi_activation_method_bits mcqi_activation_mathod;
+};
+
 struct mlx5_ifc_mcqi_reg_bits {
 	u8         read_pending_component[0x1];
 	u8         reserved_at_1[0xf];
@@ -9071,7 +9126,7 @@ struct mlx5_ifc_mcqi_reg_bits {
 	u8         reserved_at_a0[0x10];
 	u8         data_size[0x10];
 
-	u8         data[0][0x20];
+	union mlx5_ifc_mcqi_reg_data_bits data[0];
 };
 
 struct mlx5_ifc_mcc_reg_bits {
-- 
cgit v1.2.3


From 2f69e591e4531d3192841a4eb2bd9b512f5a8b66 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:53 +0000
Subject: {IB, net}/mlx5: E-Switch, Use index of rep for vport to IB port
 mapping

In the single IB device mode, the mapping between vport number and
rep relies on a counter. However for dynamic vport allocation, it is
desired to keep consistent map of eswitch vport and IB port.

Hence, simplify code to remove the free running counter and instead
use the available vport index during load/unload sequence from the
eswitch.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Suggested-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                        | 4 ++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h                       | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 +
 include/linux/mlx5/eswitch.h                               | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 22e651cb5534..1de16a93fc64 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -14,7 +14,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	int vport_index;
 
 	ibdev = mlx5_ib_get_uplink_ibdev(dev->priv.eswitch);
-	vport_index = ibdev->free_port++;
+	vport_index = rep->vport_index;
 
 	ibdev->port[vport_index].rep = rep;
 	write_lock(&ibdev->port[vport_index].roce.netdev_lock);
@@ -50,7 +50,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	}
 
 	ibdev->is_rep = true;
-	vport_index = ibdev->free_port++;
+	vport_index = rep->vport_index;
 	ibdev->port[vport_index].rep = rep;
 	ibdev->port[vport_index].roce.netdev =
 		mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 1c205c2bd486..ee73dc122d28 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -978,7 +978,6 @@ struct mlx5_ib_dev {
 	u16			devx_whitelist_uid;
 	struct mlx5_srq_table   srq_table;
 	struct mlx5_async_ctx   async_ctx;
-	int			free_port;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bc639a846714..24af2744453b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1411,6 +1411,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 
 	mlx5_esw_for_all_reps(esw, vport_index, rep) {
 		rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index);
+		rep->vport_index = vport_index;
 		ether_addr_copy(rep->hw_id, hw_id);
 
 		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index aece3ae1902d..36cb641188b0 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -46,6 +46,8 @@ struct mlx5_eswitch_rep {
 	u16		       vport;
 	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
+	/* Only IB rep is using vport_index */
+	u16		       vport_index;
 	u32		       vlan_refcount;
 };
 
-- 
cgit v1.2.3


From 386e75af995c3aec475a2185b919bf46af396bfc Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:58 +0000
Subject: net/mlx5: Rename mlx5_pci_dev_type to mlx5_coredev_type

Rename mlx5_pci_dev_type to mlx5_coredev_type to distinguish different mlx5
device types.

mlx5_coredev_type represents mlx5_core_dev instance type. Hence keep
mlx5_coredev_type in mlx5_core_dev structure.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  5 +++--
 include/linux/mlx5/driver.h                    | 11 ++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index bfc8c6faedc2..e5f9df7f7e34 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -731,8 +731,6 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
 	struct mlx5_priv *priv = &dev->priv;
 	int err = 0;
 
-	priv->pci_dev_data = id->driver_data;
-
 	pci_set_drvdata(dev->pdev, dev);
 
 	dev->bar_addr = pci_resource_start(pdev, 0);
@@ -1320,6 +1318,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->device = &pdev->dev;
 	dev->pdev = pdev;
 
+	dev->coredev_type = id->driver_data & MLX5_PCI_DEV_IS_VF ?
+			 MLX5_COREDEV_VF : MLX5_COREDEV_PF;
+
 	err = mlx5_mdev_init(dev, prof_sel);
 	if (err)
 		goto mdev_init_err;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2ff624a91e3d..155b8cbe1cc9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -182,6 +182,11 @@ enum port_state_policy {
 	MLX5_POLICY_INVALID	= 0xffffffff
 };
 
+enum mlx5_coredev_type {
+	MLX5_COREDEV_PF,
+	MLX5_COREDEV_VF
+};
+
 struct mlx5_field_desc {
 	struct dentry	       *dent;
 	int			i;
@@ -567,7 +572,6 @@ struct mlx5_priv {
 	struct mlx5_core_sriov	sriov;
 	struct mlx5_lag		*lag;
 	struct mlx5_devcom	*devcom;
-	unsigned long		pci_dev_data;
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
@@ -646,6 +650,7 @@ struct mlx5_vxlan;
 
 struct mlx5_core_dev {
 	struct device *device;
+	enum mlx5_coredev_type coredev_type;
 	struct pci_dev	       *pdev;
 	/* sync pci state */
 	struct mutex		pci_status_mutex;
@@ -1079,9 +1084,9 @@ enum {
 	MLX5_PCI_DEV_IS_VF		= 1 << 0,
 };
 
-static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev)
+static inline bool mlx5_core_is_pf(struct mlx5_core_dev *dev)
 {
-	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
+	return dev->coredev_type == MLX5_COREDEV_PF;
 }
 
 static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
-- 
cgit v1.2.3


From 5ccf2770e83bf8739f0a7c8bed9186d7e5d2ecbc Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:04 +0000
Subject: net/mlx5: Don't handle VF func change if host PF is disabled

When ECPF eswitch manager is at offloads mode, it monitors functions
changed event from host PF side and acts according to the number of
VFs enabled/disabled.

As ECPF and host PF work in two independent hosts, it's possible that
host PF OS reboots but ECPF system is still kept on and continues
monitoring events from host PF. When kernel from host PF side is
booting, PCI iov driver does sriov_init and compute_max_vf_buses by
iterating over all valid num of VFs. This triggers FLR and generates
functions changed events, even though host PF HCA is not enabled at
this time. However, ECPF is not aware of this information, and still
handles these events as usual. ECPF system will see massive number of
reps are created, but destroyed immediately once creation finished.

To eliminate this noise, a bit is added to host parameter context to
indicate host PF is disabled. ECPF will not handle the VF changed
event if this bit is set.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 ++++-
 include/linux/mlx5/mlx5_ifc.h                              | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 24af2744453b..105c21069c0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2026,6 +2026,7 @@ static void esw_functions_changed_event_handler(struct work_struct *work)
 	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {};
 	struct mlx5_host_work *host_work;
 	struct mlx5_eswitch *esw;
+	bool host_pf_disabled;
 	u16 num_vfs = 0;
 	int err;
 
@@ -2035,7 +2036,9 @@ static void esw_functions_changed_event_handler(struct work_struct *work)
 	err = mlx5_esw_query_functions(esw->dev, out, sizeof(out));
 	num_vfs = MLX5_GET(query_esw_functions_out, out,
 			   host_params_context.host_num_of_vfs);
-	if (err || num_vfs == esw->esw_funcs.num_vfs)
+	host_pf_disabled = MLX5_GET(query_esw_functions_out, out,
+				    host_params_context.host_pf_disabled);
+	if (err || host_pf_disabled || num_vfs == esw->esw_funcs.num_vfs)
 		goto out;
 
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e2a77b5152a8..031db53e94ce 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9823,7 +9823,8 @@ struct mlx5_ifc_mtrc_ctrl_bits {
 
 struct mlx5_ifc_host_params_context_bits {
 	u8         host_number[0x8];
-	u8         reserved_at_8[0x8];
+	u8         reserved_at_8[0x7];
+	u8         host_pf_disabled[0x1];
 	u8         host_num_of_vfs[0x10];
 
 	u8         host_total_vfs[0x10];
-- 
cgit v1.2.3


From d886aba677a0a75ad7fdb06e08418b481e09b036 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:06 +0000
Subject: net/mlx5: Reduce dependency on enabled_vfs counter and num_vfs

While enabling SR-IOV, PCI core already checks that if SR-IOV is already
enabled, it returns failure error code.
Hence, remove such duplicate check from mlx5_core driver.

While at it, make mlx5_device_disable_sriov() to perform cleanup of VFs in
reverse order of mlx5_device_enable_sriov().

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 22 ++++------------------
 include/linux/mlx5/driver.h                     |  1 -
 2 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 2eecb831c499..9d9ff4511306 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -74,13 +74,6 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 	int err;
 	int vf;
 
-	if (sriov->enabled_vfs) {
-		mlx5_core_warn(dev,
-			       "failed to enable SRIOV on device, already enabled with %d vfs\n",
-			       sriov->enabled_vfs);
-		return -EBUSY;
-	}
-
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		goto enable_vfs_hca;
 
@@ -99,7 +92,6 @@ enable_vfs_hca:
 			continue;
 		}
 		sriov->vfs_ctx[vf].enabled = 1;
-		sriov->enabled_vfs++;
 		if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) {
 			err = sriov_restore_guids(dev, vf);
 			if (err) {
@@ -118,13 +110,11 @@ enable_vfs_hca:
 static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int num_vfs = pci_num_vf(dev->pdev);
 	int err;
 	int vf;
 
-	if (!sriov->enabled_vfs)
-		goto out;
-
-	for (vf = 0; vf < sriov->num_vfs; vf++) {
+	for (vf = num_vfs - 1; vf >= 0; vf--) {
 		if (!sriov->vfs_ctx[vf].enabled)
 			continue;
 		err = mlx5_core_disable_hca(dev, vf + 1);
@@ -133,10 +123,8 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 			continue;
 		}
 		sriov->vfs_ctx[vf].enabled = 0;
-		sriov->enabled_vfs--;
 	}
 
-out:
 	if (MLX5_ESWITCH_MANAGER(dev))
 		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
 
@@ -191,13 +179,11 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 
 int mlx5_sriov_attach(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
-
-	if (!mlx5_core_is_pf(dev) || !sriov->num_vfs)
+	if (!mlx5_core_is_pf(dev) || !pci_num_vf(dev->pdev))
 		return 0;
 
 	/* If sriov VFs exist in PCI level, enable them in device level */
-	return mlx5_device_enable_sriov(dev, sriov->num_vfs);
+	return mlx5_device_enable_sriov(dev, pci_num_vf(dev->pdev));
 }
 
 void mlx5_sriov_detach(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 155b8cbe1cc9..7658a4908431 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -468,7 +468,6 @@ struct mlx5_vf_context {
 struct mlx5_core_sriov {
 	struct mlx5_vf_context	*vfs_ctx;
 	int			num_vfs;
-	int			enabled_vfs;
 	u16			max_vfs;
 };
 
-- 
cgit v1.2.3


From e1d974d03e590cf8370d4820e8b467ee700925c3 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:13 +0000
Subject: net/mlx5: Handle host PF vport mac/guid for ECPF

When ECPF is eswitch manager, it has the privilege to query and
configure the mac and node guid of host PF.

While vport number of host PF is 0, the vport command should be
issued with other_vport set in this case as the cmd is issued by
ECPF vport(0xfffe).

Add a specific function to query own vport mac. Low level functions
are used by vport manager to query/modify any vport mac and node guid.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/rdma.c     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 28 ++++++++++++++--------
 include/linux/mlx5/vport.h                         |  3 ++-
 9 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 554672edf8c3..8dd31b5c740c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -680,7 +680,7 @@ static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev,
 
 	memset(perm_addr, 0xff, MAX_ADDR_LEN);
 
-	mlx5_query_nic_vport_mac_address(priv->mdev, 0, perm_addr);
+	mlx5_query_mac_address(priv->mdev, perm_addr);
 }
 
 static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 457cc39423f2..bc9150f18116 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4581,7 +4581,7 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
-	mlx5_query_nic_vport_mac_address(priv->mdev, 0, netdev->dev_addr);
+	mlx5_query_mac_address(priv->mdev, netdev->dev_addr);
 	if (is_zero_ether_addr(netdev->dev_addr) &&
 	    !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) {
 		eth_hw_addr_random(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 33f8f99681a5..abe8540d6879 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1394,7 +1394,7 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev)
 		SET_NETDEV_DEV(netdev, mdev->device);
 		netdev->netdev_ops = &mlx5e_netdev_ops_uplink_rep;
 		/* we want a persistent mac for the uplink rep */
-		mlx5_query_nic_vport_mac_address(mdev, 0, netdev->dev_addr);
+		mlx5_query_mac_address(mdev, netdev->dev_addr);
 		netdev->ethtool_ops = &mlx5e_uplink_rep_ethtool_ops;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 		if (MLX5_CAP_GEN(mdev, qos))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 0c75219d91b5..a758755d7a08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -897,7 +897,7 @@ static void esw_vport_change_handle_locked(struct mlx5_vport *vport)
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	u8 mac[ETH_ALEN];
 
-	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+	mlx5_query_nic_vport_mac_address(dev, vport->vport, true, mac);
 	esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n",
 		  vport->vport, mac);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 105c21069c0c..b253bdf75dd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1407,7 +1407,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 	if (!esw->offloads.vport_reps)
 		return -ENOMEM;
 
-	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+	mlx5_query_mac_address(dev, hw_id);
 
 	mlx5_esw_for_all_reps(esw, vport_index, rep) {
 		rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index ca2296a2f9ee..d61d536f4e17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -867,7 +867,7 @@ struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
 	conn->cb_arg = attr->cb_arg;
 
 	remote_mac = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_mac_47_32);
-	err = mlx5_query_nic_vport_mac_address(fdev->mdev, 0, remote_mac);
+	err = mlx5_query_mac_address(fdev->mdev, remote_mac);
 	if (err) {
 		mlx5_fpga_err(fdev, "Failed to query local MAC: %d\n", err);
 		ret = ERR_PTR(err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
index 401441aefbcb..17ce9dd56b13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
@@ -126,7 +126,7 @@ static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid *
 {
 	u8 hw_id[ETH_ALEN];
 
-	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+	mlx5_query_mac_address(dev, hw_id);
 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
 	addrconf_addr_eui48(&gid->raw[8], hw_id);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 95cdc8cbcba4..670fa493c5f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -155,11 +155,12 @@ int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 }
 
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
-				     u16 vport, u8 *addr)
+				     u16 vport, bool other, u8 *addr)
 {
-	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {};
 	u8 *out_addr;
+	u32 *out;
 	int err;
 
 	out = kvzalloc(outlen, GFP_KERNEL);
@@ -169,7 +170,12 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
 				nic_vport_context.permanent_address);
 
-	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, other);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
 	if (!err)
 		ether_addr_copy(addr, &out_addr[2]);
 
@@ -178,6 +184,12 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address);
 
+int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
+{
+	return mlx5_query_nic_vport_mac_address(mdev, 0, false, addr);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_mac_address);
+
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				      u16 vport, u8 *addr)
 {
@@ -194,9 +206,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 field_select.permanent_address, 1);
 	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
-
-	if (vport)
-		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
 
 	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
 				     in, nic_vport_context);
@@ -291,9 +301,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
 	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type);
 	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
-
-	if (vport)
-		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
 
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
 	if (err)
@@ -483,7 +491,7 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 field_select.node_guid, 1);
 	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
-	MLX5_SET(modify_nic_vport_context_in, in, other_vport, !!vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
 
 	nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in,
 					 in, nic_vport_context);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 3d1c6cdbbba7..c147acc7bf70 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -69,7 +69,8 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 other_vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
-				     u16 vport, u8 *addr);
+				     u16 vport, bool other, u8 *addr);
+int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
 int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				    u16 vport, u8 *min_inline);
 void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline);
-- 
cgit v1.2.3


From f6455de0b0e52dcb11aeb503151b12ec87f9c5e4 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:15 +0000
Subject: net/mlx5: E-Switch, Refactor eswitch SR-IOV interface

Devlink eswitch mode is not necessarily related to SR-IOV, e.g, ECPF
can be at offload mode when SR-IOV is not enabled.

Rename the interface and eswitch mode names to decouple from SR-IOV,
and cleanup eswitch messages accordingly.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.h                |  2 +-
 drivers/infiniband/hw/mlx5/main.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 85 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  8 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 32 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/lag.c      |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    |  4 +-
 include/linux/mlx5/eswitch.h                       |  6 +-
 11 files changed, 77 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index 22adce2d6795..478503ce20df 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -28,7 +28,7 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 #else /* CONFIG_MLX5_ESWITCH */
 static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
 {
-	return SRIOV_NONE;
+	return MLX5_ESWITCH_NONE;
 }
 
 static inline
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 602ac3feea5d..798aa5e0941e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6814,7 +6814,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	printk_once(KERN_INFO "%s", mlx5_version);
 
 	if (MLX5_ESWITCH_MANAGER(mdev) &&
-	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
 		if (!mlx5_core_mp_enabled(mdev))
 			mlx5_ib_register_vport_reps(mdev);
 		return mdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bc9150f18116..f83fdb67e760 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5115,7 +5115,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 
 #ifdef CONFIG_MLX5_ESWITCH
 	if (MLX5_ESWITCH_MANAGER(mdev) &&
-	    mlx5_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+	    mlx5_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
 		mlx5e_rep_register_vport_reps(mdev);
 		return mdev;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index abe8540d6879..ef6d61c1d886 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -391,7 +391,7 @@ static int mlx5e_rep_get_port_parent_id(struct net_device *dev,
 	struct mlx5e_priv *uplink_priv = NULL;
 	struct net_device *uplink_dev;
 
-	if (esw->mode == SRIOV_NONE)
+	if (esw->mode == MLX5_ESWITCH_NONE)
 		return -EOPNOTSUPP;
 
 	uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
@@ -419,7 +419,7 @@ static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
 	struct mlx5e_rep_sq *rep_sq, *tmp;
 	struct mlx5e_rep_priv *rpriv;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return;
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
@@ -440,7 +440,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 	int err;
 	int i;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return 0;
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8ff1ca46d8d3..1ff9785c2f83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3342,7 +3342,7 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 	if (!tc_can_offload_extack(priv->netdev, f->common.extack))
 		return -EOPNOTSUPP;
 
-	if (esw && esw->mode == SRIOV_OFFLOADS)
+	if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS)
 		err = mlx5e_add_fdb_flow(priv, f, flow_flags,
 					 filter_dev, flow);
 	else
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a758755d7a08..b42540e1ba6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -497,7 +497,7 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 
 fdb_add:
 	/* SRIOV is enabled: Forward UC MAC to vport */
-	if (esw->fdb_table.legacy.fdb && esw->mode == SRIOV_LEGACY)
+	if (esw->fdb_table.legacy.fdb && esw->mode == MLX5_ESWITCH_LEGACY)
 		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
 
 	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
@@ -1577,7 +1577,7 @@ static void esw_apply_vport_conf(struct mlx5_eswitch *esw,
 			       flags);
 
 	/* Only legacy mode needs ACLs */
-	if (esw->mode == SRIOV_LEGACY) {
+	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		esw_vport_ingress_config(esw, vport);
 		esw_vport_egress_config(esw, vport);
 	}
@@ -1629,7 +1629,7 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
 	esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
 
 	/* Create steering drop counters for ingress and egress ACLs */
-	if (vport_num && esw->mode == SRIOV_LEGACY)
+	if (vport_num && esw->mode == MLX5_ESWITCH_LEGACY)
 		esw_vport_create_drop_counters(vport);
 
 	/* Restore old vport configuration */
@@ -1683,7 +1683,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw,
 	vport->enabled_events = 0;
 	esw_vport_disable_qos(esw, vport);
 	if (esw->manager_vport != vport_num &&
-	    esw->mode == SRIOV_LEGACY) {
+	    esw->mode == MLX5_ESWITCH_LEGACY) {
 		mlx5_modify_vport_admin_state(esw->dev,
 					      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
 					      vport_num, 1,
@@ -1728,7 +1728,7 @@ int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u32 *out, int outlen)
 /* Public E-Switch API */
 #define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
 
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode)
 {
 	struct mlx5_vport *vport;
 	int total_nvports = 0;
@@ -1737,19 +1737,17 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	if (!ESW_ALLOWED(esw) ||
 	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
-		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
+		esw_warn(esw->dev, "FDB is not supported, aborting ...\n");
 		return -EOPNOTSUPP;
 	}
 
 	if (!MLX5_CAP_ESW_INGRESS_ACL(esw->dev, ft_support))
-		esw_warn(esw->dev, "E-Switch ingress ACL is not supported by FW\n");
+		esw_warn(esw->dev, "ingress ACL is not supported by FW\n");
 
 	if (!MLX5_CAP_ESW_EGRESS_ACL(esw->dev, ft_support))
-		esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
+		esw_warn(esw->dev, "engress ACL is not supported by FW\n");
 
-	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
-
-	if (mode == SRIOV_OFFLOADS) {
+	if (mode == MLX5_ESWITCH_OFFLOADS) {
 		if (mlx5_core_is_ecpf_esw_manager(esw->dev))
 			total_nvports = esw->total_vports;
 		else
@@ -1760,7 +1758,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	mlx5_lag_update(esw->dev);
 
-	if (mode == SRIOV_LEGACY) {
+	if (mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_create_legacy_table(esw);
 		if (err)
 			goto abort;
@@ -1777,11 +1775,11 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	if (err)
 		esw_warn(esw->dev, "Failed to create eswitch TSAR");
 
-	/* Don't enable vport events when in SRIOV_OFFLOADS mode, since:
+	/* Don't enable vport events when in MLX5_ESWITCH_OFFLOADS mode, since:
 	 * 1. L2 table (MPFS) is programmed by PF/VF representors netdevs set_rx_mode
 	 * 2. FDB/Eswitch is programmed by user space tools
 	 */
-	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
+	enabled_events = (mode == MLX5_ESWITCH_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
 
 	/* Enable PF vport */
 	vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF);
@@ -1797,19 +1795,21 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	mlx5_esw_for_each_vf_vport(esw, i, vport, nvfs)
 		esw_enable_vport(esw, vport, enabled_events);
 
-	if (mode == SRIOV_LEGACY) {
+	if (mode == MLX5_ESWITCH_LEGACY) {
 		MLX5_NB_INIT(&esw->nb, eswitch_vport_event, NIC_VPORT_CHANGE);
 		mlx5_eq_notifier_register(esw->dev, &esw->nb);
 	}
 
-	esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
-		 esw->enabled_vports);
+	esw_info(esw->dev, "Enable: mode(%s), nvfs(%d), active vports(%d)\n",
+		 mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
+		 nvfs, esw->enabled_vports);
+
 	return 0;
 
 abort:
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 
-	if (mode == SRIOV_OFFLOADS) {
+	if (mode == MLX5_ESWITCH_OFFLOADS) {
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_ETH);
 	}
@@ -1817,22 +1817,23 @@ abort:
 	return err;
 }
 
-void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 {
 	struct esw_mc_addr *mc_promisc;
 	struct mlx5_vport *vport;
 	int old_mode;
 	int i;
 
-	if (!ESW_ALLOWED(esw) || esw->mode == SRIOV_NONE)
+	if (!ESW_ALLOWED(esw) || esw->mode == MLX5_ESWITCH_NONE)
 		return;
 
-	esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
-		 esw->enabled_vports, esw->mode);
+	esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n",
+		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
+		 esw->dev->priv.sriov.num_vfs, esw->enabled_vports);
 
 	mc_promisc = &esw->mc_promisc;
 
-	if (esw->mode == SRIOV_LEGACY)
+	if (esw->mode == MLX5_ESWITCH_LEGACY)
 		mlx5_eq_notifier_unregister(esw->dev, &esw->nb);
 
 	mlx5_esw_for_all_vports(esw, i, vport)
@@ -1843,17 +1844,17 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 
 	esw_destroy_tsar(esw);
 
-	if (esw->mode == SRIOV_LEGACY)
+	if (esw->mode == MLX5_ESWITCH_LEGACY)
 		esw_destroy_legacy_table(esw);
-	else if (esw->mode == SRIOV_OFFLOADS)
+	else if (esw->mode == MLX5_ESWITCH_OFFLOADS)
 		esw_offloads_cleanup(esw);
 
 	old_mode = esw->mode;
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 
 	mlx5_lag_update(esw->dev);
 
-	if (old_mode == SRIOV_OFFLOADS) {
+	if (old_mode == MLX5_ESWITCH_OFFLOADS) {
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_ETH);
 	}
@@ -1914,7 +1915,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	}
 
 	esw->enabled_vports = 0;
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
 	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) &&
 	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
@@ -1984,7 +1985,7 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 
 	ether_addr_copy(evport->info.mac, mac);
 	evport->info.node_guid = node_guid;
-	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY)
 		err = esw_vport_ingress_config(esw, evport);
 
 unlock:
@@ -2068,7 +2069,7 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
 
 	evport->info.vlan = vlan;
 	evport->info.qos = qos;
-	if (evport->enabled && esw->mode == SRIOV_LEGACY) {
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_vport_ingress_config(esw, evport);
 		if (err)
 			goto unlock;
@@ -2110,7 +2111,7 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
 		mlx5_core_warn(esw->dev,
 			       "Spoofchk in set while MAC is invalid, vport(%d)\n",
 			       evport->vport);
-	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY)
 		err = esw_vport_ingress_config(esw, evport);
 	if (err)
 		evport->info.spoofchk = pschk;
@@ -2206,7 +2207,7 @@ int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting)
 		return -EPERM;
 
 	mutex_lock(&esw->state_lock);
-	if (esw->mode != SRIOV_LEGACY) {
+	if (esw->mode != MLX5_ESWITCH_LEGACY) {
 		err = -EOPNOTSUPP;
 		goto out;
 	}
@@ -2229,7 +2230,7 @@ int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting)
 		return -EPERM;
 
 	mutex_lock(&esw->state_lock);
-	if (esw->mode != SRIOV_LEGACY) {
+	if (esw->mode != MLX5_ESWITCH_LEGACY) {
 		err = -EOPNOTSUPP;
 		goto out;
 	}
@@ -2372,7 +2373,7 @@ static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
 	u64 bytes = 0;
 	int err = 0;
 
-	if (!vport->enabled || esw->mode != SRIOV_LEGACY)
+	if (!vport->enabled || esw->mode != MLX5_ESWITCH_LEGACY)
 		return 0;
 
 	if (vport->egress.drop_counter)
@@ -2482,7 +2483,7 @@ free_out:
 
 u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
 {
-	return ESW_ALLOWED(esw) ? esw->mode : SRIOV_NONE;
+	return ESW_ALLOWED(esw) ? esw->mode : MLX5_ESWITCH_NONE;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
 
@@ -2499,10 +2500,10 @@ EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode);
 
 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 {
-	if ((dev0->priv.eswitch->mode == SRIOV_NONE &&
-	     dev1->priv.eswitch->mode == SRIOV_NONE) ||
-	    (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
-	     dev1->priv.eswitch->mode == SRIOV_OFFLOADS))
+	if ((dev0->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
+	     dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE) ||
+	    (dev0->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS &&
+	     dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS))
 		return true;
 
 	return false;
@@ -2511,6 +2512,6 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 			       struct mlx5_core_dev *dev1)
 {
-	return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
-		dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
+	return (dev0->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS &&
+		dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 335cbeee1b9e..273a17243275 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -253,8 +253,8 @@ void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
-void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode);
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw);
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       int vport, u8 mac[ETH_ALEN]);
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
@@ -528,8 +528,8 @@ bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num);
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
-static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
-static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+static inline int  mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
+static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; }
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 static inline int
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b253bdf75dd6..a1beada1cdbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -147,7 +147,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	struct mlx5_flow_table *fdb;
 	int j, i = 0;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	flow_act.action = attr->action;
@@ -1358,19 +1358,19 @@ static int esw_offloads_start(struct mlx5_eswitch *esw,
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
-	if (esw->mode != SRIOV_LEGACY &&
+	if (esw->mode != MLX5_ESWITCH_LEGACY &&
 	    !mlx5_core_is_ecpf_esw_manager(esw->dev)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't set offloads mode, SRIOV legacy not enabled");
 		return -EINVAL;
 	}
 
-	mlx5_eswitch_disable_sriov(esw);
-	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+	mlx5_eswitch_disable(esw);
+	err = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_OFFLOADS);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed setting eswitch to offloads");
-		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+		err1 = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_LEGACY);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to legacy");
@@ -2174,11 +2174,11 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
-	mlx5_eswitch_disable_sriov(esw);
-	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+	mlx5_eswitch_disable(esw);
+	err = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_LEGACY);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy");
-		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+		err1 = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_OFFLOADS);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to offloads");
@@ -2203,10 +2203,10 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 {
 	switch (mode) {
 	case DEVLINK_ESWITCH_MODE_LEGACY:
-		*mlx5_mode = SRIOV_LEGACY;
+		*mlx5_mode = MLX5_ESWITCH_LEGACY;
 		break;
 	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
-		*mlx5_mode = SRIOV_OFFLOADS;
+		*mlx5_mode = MLX5_ESWITCH_OFFLOADS;
 		break;
 	default:
 		return -EINVAL;
@@ -2218,10 +2218,10 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
 {
 	switch (mlx5_mode) {
-	case SRIOV_LEGACY:
+	case MLX5_ESWITCH_LEGACY:
 		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
 		break;
-	case SRIOV_OFFLOADS:
+	case MLX5_ESWITCH_OFFLOADS:
 		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
 		break;
 	default:
@@ -2285,7 +2285,7 @@ static int mlx5_devlink_eswitch_check(struct devlink *devlink)
 	if(!MLX5_ESWITCH_MANAGER(dev))
 		return -EPERM;
 
-	if (dev->priv.eswitch->mode == SRIOV_NONE &&
+	if (dev->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
 	    !mlx5_core_is_ecpf_esw_manager(dev))
 		return -EOPNOTSUPP;
 
@@ -2408,7 +2408,7 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
 	if (!MLX5_CAP_GEN(dev, vport_group_manager))
 		return -EOPNOTSUPP;
 
-	if (esw->mode == SRIOV_NONE)
+	if (esw->mode == MLX5_ESWITCH_NONE)
 		return -EOPNOTSUPP;
 
 	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
@@ -2455,7 +2455,7 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
 	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC)
 		return -EOPNOTSUPP;
 
-	if (esw->mode == SRIOV_LEGACY) {
+	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		esw->offloads.encap = encap;
 		return 0;
 	}
@@ -2522,7 +2522,7 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
 	struct mlx5_eswitch_rep *rep;
 	int i;
 
-	if (esw->mode == SRIOV_OFFLOADS)
+	if (esw->mode == MLX5_ESWITCH_OFFLOADS)
 		__unload_reps_all_vport(esw, max_vf, rep_type);
 
 	mlx5_esw_for_all_reps(esw, i, rep)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index 959605559858..c5ef2ff26465 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -305,8 +305,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 			   !mlx5_sriov_is_enabled(dev1);
 
 #ifdef CONFIG_MLX5_ESWITCH
-		roce_lag &= dev0->priv.eswitch->mode == SRIOV_NONE &&
-			    dev1->priv.eswitch->mode == SRIOV_NONE;
+		roce_lag &= dev0->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
+			    dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE;
 #endif
 
 		if (roce_lag)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 9d9ff4511306..d4c90f029f49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -77,7 +77,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		goto enable_vfs_hca;
 
-	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
+	err = mlx5_eswitch_enable(dev->priv.eswitch, num_vfs, MLX5_ESWITCH_LEGACY);
 	if (err) {
 		mlx5_core_warn(dev,
 			       "failed to enable eswitch SRIOV (%d)\n", err);
@@ -126,7 +126,7 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 	}
 
 	if (MLX5_ESWITCH_MANAGER(dev))
-		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+		mlx5_eswitch_disable(dev->priv.eswitch);
 
 	if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages))
 		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 36cb641188b0..d4731199edb4 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -12,9 +12,9 @@
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
 
 enum {
-	SRIOV_NONE,
-	SRIOV_LEGACY,
-	SRIOV_OFFLOADS
+	MLX5_ESWITCH_NONE,
+	MLX5_ESWITCH_LEGACY,
+	MLX5_ESWITCH_OFFLOADS
 };
 
 enum {
-- 
cgit v1.2.3


From 411ec9e0b45792e2ac7c55f94a635d5ce894910b Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:22 +0000
Subject: net/mlx5: E-Switch, Consider host PF for inline mode and vlan pop

When ECPF is the eswitch manager, host PF is treated like other VFs.
Driver should do the same for inline mode and vlan pop.

Add new iterators to include host PF if ECPF is the eswitch manager.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 26 ++++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 13 ++++++-----
 include/linux/mlx5/vport.h                         |  1 +
 4 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index b256f397f112..935b9429bb2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1891,6 +1891,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 
 	esw->dev = dev;
 	esw->manager_vport = mlx5_eswitch_manager_vport(dev);
+	esw->first_host_vport = mlx5_eswitch_first_host_vport_num(dev);
 
 	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
 	if (!esw->work_queue) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index bfc32bcbf544..f59183440d7f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -227,6 +227,7 @@ struct mlx5_eswitch {
 	int                     mode;
 	int                     nvports;
 	u16                     manager_vport;
+	u16                     first_host_vport;
 	struct mlx5_esw_functions esw_funcs;
 };
 
@@ -422,6 +423,12 @@ static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
 		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
 }
 
+static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev)
+{
+	return mlx5_core_is_ecpf_esw_manager(dev) ?
+		MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF;
+}
+
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev)
 {
 	/* Ideally device should have the functions changed supported
@@ -518,6 +525,25 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw);
 #define mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, nvfs)	\
 	for ((vport) = (nvfs); (vport) >= MLX5_VPORT_FIRST_VF; (vport)--)
 
+/* Includes host PF (vport 0) if it's not esw manager. */
+#define mlx5_esw_for_each_host_func_rep(esw, i, rep, nvfs)	\
+	for ((i) = (esw)->first_host_vport;			\
+	     (rep) = &(esw)->offloads.vport_reps[i],		\
+	     (i) <= (nvfs); (i)++)
+
+#define mlx5_esw_for_each_host_func_rep_reverse(esw, i, rep, nvfs)	\
+	for ((i) = (nvfs);						\
+	     (rep) = &(esw)->offloads.vport_reps[i],			\
+	     (i) >= (esw)->first_host_vport; (i)--)
+
+#define mlx5_esw_for_each_host_func_vport(esw, vport, nvfs)	\
+	for ((vport) = (esw)->first_host_vport;			\
+	     (vport) <= (nvfs); (vport)++)
+
+#define mlx5_esw_for_each_host_func_vport_reverse(esw, vport, nvfs)	\
+	for ((vport) = (nvfs);						\
+	     (vport) >= (esw)->first_host_vport; (vport)--)
+
 struct mlx5_vport *__must_check
 mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index aecfb636fbc6..50e5841c1698 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -344,10 +344,10 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 {
 	struct mlx5_eswitch_rep *rep;
-	int vf_vport, err = 0;
+	int i, err = 0;
 
 	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
-	mlx5_esw_for_each_vf_rep(esw, vf_vport, rep, esw->esw_funcs.num_vfs) {
+	mlx5_esw_for_each_host_func_rep(esw, i, rep, esw->esw_funcs.num_vfs) {
 		if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED)
 			continue;
 
@@ -2330,7 +2330,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 	if (err)
 		goto out;
 
-	mlx5_esw_for_each_vf_vport_num(esw, vport, esw->esw_funcs.num_vfs) {
+	mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) {
 		err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
@@ -2344,7 +2344,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 
 revert_inline_mode:
 	num_vport = --vport;
-	mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, num_vport)
+	mlx5_esw_for_each_host_func_vport_reverse(esw, vport, num_vport)
 		mlx5_modify_nic_vport_min_inline(dev,
 						 vport,
 						 esw->offloads.inline_mode);
@@ -2389,9 +2389,10 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode)
 	}
 
 query_vports:
-	mlx5_esw_for_each_vf_vport_num(esw, vport, esw->esw_funcs.num_vfs) {
+	mlx5_query_nic_vport_min_inline(dev, esw->first_host_vport, &prev_mlx5_mode);
+	mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) {
 		mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
-		if (vport > 1 && prev_mlx5_mode != mlx5_mode)
+		if (prev_mlx5_mode != mlx5_mode)
 			return -EINVAL;
 		prev_mlx5_mode = mlx5_mode;
 	}
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c147acc7bf70..6cbf29229749 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -58,6 +58,7 @@ enum {
 	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
 };
 
+/* Vport number for each function must keep unchanged */
 enum {
 	MLX5_VPORT_PF			= 0x0,
 	MLX5_VPORT_FIRST_VF		= 0x1,
-- 
cgit v1.2.3


From 498b98e939007f8bb65094dfa229e84b6bf30e62 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 21 Jun 2019 18:21:45 -0700
Subject: soc: qcom: mdt_loader: Support loading non-split images

In some software releases the firmware images are not split up with each
loadable segment in it's own file. Check the size of the loaded firmware
to see if it still contains each segment to be loaded, before falling
back to the split-out segments.

Acked-by: Andy Gross <agross@kernel.org>
Reviewed-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/mdt_loader.c       | 88 +++++++++++++++++++++++++++++++++++--
 include/linux/soc/qcom/mdt_loader.h |  2 +
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c
index 1c488024c698..1c970d6dd532 100644
--- a/drivers/soc/qcom/mdt_loader.c
+++ b/drivers/soc/qcom/mdt_loader.c
@@ -74,6 +74,66 @@ ssize_t qcom_mdt_get_size(const struct firmware *fw)
 }
 EXPORT_SYMBOL_GPL(qcom_mdt_get_size);
 
+/**
+ * qcom_mdt_read_metadata() - read header and metadata from mdt or mbn
+ * @fw:		firmware of mdt header or mbn
+ * @data_len:	length of the read metadata blob
+ *
+ * The mechanism that performs the authentication of the loading firmware
+ * expects an ELF header directly followed by the segment of hashes, with no
+ * padding inbetween. This function allocates a chunk of memory for this pair
+ * and copy the two pieces into the buffer.
+ *
+ * In the case of split firmware the hash is found directly following the ELF
+ * header, rather than at p_offset described by the second program header.
+ *
+ * The caller is responsible to free (kfree()) the returned pointer.
+ *
+ * Return: pointer to data, or ERR_PTR()
+ */
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len)
+{
+	const struct elf32_phdr *phdrs;
+	const struct elf32_hdr *ehdr;
+	size_t hash_offset;
+	size_t hash_size;
+	size_t ehdr_size;
+	void *data;
+
+	ehdr = (struct elf32_hdr *)fw->data;
+	phdrs = (struct elf32_phdr *)(ehdr + 1);
+
+	if (ehdr->e_phnum < 2)
+		return ERR_PTR(-EINVAL);
+
+	if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD)
+		return ERR_PTR(-EINVAL);
+
+	if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	ehdr_size = phdrs[0].p_filesz;
+	hash_size = phdrs[1].p_filesz;
+
+	data = kmalloc(ehdr_size + hash_size, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	/* Is the header and hash already packed */
+	if (ehdr_size + hash_size == fw->size)
+		hash_offset = phdrs[0].p_filesz;
+	else
+		hash_offset = phdrs[1].p_offset;
+
+	memcpy(data, fw->data, ehdr_size);
+	memcpy(data + ehdr_size, fw->data + hash_offset, hash_size);
+
+	*data_len = ehdr_size + hash_size;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(qcom_mdt_read_metadata);
+
 static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 			   const char *firmware, int pas_id, void *mem_region,
 			   phys_addr_t mem_phys, size_t mem_size,
@@ -86,12 +146,14 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 	phys_addr_t mem_reloc;
 	phys_addr_t min_addr = PHYS_ADDR_MAX;
 	phys_addr_t max_addr = 0;
+	size_t metadata_len;
 	size_t fw_name_len;
 	ssize_t offset;
+	void *metadata;
 	char *fw_name;
 	bool relocate = false;
 	void *ptr;
-	int ret;
+	int ret = 0;
 	int i;
 
 	if (!fw || !mem_region || !mem_phys || !mem_size)
@@ -109,7 +171,15 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 		return -ENOMEM;
 
 	if (pas_init) {
-		ret = qcom_scm_pas_init_image(pas_id, fw->data, fw->size);
+		metadata = qcom_mdt_read_metadata(fw, &metadata_len);
+		if (IS_ERR(metadata)) {
+			ret = PTR_ERR(metadata);
+			goto out;
+		}
+
+		ret = qcom_scm_pas_init_image(pas_id, metadata, metadata_len);
+
+		kfree(metadata);
 		if (ret) {
 			dev_err(dev, "invalid firmware metadata\n");
 			goto out;
@@ -170,7 +240,19 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 
 		ptr = mem_region + offset;
 
-		if (phdr->p_filesz) {
+		if (phdr->p_filesz && phdr->p_offset < fw->size) {
+			/* Firmware is large enough to be non-split */
+			if (phdr->p_offset + phdr->p_filesz > fw->size) {
+				dev_err(dev,
+					"failed to load segment %d from truncated file %s\n",
+					i, firmware);
+				ret = -EINVAL;
+				break;
+			}
+
+			memcpy(ptr, fw->data + phdr->p_offset, phdr->p_filesz);
+		} else if (phdr->p_filesz) {
+			/* Firmware not large enough, load split-out segments */
 			sprintf(fw_name + fw_name_len - 3, "b%02d", i);
 			ret = request_firmware_into_buf(&seg_fw, fw_name, dev,
 							ptr, phdr->p_filesz);
diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h
index 944b06aefb0f..e600baec6825 100644
--- a/include/linux/soc/qcom/mdt_loader.h
+++ b/include/linux/soc/qcom/mdt_loader.h
@@ -21,4 +21,6 @@ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw,
 			  const char *fw_name, int pas_id, void *mem_region,
 			  phys_addr_t mem_phys, size_t mem_size,
 			  phys_addr_t *reloc_base);
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len);
+
 #endif
-- 
cgit v1.2.3


From 362b87f5b1c6603b72699e8bb18661ecc4efc0bb Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 28 Jun 2019 16:40:21 +0200
Subject: netlink: use 48 byte ctx instead of 6 signed longs for callback

People are inclined to stuff random things into cb->args[n] because it
looks like an array of integers. Sometimes people even put u64s in there
with comments noting that a certain member takes up two slots. The
horror! Really this should mirror the usage of skb->cb, which are just
48 opaque bytes suitable for casting a struct. Then people can create
their usual casting macros for accessing strongly typed members of a
struct.

As a plus, this also gives us the same amount of space on 32bit and 64bit.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 593d1b9c33a8..205fa7b1f07a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -192,7 +192,14 @@ struct netlink_callback {
 	bool			strict_check;
 	u16			answer_flags;
 	unsigned int		prev_seq, seq;
-	long			args[6];
+	union {
+		u8		ctx[48];
+
+		/* args is deprecated. Cast a struct over ctx instead
+		 * for proper type safety.
+		 */
+		long		args[6];
+	};
 };
 
 struct netlink_notify {
-- 
cgit v1.2.3


From e33d2b74d805af0e4c8060f41040595ba105a520 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 28 Jun 2019 11:03:41 -0700
Subject: idr: fix overflow case for idr_for_each_entry_ul()

idr_for_each_entry_ul() is buggy as it can't handle overflow
case correctly. When we have an ID == UINT_MAX, it becomes an
infinite loop. This happens when running on 32-bit CPU where
unsigned long has the same size with unsigned int.

There is no better way to fix this than casting it to a larger
integer, but we can't just 64 bit integer on 32 bit CPU. Instead
we could just use an additional integer to help us to detect this
overflow case, that is, adding a new parameter to this macro.
Fortunately tc action is its only user right now.

Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
Reported-by: Li Shuang <shuali@redhat.com>
Tested-by: Davide Caratti <dcaratti@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chris Mi <chrism@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/idr.h | 7 +++++--
 net/sched/act_api.c | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index ee7abae143d3..68528a72d10d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -191,14 +191,17 @@ static inline void idr_preload_end(void)
  * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
  * @idr: IDR handle.
  * @entry: The type * to use as cursor.
+ * @tmp: A temporary placeholder for ID.
  * @id: Entry ID.
  *
  * @entry and @id do not need to be initialized before the loop, and
  * after normal termination @entry is left with the value NULL.  This
  * is convenient for a "not found" value.
  */
-#define idr_for_each_entry_ul(idr, entry, id)			\
-	for (id = 0; ((entry) = idr_get_next_ul(idr, &(id))) != NULL; ++id)
+#define idr_for_each_entry_ul(idr, entry, tmp, id)			\
+	for (tmp = 0, id = 0;						\
+	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     tmp = id, ++id)
 
 /**
  * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4e5d2e9ace5d..339712296164 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -221,12 +221,13 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct idr *idr = &idrinfo->action_idr;
 	struct tc_action *p;
 	unsigned long id = 1;
+	unsigned long tmp;
 
 	mutex_lock(&idrinfo->lock);
 
 	s_i = cb->args[0];
 
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		index++;
 		if (index < s_i)
 			continue;
@@ -292,6 +293,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct idr *idr = &idrinfo->action_idr;
 	struct tc_action *p;
 	unsigned long id = 1;
+	unsigned long tmp;
 
 	nest = nla_nest_start_noflag(skb, 0);
 	if (nest == NULL)
@@ -300,7 +302,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 		goto nla_put_failure;
 
 	mutex_lock(&idrinfo->lock);
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		ret = tcf_idr_release_unsafe(p);
 		if (ret == ACT_P_DELETED) {
 			module_put(ops->owner);
@@ -533,8 +535,9 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
 	struct tc_action *p;
 	int ret;
 	unsigned long id = 1;
+	unsigned long tmp;
 
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		ret = __tcf_idr_release(p, false, true);
 		if (ret == ACT_P_DELETED)
 			module_put(ops->owner);
-- 
cgit v1.2.3


From d39d714969cda5cbda291402c8c6b1fb1047f42e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 28 Jun 2019 11:03:42 -0700
Subject: idr: introduce idr_for_each_entry_continue_ul()

Similarly, other callers of idr_get_next_ul() suffer the same
overflow bug as they don't handle it properly either.

Introduce idr_for_each_entry_continue_ul() to help these callers
iterate from a given ID.

cls_flower needs more care here because it still has overflow when
does arg->cookie++, we have to fold its nested loops into one
and remove the arg->cookie++.

Fixes: 01683a146999 ("net: sched: refactor flower walk to iterate over idr")
Fixes: 12d6066c3b29 ("net/mlx5: Add flow counters idr")
Reported-by: Li Shuang <shuali@redhat.com>
Cc: Davide Caratti <dcaratti@redhat.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Chris Mi <chrism@mellanox.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Tested-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 10 ++++----
 include/linux/idr.h                                | 14 +++++++++++
 net/sched/cls_flower.c                             | 27 ++++++----------------
 3 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index c6c28f56aa29..b3762123a69c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -102,13 +102,15 @@ static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 	unsigned long next_id = (unsigned long)id + 1;
 	struct mlx5_fc *counter;
+	unsigned long tmp;
 
 	rcu_read_lock();
 	/* skip counters that are in idr, but not yet in counters list */
-	while ((counter = idr_get_next_ul(&fc_stats->counters_idr,
-					  &next_id)) != NULL &&
-	       list_empty(&counter->list))
-		next_id++;
+	idr_for_each_entry_continue_ul(&fc_stats->counters_idr,
+				       counter, tmp, next_id) {
+		if (!list_empty(&counter->list))
+			break;
+	}
 	rcu_read_unlock();
 
 	return counter ? &counter->list : &fc_stats->counters;
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 68528a72d10d..4ec8986e5dfb 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -216,6 +216,20 @@ static inline void idr_preload_end(void)
 	     entry;							\
 	     ++id, (entry) = idr_get_next((idr), &(id)))
 
+/**
+ * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
+ * @idr: IDR handle.
+ * @entry: The type * to use as a cursor.
+ * @tmp: A temporary placeholder for ID.
+ * @id: Entry ID.
+ *
+ * Continue to iterate over entries, continuing after the current position.
+ */
+#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)		\
+	for (tmp = id;							\
+	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     tmp = id, ++id)
+
 /*
  * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
  */
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eedd5786c084..fdeede3af72e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -524,24 +524,6 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
 	return f;
 }
 
-static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
-						unsigned long *handle)
-{
-	struct cls_fl_head *head = fl_head_dereference(tp);
-	struct cls_fl_filter *f;
-
-	rcu_read_lock();
-	while ((f = idr_get_next_ul(&head->handle_idr, handle))) {
-		/* don't return filters that are being deleted */
-		if (refcount_inc_not_zero(&f->refcnt))
-			break;
-		++(*handle);
-	}
-	rcu_read_unlock();
-
-	return f;
-}
-
 static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 		       bool *last, bool rtnl_held,
 		       struct netlink_ext_ack *extack)
@@ -1691,20 +1673,25 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
 static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
 		    bool rtnl_held)
 {
+	struct cls_fl_head *head = fl_head_dereference(tp);
+	unsigned long id = arg->cookie, tmp;
 	struct cls_fl_filter *f;
 
 	arg->count = arg->skip;
 
-	while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
+	idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+		/* don't return filters that are being deleted */
+		if (!refcount_inc_not_zero(&f->refcnt))
+			continue;
 		if (arg->fn(tp, f, arg) < 0) {
 			__fl_put(f);
 			arg->stop = 1;
 			break;
 		}
 		__fl_put(f);
-		arg->cookie++;
 		arg->count++;
 	}
+	arg->cookie = id;
 }
 
 static struct cls_fl_filter *
-- 
cgit v1.2.3


From 4de83b88c66a1e4dba426b29766fb68e61d93792 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Mon, 1 Jul 2019 14:38:49 -0700
Subject: loopback: create blackhole net device similar to loopack.

Create a blackhole net device that can be used for "dead"
dst entries instead of loopback device. This blackhole device differs
from loopback in few aspects: (a) It's not per-ns. (b)  MTU on this
device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb().
and (d) since it's not registered it won't have ifindex.

Lower MTU effectively make the device not pass the MTU check during
the route check when a dst associated with the skb is dead.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 76 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/netdevice.h |  2 ++
 2 files changed, 69 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
 
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
 	.ndo_set_mac_address = eth_mac_addr,
 };
 
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+			 unsigned int mtu,
+			 const struct ethtool_ops *eth_ops,
+			 const struct header_ops *hdr_ops,
+			 const struct net_device_ops *dev_ops,
+			 void (*dev_destructor)(struct net_device *dev))
 {
-	dev->mtu		= 64 * 1024;
+	dev->mtu		= mtu;
 	dev->hard_header_len	= ETH_HLEN;	/* 14	*/
 	dev->min_header_len	= ETH_HLEN;	/* 14	*/
 	dev->addr_len		= ETH_ALEN;	/* 6	*/
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
 		| NETIF_F_NETNS_LOCAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
-	dev->ethtool_ops	= &loopback_ethtool_ops;
-	dev->header_ops		= &eth_header_ops;
-	dev->netdev_ops		= &loopback_ops;
+	dev->ethtool_ops	= eth_ops;
+	dev->header_ops		= hdr_ops;
+	dev->netdev_ops		= dev_ops;
 	dev->needs_free_netdev	= true;
-	dev->priv_destructor	= loopback_dev_free;
+	dev->priv_destructor	= dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+	gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
+		     &loopback_ops, loopback_dev_free);
 }
 
 /* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ out:
 struct pernet_operations __net_initdata loopback_net_ops = {
 	.init = loopback_net_init,
 };
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	kfree_skb(skb);
+	net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+	.ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+	gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+	blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+					blackhole_netdev_setup);
+	if (!blackhole_netdev)
+		return -ENOMEM;
+
+	dev_init_scheduler(blackhole_netdev);
+	dev_activate(blackhole_netdev);
+
+	blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+	dev_net_set(blackhole_netdev, &init_net);
+
+	return 0;
+}
+
+device_initcall(blackhole_netdev_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eeacebd7debb..88292953aa6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4870,4 +4870,6 @@ do {								\
 #define PTYPE_HASH_SIZE	(16)
 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 
+extern struct net_device *blackhole_netdev;
+
 #endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3


From 6987fd42239ad43937166dace2ed8fb260b14d25 Mon Sep 17 00:00:00 2001
From: Otto Sabart <ottosabart@seberm.com>
Date: Mon, 20 May 2019 10:06:26 +0100
Subject: mfd: madera: Fix bad reference to pinctrl.txt file

The pinctrl.txt file was converted into reStructuredText and moved into
driver-api folder. This patch updates the broken reference.

Fixes: 5a9b73832e9e ("pinctrl.txt: move it to the driver-api book")
Signed-off-by: Otto Sabart <ottosabart@seberm.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/madera/pdata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 8dc852402dbb..dd00ab824e5b 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -34,7 +34,8 @@ struct madera_codec_pdata;
  * @micvdd:	    Substruct of pdata for the MICVDD regulator
  * @irq_flags:	    Mode for primary IRQ (defaults to active low)
  * @gpio_base:	    Base GPIO number
- * @gpio_configs:   Array of GPIO configurations (See Documentation/pinctrl.txt)
+ * @gpio_configs:   Array of GPIO configurations (See
+ *		    Documentation/driver-api/pinctl.rst)
  * @n_gpio_configs: Number of entries in gpio_configs
  * @gpsw:	    General purpose switch mode setting. Depends on the external
  *		    hardware connected to the switch. (See the SW1_MODE field
-- 
cgit v1.2.3


From 1ef921b6d1b68887be22f02dabc6ae73c112dce4 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:52 +0100
Subject: mfd: madera: Add Madera core support for CS47L15

This patch adds all the core support and defines for the Cirrus
Logic CS47L15 smart audio CODEC.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l15-tables.c         | 1301 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   44 ++
 drivers/mfd/madera-i2c.c             |    7 +
 drivers/mfd/madera-spi.c             |    7 +
 drivers/mfd/madera.h                 |    6 +
 include/linux/mfd/madera/core.h      |    2 +
 include/linux/mfd/madera/registers.h |    5 +
 9 files changed, 1382 insertions(+)
 create mode 100644 drivers/mfd/cs47l15-tables.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 760b9e29c8e5..86ae0a11f631 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -260,6 +260,13 @@ config MFD_MADERA_SPI
 	  Support for the Cirrus Logic Madera platform audio SoC
 	  core functionality controlled via SPI.
 
+config MFD_CS47L15
+	bool "Cirrus Logic CS47L15"
+	select PINCTRL_CS47L15
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS47L15 Smart Codec
+
 config MFD_CS47L35
 	bool "Cirrus Logic CS47L35"
 	select PINCTRL_CS47L35
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 643d65bcb6ea..cc044f38af84 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -75,6 +75,9 @@ obj-$(CONFIG_MFD_WM8994)	+= wm8994.o
 obj-$(CONFIG_MFD_WM97xx)	+= wm97xx-core.o
 
 madera-objs			:= madera-core.o
+ifeq ($(CONFIG_MFD_CS47L15),y)
+madera-objs			+= cs47l15-tables.o
+endif
 ifeq ($(CONFIG_MFD_CS47L35),y)
 madera-objs			+= cs47l35-tables.o
 endif
diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
new file mode 100644
index 000000000000..1b4f6f79eac2
--- /dev/null
+++ b/drivers/mfd/cs47l15-tables.c
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L15 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l15_reva_16_patch[] = {
+	{ 0x8C, 0x5555 },
+	{ 0x8C, 0xAAAA },
+	{ 0x314, 0x0080 },
+	{ 0x4A8, 0x6023 },
+	{ 0x4A9, 0x6023 },
+	{ 0x4D4, 0x0008 },
+	{ 0x4CF, 0x0F00 },
+	{ 0x4D7, 0x1B2B },
+	{ 0x8C, 0xCCCC },
+	{ 0x8C, 0x3333 },
+};
+
+int cs47l15_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l15_reva_16_patch,
+				    ARRAY_SIZE(cs47l15_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l15_patch);
+
+static const struct reg_default cs47l15_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000066, 0x01ff }, /* R102 (0x66) - Always On Triggers Sequence Select 1 */
+	{ 0x00000067, 0x01ff }, /* R103 (0x67) - Always On Triggers Sequence Select 2 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xA0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32K 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000120, 0x0304 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x0002 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0008 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0018 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
+	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0000 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x00000187, 0x0001 }, /* R391 (0x187) - FLL1 Synchroniser 7 */
+	{ 0x00000189, 0x0000 }, /* R393 (0x189) - FLL1 Spread Spectrum */
+	{ 0x0000018a, 0x0004 }, /* R394 (0x18A) - FLL1 GPIO Clock */
+	{ 0x000001d1, 0x0004 }, /* R465 (0x1D1) - FLL AO Control 1 */
+	{ 0x000001d2, 0x0004 }, /* R466 (0x1D2) - FLL AO Control 2 */
+	{ 0x000001d3, 0x0000 }, /* R467 (0x1D3) - FLL AO Control 3 */
+	{ 0x000001d4, 0x0000 }, /* R468 (0x1D4) - FLL AO Control 4 */
+	{ 0x000001d5, 0x0001 }, /* R469 (0x1D5) - FLL AO Control 5 */
+	{ 0x000001d6, 0x8004 }, /* R470 (0x1D6) - FLL AO Control 6 */
+	{ 0x000001d8, 0x0000 }, /* R472 (0x1D8) - FLL AO Control 7 */
+	{ 0x000001da, 0x0077 }, /* R474 (0x1DA) - FLL AO Control 8 */
+	{ 0x000001db, 0x0000 }, /* R475 (0x1DB) - FLL AO Control 9 */
+	{ 0x000001dc, 0x06da }, /* R476 (0x1DC) - FLL AO Control 10 */
+	{ 0x000001dd, 0x0011 }, /* R477 (0x1DD) - FLL AO Control 11 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x0000021c, 0x0222 }, /* R540 (0x21C) - Mic Bias Ctrl 5 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29B) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2A2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2A3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2A4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2A6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2A7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2A8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2A9) - Mic Detect 1 Level 4 */
+	{ 0x000002c6, 0x0010 }, /* R710 (0x2C6) - Micd Clamp Control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2C8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2D3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0000 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30C) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0000 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31A) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31B) - IN2L Rate Control */
+	{ 0x0000031c, 0x0800 }, /* R796 (0x31C) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31D) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31E) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31F) - IN2R Rate Control */
+	{ 0x000003a8, 0x2000 }, /* R936 (0x3A8) - CS47L15 ADC Int Bias */
+	{ 0x000003c4, 0x0000 }, /* R964 (0x3C4) - CS47L15 PGA Bias Sel */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0000 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0000 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x0000041a, 0x0600 }, /* R1050 (0x41A) - Output Path Config 2 */
+	{ 0x00000428, 0x0000 }, /* R1064 (0x428) - Output Path Config 4L */
+	{ 0x00000429, 0x0180 }, /* R1065 (0x429) - DAC Digital Volume 4L */
+	{ 0x0000042b, 0x0040 }, /* R1067 (0x42B) - Noise Gate Select 4L */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x3080 }, /* R1184 (0x4A0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a8, 0x6023 }, /* R1192 (0x4A8) - HP Test Ctrl 5 */
+	{ 0x000004a9, 0x6023 }, /* R1193 (0x4A9) - HP Test Ctrl 6 */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50A) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50B) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50C) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50D) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50E) - AIF1 Frame Ctrl 8 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51A) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54A) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54B) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54C) - AIF2 Frame Ctrl 6 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55A) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58A) - AIF3 Frame Ctrl 4 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59A) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5C2) - SPD1 Tx Control */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64A) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64B) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64C) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64D) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64E) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64F) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68A) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68B) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68C) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68D) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68E) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68F) - OUT1RMIX Input 4 Volume */
+	{ 0x000006b0, 0x0000 }, /* R1712 (0x6B0) - OUT4LMIX Input 1 Source */
+	{ 0x000006b1, 0x0080 }, /* R1713 (0x6B1) - OUT4LMIX Input 1 Volume */
+	{ 0x000006b2, 0x0000 }, /* R1714 (0x6B2) - OUT4LMIX Input 2 Source */
+	{ 0x000006b3, 0x0080 }, /* R1715 (0x6B3) - OUT4LMIX Input 2 Volume */
+	{ 0x000006b4, 0x0000 }, /* R1716 (0x6B4) - OUT4LMIX Input 3 Source */
+	{ 0x000006b5, 0x0080 }, /* R1717 (0x6B5) - OUT4LMIX Input 3 Volume */
+	{ 0x000006b6, 0x0000 }, /* R1718 (0x6B6) - OUT4LMIX Input 4 Source */
+	{ 0x000006b7, 0x0080 }, /* R1719 (0x6B7) - OUT4LMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6C0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6C1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6C2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6C3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6C4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6C5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6C6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6C7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6C8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6C9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6CA) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6CB) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6CC) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6CD) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6CE) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6CF) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70A) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70B) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70C) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70D) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70E) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70F) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71A) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71B) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71C) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71D) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71E) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71F) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72A) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72B) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72C) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72D) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72E) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72F) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74A) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74B) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74C) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74D) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74E) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74F) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75A) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75B) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75C) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75D) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75E) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75F) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78A) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78B) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78C) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78D) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78E) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78F) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88A) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88B) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88C) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88D) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88E) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88F) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89A) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89B) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89C) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89D) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89E) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89F) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8C0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8C1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8C2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8C3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8C4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8C5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8C6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8C7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8C8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8C9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8CA) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8CB) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8CC) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8CD) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8CE) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8CF) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8D0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8D1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8D2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8D3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8D4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8D5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8D6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8D7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8D8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8D9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8DA) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8DB) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8DC) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8DD) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8DE) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8DF) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90A) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90B) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90C) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90D) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90E) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90F) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91A) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91B) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91C) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91D) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91E) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91F) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94A) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94B) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94C) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94D) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94E) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94F) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xB00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xB08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b10, 0x0000 }, /* R2832 (0xB10) - ISRC1DEC3MIX Input 1 Source */
+	{ 0x00000b18, 0x0000 }, /* R2840 (0xB18) - ISRC1DEC4MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xB20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xB28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b30, 0x0000 }, /* R2864 (0xB30) - ISRC1INT3MIX Input 1 Source */
+	{ 0x00000b38, 0x0000 }, /* R2872 (0xB38) - ISRC1INT4MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xB40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xB48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b50, 0x0000 }, /* R2896 (0xB50) - ISRC2DEC3MIX Input 1 Source */
+	{ 0x00000b58, 0x0000 }, /* R2904 (0xB58) - ISRC2DEC4MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xB60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xB68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000b70, 0x0000 }, /* R2928 (0xB70) - ISRC2INT3MIX Input 1 Source */
+	{ 0x00000b78, 0x0000 }, /* R2936 (0xB78) - ISRC2INT4MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xE00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xE10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xE11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xE12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xE13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xE14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xE15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xE16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xE17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xE18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xE19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xE1A) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xE1B) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xE1C) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xE1D) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xE1E) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xE1F) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xE20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xE21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xE22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xE23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xE24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xE26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xE27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xE28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xE29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xE2A) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xE2B) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xE2C) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xE2D) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xE2E) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xE2F) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xE30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xE31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xE32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xE33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xE34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xE35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xE36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xE37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xE38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xE39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xE3A) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xE3C) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xE3D) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xE3E) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xE3F) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xE40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xE41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xE42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xE43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xE44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xE45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xE46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xE47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xE48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xE49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xE4A) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xE4B) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xE4C) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xE4D) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xE4E) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xE4F) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xE50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xE52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xE53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xE54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xE55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xE56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xE57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xE58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xE59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xE5A) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xE5B) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xE5C) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xE5D) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xE5E) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xE5F) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xE60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xE61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xE62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xE63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xE64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xE65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xE66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xE80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xE81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xE82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xE83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xE84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xE88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xE89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xE8A) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xE8B) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xE8C) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xEC0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xEC1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xEC4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xEC5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xEC8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xEC9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xECC) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xECD) - HPLPF4 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xEF0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xEF1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xEF2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xEF3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xEF4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xEF5) - ISRC2 Ctrl 3 */
+	{ 0x00001700, 0x2801 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xe800 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2801 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xe800 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2801 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xe800 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2801 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xe800 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2801 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xe800 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2801 }, /* R5898 (0x170A) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xe800 }, /* R5899 (0x170B) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2801 }, /* R5900 (0x170C) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xe800 }, /* R5901 (0x170D) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2801 }, /* R5902 (0x170E) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xe800 }, /* R5903 (0x170F) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2801 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xe800 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2801 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xe800 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2801 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xe800 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2801 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xe800 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2801 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xe800 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2801 }, /* R5914 (0x171A) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xe800 }, /* R5915 (0x171B) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2801 }, /* R5916 (0x171C) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xe800 }, /* R5917 (0x171D) - GPIO15 Ctrl 2 */
+	{ 0x00001840, 0xffff }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0xffff }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0xffff }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0xffff }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0xffff }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0xffff }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184A) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0xffff }, /* R6219 (0x184B) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0xffff }, /* R6220 (0x184C) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0xffff }, /* R6221 (0x184D) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0xffff }, /* R6222 (0x184E) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184F) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0xffff }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0xffff }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0xffff }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0xffff }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0xffff }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0xffff }, /* R6234 (0x185A) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0xffff }, /* R6235 (0x185B) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185C) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0xffff }, /* R6237 (0x185D) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185E) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185F) - IRQ1 Mask 32 */
+	{ 0x00001860, 0xffff }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1A06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1A80) - IRQ1 Ctrl */
+};
+
+static bool cs47l15_is_adsp_memory(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x088ffe:
+	case 0x0a0000 ... 0x0a9ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_1:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_2:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ... MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case MADERA_FLL1_CONTROL_7:
+	case MADERA_FLL1_EFS_2:
+	case MADERA_FLL1_LOOP_FILTER_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
+	case MADERA_FLL1_SPREAD_SPECTRUM:
+	case MADERA_FLL1_GPIO_CLOCK:
+	case MADERA_FLLAO_CONTROL_1:
+	case MADERA_FLLAO_CONTROL_2:
+	case MADERA_FLLAO_CONTROL_3:
+	case MADERA_FLLAO_CONTROL_4:
+	case MADERA_FLLAO_CONTROL_5:
+	case MADERA_FLLAO_CONTROL_6:
+	case MADERA_FLLAO_CONTROL_7:
+	case MADERA_FLLAO_CONTROL_8:
+	case MADERA_FLLAO_CONTROL_9:
+	case MADERA_FLLAO_CONTROL_10:
+	case MADERA_FLLAO_CONTROL_11:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case CS47L15_ADC_INT_BIAS:
+	case CS47L15_PGA_BIAS_SEL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_OUTPUT_PATH_CONFIG_4L:
+	case MADERA_DAC_DIGITAL_VOLUME_4L:
+	case MADERA_NOISE_GATE_SELECT_4L:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1:
+	case MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP_TEST_CTRL_5:
+	case MADERA_HP_TEST_CTRL_6:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_8:
+	case MADERA_AIF1_FRAME_CTRL_11 ... MADERA_AIF1_FRAME_CTRL_16:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_6:
+	case MADERA_AIF2_FRAME_CTRL_11 ... MADERA_AIF2_FRAME_CTRL_14:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_4:
+	case MADERA_AIF3_FRAME_CTRL_11 ... MADERA_AIF3_FRAME_CTRL_12:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT4MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO15_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_FX_CTRL2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+static bool cs47l15_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+const struct regmap_config cs47l15_16bit_spi_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_spi_regmap);
+
+const struct regmap_config cs47l15_16bit_i2c_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_i2c_regmap);
+
+const struct regmap_config cs47l15_32bit_spi_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_spi_regmap);
+
+const struct regmap_config cs47l15_32bit_i2c_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index bc4e25b5b97d..a354567ebc86 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -31,6 +31,7 @@
 
 #include "madera.h"
 
+#define CS47L15_SILICON_ID	0x6370
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
@@ -46,6 +47,28 @@ static const struct mfd_cell madera_ldo1_devs[] = {
 	{ .name = "madera-ldo1" },
 };
 
+static const char * const cs47l15_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"SPKVDD",
+};
+
+static const struct mfd_cell cs47l15_devs[] = {
+	{ .name = "madera-pinctrl", },
+	{ .name = "madera-irq" },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l15-codec",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l15_supplies),
+	},
+};
+
 static const char * const cs47l35_supplies[] = {
 	"MICVDD",
 	"DBVDD2",
@@ -129,6 +152,8 @@ static const struct mfd_cell cs47l90_devs[] = {
 const char *madera_name_from_type(enum madera_type type)
 {
 	switch (type) {
+	case CS47L15:
+		return "CS47L15";
 	case CS47L35:
 		return "CS47L35";
 	case CS47L85:
@@ -291,6 +316,7 @@ const struct dev_pm_ops madera_pm_ops = {
 EXPORT_SYMBOL_GPL(madera_pm_ops);
 
 const struct of_device_id madera_of_match[] = {
+	{ .compatible = "cirrus,cs47l15", .data = (void *)CS47L15 },
 	{ .compatible = "cirrus,cs47l35", .data = (void *)CS47L35 },
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
@@ -339,6 +365,10 @@ static void madera_set_micbias_info(struct madera *madera)
 	 * childbiases for each micbias. Unspecified values default to 0.
 	 */
 	switch (madera->type) {
+	case CS47L15:
+		madera->num_micbias = 1;
+		madera->num_childbias[0] = 3;
+		return;
 	case CS47L35:
 		madera->num_micbias = 2;
 		madera->num_childbias[0] = 2;
@@ -402,6 +432,7 @@ int madera_dev_init(struct madera *madera)
 	 * No devm_ because we need to control shutdown order of children.
 	 */
 	switch (madera->type) {
+	case CS47L15:
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
@@ -471,6 +502,19 @@ int madera_dev_init(struct madera *madera)
 	}
 
 	switch (hwid) {
+	case CS47L15_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			switch (madera->type) {
+			case CS47L15:
+				patch_fn = &cs47l15_patch;
+				mfd_devs = cs47l15_devs;
+				n_devs = ARRAY_SIZE(cs47l15_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	case CS47L35_SILICON_ID:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			switch (madera->type) {
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 05ae94be01d8..bd868459cedb 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -39,6 +39,12 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l15_32bit_i2c_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_i2c_regmap;
@@ -113,6 +119,7 @@ static int madera_i2c_remove(struct i2c_client *i2c)
 }
 
 static const struct i2c_device_id madera_i2c_id[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index 4c398b278bba..a36741b73c25 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -39,6 +39,12 @@ static int madera_spi_probe(struct spi_device *spi)
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l15_32bit_spi_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_spi_regmap;
@@ -112,6 +118,7 @@ static int madera_spi_remove(struct spi_device *spi)
 }
 
 static const struct spi_device_id madera_spi_ids[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index 891b84efb9a7..ccc16f2a1288 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -24,6 +24,12 @@ int madera_dev_exit(struct madera *madera);
 
 const char *madera_name_from_type(enum madera_type type);
 
+extern const struct regmap_config cs47l15_16bit_spi_regmap;
+extern const struct regmap_config cs47l15_32bit_spi_regmap;
+extern const struct regmap_config cs47l15_16bit_i2c_regmap;
+extern const struct regmap_config cs47l15_32bit_i2c_regmap;
+int cs47l15_patch(struct madera *madera);
+
 extern const struct regmap_config cs47l35_16bit_spi_regmap;
 extern const struct regmap_config cs47l35_32bit_spi_regmap;
 extern const struct regmap_config cs47l35_16bit_i2c_regmap;
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 4d5d51a9c8a6..98dd3cb5e84d 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -27,11 +27,13 @@ enum madera_type {
 	CS47L90 = 3,
 	CS47L91 = 4,
 	WM1840 = 7,
+	CS47L15 = 8,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
 #define MADERA_MAX_GPIOS		40
 
+#define CS47L15_NUM_GPIOS		15
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 977e06101711..5b054d511c6a 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -244,6 +244,8 @@
 #define MADERA_IN6R_CONTROL				0x33C
 #define MADERA_ADC_DIGITAL_VOLUME_6R			0x33D
 #define MADERA_DMIC6R_CONTROL				0x33E
+#define CS47L15_ADC_INT_BIAS				0x3A8
+#define CS47L15_PGA_BIAS_SEL				0x3C4
 #define MADERA_OUTPUT_ENABLES_1				0x400
 #define MADERA_OUTPUT_STATUS_1				0x401
 #define MADERA_RAW_OUTPUT_STATUS_1			0x406
@@ -1202,6 +1204,8 @@
 #define MADERA_GPIO1_CTRL_2				0x1701
 #define MADERA_GPIO2_CTRL_1				0x1702
 #define MADERA_GPIO2_CTRL_2				0x1703
+#define MADERA_GPIO15_CTRL_1				0x171C
+#define MADERA_GPIO15_CTRL_2				0x171D
 #define MADERA_GPIO16_CTRL_1				0x171E
 #define MADERA_GPIO16_CTRL_2				0x171F
 #define MADERA_GPIO38_CTRL_1				0x174A
@@ -1232,6 +1236,7 @@
 #define MADERA_IRQ2_CTRL				0x1A82
 #define MADERA_INTERRUPT_RAW_STATUS_1			0x1AA0
 #define MADERA_WSEQ_SEQUENCE_1				0x3000
+#define MADERA_WSEQ_SEQUENCE_225			0x31C0
 #define MADERA_WSEQ_SEQUENCE_252			0x31F6
 #define CS47L35_OTP_HPDET_CAL_1				0x31F8
 #define CS47L35_OTP_HPDET_CAL_2				0x31FA
-- 
cgit v1.2.3


From 297939901f382f16ab78a8073cdfb2a6279bb654 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:53 +0100
Subject: mfd: madera: Add Madera core support for CS47L92

This patch adds all the core support and defines for the Cirrus
Logic CS42L92, CS47L92 and CS47L93 smart audio CODECs.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Stuart Henderson <stuarth@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l92-tables.c         | 1948 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   58 +
 drivers/mfd/madera-i2c.c             |   11 +
 drivers/mfd/madera-spi.c             |   11 +
 drivers/mfd/madera.h                 |    7 +
 include/linux/mfd/madera/core.h      |    4 +
 include/linux/mfd/madera/registers.h |  195 ++++
 9 files changed, 2244 insertions(+)
 create mode 100644 drivers/mfd/cs47l92-tables.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 86ae0a11f631..760100c7d5f9 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -288,6 +288,13 @@ config MFD_CS47L90
 	help
 	  Support for Cirrus Logic CS47L90 and CS47L91 Smart Codecs
 
+config MFD_CS47L92
+	bool "Cirrus Logic CS47L92/93"
+	select PINCTRL_CS47L92
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS42L92, CS47L92 and CS47L93 Smart Codecs
+
 config MFD_ASIC3
 	bool "Compaq ASIC3"
 	depends on GPIOLIB && ARM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index cc044f38af84..f026ada68f6a 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -87,6 +87,9 @@ endif
 ifeq ($(CONFIG_MFD_CS47L90),y)
 madera-objs			+= cs47l90-tables.o
 endif
+ifeq ($(CONFIG_MFD_CS47L92),y)
+madera-objs			+= cs47l92-tables.o
+endif
 obj-$(CONFIG_MFD_MADERA)	+= madera.o
 obj-$(CONFIG_MFD_MADERA_I2C)	+= madera-i2c.o
 obj-$(CONFIG_MFD_MADERA_SPI)	+= madera-spi.o
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
new file mode 100644
index 000000000000..3dc1fefe68f5
--- /dev/null
+++ b/drivers/mfd/cs47l92-tables.c
@@ -0,0 +1,1948 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L92 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Stuart Henderson <stuarth@opensource.cirrus.com>
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l92_reva_16_patch[] = {
+	{ 0x3A2,  0x2C29 },
+	{ 0x3A3,  0x0E00 },
+	{ 0x281,  0x0000 },
+	{ 0x282,  0x0000 },
+	{ 0x4EA,  0x0100 },
+	{ 0x22B,  0x0000 },
+	{ 0x4A0,  0x0080 },
+	{ 0x4A1,  0x0000 },
+	{ 0x4A2,  0x0000 },
+	{ 0x180B, 0x033F },
+	{ 0x190B, 0x033F },
+	{ 0x442,  0x0304 },
+	{ 0x34C,  0x0003 },
+	{ 0x124,  0x0C49 },
+	{ 0x120,  0x0345 },
+	{ 0x120,  0x0305 },
+	{ 0x4FA,  0x5064 },
+	{ 0x1300, 0x050E },
+	{ 0x1302, 0x0101 },
+	{ 0x1380, 0x02E0 },
+	{ 0x1381, 0xF942 },
+	{ 0x1382, 0x04CE },
+	{ 0x1383, 0xFF06 },
+	{ 0x1390, 0x0304 },
+	{ 0x1391, 0xF8FF },
+	{ 0x1392, 0x04F3 },
+	{ 0x1393, 0xFF00 },
+	{ 0x13A0, 0x02E0 },
+	{ 0x13A1, 0xF942 },
+	{ 0x13A2, 0x04CE },
+	{ 0x13A3, 0xFF06 },
+	{ 0x13B0, 0x0304 },
+	{ 0x13B1, 0xF8FF },
+	{ 0x13B2, 0x04F3 },
+	{ 0x13B3, 0xFF00 },
+	{ 0x412,  0x0005 },
+	{ 0x41A,  0x0005 },
+	{ 0x422,  0x0005 },
+};
+
+static const struct reg_sequence cs47l92_reva_32_patch[] = {
+	{ 0x3030, 0x04A00C01 },
+	{ 0x3032, 0x0225F501 },
+	{ 0x3044, 0x04A00C00 },
+	{ 0x3046, 0x0225FF01 },
+	{ 0x3080, 0x04A00C01 },
+	{ 0x3082, 0x0226F501 },
+	{ 0x3094, 0x04A00C00 },
+	{ 0x3096, 0x0226FF01 },
+	{ 0x30D1, 0x04A10C01 },
+	{ 0x30D2, 0x0227F501 },
+	{ 0x30E4, 0x04A10C00 },
+	{ 0x30E6, 0x0227FF01 },
+	{ 0x3120, 0x04A10C01 },
+	{ 0x3122, 0x0228F501 },
+	{ 0x3134, 0x04A10C00 },
+	{ 0x3136, 0x0228FF01 },
+	{ 0x3170, 0x04A20C01 },
+	{ 0x3172, 0x022B0101 },
+	{ 0x3174, 0x0229F501 },
+	{ 0x3184, 0x04A20C00 },
+	{ 0x3186, 0x022B0100 },
+	{ 0x3188, 0x0229FF01 },
+	{ 0x31C0, 0x04A20C01 },
+	{ 0x31C2, 0x022B0001 },
+	{ 0x31C4, 0x022AF501 },
+	{ 0x31D4, 0x04A20C00 },
+	{ 0x31D6, 0x022B0000 },
+	{ 0x31D8, 0x022AFF01 },
+};
+
+int cs47l92_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l92_reva_16_patch,
+				    ARRAY_SIZE(cs47l92_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	ret = regmap_register_patch(madera->regmap_32bit,
+				    cs47l92_reva_32_patch,
+				    ARRAY_SIZE(cs47l92_reva_32_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 32-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l92_patch);
+
+static const struct reg_default cs47l92_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xa0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32k 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000112, 0x0305 }, /* R274 (0x112) - Async Clock 1 */
+	{ 0x00000113, 0x0011 }, /* R275 (0x113) - Async Sample Rate 1 */
+	{ 0x00000114, 0x0011 }, /* R276 (0x114) - Async Sample Rate 2 */
+	{ 0x00000120, 0x0305 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x0000014a, 0x0000 }, /* R330 (0x14a) - Output Async Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x7004 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0004 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0000 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x0000 }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0001 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x8000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0680 }, /* R375 (0x177) - FLL1 Control 7 */
+	{ 0x00000178, 0x21f0 }, /* R376 (0x178) - FLL1 Control 8 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 9 */
+	{ 0x0000017a, 0x0000 }, /* R378 (0x17a) - FLL1 Control 10 */
+	{ 0x0000017b, 0x0011 }, /* R379 (0x17b) - FLL1 Control 11 */
+	{ 0x0000017d, 0x33e8 }, /* R381 (0x17d) - FLL1 Digital Test 1 */
+	{ 0x00000181, 0x7000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0004 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0001 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x0000018e, 0x0c04 }, /* R398 (0x18e) - FLL1 GPIO Clock */
+	{ 0x00000191, 0x7000 }, /* R401 (0x191) - FLL2 Control 1 */
+	{ 0x00000192, 0x0004 }, /* R402 (0x192) - FLL2 Control 2 */
+	{ 0x00000193, 0x0000 }, /* R403 (0x193) - FLL2 Control 3 */
+	{ 0x00000194, 0x0000 }, /* R404 (0x194) - FLL2 Control 4 */
+	{ 0x00000195, 0x0001 }, /* R405 (0x195) - FLL2 Control 5 */
+	{ 0x00000196, 0x8000 }, /* R406 (0x196) - FLL2 Control 6 */
+	{ 0x00000197, 0x0680 }, /* R407 (0x197) - FLL2 Control 7 */
+	{ 0x00000198, 0x21f0 }, /* R408 (0x198) - FLL2 Control 8 */
+	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 9 */
+	{ 0x0000019a, 0x0000 }, /* R410 (0x19a) - FLL2 Control 10 */
+	{ 0x0000019b, 0x0011 }, /* R411 (0x19b) - FLL2 Control 11 */
+	{ 0x0000019d, 0x33e8 }, /* R413 (0x19d) - FLL2 Digital Test 1 */
+	{ 0x000001a1, 0x7000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
+	{ 0x000001a2, 0x0004 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
+	{ 0x000001a3, 0x0000 }, /* R419 (0x1a3) - FLL2 Synchroniser 3 */
+	{ 0x000001a4, 0x0000 }, /* R420 (0x1a4) - FLL2 Synchroniser 4 */
+	{ 0x000001a5, 0x0001 }, /* R421 (0x1a5) - FLL2 Synchroniser 5 */
+	{ 0x000001a6, 0x0000 }, /* R422 (0x1a6) - FLL2 Synchroniser 6 */
+	{ 0x000001ae, 0x0c04 }, /* R430 (0x1ae) - FLL2 GPIO Clock */
+	{ 0x00000200, 0x0006 }, /* R512 (0x200) - Mic Charge Pump 1 */
+	{ 0x00000213, 0x03e4 }, /* R531 (0x213) - LDO2 Control 1 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x00000219, 0x00e6 }, /* R537 (0x219) - Mic Bias Ctrl 2 */
+	{ 0x0000021c, 0x2222 }, /* R540 (0x21c) - Mic Bias Ctrl 5 */
+	{ 0x0000021e, 0x0022 }, /* R542 (0x21e) - Mic Bias Ctrl 6 */
+	{ 0x00000293, 0x0080 }, /* R659 (0x293) - Accessory Detect Mode 1 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29b) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2a2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2a3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2a4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2a6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2a7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2a8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2a9) - Mic Detect 1 Level 4 */
+	{ 0x000002b2, 0x0010 }, /* R690 (0x2b2) - Mic Detect 2 Control 0 */
+	{ 0x000002b3, 0x1102 }, /* R691 (0x2b3) - Mic Detect 2 Control 1 */
+	{ 0x000002b4, 0x009f }, /* R692 (0x2b4) - Mic Detect 2 Control 2 */
+	{ 0x000002b6, 0x3d3d }, /* R694 (0x2b6) - Mic Detect 2 Level 1 */
+	{ 0x000002b7, 0x3d3d }, /* R695 (0x2b7) - Mic Detect 2 Level 2 */
+	{ 0x000002b8, 0x333d }, /* R696 (0x2b8) - Mic Detect 2 Level 3 */
+	{ 0x000002b9, 0x202d }, /* R697 (0x2b9) - Mic Detect 2 Level 4 */
+	{ 0x000002c6, 0x0210 }, /* R710 (0x2c6) - Micd Clamp control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2c8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2d3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0400 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30c) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0080 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31a) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31b) - IN2L Rate Control */
+	{ 0x0000031c, 0x0080 }, /* R796 (0x31c) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31d) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31e) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31f) - IN2R Rate Control */
+	{ 0x00000320, 0x0000 }, /* R800 (0x320) - IN3L Control */
+	{ 0x00000321, 0x0180 }, /* R801 (0x321) - ADC Digital Volume 3L */
+	{ 0x00000322, 0x0500 }, /* R802 (0x322) - DMIC3L Control */
+	{ 0x00000323, 0x0000 }, /* R803 (0x323) - IN3L Rate Control */
+	{ 0x00000324, 0x0000 }, /* R804 (0x324) - IN3R Control */
+	{ 0x00000325, 0x0180 }, /* R805 (0x325) - ADC Digital Volume 3R */
+	{ 0x00000326, 0x0000 }, /* R806 (0x326) - DMIC3R Control */
+	{ 0x00000327, 0x0000 }, /* R807 (0x327) - IN3R Rate Control */
+	{ 0x00000328, 0x0000 }, /* R808 (0x328) - IN4L Control */
+	{ 0x00000329, 0x0180 }, /* R809 (0x329) - ADC Digital Volume 4L */
+	{ 0x0000032a, 0x0500 }, /* R810 (0x32a) - DMIC4L Control */
+	{ 0x0000032b, 0x0000 }, /* R811 (0x32b) - IN4L Rate Control */
+	{ 0x0000032c, 0x0000 }, /* R812 (0x32c) - IN4R Control */
+	{ 0x0000032d, 0x0180 }, /* R813 (0x32d) - ADC Digital Volume 4R */
+	{ 0x0000032e, 0x0000 }, /* R814 (0x32e) - DMIC4R Control */
+	{ 0x0000032f, 0x0000 }, /* R815 (0x32f) - IN4R Rate Control */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0040 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0005 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x00000418, 0x0080 }, /* R1048 (0x418) - Output Path Config 2L */
+	{ 0x00000419, 0x0180 }, /* R1049 (0x419) - DAC Digital Volume 2L */
+	{ 0x0000041a, 0x0005 }, /* R1050 (0x41a) - Output Path Config 2 */
+	{ 0x0000041b, 0x0004 }, /* R1051 (0x41b) - Noise Gate Select 2L */
+	{ 0x0000041c, 0x0080 }, /* R1052 (0x41c) - Output Path Config 2R */
+	{ 0x0000041d, 0x0180 }, /* R1053 (0x41d) - DAC Digital Volume 2R */
+	{ 0x0000041f, 0x0008 }, /* R1055 (0x41f) - Noise Gate Select 2R */
+	{ 0x00000420, 0x0080 }, /* R1056 (0x420) - Output Path Config 3L */
+	{ 0x00000421, 0x0180 }, /* R1057 (0x421) - DAC Digital Volume 3L */
+	{ 0x00000422, 0x0005 }, /* R1058 (0x422) - Output Path Config 3 */
+	{ 0x00000423, 0x0010 }, /* R1059 (0x423) - Noise Gate Select 3L */
+	{ 0x00000424, 0x0080 }, /* R1060 (0x424) - Output Path Config 3R */
+	{ 0x00000425, 0x0180 }, /* R1061 (0x425) - DAC Digital Volume 3R */
+	{ 0x00000427, 0x0020 }, /* R1063 (0x427) - Noise Gate Select 3R */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x0080 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a1, 0x0000 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
+	{ 0x000004a2, 0x0000 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50a) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50b) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50c) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50d) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50e) - AIF1 Frame Ctrl 8 */
+	{ 0x0000050f, 0x0006 }, /* R1295 (0x50f) - AIF1 Frame Ctrl 9 */
+	{ 0x00000510, 0x0007 }, /* R1296 (0x510) - AIF1 Frame Ctrl 10 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000517, 0x0006 }, /* R1303 (0x517) - AIF1 Frame Ctrl 17 */
+	{ 0x00000518, 0x0007 }, /* R1304 (0x518) - AIF1 Frame Ctrl 18 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51a) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54a) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54b) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54c) - AIF2 Frame Ctrl 6 */
+	{ 0x0000054d, 0x0004 }, /* R1357 (0x54d) - AIF2 Frame Ctrl 7 */
+	{ 0x0000054e, 0x0005 }, /* R1358 (0x54e) - AIF2 Frame Ctrl 8 */
+	{ 0x0000054f, 0x0006 }, /* R1359 (0x54f) - AIF2 Frame Ctrl 9 */
+	{ 0x00000550, 0x0007 }, /* R1360 (0x550) - AIF2 Frame Ctrl 10 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000555, 0x0004 }, /* R1365 (0x555) - AIF2 Frame Ctrl 15 */
+	{ 0x00000556, 0x0005 }, /* R1366 (0x556) - AIF2 Frame Ctrl 16 */
+	{ 0x00000557, 0x0006 }, /* R1367 (0x557) - AIF2 Frame Ctrl 17 */
+	{ 0x00000558, 0x0007 }, /* R1368 (0x558) - AIF2 Frame Ctrl 18 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55a) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58a) - AIF3 Frame Ctrl 4 */
+	{ 0x0000058b, 0x0002 }, /* R1419 (0x58b) - AIF3 Frame Ctrl 5 */
+	{ 0x0000058c, 0x0003 }, /* R1420 (0x58c) - AIF3 Frame Ctrl 6 */
+	{ 0x0000058d, 0x0004 }, /* R1421 (0x58d) - AIF3 Frame Ctrl 7 */
+	{ 0x0000058e, 0x0005 }, /* R1422 (0x58e) - AIF3 Frame Ctrl 8 */
+	{ 0x0000058f, 0x0006 }, /* R1423 (0x58f) - AIF3 Frame Ctrl 9 */
+	{ 0x00000590, 0x0007 }, /* R1424 (0x590) - AIF3 Frame Ctrl 10 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000593, 0x0002 }, /* R1427 (0x593) - AIF3 Frame Ctrl 13 */
+	{ 0x00000594, 0x0003 }, /* R1428 (0x594) - AIF3 Frame Ctrl 14 */
+	{ 0x00000595, 0x0004 }, /* R1429 (0x595) - AIF3 Frame Ctrl 15 */
+	{ 0x00000596, 0x0005 }, /* R1430 (0x596) - AIF3 Frame Ctrl 16 */
+	{ 0x00000597, 0x0006 }, /* R1431 (0x597) - AIF3 Frame Ctrl 17 */
+	{ 0x00000598, 0x0007 }, /* R1432 (0x598) - AIF3 Frame Ctrl 18 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59a) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5c2) - SPD1 Tx Control */
+	{ 0x000005e3, 0x0000 }, /* R1507 (0x5e3) - SLIMBus Framer Ref Gear */
+	{ 0x000005e5, 0x0000 }, /* R1509 (0x5e5) - SLIMBus Rates 1 */
+	{ 0x000005e6, 0x0000 }, /* R1510 (0x5e6) - SLIMBus Rates 2 */
+	{ 0x000005e7, 0x0000 }, /* R1511 (0x5e7) - SLIMBus Rates 3 */
+	{ 0x000005e8, 0x0000 }, /* R1512 (0x5e8) - SLIMBus Rates 4 */
+	{ 0x000005e9, 0x0000 }, /* R1513 (0x5e9) - SLIMBus Rates 5 */
+	{ 0x000005ea, 0x0000 }, /* R1514 (0x5ea) - SLIMBus Rates 6 */
+	{ 0x000005eb, 0x0000 }, /* R1515 (0x5eb) - SLIMBus Rates 7 */
+	{ 0x000005ec, 0x0000 }, /* R1516 (0x5ec) - SLIMBus Rates 8 */
+	{ 0x000005f5, 0x0000 }, /* R1525 (0x5f5) - SLIMBus RX Channel Enable */
+	{ 0x000005f6, 0x0000 }, /* R1526 (0x5f6) - SLIMBus TX Channel Enable */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64a) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64b) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64c) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64d) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64e) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64f) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68a) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68b) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68c) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68d) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68e) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68f) - OUT1RMIX Input 4 Volume */
+	{ 0x00000690, 0x0000 }, /* R1680 (0x690) - OUT2LMIX Input 1 Source */
+	{ 0x00000691, 0x0080 }, /* R1681 (0x691) - OUT2LMIX Input 1 Volume */
+	{ 0x00000692, 0x0000 }, /* R1682 (0x692) - OUT2LMIX Input 2 Source */
+	{ 0x00000693, 0x0080 }, /* R1683 (0x693) - OUT2LMIX Input 2 Volume */
+	{ 0x00000694, 0x0000 }, /* R1684 (0x694) - OUT2LMIX Input 3 Source */
+	{ 0x00000695, 0x0080 }, /* R1685 (0x695) - OUT2LMIX Input 3 Volume */
+	{ 0x00000696, 0x0000 }, /* R1686 (0x696) - OUT2LMIX Input 4 Source */
+	{ 0x00000697, 0x0080 }, /* R1687 (0x697) - OUT2LMIX Input 4 Volume */
+	{ 0x00000698, 0x0000 }, /* R1688 (0x698) - OUT2RMIX Input 1 Source */
+	{ 0x00000699, 0x0080 }, /* R1689 (0x699) - OUT2RMIX Input 1 Volume */
+	{ 0x0000069a, 0x0000 }, /* R1690 (0x69a) - OUT2RMIX Input 2 Source */
+	{ 0x0000069b, 0x0080 }, /* R1691 (0x69b) - OUT2RMIX Input 2 Volume */
+	{ 0x0000069c, 0x0000 }, /* R1692 (0x69c) - OUT2RMIX Input 3 Source */
+	{ 0x0000069d, 0x0080 }, /* R1693 (0x69d) - OUT2RMIX Input 3 Volume */
+	{ 0x0000069e, 0x0000 }, /* R1694 (0x69e) - OUT2RMIX Input 4 Source */
+	{ 0x0000069f, 0x0080 }, /* R1695 (0x69f) - OUT2RMIX Input 4 Volume */
+	{ 0x000006a0, 0x0000 }, /* R1696 (0x6a0) - OUT3LMIX Input 1 Source */
+	{ 0x000006a1, 0x0080 }, /* R1697 (0x6a1) - OUT3LMIX Input 1 Volume */
+	{ 0x000006a2, 0x0000 }, /* R1698 (0x6a2) - OUT3LMIX Input 2 Source */
+	{ 0x000006a3, 0x0080 }, /* R1699 (0x6a3) - OUT3LMIX Input 2 Volume */
+	{ 0x000006a4, 0x0000 }, /* R1700 (0x6a4) - OUT3LMIX Input 3 Source */
+	{ 0x000006a5, 0x0080 }, /* R1701 (0x6a5) - OUT3LMIX Input 3 Volume */
+	{ 0x000006a6, 0x0000 }, /* R1702 (0x6a6) - OUT3LMIX Input 4 Source */
+	{ 0x000006a7, 0x0080 }, /* R1703 (0x6a7) - OUT3LMIX Input 4 Volume */
+	{ 0x000006a8, 0x0000 }, /* R1704 (0x6a8) - OUT3RMIX Input 1 Source */
+	{ 0x000006a9, 0x0080 }, /* R1705 (0x6a9) - OUT3RMIX Input 1 Volume */
+	{ 0x000006aa, 0x0000 }, /* R1706 (0x6aa) - OUT3RMIX Input 2 Source */
+	{ 0x000006ab, 0x0080 }, /* R1707 (0x6ab) - OUT3RMIX Input 2 Volume */
+	{ 0x000006ac, 0x0000 }, /* R1708 (0x6ac) - OUT3RMIX Input 3 Source */
+	{ 0x000006ad, 0x0080 }, /* R1709 (0x6ad) - OUT3RMIX Input 3 Volume */
+	{ 0x000006ae, 0x0000 }, /* R1710 (0x6ae) - OUT3RMIX Input 4 Source */
+	{ 0x000006af, 0x0080 }, /* R1711 (0x6af) - OUT3RMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6c0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6c1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6c2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6c3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6c4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6c5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6c6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6c7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6c8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6c9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6ca) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6cb) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6cc) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6cd) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6ce) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6cf) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70a) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70b) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70c) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70d) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70e) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70f) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71a) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71b) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71c) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71d) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71e) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71f) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72a) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72b) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72c) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72d) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72e) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72f) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000730, 0x0000 }, /* R1840 (0x730) - AIF1TX7MIX Input 1 Source */
+	{ 0x00000731, 0x0080 }, /* R1841 (0x731) - AIF1TX7MIX Input 1 Volume */
+	{ 0x00000732, 0x0000 }, /* R1842 (0x732) - AIF1TX7MIX Input 2 Source */
+	{ 0x00000733, 0x0080 }, /* R1843 (0x733) - AIF1TX7MIX Input 2 Volume */
+	{ 0x00000734, 0x0000 }, /* R1844 (0x734) - AIF1TX7MIX Input 3 Source */
+	{ 0x00000735, 0x0080 }, /* R1845 (0x735) - AIF1TX7MIX Input 3 Volume */
+	{ 0x00000736, 0x0000 }, /* R1846 (0x736) - AIF1TX7MIX Input 4 Source */
+	{ 0x00000737, 0x0080 }, /* R1847 (0x737) - AIF1TX7MIX Input 4 Volume */
+	{ 0x00000738, 0x0000 }, /* R1848 (0x738) - AIF1TX8MIX Input 1 Source */
+	{ 0x00000739, 0x0080 }, /* R1849 (0x739) - AIF1TX8MIX Input 1 Volume */
+	{ 0x0000073a, 0x0000 }, /* R1850 (0x73a) - AIF1TX8MIX Input 2 Source */
+	{ 0x0000073b, 0x0080 }, /* R1851 (0x73b) - AIF1TX8MIX Input 2 Volume */
+	{ 0x0000073c, 0x0000 }, /* R1852 (0x73c) - AIF1TX8MIX Input 3 Source */
+	{ 0x0000073d, 0x0080 }, /* R1853 (0x73d) - AIF1TX8MIX Input 3 Volume */
+	{ 0x0000073e, 0x0000 }, /* R1854 (0x73e) - AIF1TX8MIX Input 4 Source */
+	{ 0x0000073f, 0x0080 }, /* R1855 (0x73f) - AIF1TX8MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74a) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74b) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74c) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74d) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74e) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74f) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75a) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75b) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75c) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75d) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75e) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75f) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000760, 0x0000 }, /* R1888 (0x760) - AIF2TX5MIX Input 1 Source */
+	{ 0x00000761, 0x0080 }, /* R1889 (0x761) - AIF2TX5MIX Input 1 Volume */
+	{ 0x00000762, 0x0000 }, /* R1890 (0x762) - AIF2TX5MIX Input 2 Source */
+	{ 0x00000763, 0x0080 }, /* R1891 (0x763) - AIF2TX5MIX Input 2 Volume */
+	{ 0x00000764, 0x0000 }, /* R1892 (0x764) - AIF2TX5MIX Input 3 Source */
+	{ 0x00000765, 0x0080 }, /* R1893 (0x765) - AIF2TX5MIX Input 3 Volume */
+	{ 0x00000766, 0x0000 }, /* R1894 (0x766) - AIF2TX5MIX Input 4 Source */
+	{ 0x00000767, 0x0080 }, /* R1895 (0x767) - AIF2TX5MIX Input 4 Volume */
+	{ 0x00000768, 0x0000 }, /* R1896 (0x768) - AIF2TX6MIX Input 1 Source */
+	{ 0x00000769, 0x0080 }, /* R1897 (0x769) - AIF2TX6MIX Input 1 Volume */
+	{ 0x0000076a, 0x0000 }, /* R1898 (0x76a) - AIF2TX6MIX Input 2 Source */
+	{ 0x0000076b, 0x0080 }, /* R1899 (0x76b) - AIF2TX6MIX Input 2 Volume */
+	{ 0x0000076c, 0x0000 }, /* R1900 (0x76c) - AIF2TX6MIX Input 3 Source */
+	{ 0x0000076d, 0x0080 }, /* R1901 (0x76d) - AIF2TX6MIX Input 3 Volume */
+	{ 0x0000076e, 0x0000 }, /* R1902 (0x76e) - AIF2TX6MIX Input 4 Source */
+	{ 0x0000076f, 0x0080 }, /* R1903 (0x76f) - AIF2TX6MIX Input 4 Volume */
+	{ 0x00000770, 0x0000 }, /* R1904 (0x770) - AIF2TX7MIX Input 1 Source */
+	{ 0x00000771, 0x0080 }, /* R1905 (0x771) - AIF2TX7MIX Input 1 Volume */
+	{ 0x00000772, 0x0000 }, /* R1906 (0x772) - AIF2TX7MIX Input 2 Source */
+	{ 0x00000773, 0x0080 }, /* R1907 (0x773) - AIF2TX7MIX Input 2 Volume */
+	{ 0x00000774, 0x0000 }, /* R1908 (0x774) - AIF2TX7MIX Input 3 Source */
+	{ 0x00000775, 0x0080 }, /* R1909 (0x775) - AIF2TX7MIX Input 3 Volume */
+	{ 0x00000776, 0x0000 }, /* R1910 (0x776) - AIF2TX7MIX Input 4 Source */
+	{ 0x00000777, 0x0080 }, /* R1911 (0x777) - AIF2TX7MIX Input 4 Volume */
+	{ 0x00000778, 0x0000 }, /* R1912 (0x778) - AIF2TX8MIX Input 1 Source */
+	{ 0x00000779, 0x0080 }, /* R1913 (0x779) - AIF2TX8MIX Input 1 Volume */
+	{ 0x0000077a, 0x0000 }, /* R1914 (0x77a) - AIF2TX8MIX Input 2 Source */
+	{ 0x0000077b, 0x0080 }, /* R1915 (0x77b) - AIF2TX8MIX Input 2 Volume */
+	{ 0x0000077c, 0x0000 }, /* R1916 (0x77c) - AIF2TX8MIX Input 3 Source */
+	{ 0x0000077d, 0x0080 }, /* R1917 (0x77d) - AIF2TX8MIX Input 3 Volume */
+	{ 0x0000077e, 0x0000 }, /* R1918 (0x77e) - AIF2TX8MIX Input 4 Source */
+	{ 0x0000077f, 0x0080 }, /* R1919 (0x77f) - AIF2TX8MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78a) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78b) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78c) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78d) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78e) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78f) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000790, 0x0000 }, /* R1936 (0x790) - AIF3TX3MIX Input 1 Source */
+	{ 0x00000791, 0x0080 }, /* R1937 (0x791) - AIF3TX3MIX Input 1 Volume */
+	{ 0x00000792, 0x0000 }, /* R1938 (0x792) - AIF3TX3MIX Input 2 Source */
+	{ 0x00000793, 0x0080 }, /* R1939 (0x793) - AIF3TX3MIX Input 2 Volume */
+	{ 0x00000794, 0x0000 }, /* R1940 (0x794) - AIF3TX3MIX Input 3 Source */
+	{ 0x00000795, 0x0080 }, /* R1941 (0x795) - AIF3TX3MIX Input 3 Volume */
+	{ 0x00000796, 0x0000 }, /* R1942 (0x796) - AIF3TX3MIX Input 4 Source */
+	{ 0x00000797, 0x0080 }, /* R1943 (0x797) - AIF3TX3MIX Input 4 Volume */
+	{ 0x00000798, 0x0000 }, /* R1944 (0x798) - AIF3TX4MIX Input 1 Source */
+	{ 0x00000799, 0x0080 }, /* R1945 (0x799) - AIF3TX4MIX Input 1 Volume */
+	{ 0x0000079a, 0x0000 }, /* R1946 (0x79a) - AIF3TX4MIX Input 2 Source */
+	{ 0x0000079b, 0x0080 }, /* R1947 (0x79b) - AIF3TX4MIX Input 2 Volume */
+	{ 0x0000079c, 0x0000 }, /* R1948 (0x79c) - AIF3TX4MIX Input 3 Source */
+	{ 0x0000079d, 0x0080 }, /* R1949 (0x79d) - AIF3TX4MIX Input 3 Volume */
+	{ 0x0000079e, 0x0000 }, /* R1950 (0x79e) - AIF3TX4MIX Input 4 Source */
+	{ 0x0000079f, 0x0080 }, /* R1951 (0x79f) - AIF3TX4MIX Input 4 Volume */
+	{ 0x000007a0, 0x0000 }, /* R1952 (0x7a0) - AIF3TX5MIX Input 1 Source */
+	{ 0x000007a1, 0x0080 }, /* R1953 (0x7a1) - AIF3TX5MIX Input 1 Volume */
+	{ 0x000007a2, 0x0000 }, /* R1954 (0x7a2) - AIF3TX5MIX Input 2 Source */
+	{ 0x000007a3, 0x0080 }, /* R1955 (0x7a3) - AIF3TX5MIX Input 2 Volume */
+	{ 0x000007a4, 0x0000 }, /* R1956 (0x7a4) - AIF3TX5MIX Input 3 Source */
+	{ 0x000007a5, 0x0080 }, /* R1957 (0x7a5) - AIF3TX5MIX Input 3 Volume */
+	{ 0x000007a6, 0x0000 }, /* R1958 (0x7a6) - AIF3TX5MIX Input 4 Source */
+	{ 0x000007a7, 0x0080 }, /* R1959 (0x7a7) - AIF3TX5MIX Input 4 Volume */
+	{ 0x000007a8, 0x0000 }, /* R1960 (0x7a8) - AIF3TX6MIX Input 1 Source */
+	{ 0x000007a9, 0x0080 }, /* R1961 (0x7a9) - AIF3TX6MIX Input 1 Volume */
+	{ 0x000007aa, 0x0000 }, /* R1962 (0x7aa) - AIF3TX6MIX Input 2 Source */
+	{ 0x000007ab, 0x0080 }, /* R1963 (0x7ab) - AIF3TX6MIX Input 2 Volume */
+	{ 0x000007ac, 0x0000 }, /* R1964 (0x7ac) - AIF3TX6MIX Input 3 Source */
+	{ 0x000007ad, 0x0080 }, /* R1965 (0x7ad) - AIF3TX6MIX Input 3 Volume */
+	{ 0x000007ae, 0x0000 }, /* R1966 (0x7ae) - AIF3TX6MIX Input 4 Source */
+	{ 0x000007af, 0x0080 }, /* R1967 (0x7af) - AIF3TX6MIX Input 4 Volume */
+	{ 0x000007b0, 0x0000 }, /* R1968 (0x7b0) - AIF3TX7MIX Input 1 Source */
+	{ 0x000007b1, 0x0080 }, /* R1969 (0x7b1) - AIF3TX7MIX Input 1 Volume */
+	{ 0x000007b2, 0x0000 }, /* R1970 (0x7b2) - AIF3TX7MIX Input 2 Source */
+	{ 0x000007b3, 0x0080 }, /* R1971 (0x7b3) - AIF3TX7MIX Input 2 Volume */
+	{ 0x000007b4, 0x0000 }, /* R1972 (0x7b4) - AIF3TX7MIX Input 3 Source */
+	{ 0x000007b5, 0x0080 }, /* R1973 (0x7b5) - AIF3TX7MIX Input 3 Volume */
+	{ 0x000007b6, 0x0000 }, /* R1974 (0x7b6) - AIF3TX7MIX Input 4 Source */
+	{ 0x000007b7, 0x0080 }, /* R1975 (0x7b7) - AIF3TX7MIX Input 4 Volume */
+	{ 0x000007b8, 0x0000 }, /* R1976 (0x7b8) - AIF3TX8MIX Input 1 Source */
+	{ 0x000007b9, 0x0080 }, /* R1977 (0x7b9) - AIF3TX8MIX Input 1 Volume */
+	{ 0x000007ba, 0x0000 }, /* R1978 (0x7ba) - AIF3TX8MIX Input 2 Source */
+	{ 0x000007bb, 0x0080 }, /* R1979 (0x7bb) - AIF3TX8MIX Input 2 Volume */
+	{ 0x000007bc, 0x0000 }, /* R1980 (0x7bc) - AIF3TX8MIX Input 3 Source */
+	{ 0x000007bd, 0x0080 }, /* R1981 (0x7bd) - AIF3TX8MIX Input 3 Volume */
+	{ 0x000007be, 0x0000 }, /* R1982 (0x7be) - AIF3TX8MIX Input 4 Source */
+	{ 0x000007bf, 0x0080 }, /* R1983 (0x7bf) - AIF3TX8MIX Input 4 Volume */
+	{ 0x000007c0, 0x0000 }, /* R1984 (0x7c0) - SLIMTX1MIX Input 1 Source */
+	{ 0x000007c1, 0x0080 }, /* R1985 (0x7c1) - SLIMTX1MIX Input 1 Volume */
+	{ 0x000007c2, 0x0000 }, /* R1986 (0x7c2) - SLIMTX1MIX Input 2 Source */
+	{ 0x000007c3, 0x0080 }, /* R1987 (0x7c3) - SLIMTX1MIX Input 2 Volume */
+	{ 0x000007c4, 0x0000 }, /* R1988 (0x7c4) - SLIMTX1MIX Input 3 Source */
+	{ 0x000007c5, 0x0080 }, /* R1989 (0x7c5) - SLIMTX1MIX Input 3 Volume */
+	{ 0x000007c6, 0x0000 }, /* R1990 (0x7c6) - SLIMTX1MIX Input 4 Source */
+	{ 0x000007c7, 0x0080 }, /* R1991 (0x7c7) - SLIMTX1MIX Input 4 Volume */
+	{ 0x000007c8, 0x0000 }, /* R1992 (0x7c8) - SLIMTX2MIX Input 1 Source */
+	{ 0x000007c9, 0x0080 }, /* R1993 (0x7c9) - SLIMTX2MIX Input 1 Volume */
+	{ 0x000007ca, 0x0000 }, /* R1994 (0x7ca) - SLIMTX2MIX Input 2 Source */
+	{ 0x000007cb, 0x0080 }, /* R1995 (0x7cb) - SLIMTX2MIX Input 2 Volume */
+	{ 0x000007cc, 0x0000 }, /* R1996 (0x7cc) - SLIMTX2MIX Input 3 Source */
+	{ 0x000007cd, 0x0080 }, /* R1997 (0x7cd) - SLIMTX2MIX Input 3 Volume */
+	{ 0x000007ce, 0x0000 }, /* R1998 (0x7ce) - SLIMTX2MIX Input 4 Source */
+	{ 0x000007cf, 0x0080 }, /* R1999 (0x7cf) - SLIMTX2MIX Input 4 Volume */
+	{ 0x000007d0, 0x0000 }, /* R2000 (0x7d0) - SLIMTX3MIX Input 1 Source */
+	{ 0x000007d1, 0x0080 }, /* R2001 (0x7d1) - SLIMTX3MIX Input 1 Volume */
+	{ 0x000007d2, 0x0000 }, /* R2002 (0x7d2) - SLIMTX3MIX Input 2 Source */
+	{ 0x000007d3, 0x0080 }, /* R2003 (0x7d3) - SLIMTX3MIX Input 2 Volume */
+	{ 0x000007d4, 0x0000 }, /* R2004 (0x7d4) - SLIMTX3MIX Input 3 Source */
+	{ 0x000007d5, 0x0080 }, /* R2005 (0x7d5) - SLIMTX3MIX Input 3 Volume */
+	{ 0x000007d6, 0x0000 }, /* R2006 (0x7d6) - SLIMTX3MIX Input 4 Source */
+	{ 0x000007d7, 0x0080 }, /* R2007 (0x7d7) - SLIMTX3MIX Input 4 Volume */
+	{ 0x000007d8, 0x0000 }, /* R2008 (0x7d8) - SLIMTX4MIX Input 1 Source */
+	{ 0x000007d9, 0x0080 }, /* R2009 (0x7d9) - SLIMTX4MIX Input 1 Volume */
+	{ 0x000007da, 0x0000 }, /* R2010 (0x7da) - SLIMTX4MIX Input 2 Source */
+	{ 0x000007db, 0x0080 }, /* R2011 (0x7db) - SLIMTX4MIX Input 2 Volume */
+	{ 0x000007dc, 0x0000 }, /* R2012 (0x7dc) - SLIMTX4MIX Input 3 Source */
+	{ 0x000007dd, 0x0080 }, /* R2013 (0x7dd) - SLIMTX4MIX Input 3 Volume */
+	{ 0x000007de, 0x0000 }, /* R2014 (0x7de) - SLIMTX4MIX Input 4 Source */
+	{ 0x000007df, 0x0080 }, /* R2015 (0x7df) - SLIMTX4MIX Input 4 Volume */
+	{ 0x000007e0, 0x0000 }, /* R2016 (0x7e0) - SLIMTX5MIX Input 1 Source */
+	{ 0x000007e1, 0x0080 }, /* R2017 (0x7e1) - SLIMTX5MIX Input 1 Volume */
+	{ 0x000007e2, 0x0000 }, /* R2018 (0x7e2) - SLIMTX5MIX Input 2 Source */
+	{ 0x000007e3, 0x0080 }, /* R2019 (0x7e3) - SLIMTX5MIX Input 2 Volume */
+	{ 0x000007e4, 0x0000 }, /* R2020 (0x7e4) - SLIMTX5MIX Input 3 Source */
+	{ 0x000007e5, 0x0080 }, /* R2021 (0x7e5) - SLIMTX5MIX Input 3 Volume */
+	{ 0x000007e6, 0x0000 }, /* R2022 (0x7e6) - SLIMTX5MIX Input 4 Source */
+	{ 0x000007e7, 0x0080 }, /* R2023 (0x7e7) - SLIMTX5MIX Input 4 Volume */
+	{ 0x000007e8, 0x0000 }, /* R2024 (0x7e8) - SLIMTX6MIX Input 1 Source */
+	{ 0x000007e9, 0x0080 }, /* R2025 (0x7e9) - SLIMTX6MIX Input 1 Volume */
+	{ 0x000007ea, 0x0000 }, /* R2026 (0x7ea) - SLIMTX6MIX Input 2 Source */
+	{ 0x000007eb, 0x0080 }, /* R2027 (0x7eb) - SLIMTX6MIX Input 2 Volume */
+	{ 0x000007ec, 0x0000 }, /* R2028 (0x7ec) - SLIMTX6MIX Input 3 Source */
+	{ 0x000007ed, 0x0080 }, /* R2029 (0x7ed) - SLIMTX6MIX Input 3 Volume */
+	{ 0x000007ee, 0x0000 }, /* R2030 (0x7ee) - SLIMTX6MIX Input 4 Source */
+	{ 0x000007ef, 0x0080 }, /* R2031 (0x7ef) - SLIMTX6MIX Input 4 Volume */
+	{ 0x000007f0, 0x0000 }, /* R2032 (0x7f0) - SLIMTX7MIX Input 1 Source */
+	{ 0x000007f1, 0x0080 }, /* R2033 (0x7f1) - SLIMTX7MIX Input 1 Volume */
+	{ 0x000007f2, 0x0000 }, /* R2034 (0x7f2) - SLIMTX7MIX Input 2 Source */
+	{ 0x000007f3, 0x0080 }, /* R2035 (0x7f3) - SLIMTX7MIX Input 2 Volume */
+	{ 0x000007f4, 0x0000 }, /* R2036 (0x7f4) - SLIMTX7MIX Input 3 Source */
+	{ 0x000007f5, 0x0080 }, /* R2037 (0x7f5) - SLIMTX7MIX Input 3 Volume */
+	{ 0x000007f6, 0x0000 }, /* R2038 (0x7f6) - SLIMTX7MIX Input 4 Source */
+	{ 0x000007f7, 0x0080 }, /* R2039 (0x7f7) - SLIMTX7MIX Input 4 Volume */
+	{ 0x000007f8, 0x0000 }, /* R2040 (0x7f8) - SLIMTX8MIX Input 1 Source */
+	{ 0x000007f9, 0x0080 }, /* R2041 (0x7f9) - SLIMTX8MIX Input 1 Volume */
+	{ 0x000007fa, 0x0000 }, /* R2042 (0x7fa) - SLIMTX8MIX Input 2 Source */
+	{ 0x000007fb, 0x0080 }, /* R2043 (0x7fb) - SLIMTX8MIX Input 2 Volume */
+	{ 0x000007fc, 0x0000 }, /* R2044 (0x7fc) - SLIMTX8MIX Input 3 Source */
+	{ 0x000007fd, 0x0080 }, /* R2045 (0x7fd) - SLIMTX8MIX Input 3 Volume */
+	{ 0x000007fe, 0x0000 }, /* R2046 (0x7fe) - SLIMTX8MIX Input 4 Source */
+	{ 0x000007ff, 0x0080 }, /* R2047 (0x7ff) - SLIMTX8MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88a) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88b) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88c) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88d) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88e) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88f) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89a) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89b) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89c) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89d) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89e) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89f) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8c0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8c1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8c2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8c3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8c4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8c5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8c6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8c7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8c8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8c9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8ca) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8cb) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8cc) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8cd) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8ce) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8cf) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8d0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8d1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8d2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8d3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8d4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8d5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8d6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8d7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8d8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8d9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8da) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8db) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8dc) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8dd) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8de) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8df) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90a) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90b) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90c) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90d) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90e) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90f) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91a) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91b) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91c) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91d) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91e) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91f) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94a) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94b) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94c) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94d) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94e) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94f) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000a80, 0x0000 }, /* R2688 (0xa80) - ASRC1 1LMIX Input 1 Source */
+	{ 0x00000a88, 0x0000 }, /* R2696 (0xa88) - ASRC1 1RMIX Input 1 Source */
+	{ 0x00000a90, 0x0000 }, /* R2704 (0xa90) - ASRC1 2LMIX Input 1 Source */
+	{ 0x00000a98, 0x0000 }, /* R2712 (0xa98) - ASRC1 2RMIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xb00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xb08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xb20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xb28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xb40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xb48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xb60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xb68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000dc0, 0x0000 }, /* R3520 (0xdc0) - DFC1MIX Input 1 Source */
+	{ 0x00000dc8, 0x0000 }, /* R3528 (0xdc8) - DFC2MIX Input 1 Source */
+	{ 0x00000dd0, 0x0000 }, /* R3536 (0xdd0) - DFC3MIX Input 1 Source */
+	{ 0x00000dd8, 0x0000 }, /* R3544 (0xdd8) - DFC4MIX Input 1 Source */
+	{ 0x00000de0, 0x0000 }, /* R3552 (0xde0) - DFC5MIX Input 1 Source */
+	{ 0x00000de8, 0x0000 }, /* R3560 (0xde8) - DFC6MIX Input 1 Source */
+	{ 0x00000df0, 0x0000 }, /* R3568 (0xdf0) - DFC7MIX Input 1 Source */
+	{ 0x00000df8, 0x0000 }, /* R3576 (0xdf8) - DFC8MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xe00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xe10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xe11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xe12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xe13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xe14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xe15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xe16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xe17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xe18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xe19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xe1a) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xe1b) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xe1c) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xe1d) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xe1e) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xe1f) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xe20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xe21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xe22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xe23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xe24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xe26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xe27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xe28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xe29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xe2a) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xe2b) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xe2c) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xe2d) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xe2e) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xe2f) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xe30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xe31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xe32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xe33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xe34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xe35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xe36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xe37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xe38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xe39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xe3a) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xe3c) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xe3d) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xe3e) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xe3f) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xe40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xe41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xe42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xe43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xe44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xe45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xe46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xe47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xe48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xe49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xe4a) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xe4b) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xe4c) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xe4d) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xe4e) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xe4f) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xe50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xe52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xe53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xe54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xe55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xe56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xe57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xe58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xe59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xe5a) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xe5b) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xe5c) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xe5d) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xe5e) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xe5f) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xe60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xe61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xe62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xe63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xe64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xe65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xe66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xe80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xe81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xec5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xec8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xec9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xecc) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xecd) - HPLPF4 2 */
+	{ 0x00000ee0, 0x0000 }, /* R3808 (0xee0) - ASRC1 Enable */
+	{ 0x00000ee2, 0x0000 }, /* R3810 (0xee2) - ASRC1 Rate 1 */
+	{ 0x00000ee3, 0x4000 }, /* R3811 (0xee3) - ASRC1 Rate 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xef0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xef1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xef2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC2 Ctrl 3 */
+	{ 0x000010c0, 0x0008 }, /* R4288 (0x10c0) - AUXPDM1 Ctrl 0 */
+	{ 0x000010c1, 0x4000 }, /* R4289 (0x10c1) - AUXPDM1 Ctrl 1 */
+	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1 Ctrl W0 */
+	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1 Rx W0 */
+	{ 0x00001484, 0x1f00 }, /* R5252 (0x1484) - DFC1 Tx W0 */
+	{ 0x00001486, 0x0000 }, /* R5254 (0x1486) - DFC2 Ctrl W0 */
+	{ 0x00001488, 0x1f00 }, /* R5256 (0x1488) - DFC2 Rx W0 */
+	{ 0x0000148a, 0x1f00 }, /* R5258 (0x148a) - DFC2 Tx W0 */
+	{ 0x0000148c, 0x0000 }, /* R5260 (0x148c) - DFC3 Ctrl W0 */
+	{ 0x0000148e, 0x1f00 }, /* R5262 (0x148e) - DFC3 Rx W0 */
+	{ 0x00001490, 0x1f00 }, /* R5264 (0x1490) - DFC3 Tx W0 */
+	{ 0x00001492, 0x0000 }, /* R5266 (0x1492) - DFC4 Ctrl W0 */
+	{ 0x00001494, 0x1f00 }, /* R5268 (0x1494) - DFC4 Rx W0 */
+	{ 0x00001496, 0x1f00 }, /* R5270 (0x1496) - DFC4 Tx W0 */
+	{ 0x00001498, 0x0000 }, /* R5272 (0x1498) - DFC5 Ctrl W0 */
+	{ 0x0000149a, 0x1f00 }, /* R5274 (0x149a) - DFC5 Rx W0 */
+	{ 0x0000149c, 0x1f00 }, /* R5276 (0x149c) - DFC5 Tx W0 */
+	{ 0x0000149e, 0x0000 }, /* R5278 (0x149e) - DFC6 Ctrl W0 */
+	{ 0x000014a0, 0x1f00 }, /* R5280 (0x14a0) - DFC6 Rx W0 */
+	{ 0x000014a2, 0x1f00 }, /* R5282 (0x14a2) - DFC6 Tx W0 */
+	{ 0x000014a4, 0x0000 }, /* R5284 (0x14a4) - DFC7 Ctrl W0 */
+	{ 0x000014a6, 0x1f00 }, /* R5286 (0x14a6) - DFC7 Rx W0 */
+	{ 0x000014a8, 0x1f00 }, /* R5288 (0x14a8) - DFC7 Tx W0 */
+	{ 0x000014aa, 0x0000 }, /* R5290 (0x14aa) - DFC8 Ctrl W0 */
+	{ 0x000014ac, 0x1f00 }, /* R5292 (0x14ac) - DFC8 Rx W0 */
+	{ 0x000014ae, 0x1f00 }, /* R5294 (0x14ae) - DFC8 Tx W0 */
+	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xf000 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2001 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xf000 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2001 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xf000 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2001 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xf000 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2001 }, /* R5898 (0x170a) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xf000 }, /* R5899 (0x170b) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2001 }, /* R5900 (0x170c) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xf000 }, /* R5901 (0x170d) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2001 }, /* R5902 (0x170e) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xf000 }, /* R5903 (0x170f) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2001 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xf000 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2001 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xf000 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2001 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xf000 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2001 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xf000 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2001 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xf000 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2001 }, /* R5914 (0x171a) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xf000 }, /* R5915 (0x171b) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2001 }, /* R5916 (0x171c) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xf000 }, /* R5917 (0x171d) - GPIO15 Ctrl 2 */
+	{ 0x0000171e, 0x2001 }, /* R5918 (0x171e) - GPIO16 Ctrl 1 */
+	{ 0x0000171f, 0xf000 }, /* R5919 (0x171f) - GPIO16 Ctrl 2 */
+	{ 0x00001840, 0x1200 }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0x77e0 }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0x0301 }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0x0f3f }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0x031f }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0x031f }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184a) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0x033f }, /* R6219 (0x184b) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0x003f }, /* R6220 (0x184c) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0x003f }, /* R6221 (0x184d) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0x1000 }, /* R6222 (0x184e) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184f) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0x0001 }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0x0001 }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0x0001 }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0x0001 }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0x0001 }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0x0001 }, /* R6234 (0x185a) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0x0001 }, /* R6235 (0x185b) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185c) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0x0001 }, /* R6237 (0x185d) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
+	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
+};
+
+static bool cs47l92_is_adsp_memory(unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x082ffe:
+	case 0x0a0000 ... 0x0a1ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_CLOCK_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_OUTPUT_ASYNC_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ...  MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case CS47L92_FLL1_CONTROL_7 ...  CS47L92_FLL1_CONTROL_10:
+	case MADERA_FLL1_CONTROL_11:
+	case MADERA_FLL1_DIGITAL_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_6:
+	case CS47L92_FLL1_GPIO_CLOCK:
+	case MADERA_FLL2_CONTROL_1 ... MADERA_FLL2_CONTROL_6:
+	case CS47L92_FLL2_CONTROL_7 ... CS47L92_FLL2_CONTROL_10:
+	case MADERA_FLL2_CONTROL_11:
+	case MADERA_FLL2_DIGITAL_TEST_1:
+	case MADERA_FLL2_SYNCHRONISER_1 ... MADERA_FLL2_SYNCHRONISER_6:
+	case CS47L92_FLL2_GPIO_CLOCK:
+	case MADERA_MIC_CHARGE_PUMP_1:
+	case MADERA_LDO2_CONTROL_1:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_2:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_MIC_BIAS_CTRL_6:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_0:
+	case MADERA_MIC_DETECT_2_CONTROL_1:
+	case MADERA_MIC_DETECT_2_CONTROL_2:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_MIC_DETECT_2_LEVEL_1 ... MADERA_MIC_DETECT_2_LEVEL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case MADERA_IN3L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3L:
+	case MADERA_DMIC3L_CONTROL:
+	case MADERA_IN3L_RATE_CONTROL:
+	case MADERA_IN3R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3R:
+	case MADERA_DMIC3R_CONTROL:
+	case MADERA_IN3R_RATE_CONTROL:
+	case MADERA_IN4L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4L:
+	case MADERA_DMIC4L_CONTROL:
+	case MADERA_IN4L_RATE_CONTROL:
+	case MADERA_IN4R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4R:
+	case MADERA_DMIC4R_CONTROL:
+	case MADERA_IN4R_RATE_CONTROL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2L:
+	case MADERA_DAC_DIGITAL_VOLUME_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_NOISE_GATE_SELECT_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2R:
+	case MADERA_DAC_DIGITAL_VOLUME_2R:
+	case MADERA_NOISE_GATE_SELECT_2R:
+	case MADERA_OUTPUT_PATH_CONFIG_3L:
+	case MADERA_DAC_DIGITAL_VOLUME_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3:
+	case MADERA_NOISE_GATE_SELECT_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3R:
+	case MADERA_DAC_DIGITAL_VOLUME_3R:
+	case MADERA_NOISE_GATE_SELECT_3R:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1 ...  MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP2_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP3_SHORT_CIRCUIT_CTRL:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_18:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_18:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_18:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_FRAMER_REF_GEAR:
+	case MADERA_SLIMBUS_RATES_1 ... MADERA_SLIMBUS_RATES_8:
+	case MADERA_SLIMBUS_RX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_TX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1RMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2RMIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC1MIX_INPUT_1_SOURCE:
+	case MADERA_DFC2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC3MIX_INPUT_1_SOURCE:
+	case MADERA_DFC4MIX_INPUT_1_SOURCE:
+	case MADERA_DFC5MIX_INPUT_1_SOURCE:
+	case MADERA_DFC6MIX_INPUT_1_SOURCE:
+	case MADERA_DFC7MIX_INPUT_1_SOURCE:
+	case MADERA_DFC8MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ASRC1_ENABLE:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_ASRC1_RATE1 ... MADERA_ASRC1_RATE2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_AUXPDM1_CTRL_0 ... MADERA_AUXPDM1_CTRL_1:
+	case MADERA_DFC1_CTRL:
+	case MADERA_DFC1_RX:
+	case MADERA_DFC1_TX:
+	case MADERA_DFC2_CTRL:
+	case MADERA_DFC2_RX:
+	case MADERA_DFC2_TX:
+	case MADERA_DFC3_CTRL:
+	case MADERA_DFC3_RX:
+	case MADERA_DFC3_TX:
+	case MADERA_DFC4_CTRL:
+	case MADERA_DFC4_RX:
+	case MADERA_DFC4_TX:
+	case MADERA_DFC5_CTRL:
+	case MADERA_DFC5_RX:
+	case MADERA_DFC5_TX:
+	case MADERA_DFC6_CTRL:
+	case MADERA_DFC6_RX:
+	case MADERA_DFC6_TX:
+	case MADERA_DFC7_CTRL:
+	case MADERA_DFC7_RX:
+	case MADERA_DFC7_TX:
+	case MADERA_DFC8_CTRL:
+	case MADERA_DFC8_RX:
+	case MADERA_DFC8_TX:
+	case MADERA_DFC_STATUS:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_FX_CTRL2:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_DFC_STATUS:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+static bool cs47l92_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+const struct regmap_config cs47l92_16bit_spi_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_spi_regmap);
+
+const struct regmap_config cs47l92_16bit_i2c_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_i2c_regmap);
+
+const struct regmap_config cs47l92_32bit_spi_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_spi_regmap);
+
+const struct regmap_config cs47l92_32bit_i2c_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index a354567ebc86..b9e9c169c6cc 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -35,6 +35,7 @@
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
+#define CS47L92_SILICON_ID	0x6371
 
 #define MADERA_32KZ_MCLK2	1
 
@@ -148,6 +149,29 @@ static const struct mfd_cell cs47l90_devs[] = {
 	},
 };
 
+static const char * const cs47l92_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"CPVDD2",
+};
+
+static const struct mfd_cell cs47l92_devs[] = {
+	{ .name = "madera-pinctrl" },
+	{ .name = "madera-irq", },
+	{ .name = "madera-micsupp", },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l92-codec",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l92_supplies),
+	},
+};
+
 /* Used by madera-i2c and madera-spi drivers */
 const char *madera_name_from_type(enum madera_type type)
 {
@@ -162,6 +186,12 @@ const char *madera_name_from_type(enum madera_type type)
 		return "CS47L90";
 	case CS47L91:
 		return "CS47L91";
+	case CS42L92:
+		return "CS42L92";
+	case CS47L92:
+		return "CS47L92";
+	case CS47L93:
+		return "CS47L93";
 	case WM1840:
 		return "WM1840";
 	default:
@@ -321,6 +351,9 @@ const struct of_device_id madera_of_match[] = {
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
 	{ .compatible = "cirrus,cs47l91", .data = (void *)CS47L91 },
+	{ .compatible = "cirrus,cs42l92", .data = (void *)CS42L92 },
+	{ .compatible = "cirrus,cs47l92", .data = (void *)CS47L92 },
+	{ .compatible = "cirrus,cs47l93", .data = (void *)CS47L93 },
 	{ .compatible = "cirrus,wm1840", .data = (void *)WM1840 },
 	{}
 };
@@ -385,6 +418,13 @@ static void madera_set_micbias_info(struct madera *madera)
 		madera->num_childbias[0] = 4;
 		madera->num_childbias[1] = 4;
 		return;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		madera->num_micbias = 2;
+		madera->num_childbias[0] = 4;
+		madera->num_childbias[1] = 2;
+		return;
 	default:
 		return;
 	}
@@ -436,6 +476,9 @@ int madera_dev_init(struct madera *madera)
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
 		break;
 	case CS47L85:
 	case WM1840:
@@ -556,6 +599,21 @@ int madera_dev_init(struct madera *madera)
 			}
 		}
 		break;
+	case CS47L92_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			switch (madera->type) {
+			case CS42L92:
+			case CS47L92:
+			case CS47L93:
+				patch_fn = cs47l92_patch;
+				mfd_devs = cs47l92_devs;
+				n_devs = ARRAY_SIZE(cs47l92_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	default:
 		dev_err(madera->dev, "Unknown device ID: %x\n", hwid);
 		ret = -EINVAL;
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index bd868459cedb..3f4ab5dcf5c3 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -65,6 +65,14 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 			regmap_32bit_config = &cs47l90_32bit_i2c_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l92_32bit_i2c_regmap;
+		}
+		break;
 	default:
 		dev_err(&i2c->dev,
 			"Unknown Madera I2C device type %ld\n", type);
@@ -124,6 +132,9 @@ static const struct i2c_device_id madera_i2c_id[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index a36741b73c25..d76c7e7376d7 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -65,6 +65,14 @@ static int madera_spi_probe(struct spi_device *spi)
 			regmap_32bit_config = &cs47l90_32bit_spi_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l92_32bit_spi_regmap;
+		}
+		break;
 	default:
 		dev_err(&spi->dev,
 			"Unknown Madera SPI device type %ld\n", type);
@@ -123,6 +131,9 @@ static const struct spi_device_id madera_spi_ids[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index ccc16f2a1288..69a40aba7e69 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -47,4 +47,11 @@ extern const struct regmap_config cs47l90_32bit_spi_regmap;
 extern const struct regmap_config cs47l90_16bit_i2c_regmap;
 extern const struct regmap_config cs47l90_32bit_i2c_regmap;
 int cs47l90_patch(struct madera *madera);
+
+extern const struct regmap_config cs47l92_16bit_spi_regmap;
+extern const struct regmap_config cs47l92_32bit_spi_regmap;
+extern const struct regmap_config cs47l92_16bit_i2c_regmap;
+extern const struct regmap_config cs47l92_32bit_i2c_regmap;
+int cs47l92_patch(struct madera *madera);
+
 #endif
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 98dd3cb5e84d..7b87f9a02ecc 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -26,8 +26,11 @@ enum madera_type {
 	CS47L85 = 2,
 	CS47L90 = 3,
 	CS47L91 = 4,
+	CS47L92 = 5,
+	CS47L93 = 6,
 	WM1840 = 7,
 	CS47L15 = 8,
+	CS42L92 = 9,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
@@ -37,6 +40,7 @@ enum madera_type {
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
+#define CS47L92_NUM_GPIOS		16
 
 #define MADERA_MAX_MICBIAS		4
 
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 5b054d511c6a..6439c0282ac6 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -77,9 +77,15 @@
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
 #define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
+#define CS47L92_FLL1_CONTROL_7				0x177
 #define MADERA_FLL1_NCO_TEST_0				0x178
+#define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
+#define CS47L92_FLL1_CONTROL_9				0x179
 #define MADERA_FLL1_EFS_2				0x17A
+#define CS47L92_FLL1_CONTROL_10				0x17A
+#define MADERA_FLL1_CONTROL_11				0x17B
+#define MADERA_FLL1_DIGITAL_TEST_1			0x17D
 #define CS47L35_FLL1_SYNCHRONISER_1			0x17F
 #define CS47L35_FLL1_SYNCHRONISER_2			0x180
 #define CS47L35_FLL1_SYNCHRONISER_3			0x181
@@ -98,6 +104,7 @@
 #define MADERA_FLL1_SYNCHRONISER_7			0x187
 #define MADERA_FLL1_SPREAD_SPECTRUM			0x189
 #define MADERA_FLL1_GPIO_CLOCK				0x18A
+#define CS47L92_FLL1_GPIO_CLOCK				0x18E
 #define MADERA_FLL2_CONTROL_1				0x191
 #define MADERA_FLL2_CONTROL_2				0x192
 #define MADERA_FLL2_CONTROL_3				0x193
@@ -105,9 +112,15 @@
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
 #define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
+#define CS47L92_FLL2_CONTROL_7				0x197
 #define MADERA_FLL2_NCO_TEST_0				0x198
+#define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
+#define CS47L92_FLL2_CONTROL_9				0x199
 #define MADERA_FLL2_EFS_2				0x19A
+#define CS47L92_FLL2_CONTROL_10				0x19A
+#define MADERA_FLL2_CONTROL_11				0x19B
+#define MADERA_FLL2_DIGITAL_TEST_1			0x19D
 #define MADERA_FLL2_SYNCHRONISER_1			0x1A1
 #define MADERA_FLL2_SYNCHRONISER_2			0x1A2
 #define MADERA_FLL2_SYNCHRONISER_3			0x1A3
@@ -117,6 +130,7 @@
 #define MADERA_FLL2_SYNCHRONISER_7			0x1A7
 #define MADERA_FLL2_SPREAD_SPECTRUM			0x1A9
 #define MADERA_FLL2_GPIO_CLOCK				0x1AA
+#define CS47L92_FLL2_GPIO_CLOCK				0x1AE
 #define MADERA_FLL3_CONTROL_1				0x1B1
 #define MADERA_FLL3_CONTROL_2				0x1B2
 #define MADERA_FLL3_CONTROL_3				0x1B3
@@ -267,6 +281,7 @@
 #define MADERA_NOISE_GATE_SELECT_2R			0x41F
 #define MADERA_OUTPUT_PATH_CONFIG_3L			0x420
 #define MADERA_DAC_DIGITAL_VOLUME_3L			0x421
+#define MADERA_OUTPUT_PATH_CONFIG_3			0x422
 #define MADERA_NOISE_GATE_SELECT_3L			0x423
 #define MADERA_OUTPUT_PATH_CONFIG_3R			0x424
 #define MADERA_DAC_DIGITAL_VOLUME_3R			0x425
@@ -369,8 +384,20 @@
 #define MADERA_AIF3_FRAME_CTRL_2			0x588
 #define MADERA_AIF3_FRAME_CTRL_3			0x589
 #define MADERA_AIF3_FRAME_CTRL_4			0x58A
+#define MADERA_AIF3_FRAME_CTRL_5			0x58B
+#define MADERA_AIF3_FRAME_CTRL_6			0x58C
+#define MADERA_AIF3_FRAME_CTRL_7			0x58D
+#define MADERA_AIF3_FRAME_CTRL_8			0x58E
+#define MADERA_AIF3_FRAME_CTRL_9			0x58F
+#define MADERA_AIF3_FRAME_CTRL_10			0x590
 #define MADERA_AIF3_FRAME_CTRL_11			0x591
 #define MADERA_AIF3_FRAME_CTRL_12			0x592
+#define MADERA_AIF3_FRAME_CTRL_13			0x593
+#define MADERA_AIF3_FRAME_CTRL_14			0x594
+#define MADERA_AIF3_FRAME_CTRL_15			0x595
+#define MADERA_AIF3_FRAME_CTRL_16			0x596
+#define MADERA_AIF3_FRAME_CTRL_17			0x597
+#define MADERA_AIF3_FRAME_CTRL_18			0x598
 #define MADERA_AIF3_TX_ENABLES				0x599
 #define MADERA_AIF3_RX_ENABLES				0x59A
 #define MADERA_AIF3_FORCE_WRITE				0x59B
@@ -662,6 +689,54 @@
 #define MADERA_AIF3TX2MIX_INPUT_3_VOLUME		0x78D
 #define MADERA_AIF3TX2MIX_INPUT_4_SOURCE		0x78E
 #define MADERA_AIF3TX2MIX_INPUT_4_VOLUME		0x78F
+#define MADERA_AIF3TX3MIX_INPUT_1_SOURCE		0x790
+#define MADERA_AIF3TX3MIX_INPUT_1_VOLUME		0x791
+#define MADERA_AIF3TX3MIX_INPUT_2_SOURCE		0x792
+#define MADERA_AIF3TX3MIX_INPUT_2_VOLUME		0x793
+#define MADERA_AIF3TX3MIX_INPUT_3_SOURCE		0x794
+#define MADERA_AIF3TX3MIX_INPUT_3_VOLUME		0x795
+#define MADERA_AIF3TX3MIX_INPUT_4_SOURCE		0x796
+#define MADERA_AIF3TX3MIX_INPUT_4_VOLUME		0x797
+#define MADERA_AIF3TX4MIX_INPUT_1_SOURCE		0x798
+#define MADERA_AIF3TX4MIX_INPUT_1_VOLUME		0x799
+#define MADERA_AIF3TX4MIX_INPUT_2_SOURCE		0x79A
+#define MADERA_AIF3TX4MIX_INPUT_2_VOLUME		0x79B
+#define MADERA_AIF3TX4MIX_INPUT_3_SOURCE		0x79C
+#define MADERA_AIF3TX4MIX_INPUT_3_VOLUME		0x79D
+#define MADERA_AIF3TX4MIX_INPUT_4_SOURCE		0x79E
+#define MADERA_AIF3TX4MIX_INPUT_4_VOLUME		0x79F
+#define CS47L92_AIF3TX5MIX_INPUT_1_SOURCE		0x7A0
+#define CS47L92_AIF3TX5MIX_INPUT_1_VOLUME		0x7A1
+#define CS47L92_AIF3TX5MIX_INPUT_2_SOURCE		0x7A2
+#define CS47L92_AIF3TX5MIX_INPUT_2_VOLUME		0x7A3
+#define CS47L92_AIF3TX5MIX_INPUT_3_SOURCE		0x7A4
+#define CS47L92_AIF3TX5MIX_INPUT_3_VOLUME		0x7A5
+#define CS47L92_AIF3TX5MIX_INPUT_4_SOURCE		0x7A6
+#define CS47L92_AIF3TX5MIX_INPUT_4_VOLUME		0x7A7
+#define CS47L92_AIF3TX6MIX_INPUT_1_SOURCE		0x7A8
+#define CS47L92_AIF3TX6MIX_INPUT_1_VOLUME		0x7A9
+#define CS47L92_AIF3TX6MIX_INPUT_2_SOURCE		0x7AA
+#define CS47L92_AIF3TX6MIX_INPUT_2_VOLUME		0x7AB
+#define CS47L92_AIF3TX6MIX_INPUT_3_SOURCE		0x7AC
+#define CS47L92_AIF3TX6MIX_INPUT_3_VOLUME		0x7AD
+#define CS47L92_AIF3TX6MIX_INPUT_4_SOURCE		0x7AE
+#define CS47L92_AIF3TX6MIX_INPUT_4_VOLUME		0x7AF
+#define CS47L92_AIF3TX7MIX_INPUT_1_SOURCE		0x7B0
+#define CS47L92_AIF3TX7MIX_INPUT_1_VOLUME		0x7B1
+#define CS47L92_AIF3TX7MIX_INPUT_2_SOURCE		0x7B2
+#define CS47L92_AIF3TX7MIX_INPUT_2_VOLUME		0x7B3
+#define CS47L92_AIF3TX7MIX_INPUT_3_SOURCE		0x7B4
+#define CS47L92_AIF3TX7MIX_INPUT_3_VOLUME		0x7B5
+#define CS47L92_AIF3TX7MIX_INPUT_4_SOURCE		0x7B6
+#define CS47L92_AIF3TX7MIX_INPUT_4_VOLUME		0x7B7
+#define CS47L92_AIF3TX8MIX_INPUT_1_SOURCE		0x7B8
+#define CS47L92_AIF3TX8MIX_INPUT_1_VOLUME		0x7B9
+#define CS47L92_AIF3TX8MIX_INPUT_2_SOURCE		0x7BA
+#define CS47L92_AIF3TX8MIX_INPUT_2_VOLUME		0x7BB
+#define CS47L92_AIF3TX8MIX_INPUT_3_SOURCE		0x7BC
+#define CS47L92_AIF3TX8MIX_INPUT_3_VOLUME		0x7BD
+#define CS47L92_AIF3TX8MIX_INPUT_4_SOURCE		0x7BE
+#define CS47L92_AIF3TX8MIX_INPUT_4_VOLUME		0x7BF
 #define MADERA_AIF4TX1MIX_INPUT_1_SOURCE		0x7A0
 #define MADERA_AIF4TX1MIX_INPUT_1_VOLUME		0x7A1
 #define MADERA_AIF4TX1MIX_INPUT_2_SOURCE		0x7A2
@@ -1105,6 +1180,8 @@
 #define MADERA_FCR_ADC_REFORMATTER_CONTROL		0xF73
 #define MADERA_FCR_COEFF_START				0xF74
 #define MADERA_FCR_COEFF_END				0xFC5
+#define MADERA_AUXPDM1_CTRL_0				0x10C0
+#define MADERA_AUXPDM1_CTRL_1				0x10C1
 #define MADERA_DAC_COMP_1				0x1300
 #define MADERA_DAC_COMP_2				0x1302
 #define MADERA_FRF_COEFFICIENT_1L_1			0x1380
@@ -1446,6 +1523,12 @@
 #define MADERA_OPCLK_ASYNC_SEL_WIDTH			     3
 
 /* (0x0171)  FLL1_Control_1 */
+#define CS47L92_FLL1_REFCLK_SRC_MASK			0xF000
+#define CS47L92_FLL1_REFCLK_SRC_SHIFT			    12
+#define CS47L92_FLL1_REFCLK_SRC_WIDTH			     4
+#define MADERA_FLL1_HOLD_MASK				0x0004
+#define MADERA_FLL1_HOLD_SHIFT				     2
+#define MADERA_FLL1_HOLD_WIDTH				     1
 #define MADERA_FLL1_FREERUN				0x0002
 #define MADERA_FLL1_FREERUN_MASK			0x0002
 #define MADERA_FLL1_FREERUN_SHIFT			     1
@@ -1478,6 +1561,9 @@
 #define MADERA_FLL1_FRATIO_MASK				0x0F00
 #define MADERA_FLL1_FRATIO_SHIFT			     8
 #define MADERA_FLL1_FRATIO_WIDTH			     4
+#define MADERA_FLL1_FB_DIV_MASK				0x03FF
+#define MADERA_FLL1_FB_DIV_SHIFT			     0
+#define MADERA_FLL1_FB_DIV_WIDTH			    10
 
 /* (0x0176)  FLL1_Control_6 */
 #define MADERA_FLL1_REFCLK_DIV_MASK			0x00C0
@@ -1509,6 +1595,30 @@
 #define MADERA_FLL1_PHASE_ENA_SHIFT			    11
 #define MADERA_FLL1_PHASE_ENA_WIDTH			     1
 
+/* (0x017A)  FLL1_Control_10 */
+#define MADERA_FLL1_HP_MASK				0xC000
+#define MADERA_FLL1_HP_SHIFT				    14
+#define MADERA_FLL1_HP_WIDTH				     2
+#define MADERA_FLL1_PHASEDET_ENA_MASK			0x1000
+#define MADERA_FLL1_PHASEDET_ENA_SHIFT			    12
+#define MADERA_FLL1_PHASEDET_ENA_WIDTH			     1
+
+/* (0x017B)  FLL1_Control_11 */
+#define MADERA_FLL1_LOCKDET_THR_MASK			0x001E
+#define MADERA_FLL1_LOCKDET_THR_SHIFT			     1
+#define MADERA_FLL1_LOCKDET_THR_WIDTH			     4
+#define MADERA_FLL1_LOCKDET_MASK			0x0001
+#define MADERA_FLL1_LOCKDET_SHIFT			     0
+#define MADERA_FLL1_LOCKDET_WIDTH			     1
+
+/* (0x017D)  FLL1_Digital_Test_1 */
+#define MADERA_FLL1_SYNC_EFS_ENA_MASK			0x0100
+#define MADERA_FLL1_SYNC_EFS_ENA_SHIFT			     8
+#define MADERA_FLL1_SYNC_EFS_ENA_WIDTH			     1
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_MASK		0x0003
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_SHIFT		     0
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_WIDTH		     2
+
 /* (0x0181)  FLL1_Synchroniser_1 */
 #define MADERA_FLL1_SYNC_ENA				0x0001
 #define MADERA_FLL1_SYNC_ENA_MASK			0x0001
@@ -1630,6 +1740,13 @@
 #define MADERA_LDO2_ENA_WIDTH				     1
 
 /* (0x0218)  Mic_Bias_Ctrl_1 */
+#define MADERA_MICB1_EXT_CAP				0x8000
+#define MADERA_MICB1_EXT_CAP_MASK			0x8000
+#define MADERA_MICB1_EXT_CAP_SHIFT			    15
+#define MADERA_MICB1_EXT_CAP_WIDTH			     1
+#define MADERA_MICB1_LVL_MASK				0x01E0
+#define MADERA_MICB1_LVL_SHIFT				     5
+#define MADERA_MICB1_LVL_WIDTH				     4
 #define MADERA_MICB1_ENA				0x0001
 #define MADERA_MICB1_ENA_MASK				0x0001
 #define MADERA_MICB1_ENA_SHIFT				     0
@@ -2313,6 +2430,17 @@
 #define MADERA_OUT1R_ENA_SHIFT				     0
 #define MADERA_OUT1R_ENA_WIDTH				     1
 
+/* (0x0408)  Output_Rate_1 */
+#define MADERA_CP_DAC_MODE_MASK				0x0040
+#define MADERA_CP_DAC_MODE_SHIFT			     6
+#define MADERA_CP_DAC_MODE_WIDTH			     1
+#define MADERA_OUT_EXT_CLK_DIV_MASK			0x0030
+#define MADERA_OUT_EXT_CLK_DIV_SHIFT			     4
+#define MADERA_OUT_EXT_CLK_DIV_WIDTH			     2
+#define MADERA_OUT_CLK_SRC_MASK				0x0007
+#define MADERA_OUT_CLK_SRC_SHIFT			     0
+#define MADERA_OUT_CLK_SRC_WIDTH			     3
+
 /* (0x0409)  Output_Volume_Ramp */
 #define MADERA_OUT_VD_RAMP_MASK				0x0070
 #define MADERA_OUT_VD_RAMP_SHIFT			     4
@@ -2834,6 +2962,30 @@
 #define MADERA_AIF2RX1_ENA_WIDTH			     1
 
 /* (0x0599)  AIF3_Tx_Enables */
+#define MADERA_AIF3TX8_ENA				0x0080
+#define MADERA_AIF3TX8_ENA_MASK				0x0080
+#define MADERA_AIF3TX8_ENA_SHIFT			     7
+#define MADERA_AIF3TX8_ENA_WIDTH			     1
+#define MADERA_AIF3TX7_ENA				0x0040
+#define MADERA_AIF3TX7_ENA_MASK				0x0040
+#define MADERA_AIF3TX7_ENA_SHIFT			     6
+#define MADERA_AIF3TX7_ENA_WIDTH			     1
+#define MADERA_AIF3TX6_ENA				0x0020
+#define MADERA_AIF3TX6_ENA_MASK				0x0020
+#define MADERA_AIF3TX6_ENA_SHIFT			     5
+#define MADERA_AIF3TX6_ENA_WIDTH			     1
+#define MADERA_AIF3TX5_ENA				0x0010
+#define MADERA_AIF3TX5_ENA_MASK				0x0010
+#define MADERA_AIF3TX5_ENA_SHIFT			     4
+#define MADERA_AIF3TX5_ENA_WIDTH			     1
+#define MADERA_AIF3TX4_ENA				0x0008
+#define MADERA_AIF3TX4_ENA_MASK				0x0008
+#define MADERA_AIF3TX4_ENA_SHIFT			     3
+#define MADERA_AIF3TX4_ENA_WIDTH			     1
+#define MADERA_AIF3TX3_ENA				0x0004
+#define MADERA_AIF3TX3_ENA_MASK				0x0004
+#define MADERA_AIF3TX3_ENA_SHIFT			     2
+#define MADERA_AIF3TX3_ENA_WIDTH			     1
 #define MADERA_AIF3TX2_ENA				0x0002
 #define MADERA_AIF3TX2_ENA_MASK				0x0002
 #define MADERA_AIF3TX2_ENA_SHIFT			     1
@@ -2844,6 +2996,30 @@
 #define MADERA_AIF3TX1_ENA_WIDTH			     1
 
 /* (0x059A)  AIF3_Rx_Enables */
+#define MADERA_AIF3RX8_ENA				0x0080
+#define MADERA_AIF3RX8_ENA_MASK				0x0080
+#define MADERA_AIF3RX8_ENA_SHIFT			     7
+#define MADERA_AIF3RX8_ENA_WIDTH			     1
+#define MADERA_AIF3RX7_ENA				0x0040
+#define MADERA_AIF3RX7_ENA_MASK				0x0040
+#define MADERA_AIF3RX7_ENA_SHIFT			     6
+#define MADERA_AIF3RX7_ENA_WIDTH			     1
+#define MADERA_AIF3RX6_ENA				0x0020
+#define MADERA_AIF3RX6_ENA_MASK				0x0020
+#define MADERA_AIF3RX6_ENA_SHIFT			     5
+#define MADERA_AIF3RX6_ENA_WIDTH			     1
+#define MADERA_AIF3RX5_ENA				0x0010
+#define MADERA_AIF3RX5_ENA_MASK				0x0010
+#define MADERA_AIF3RX5_ENA_SHIFT			     4
+#define MADERA_AIF3RX5_ENA_WIDTH			     1
+#define MADERA_AIF3RX4_ENA				0x0008
+#define MADERA_AIF3RX4_ENA_MASK				0x0008
+#define MADERA_AIF3RX4_ENA_SHIFT			     3
+#define MADERA_AIF3RX4_ENA_WIDTH			     1
+#define MADERA_AIF3RX3_ENA				0x0004
+#define MADERA_AIF3RX3_ENA_MASK				0x0004
+#define MADERA_AIF3RX3_ENA_SHIFT			     2
+#define MADERA_AIF3RX3_ENA_WIDTH			     1
 #define MADERA_AIF3RX2_ENA				0x0002
 #define MADERA_AIF3RX2_ENA_MASK				0x0002
 #define MADERA_AIF3RX2_ENA_SHIFT			     1
@@ -3458,6 +3634,25 @@
 #define MADERA_FCR_MIC_MODE_SEL_SHIFT			     2
 #define MADERA_FCR_MIC_MODE_SEL_WIDTH			     2
 
+/* (0x10C0)  AUXPDM1_CTRL_0 */
+#define MADERA_AUXPDM1_SRC_MASK				0x0F00
+#define MADERA_AUXPDM1_SRC_SHIFT			     8
+#define MADERA_AUXPDM1_SRC_WIDTH			     4
+#define MADERA_AUXPDM1_TXEDGE_MASK			0x0010
+#define MADERA_AUXPDM1_TXEDGE_SHIFT			     4
+#define MADERA_AUXPDM1_TXEDGE_WIDTH			     1
+#define MADERA_AUXPDM1_MSTR_MASK			0x0008
+#define MADERA_AUXPDM1_MSTR_SHIFT			     3
+#define MADERA_AUXPDM1_MSTR_WIDTH			     1
+#define MADERA_AUXPDM1_ENABLE_MASK			0x0001
+#define MADERA_AUXPDM1_ENABLE_SHIFT			     0
+#define MADERA_AUXPDM1_ENABLE_WIDTH			     1
+
+/* (0x10C1)  AUXPDM1_CTRL_1 */
+#define MADERA_AUXPDM1_CLK_FREQ_MASK			0xC000
+#define MADERA_AUXPDM1_CLK_FREQ_SHIFT			    14
+#define MADERA_AUXPDM1_CLK_FREQ_WIDTH			     2
+
 /* (0x1480)  DFC1_CTRL_W0 */
 #define MADERA_DFC1_RATE_MASK				0x007C
 #define MADERA_DFC1_RATE_SHIFT				     2
-- 
cgit v1.2.3


From 554e937ec8d75930bc711612208af27961e3dc61 Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Mon, 3 Jun 2019 11:45:11 +0800
Subject: mfd: cros_ec: differentiate SCP from EC by feature bit

System Companion Processor (SCP) is Cortex M4 co-processor on some
MediaTek platform that can run EC-style firmware. Since a SCP and EC
would both exist on a system, and use the cros_ec_dev driver, we need to
differentiate between them for the userspace, or they would both be
registered at /dev/cros_ec, causing a conflict.

Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_dev.c   | 10 ++++++++++
 include/linux/mfd/cros_ec.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index d992365472b8..a47223d2baf4 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -459,6 +459,16 @@ static int ec_device_probe(struct platform_device *pdev)
 		ec_platform->ec_name = CROS_EC_DEV_TP_NAME;
 	}
 
+	/* Check whether this is actually a SCP rather than an EC. */
+	if (cros_ec_check_features(ec, EC_FEATURE_SCP)) {
+		dev_info(dev, "CrOS SCP MCU detected.\n");
+		/*
+		 * Help userspace differentiating ECs from SCP,
+		 * regardless of the probing order.
+		 */
+		ec_platform->ec_name = CROS_EC_DEV_SCP_NAME;
+	}
+
 	/*
 	 * Add the class device
 	 * Link to the character device for creating the /dev entry
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index cfa78bb4990f..751cb3756d49 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -27,6 +27,7 @@
 #define CROS_EC_DEV_PD_NAME "cros_pd"
 #define CROS_EC_DEV_TP_NAME "cros_tp"
 #define CROS_EC_DEV_ISH_NAME "cros_ish"
+#define CROS_EC_DEV_SCP_NAME "cros_scp"
 
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
-- 
cgit v1.2.3


From 76304994645028accc0cfe287652344b696f4470 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:25 +0300
Subject: mfd: rk808: Check pm_power_off pointer

The function pointer pm_power_off may point to function from other
module (PSCI for example). If rk808 is removed, pm_power_off is
overwritten to NULL and the system cannot be powered off.

This patch checks if pm_power_off points to a module function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 17 +++++++++++------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 6ee1c461a3bb..e234720fee02 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -562,7 +562,6 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
@@ -609,7 +608,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg);
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
-		pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_fn = rk805_device_shutdown;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -618,7 +617,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
 		cells = rk808s;
 		nr_cells = ARRAY_SIZE(rk808s);
-		pm_pwroff_fn = rk808_device_shutdown;
+		rk808->pm_pwroff_fn = rk808_device_shutdown;
 		break;
 	case RK818_ID:
 		rk808->regmap_cfg = &rk818_regmap_config;
@@ -627,7 +626,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
 		cells = rk818s;
 		nr_cells = ARRAY_SIZE(rk818s);
-		pm_pwroff_fn = rk818_device_shutdown;
+		rk808->pm_pwroff_fn = rk818_device_shutdown;
 		break;
 	case RK809_ID:
 	case RK817_ID:
@@ -692,7 +691,7 @@ static int rk808_probe(struct i2c_client *client,
 				"rockchip,system-power-controller");
 	if (pm_off && !pm_power_off) {
 		rk808_i2c_client = client;
-		pm_power_off = pm_pwroff_fn;
+		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
 	return 0;
@@ -707,7 +706,13 @@ static int rk808_remove(struct i2c_client *client)
 	struct rk808 *rk808 = i2c_get_clientdata(client);
 
 	regmap_del_irq_chip(client->irq, rk808->irq_data);
-	pm_power_off = NULL;
+
+	/**
+	 * pm_power_off may points to a function from another module.
+	 * Check if the pointer is set by us and only then overwrite it.
+	 */
+	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
+		pm_power_off = NULL;
 
 	return 0;
 }
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 2a9cd01691b2..286316375636 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -628,5 +628,6 @@ struct rk808 {
 	long				variant;
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
+	void				(*pm_pwroff_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3


From ac195d94280a783f030a01ee84998a198b779d99 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:26 +0300
Subject: mfd: rk808: Prepare rk805 for poweroff

RK805 has SLEEP signal, which can put the device into SLEEP or OFF
mode. The default is SLEEP mode.

However, when the kernel performs power-off (actually the ATF) the
device will not go fully off and this will result in higher power
consumption and inability to wake the device with RTC alarm.

The solution is to enable pm_power_off_prepare function, which will
configure SLEEP pin for OFF function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 50 +++++++++++++++++++++++++++++++++--------------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index e234720fee02..09fc8d3da541 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -474,17 +474,29 @@ static void rk805_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk805, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK805_DEV_CTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
+}
+
+static void rk805_device_shutdown_prepare(void)
+{
+	int ret;
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+
+	if (!rk808)
+		return;
+
+	ret = regmap_update_bits(rk808->regmap,
+				 RK805_GPIO_IO_POL_REG,
+				 SLP_SD_MSK, SHUTDOWN_FUN);
+	if (ret)
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk808_device_shutdown(void)
@@ -492,17 +504,14 @@ static void rk808_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk808, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK808_DEVCTRL_REG,
 				 DEV_OFF_RST, DEV_OFF_RST);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk818_device_shutdown(void)
@@ -510,17 +519,14 @@ static void rk818_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk818, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK818_DEVCTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk8xx_syscore_shutdown(void)
@@ -609,6 +615,7 @@ static int rk808_probe(struct i2c_client *client,
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
 		rk808->pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -694,6 +701,12 @@ static int rk808_probe(struct i2c_client *client,
 		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
+	if (pm_off && !pm_power_off_prepare) {
+		if (!rk808_i2c_client)
+			rk808_i2c_client = client;
+		pm_power_off_prepare = rk808->pm_pwroff_prep_fn;
+	}
+
 	return 0;
 
 err_irq:
@@ -714,6 +727,13 @@ static int rk808_remove(struct i2c_client *client)
 	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
 		pm_power_off = NULL;
 
+	/**
+	 * As above, check if the pointer is set by us before overwrite.
+	 */
+	if (rk808->pm_pwroff_prep_fn &&
+	    pm_power_off_prepare == rk808->pm_pwroff_prep_fn)
+		pm_power_off_prepare = NULL;
+
 	return 0;
 }
 
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 286316375636..b264ac794c74 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -629,5 +629,6 @@ struct rk808 {
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
 	void				(*pm_pwroff_fn)(void);
+	void				(*pm_pwroff_prep_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3


From b1c83bd84618e5a3ec6395845d11803047a3ef9a Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Fri, 10 May 2019 18:23:01 -0700
Subject: mfd: stmfx: Fix macro definition spelling

Clang warns:

In file included from drivers/mfd/stmfx.c:13:
include/linux/mfd/stmfx.h:7:9: warning: 'MFD_STMFX_H' is used as a
header guard here, followed by #define of a different macro
[-Wheader-guard]

Fixes: 06252ade9156 ("mfd: Add ST Multi-Function eXpander (STMFX) core driver")
Link: https://github.com/ClangBuiltLinux/linux/issues/475
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/stmfx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h
index d890595b89b6..3c67983678ec 100644
--- a/include/linux/mfd/stmfx.h
+++ b/include/linux/mfd/stmfx.h
@@ -5,7 +5,7 @@
  */
 
 #ifndef MFD_STMFX_H
-#define MFX_STMFX_H
+#define MFD_STMFX_H
 
 #include <linux/regmap.h>
 
-- 
cgit v1.2.3


From 0772a34bb8a12fcc245074e0f76e96cba2c9a434 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:35 +0100
Subject: mfd: madera: Remove some unused registers and fix some defaults

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         |   2 -
 drivers/mfd/cs47l35-tables.c         |  54 +---------------
 drivers/mfd/cs47l85-tables.c         | 122 ++---------------------------------
 drivers/mfd/cs47l90-tables.c         |  76 ----------------------
 drivers/mfd/cs47l92-tables.c         |   1 -
 include/linux/mfd/madera/registers.h |  80 -----------------------
 6 files changed, 6 insertions(+), 329 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 1b4f6f79eac2..73db8d03b531 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -88,7 +88,6 @@ static const struct reg_default cs47l15_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -746,7 +745,6 @@ static bool cs47l15_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
 	case MADERA_FLL1_SPREAD_SPECTRUM:
 	case MADERA_FLL1_GPIO_CLOCK:
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index 338b825127f1..fe838cbc2a7e 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -109,9 +109,8 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
-	{ 0x0000017a, 0x0b06 }, /* R378 (0x17a) - FLL1 EFS2 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17a) - FLL1 EFS2 */
 	{ 0x0000017f, 0x0000 }, /* R383 (0x17f) - FLL1 Synchroniser 1 */
 	{ 0x00000180, 0x0000 }, /* R384 (0x180) - FLL1 Synchroniser 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 3 */
@@ -174,9 +173,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x0003 }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x0a83 }, /* R1096 (0x448) - eDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - eDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -720,28 +716,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC 2 CTRL 1 */
 	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC 2 CTRL 2 */
 	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC 2 CTRL 3 */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 4L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 4L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 4L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 4L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 5L 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 5L 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 5L 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 5L 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 5R 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 5R 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 5R 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 5R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -892,7 +866,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case CS47L35_FLL1_SYNCHRONISER_1:
 	case CS47L35_FLL1_SYNCHRONISER_2:
 	case CS47L35_FLL1_SYNCHRONISER_3:
@@ -967,9 +940,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -1439,28 +1409,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_ISRC_2_CTRL_1:
 	case MADERA_ISRC_2_CTRL_2:
 	case MADERA_ISRC_2_CTRL_3:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case CS47L35_FRF_COEFFICIENT_4L_1:
-	case CS47L35_FRF_COEFFICIENT_4L_2:
-	case CS47L35_FRF_COEFFICIENT_4L_3:
-	case CS47L35_FRF_COEFFICIENT_4L_4:
-	case CS47L35_FRF_COEFFICIENT_5L_1:
-	case CS47L35_FRF_COEFFICIENT_5L_2:
-	case CS47L35_FRF_COEFFICIENT_5L_3:
-	case CS47L35_FRF_COEFFICIENT_5L_4:
-	case CS47L35_FRF_COEFFICIENT_5R_1:
-	case CS47L35_FRF_COEFFICIENT_5R_2:
-	case CS47L35_FRF_COEFFICIENT_5R_3:
-	case CS47L35_FRF_COEFFICIENT_5R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index 43803145d8e5..d0198b5e86ba 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -402,7 +402,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
 	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
@@ -419,7 +418,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
 	{ 0x000001a2, 0x0000 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
@@ -436,7 +434,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x000001b4, 0x007d }, /* R436 (0x1b4) - FLL3 Control 4 */
 	{ 0x000001b5, 0x0000 }, /* R437 (0x1b5) - FLL3 Control 5 */
 	{ 0x000001b6, 0x0000 }, /* R438 (0x1b6) - FLL3 Control 6 */
-	{ 0x000001b7, 0x0281 }, /* R439 (0x1b7) - FLL3 Loop Filter Test 1 */
 	{ 0x000001b9, 0x0000 }, /* R441 (0x1b9) - FLL3 Control 7 */
 	{ 0x000001c1, 0x0000 }, /* R449 (0x1c1) - FLL3 Synchroniser 1 */
 	{ 0x000001c2, 0x0000 }, /* R450 (0x1c2) - FLL3 Synchroniser 2 */
@@ -546,9 +543,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x0000043c, 0x0000 }, /* R1084 (0x43c) - Output Path Config 6R */
 	{ 0x0000043d, 0x0180 }, /* R1085 (0x43d) - DAC Digital Volume 6R */
 	{ 0x0000043f, 0x0800 }, /* R1087 (0x43f) - Noise Gate Select 6R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - EDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - EDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -556,7 +550,7 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 CTRL 2 */
 	{ 0x00000492, 0x0069 }, /* R1170 (0x492) - PDM SPK2 CTRL 1 */
 	{ 0x00000493, 0x0000 }, /* R1171 (0x493) - PDM SPK2 CTRL 2 */
-	{ 0x000004a0, 0x3210 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a0, 0x3280 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
 	{ 0x000004a1, 0x3200 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
 	{ 0x000004a2, 0x3200 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
 	{ 0x000004a8, 0x7020 }, /* R1192 (0x4a8) - HP Test Ctrl 5 */
@@ -1365,11 +1359,11 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 ctrl3 */
 	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 ctrl4 */
 	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 ctrl5 */
-	{ 0x00000e88, 0x0933 }, /* R3720 (0xe88) - DRC2 ctrl1 */
-	{ 0x00000e89, 0x0018 }, /* R3721 (0xe89) - DRC2 ctrl2 */
-	{ 0x00000e8a, 0x0000 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 ctrl1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 ctrl2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
 	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 ctrl4 */
-	{ 0x00000e8c, 0x0040 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
 	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1_1 */
 	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1_2 */
 	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2_1 */
@@ -1577,56 +1571,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x000013e0, 0x0000 }, /* R5088 (0x13e0) - FRF Coefficient 4L 1 */
-	{ 0x000013e1, 0x0000 }, /* R5089 (0x13e1) - FRF Coefficient 4L 2 */
-	{ 0x000013e2, 0x0000 }, /* R5090 (0x13e2) - FRF Coefficient 4L 3 */
-	{ 0x000013e3, 0x0000 }, /* R5091 (0x13e3) - FRF Coefficient 4L 4 */
-	{ 0x000013f0, 0x0000 }, /* R5104 (0x13f0) - FRF Coefficient 4R 1 */
-	{ 0x000013f1, 0x0000 }, /* R5105 (0x13f1) - FRF Coefficient 4R 2 */
-	{ 0x000013f2, 0x0000 }, /* R5106 (0x13f2) - FRF Coefficient 4R 3 */
-	{ 0x000013f3, 0x0000 }, /* R5107 (0x13f3) - FRF Coefficient 4R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
-	{ 0x00001420, 0x0000 }, /* R5152 (0x1420) - FRF Coefficient 6L 1 */
-	{ 0x00001421, 0x0000 }, /* R5153 (0x1421) - FRF Coefficient 6L 2 */
-	{ 0x00001422, 0x0000 }, /* R5154 (0x1422) - FRF Coefficient 6L 3 */
-	{ 0x00001423, 0x0000 }, /* R5155 (0x1423) - FRF Coefficient 6L 4 */
-	{ 0x00001430, 0x0000 }, /* R5168 (0x1430) - FRF Coefficient 6R 1 */
-	{ 0x00001431, 0x0000 }, /* R5169 (0x1431) - FRF Coefficient 6R 2 */
-	{ 0x00001432, 0x0000 }, /* R5170 (0x1432) - FRF Coefficient 6R 3 */
-	{ 0x00001433, 0x0000 }, /* R5171 (0x1433) - FRF Coefficient 6R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xe000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -1845,7 +1789,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_5:
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1862,7 +1805,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_5:
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1879,7 +1821,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL3_CONTROL_5:
 	case MADERA_FLL3_CONTROL_6:
 	case MADERA_FLL3_CONTROL_7:
-	case MADERA_FLL3_LOOP_FILTER_TEST_1:
 	case MADERA_FLL3_SYNCHRONISER_1:
 	case MADERA_FLL3_SYNCHRONISER_2:
 	case MADERA_FLL3_SYNCHRONISER_3:
@@ -2004,9 +1945,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_6R:
 	case MADERA_DAC_DIGITAL_VOLUME_6R:
 	case MADERA_NOISE_GATE_SELECT_6R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2792,56 +2730,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_4L_1:
-	case MADERA_FRF_COEFFICIENT_4L_2:
-	case MADERA_FRF_COEFFICIENT_4L_3:
-	case MADERA_FRF_COEFFICIENT_4L_4:
-	case MADERA_FRF_COEFFICIENT_4R_1:
-	case MADERA_FRF_COEFFICIENT_4R_2:
-	case MADERA_FRF_COEFFICIENT_4R_3:
-	case MADERA_FRF_COEFFICIENT_4R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
-	case MADERA_FRF_COEFFICIENT_6L_1:
-	case MADERA_FRF_COEFFICIENT_6L_2:
-	case MADERA_FRF_COEFFICIENT_6L_3:
-	case MADERA_FRF_COEFFICIENT_6L_4:
-	case MADERA_FRF_COEFFICIENT_6R_1:
-	case MADERA_FRF_COEFFICIENT_6R_2:
-	case MADERA_FRF_COEFFICIENT_6R_3:
-	case MADERA_FRF_COEFFICIENT_6R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO40_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index c040d3d7232a..2c761fc241f3 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -119,7 +119,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R377 (0x17a) - FLL1 Efs 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -137,7 +136,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x0000019a, 0x2906 }, /* R410 (0x19a) - FLL2 Efs 2 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
@@ -260,8 +258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - eDRE Enable */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -1262,40 +1258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x050E }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0101 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0425 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0xF6D8 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0632 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0xFEC8 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x042F }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0xF6CA }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0637 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0xFEC8 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
 	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1_CTRL */
 	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1_RX */
 	{ 0x00001484, 0x1f00 }, /* R5252 (0x1486) - DFC1_TX */
@@ -1535,7 +1497,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1553,7 +1514,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
 	case MADERA_FLL2_EFS_2:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1690,8 +1650,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2449,40 +2407,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
 	case MADERA_DFC1_CTRL:
 	case MADERA_DFC1_RX:
 	case MADERA_DFC1_TX:
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index 3dc1fefe68f5..c8a234381350 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1063,7 +1063,6 @@ static const struct reg_default cs47l92_reg_default[] = {
 	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
 	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
 	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
-	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
 	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
 	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
 };
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 6439c0282ac6..53c2377b54b2 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -76,9 +76,7 @@
 #define MADERA_FLL1_CONTROL_4				0x174
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
-#define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
 #define CS47L92_FLL1_CONTROL_7				0x177
-#define MADERA_FLL1_NCO_TEST_0				0x178
 #define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
 #define CS47L92_FLL1_CONTROL_9				0x179
@@ -111,9 +109,7 @@
 #define MADERA_FLL2_CONTROL_4				0x194
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
-#define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
 #define CS47L92_FLL2_CONTROL_7				0x197
-#define MADERA_FLL2_NCO_TEST_0				0x198
 #define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
 #define CS47L92_FLL2_CONTROL_9				0x199
@@ -137,8 +133,6 @@
 #define MADERA_FLL3_CONTROL_4				0x1B4
 #define MADERA_FLL3_CONTROL_5				0x1B5
 #define MADERA_FLL3_CONTROL_6				0x1B6
-#define MADERA_FLL3_LOOP_FILTER_TEST_1			0x1B7
-#define MADERA_FLL3_NCO_TEST_0				0x1B8
 #define MADERA_FLL3_CONTROL_7				0x1B9
 #define MADERA_FLL3_SYNCHRONISER_1			0x1C1
 #define MADERA_FLL3_SYNCHRONISER_2			0x1C2
@@ -304,9 +298,6 @@
 #define MADERA_OUTPUT_PATH_CONFIG_6R			0x43C
 #define MADERA_DAC_DIGITAL_VOLUME_6R			0x43D
 #define MADERA_NOISE_GATE_SELECT_6R			0x43F
-#define MADERA_DRE_ENABLE				0x440
-#define MADERA_EDRE_ENABLE				0x448
-#define MADERA_EDRE_MANUAL				0x44A
 #define MADERA_DAC_AEC_CONTROL_1			0x450
 #define MADERA_DAC_AEC_CONTROL_2			0x451
 #define MADERA_NOISE_GATE_CONTROL			0x458
@@ -1182,68 +1173,6 @@
 #define MADERA_FCR_COEFF_END				0xFC5
 #define MADERA_AUXPDM1_CTRL_0				0x10C0
 #define MADERA_AUXPDM1_CTRL_1				0x10C1
-#define MADERA_DAC_COMP_1				0x1300
-#define MADERA_DAC_COMP_2				0x1302
-#define MADERA_FRF_COEFFICIENT_1L_1			0x1380
-#define MADERA_FRF_COEFFICIENT_1L_2			0x1381
-#define MADERA_FRF_COEFFICIENT_1L_3			0x1382
-#define MADERA_FRF_COEFFICIENT_1L_4			0x1383
-#define MADERA_FRF_COEFFICIENT_1R_1			0x1390
-#define MADERA_FRF_COEFFICIENT_1R_2			0x1391
-#define MADERA_FRF_COEFFICIENT_1R_3			0x1392
-#define MADERA_FRF_COEFFICIENT_1R_4			0x1393
-#define MADERA_FRF_COEFFICIENT_2L_1			0x13A0
-#define MADERA_FRF_COEFFICIENT_2L_2			0x13A1
-#define MADERA_FRF_COEFFICIENT_2L_3			0x13A2
-#define MADERA_FRF_COEFFICIENT_2L_4			0x13A3
-#define MADERA_FRF_COEFFICIENT_2R_1			0x13B0
-#define MADERA_FRF_COEFFICIENT_2R_2			0x13B1
-#define MADERA_FRF_COEFFICIENT_2R_3			0x13B2
-#define MADERA_FRF_COEFFICIENT_2R_4			0x13B3
-#define MADERA_FRF_COEFFICIENT_3L_1			0x13C0
-#define MADERA_FRF_COEFFICIENT_3L_2			0x13C1
-#define MADERA_FRF_COEFFICIENT_3L_3			0x13C2
-#define MADERA_FRF_COEFFICIENT_3L_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_3R_1			0x13D0
-#define MADERA_FRF_COEFFICIENT_3R_2			0x13D1
-#define MADERA_FRF_COEFFICIENT_3R_3			0x13D2
-#define MADERA_FRF_COEFFICIENT_3R_4			0x13D3
-#define MADERA_FRF_COEFFICIENT_4L_1			0x13E0
-#define MADERA_FRF_COEFFICIENT_4L_2			0x13E1
-#define MADERA_FRF_COEFFICIENT_4L_3			0x13E2
-#define MADERA_FRF_COEFFICIENT_4L_4			0x13E3
-#define MADERA_FRF_COEFFICIENT_4R_1			0x13F0
-#define MADERA_FRF_COEFFICIENT_4R_2			0x13F1
-#define MADERA_FRF_COEFFICIENT_4R_3			0x13F2
-#define MADERA_FRF_COEFFICIENT_4R_4			0x13F3
-#define CS47L35_FRF_COEFFICIENT_4L_1			0x13A0
-#define CS47L35_FRF_COEFFICIENT_4L_2			0x13A1
-#define CS47L35_FRF_COEFFICIENT_4L_3			0x13A2
-#define CS47L35_FRF_COEFFICIENT_4L_4			0x13A3
-#define CS47L35_FRF_COEFFICIENT_5L_1			0x13B0
-#define CS47L35_FRF_COEFFICIENT_5L_2			0x13B1
-#define CS47L35_FRF_COEFFICIENT_5L_3			0x13B2
-#define CS47L35_FRF_COEFFICIENT_5L_4			0x13B3
-#define CS47L35_FRF_COEFFICIENT_5R_1			0x13C0
-#define CS47L35_FRF_COEFFICIENT_5R_2			0x13C1
-#define CS47L35_FRF_COEFFICIENT_5R_3			0x13C2
-#define CS47L35_FRF_COEFFICIENT_5R_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_5L_1			0x1400
-#define MADERA_FRF_COEFFICIENT_5L_2			0x1401
-#define MADERA_FRF_COEFFICIENT_5L_3			0x1402
-#define MADERA_FRF_COEFFICIENT_5L_4			0x1403
-#define MADERA_FRF_COEFFICIENT_5R_1			0x1410
-#define MADERA_FRF_COEFFICIENT_5R_2			0x1411
-#define MADERA_FRF_COEFFICIENT_5R_3			0x1412
-#define MADERA_FRF_COEFFICIENT_5R_4			0x1413
-#define MADERA_FRF_COEFFICIENT_6L_1			0x1420
-#define MADERA_FRF_COEFFICIENT_6L_2			0x1421
-#define MADERA_FRF_COEFFICIENT_6L_3			0x1422
-#define MADERA_FRF_COEFFICIENT_6L_4			0x1423
-#define MADERA_FRF_COEFFICIENT_6R_1			0x1430
-#define MADERA_FRF_COEFFICIENT_6R_2			0x1431
-#define MADERA_FRF_COEFFICIENT_6R_3			0x1432
-#define MADERA_FRF_COEFFICIENT_6R_4			0x1433
 #define MADERA_DFC1_CTRL				0x1480
 #define MADERA_DFC1_RX					0x1482
 #define MADERA_DFC1_TX					0x1484
@@ -1573,15 +1502,6 @@
 #define MADERA_FLL1_REFCLK_SRC_SHIFT			     0
 #define MADERA_FLL1_REFCLK_SRC_WIDTH			     4
 
-/* (0x0177)  FLL1_Loop_Filter_Test_1 */
-#define MADERA_FLL1_FRC_INTEG_UPD			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_MASK			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_SHIFT			    15
-#define MADERA_FLL1_FRC_INTEG_UPD_WIDTH			     1
-#define MADERA_FLL1_FRC_INTEG_VAL_MASK			0x0FFF
-#define MADERA_FLL1_FRC_INTEG_VAL_SHIFT			     0
-#define MADERA_FLL1_FRC_INTEG_VAL_WIDTH			    12
-
 /* (0x0179)  FLL1_Control_7 */
 #define MADERA_FLL1_GAIN_MASK				0x003c
 #define MADERA_FLL1_GAIN_SHIFT				     2
-- 
cgit v1.2.3


From d6871a73387d51dfdde6ad1479aea54d3eafcc89 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:36 +0100
Subject: mfd: madera: Fixup SPDX headers

GPL-2.0-only is the preferred way of expressing v2 of the GPL, so switch
to that. Remove some redundant copyright notices and correct some
instances where the wrong comment type has been used in header files.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         | 2 +-
 drivers/mfd/cs47l35-tables.c         | 6 +-----
 drivers/mfd/cs47l85-tables.c         | 6 +-----
 drivers/mfd/cs47l90-tables.c         | 6 +-----
 drivers/mfd/cs47l92-tables.c         | 2 +-
 drivers/mfd/madera-core.c            | 6 +-----
 drivers/mfd/madera-i2c.c             | 6 +-----
 drivers/mfd/madera-spi.c             | 6 +-----
 include/linux/mfd/madera/core.h      | 6 +-----
 include/linux/mfd/madera/pdata.h     | 6 +-----
 include/linux/mfd/madera/registers.h | 6 +-----
 11 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 73db8d03b531..f81b45336690 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L15 codec
  *
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index fe838cbc2a7e..a0bc6c5100d6 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L35 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index d0198b5e86ba..270d8eda3f5f 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L85 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index 2c761fc241f3..7345fc09c0bb 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L90 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index c8a234381350..f296e355df4d 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L92 codec
  *
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index b9e9c169c6cc..29540cbf7593 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Core MFD support for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 3f4ab5dcf5c3..6b965eb034b6 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * I2C bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index d76c7e7376d7..e860f5ff0933 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SPI bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 7b87f9a02ecc..7ffa696cce7c 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * MFD internals for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_CORE_H
diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index dd00ab824e5b..ec0711bcad50 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Platform data for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_PDATA_H
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 53c2377b54b2..fe909d177762 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Madera register definitions
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_REGISTERS_H
-- 
cgit v1.2.3


From 9d83dcb3e4553f34ee1c4f09d65173159f9eb7a7 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sat, 29 Jun 2019 13:44:46 +0200
Subject: regulator: s2mps11: Adjust supported buck voltages to real values

The driver was registering buck regulators with unsupported range of
voltages for S2MPS11 devices.  Basically it assumed that all 256 values
are possible for a single 8-bit I2C register controlling buck's voltage.
This is not true, as datasheet describes subset of these which can be
used.

For example for buck[12346] the minimum voltage is 650 mV which
corresponds to register value of 0x8.  The driver was however
registering regulator starting at 600 mV, so for a step of 6.25 mV this
gave the same result.  However this allowed to try to configure
regulators to unsupported values.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/s2mps11.c         | 27 +++++++++++++++++----------
 include/linux/mfd/samsung/core.h    |  1 +
 include/linux/mfd/samsung/s2mps11.h |  4 +++-
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index 7a89030187a4..5b7ba7c6c4f6 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -269,9 +269,10 @@ static const struct regulator_ops s2mps11_buck_ops = {
 	.ops		= &s2mps11_buck_ops,			\
 	.type		= REGULATOR_VOLTAGE,			\
 	.owner		= THIS_MODULE,				\
-	.min_uV		= MIN_600_MV,				\
+	.min_uV		= MIN_650_MV,				\
 	.uV_step	= STEP_6_25_MV,				\
-	.n_voltages	= S2MPS11_BUCK_N_VOLTAGES,		\
+	.linear_min_sel	= 8,					\
+	.n_voltages	= S2MPS11_BUCK12346_N_VOLTAGES,		\
 	.ramp_delay	= S2MPS11_RAMP_DELAY,			\
 	.vsel_reg	= S2MPS11_REG_B1CTRL2 + (num - 1) * 2,	\
 	.vsel_mask	= S2MPS11_BUCK_VSEL_MASK,		\
@@ -285,9 +286,10 @@ static const struct regulator_ops s2mps11_buck_ops = {
 	.ops		= &s2mps11_buck_ops,			\
 	.type		= REGULATOR_VOLTAGE,			\
 	.owner		= THIS_MODULE,				\
-	.min_uV		= MIN_600_MV,				\
+	.min_uV		= MIN_650_MV,				\
 	.uV_step	= STEP_6_25_MV,				\
-	.n_voltages	= S2MPS11_BUCK_N_VOLTAGES,		\
+	.linear_min_sel	= 8,					\
+	.n_voltages	= S2MPS11_BUCK5_N_VOLTAGES,		\
 	.ramp_delay	= S2MPS11_RAMP_DELAY,			\
 	.vsel_reg	= S2MPS11_REG_B5CTRL2,			\
 	.vsel_mask	= S2MPS11_BUCK_VSEL_MASK,		\
@@ -295,7 +297,7 @@ static const struct regulator_ops s2mps11_buck_ops = {
 	.enable_mask	= S2MPS11_ENABLE_MASK			\
 }
 
-#define regulator_desc_s2mps11_buck67810(num, min, step) {	\
+#define regulator_desc_s2mps11_buck67810(num, min, step, min_sel, voltages) {	\
 	.name		= "BUCK"#num,				\
 	.id		= S2MPS11_BUCK##num,			\
 	.ops		= &s2mps11_buck_ops,			\
@@ -303,7 +305,8 @@ static const struct regulator_ops s2mps11_buck_ops = {
 	.owner		= THIS_MODULE,				\
 	.min_uV		= min,					\
 	.uV_step	= step,					\
-	.n_voltages	= S2MPS11_BUCK_N_VOLTAGES,		\
+	.linear_min_sel	= min_sel,				\
+	.n_voltages	= voltages,				\
 	.ramp_delay	= S2MPS11_RAMP_DELAY,			\
 	.vsel_reg	= S2MPS11_REG_B6CTRL2 + (num - 6) * 2,	\
 	.vsel_mask	= S2MPS11_BUCK_VSEL_MASK,		\
@@ -371,11 +374,15 @@ static const struct regulator_desc s2mps11_regulators[] = {
 	regulator_desc_s2mps11_buck1_4(3),
 	regulator_desc_s2mps11_buck1_4(4),
 	regulator_desc_s2mps11_buck5,
-	regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck67810(7, MIN_750_MV, STEP_12_5_MV),
-	regulator_desc_s2mps11_buck67810(8, MIN_750_MV, STEP_12_5_MV),
+	regulator_desc_s2mps11_buck67810(6, MIN_650_MV, STEP_6_25_MV, 8,
+					 S2MPS11_BUCK12346_N_VOLTAGES),
+	regulator_desc_s2mps11_buck67810(7, MIN_750_MV, STEP_12_5_MV, 0,
+					 S2MPS11_BUCK7810_N_VOLTAGES),
+	regulator_desc_s2mps11_buck67810(8, MIN_750_MV, STEP_12_5_MV, 0,
+					 S2MPS11_BUCK7810_N_VOLTAGES),
 	regulator_desc_s2mps11_buck9,
-	regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV),
+	regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV, 0,
+					 S2MPS11_BUCK7810_N_VOLTAGES),
 };
 
 static const struct regulator_ops s2mps14_reg_ops;
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index 3ca17eb89aa2..f1631a39acfc 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -20,6 +20,7 @@
 #define MIN_850_MV		850000
 #define MIN_800_MV		800000
 #define MIN_750_MV		750000
+#define MIN_650_MV		650000
 #define MIN_600_MV		600000
 #define MIN_500_MV		500000
 
diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h
index 6e7668a389a1..fc67c9e75bba 100644
--- a/include/linux/mfd/samsung/s2mps11.h
+++ b/include/linux/mfd/samsung/s2mps11.h
@@ -170,7 +170,9 @@ enum s2mps11_regulators {
 #define S2MPS11_ENABLE_MASK	(0x03 << S2MPS11_ENABLE_SHIFT)
 #define S2MPS11_ENABLE_SHIFT	0x06
 #define S2MPS11_LDO_N_VOLTAGES	(S2MPS11_LDO_VSEL_MASK + 1)
-#define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1)
+#define S2MPS11_BUCK12346_N_VOLTAGES	153
+#define S2MPS11_BUCK5_N_VOLTAGES	216
+#define S2MPS11_BUCK7810_N_VOLTAGES	225
 #define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1)
 #define S2MPS11_RAMP_DELAY	25000		/* uV/us */
 
-- 
cgit v1.2.3


From 07ec38917e68f0114b9c8aeeb1c584b5e73e4dd6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:01 +0200
Subject: mm: remove the struct hmm_device infrastructure

This code is a trivial wrapper around device model helpers, which
should have been integrated into the driver device model usage from
the start.  Assuming it actually had users, which it never had since
the code was added more than 1 1/2 years ago.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 20 --------------
 mm/hmm.c            | 80 -----------------------------------------------------
 2 files changed, 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..99765be3284d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -751,26 +751,6 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-
-
-/*
- * struct hmm_device - fake device to hang device memory onto
- *
- * @device: device struct
- * @minor: device minor number
- */
-struct hmm_device {
-	struct device		device;
-	unsigned int		minor;
-};
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper and
- * it is not strictly needed, in order to make use of any HMM functionality.
- */
-struct hmm_device *hmm_device_new(void *drvdata);
-void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
diff --git a/mm/hmm.c b/mm/hmm.c
index f702a3895d05..00cc642b3d7e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1528,84 +1528,4 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper
- * and it is not needed to make use of any HMM functionality.
- */
-#define HMM_DEVICE_MAX 256
-
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
-static DEFINE_SPINLOCK(hmm_device_lock);
-static struct class *hmm_device_class;
-static dev_t hmm_device_devt;
-
-static void hmm_device_release(struct device *device)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = container_of(device, struct hmm_device, device);
-	spin_lock(&hmm_device_lock);
-	clear_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	kfree(hmm_device);
-}
-
-struct hmm_device *hmm_device_new(void *drvdata)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
-	if (!hmm_device)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock(&hmm_device_lock);
-	hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
-	if (hmm_device->minor >= HMM_DEVICE_MAX) {
-		spin_unlock(&hmm_device_lock);
-		kfree(hmm_device);
-		return ERR_PTR(-EBUSY);
-	}
-	set_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
-	hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
-					hmm_device->minor);
-	hmm_device->device.release = hmm_device_release;
-	dev_set_drvdata(&hmm_device->device, drvdata);
-	hmm_device->device.class = hmm_device_class;
-	device_initialize(&hmm_device->device);
-
-	return hmm_device;
-}
-EXPORT_SYMBOL(hmm_device_new);
-
-void hmm_device_put(struct hmm_device *hmm_device)
-{
-	put_device(&hmm_device->device);
-}
-EXPORT_SYMBOL(hmm_device_put);
-
-static int __init hmm_init(void)
-{
-	int ret;
-
-	ret = alloc_chrdev_region(&hmm_device_devt, 0,
-				  HMM_DEVICE_MAX,
-				  "hmm_device");
-	if (ret)
-		return ret;
-
-	hmm_device_class = class_create(THIS_MODULE, "hmm_device");
-	if (IS_ERR(hmm_device_class)) {
-		unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
-		return PTR_ERR(hmm_device_class);
-	}
-	return 0;
-}
-
-device_initcall(hmm_init);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-- 
cgit v1.2.3


From 25b2995a35b609119cf96f6b62eccd56c0234c7d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 22:50:49 +0200
Subject: mm: remove MEMORY_DEVICE_PUBLIC support

The code hasn't been used since it was added to the tree, and doesn't
appear to actually be usable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 fs/proc/task_mmu.c       |  2 +-
 include/linux/hmm.h      |  7 ++-----
 include/linux/ioport.h   |  1 -
 include/linux/memremap.h |  8 -------
 include/linux/mm.h       | 18 ++--------------
 mm/Kconfig               | 11 ----------
 mm/gup.c                 |  7 -------
 mm/hmm.c                 | 54 ++----------------------------------------------
 mm/madvise.c             |  2 +-
 mm/memcontrol.c          | 13 ++++++------
 mm/memory-failure.c      |  6 +-----
 mm/memory.c              | 40 +++--------------------------------
 mm/migrate.c             | 28 ++++---------------------
 mm/swap.c                | 11 ----------
 14 files changed, 22 insertions(+), 186 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 01d4eb0e6bd1..74d8f00b3615 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1279,7 +1279,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
 		flags |= PM_PRESENT;
-		page = _vm_normal_page(vma, addr, pte, true);
+		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 99765be3284d..44a5ac738bb5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,7 +584,7 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -722,9 +722,6 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  struct device *device,
 				  unsigned long size);
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res);
 
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
@@ -751,7 +748,7 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..dd961882bc74 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -132,7 +132,6 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY		= 4,
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
-	IORES_DESC_DEVICE_PUBLIC_MEMORY		= 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1732dea030b2..995c62c5a48b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -37,13 +37,6 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
- * MEMORY_DEVICE_PUBLIC:
- * Device memory that is cache coherent from device and CPU point of view. This
- * is use on platform that have an advance system bus (like CAPI or CCIX). A
- * driver can hotplug the device memory using ZONE_DEVICE and with that memory
- * type. Any page of a process can be migrated to such memory. However no one
- * should be allow to pin such memory so that it can always be evicted.
- *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -58,7 +51,6 @@ struct vmem_altmap {
  */
 enum memory_type {
 	MEMORY_DEVICE_PRIVATE = 1,
-	MEMORY_DEVICE_PUBLIC,
 	MEMORY_DEVICE_FS_DAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..7399f9f08de6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -944,7 +944,6 @@ static inline bool put_devmap_managed_page(struct page *page)
 		return false;
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
 	case MEMORY_DEVICE_FS_DAX:
 		__put_devmap_managed_page(page);
 		return true;
@@ -960,12 +959,6 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
-
 #ifdef CONFIG_PCI_P2PDMA
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
@@ -998,11 +991,6 @@ static inline bool is_device_private_page(const struct page *page)
 	return false;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return false;
-}
-
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return false;
@@ -1431,10 +1419,8 @@ struct zap_details {
 	pgoff_t last_index;			/* Highest page->index to unmap */
 };
 
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device);
-#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
-
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 0d2ba7e1f43e..6f35b85b3052 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -718,17 +718,6 @@ config DEVICE_PRIVATE
 	  memory; i.e., memory that is only accessible from the device (or
 	  group of devices). You likely also want to select HMM_MIRROR.
 
-config DEVICE_PUBLIC
-	bool "Addressable device memory (like GPU memory)"
-	depends on ARCH_HAS_HMM
-	select HMM
-	select DEV_PAGEMAP_OPS
-
-	help
-	  Allows creation of struct pages to represent addressable device
-	  memory; i.e., memory that is accessible from both the device and
-	  the CPU
-
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..fe131d879c70 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -605,13 +605,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 			goto unmap;
 		*page = pte_page(*pte);
-
-		/*
-		 * This should never happen (a device public page in the gate
-		 * area).
-		 */
-		if (is_device_public_page(*page))
-			goto unmap;
 	}
 	if (unlikely(!try_get_page(*page))) {
 		ret = -ENOMEM;
diff --git a/mm/hmm.c b/mm/hmm.c
index 00cc642b3d7e..376159a769fb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1331,7 +1331,7 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
 				       unsigned long addr)
 {
@@ -1478,54 +1478,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add);
-
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
-		return ERR_PTR(-EINVAL);
-
-	dev_pagemap_get_ops();
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = res;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
-
-	devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
-	devmem->pagemap.altmap_valid = false;
-	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE  */
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..968df3aa069f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 		}
 
-		page = _vm_normal_page(vma, addr, ptent, true);
+		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..d2a6454fa0bd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4793,7 +4793,7 @@ enum mc_target_type {
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
-	struct page *page = _vm_normal_page(vma, addr, ptent, true);
+	struct page *page = vm_normal_page(vma, addr, ptent);
 
 	if (!page || !page_mapped(page))
 		return NULL;
@@ -4994,8 +4994,8 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
- *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
+ *     (so ZONE_DEVICE page and thus not on the lru).
  *     For now we such page is charge like a regular page would be as for all
  *     intent and purposes it is just special memory taking the place of a
  *     regular page.
@@ -5029,8 +5029,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page) ||
-			    is_device_public_page(page))
+			if (is_device_private_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
@@ -5101,8 +5100,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	if (ptl) {
 		/*
 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
-		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or
-		 * MEMORY_DEVICE_PRIVATE but this might change.
+		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+		 * this might change.
 		 */
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d9cc6606f409..31e7c7b424a1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		goto unlock;
 	}
 
-	switch (pgmap->type) {
-	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		/*
 		 * TODO: Handle HMM pages which may need coordination
 		 * with device-side memory.
 		 */
 		goto unlock;
-	default:
-		break;
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index ddf20bd0c317..2d14f4c7e152 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			    pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			return NULL;
 		if (is_zero_pfn(pfn))
 			return NULL;
-
-		/*
-		 * Device public pages are special pages (they are ZONE_DEVICE
-		 * pages but different from persistent memory). They behave
-		 * allmost like normal pages. The difference is that they are
-		 * not on the lru and thus should never be involve with any-
-		 * thing that involve lru manipulation (mlock, numa balancing,
-		 * ...).
-		 *
-		 * This is why we still want to return NULL for such page from
-		 * vm_normal_page() so that we do not have to special case all
-		 * call site of vm_normal_page().
-		 */
-		if (likely(pfn <= highest_memmap_pfn)) {
-			struct page *page = pfn_to_page(pfn);
-
-			if (is_device_public_page(page)) {
-				if (with_public_device)
-					return page;
-				return NULL;
-			}
-		}
-
 		if (pte_devmap(pte))
 			return NULL;
 
@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		rss[mm_counter(page)]++;
 	} else if (pte_devmap(pte)) {
 		page = pte_page(pte);
-
-		/*
-		 * Cache coherent device memory behave like regular page and
-		 * not like persistent memory page. For more informations see
-		 * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
-		 */
-		if (is_device_public_page(page)) {
-			get_page(page);
-			page_dup_rmap(page, false);
-			rss[mm_counter(page)]++;
-		}
 	}
 
 out_set_pte:
@@ -1063,7 +1029,7 @@ again:
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			page = _vm_normal_page(vma, addr, ptent, true);
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
diff --git a/mm/migrate.c b/mm/migrate.c
index f2ecc2855a12..78d45e184457 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 			if (is_device_private_page(new)) {
 				entry = make_device_private_entry(new, pte_write(pte));
 				pte = swp_entry_to_pte(entry);
-			} else if (is_device_public_page(new)) {
-				pte = pte_mkdevmap(pte);
 			}
 		}
 
@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
 	 * ZONE_DEVICE pages.
 	 */
 	expected_count += is_device_private_page(page);
-	expected_count += is_device_public_page(page);
 	if (mapping)
 		expected_count += hpage_nr_pages(page) + page_has_private(page);
 
@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		if (!PageMappingFlags(page))
 			page->mapping = NULL;
 
-		if (unlikely(is_zone_device_page(newpage))) {
-			if (is_device_public_page(newpage))
-				flush_dcache_page(newpage);
-		} else
+		if (likely(!is_zone_device_page(newpage)))
 			flush_dcache_page(newpage);
 
 	}
@@ -2265,7 +2259,7 @@ again:
 				pfn = 0;
 				goto next;
 			}
-			page = _vm_normal_page(migrate->vma, addr, pte, true);
+			page = vm_normal_page(migrate->vma, addr, pte);
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
 		 * FIXME proper solution is to rework migration_entry_wait() so
 		 * it does not need to take a reference on page.
 		 */
-		if (is_device_private_page(page))
-			return true;
-
-		/*
-		 * Only allow device public page to be migrated and account for
-		 * the extra reference count imply by ZONE_DEVICE pages.
-		 */
-		if (!is_device_public_page(page))
-			return false;
-		extra++;
+		return is_device_private_page(page);
 	}
 
 	/* For file back page */
@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
 			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
 			entry = swp_entry_to_pte(swp_entry);
-		} else if (is_device_public_page(page)) {
-			entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
-			if (vma->vm_flags & VM_WRITE)
-				entry = pte_mkwrite(pte_mkdirty(entry));
-			entry = pte_mkdevmap(entry);
 		}
 	} else {
 		entry = mk_pte(page, vma->vm_page_prot);
@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 					continue;
 				}
-			} else if (!is_device_public_page(newpage)) {
+			} else {
 				/*
 				 * Other types of ZONE_DEVICE page are not
 				 * supported.
diff --git a/mm/swap.c b/mm/swap.c
index 7ede3eddc12a..83107410d29f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -740,17 +740,6 @@ void release_pages(struct page **pages, int nr)
 		if (is_huge_zero_page(page))
 			continue;
 
-		/* Device public page can not be huge page */
-		if (is_device_public_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
-			}
-			put_devmap_managed_page(page);
-			continue;
-		}
-
 		page = compound_head(page);
 		if (!put_page_testzero(page))
 			continue;
-- 
cgit v1.2.3


From 0092908d16c604b8207c2141ec64b0fa4473bb03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:06 +0200
Subject: mm: factor out a devm_request_free_mem_region helper

Keep the physical address allocation that hmm_add_device does with the
rest of the resource code, and allow future reuse of it without the hmm
wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/ioport.h |  2 ++
 kernel/resource.c      | 39 +++++++++++++++++++++++++++++++++++++++
 mm/hmm.c               | 33 ++++-----------------------------
 3 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index dd961882bc74..a02b290ca08a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -285,6 +285,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return (r1->start <= r2->end && r1->end >= r2->start);
 }
 
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size);
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 158f04ec1d4f..d22423e85cf8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
+#ifdef CONFIG_DEVICE_PRIVATE
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size)
+{
+	resource_size_t end, addr;
+	struct resource *res;
+
+	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
+	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
+	addr = end - size + 1UL;
+
+	for (; addr > size && addr >= base->start; addr -= size) {
+		if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+				REGION_DISJOINT)
+			continue;
+
+		res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+		if (!res)
+			return ERR_PTR(-ENOMEM);
+		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+		return res;
+	}
+
+	return ERR_PTR(-ERANGE);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 static int __init strict_iomem(char *str)
 {
 	if (strstr(str, "relaxed"))
diff --git a/mm/hmm.c b/mm/hmm.c
index e7dd2ab8f9ab..48574f8485bb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,8 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
@@ -1408,7 +1406,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  unsigned long size)
 {
 	struct hmm_devmem *devmem;
-	resource_size_t addr;
 	void *result;
 	int ret;
 
@@ -1430,32 +1427,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	if (ret)
 		return ERR_PTR(ret);
 
-	size = ALIGN(size, PA_SECTION_SIZE);
-	addr = min((unsigned long)iomem_resource.end,
-		   (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = addr - size + 1UL;
-
-	/*
-	 * FIXME add a new helper to quickly walk resource tree and find free
-	 * range
-	 *
-	 * FIXME what about ioport_resource resource ?
-	 */
-	for (; addr > size && addr >= iomem_resource.start; addr -= size) {
-		ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
-		if (ret != REGION_DISJOINT)
-			continue;
-
-		devmem->resource = devm_request_mem_region(device, addr, size,
-							   dev_name(device));
-		if (!devmem->resource)
-			return ERR_PTR(-ENOMEM);
-		break;
-	}
-	if (!devmem->resource)
-		return ERR_PTR(-ERANGE);
-
-	devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
+			size);
+	if (IS_ERR(devmem->resource))
+		return ERR_CAST(devmem->resource);
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-- 
cgit v1.2.3


From 3ed2dcdf54d5bf1f9823b5faf1a702e7cee53982 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:07 +0200
Subject: memremap: validate the pagemap type passed to devm_memremap_pages

Most pgmap types are only supported when certain config options are
enabled.  Check for a type that is valid for the current configuration
before setting up the pagemap.  For this the usage of the 0 type for
device dax gets replaced with an explicit MEMORY_DEVICE_DEVDAX type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c     |  1 +
 include/linux/memremap.h |  8 ++++++++
 kernel/memremap.c        | 22 ++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 8465d12fecba..79014baa782d 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -468,6 +468,7 @@ int dev_dax_probe(struct device *dev)
 	dev_dax->pgmap.ref = &dev_dax->ref;
 	dev_dax->pgmap.kill = dev_dax_percpu_kill;
 	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
+	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 995c62c5a48b..0c86f2c5ac9c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,13 +45,21 @@ struct vmem_altmap {
  * wakeup is used to coordinate physical address space management (ex:
  * fs truncate/hole punch) vs pinned pages (ex: device dma).
  *
+ * MEMORY_DEVICE_DEVDAX:
+ * Host memory that has similar access semantics as System RAM i.e. DMA
+ * coherent and supports page pinning. In contrast to
+ * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
+ * character device.
+ *
  * MEMORY_DEVICE_PCI_P2PDMA:
  * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
  * transactions.
  */
 enum memory_type {
+	/* 0 is reserved to catch uninitialized type fields */
 	MEMORY_DEVICE_PRIVATE = 1,
 	MEMORY_DEVICE_FS_DAX,
+	MEMORY_DEVICE_DEVDAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6e1970719dc2..abda62d1e5a3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -157,6 +157,28 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+		if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+			WARN(1, "Device private memory not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_FS_DAX:
+		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+		    IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+			WARN(1, "File system DAX not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_DEVDAX:
+	case MEMORY_DEVICE_PCI_P2PDMA:
+		break;
+	default:
+		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+		break;
+	}
+
 	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:08 +0200
Subject: memremap: move dev_pagemap callbacks into a separate structure

The dev_pagemap is a growing too many callbacks.  Move them into a
separate ops structure so that they are not duplicated for multiple
instances, and an attacker can't easily overwrite them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 11 +++++++----
 drivers/dax/pmem/core.c           |  2 +-
 drivers/nvdimm/pmem.c             | 19 +++++++++++--------
 drivers/pci/p2pdma.c              |  8 ++++++--
 include/linux/memremap.h          | 36 ++++++++++++++++++++----------------
 kernel/memremap.c                 | 18 +++++++++---------
 mm/hmm.c                          | 10 +++++++---
 tools/testing/nvdimm/test/iomap.c |  7 ++++---
 8 files changed, 65 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 79014baa782d..f390083a64d7 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref)
 	percpu_ref_exit(ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *data)
+static void dev_dax_percpu_kill(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
@@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax)
 	kill_dev_dax(dev_dax);
 }
 
+static const struct dev_pagemap_ops dev_dax_pagemap_ops = {
+	.kill		= dev_dax_percpu_kill,
+	.cleanup	= dev_dax_percpu_exit,
+};
+
 int dev_dax_probe(struct device *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
@@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev)
 		return rc;
 
 	dev_dax->pgmap.ref = &dev_dax->ref;
-	dev_dax->pgmap.kill = dev_dax_percpu_kill;
-	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
 	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
+	dev_dax->pgmap.ops = &dev_dax_pagemap_ops;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index f9f51786d556..6eb6dfdf19bf 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 	struct dev_dax *dev_dax;
 	struct nd_namespace_io *nsio;
 	struct dax_region *dax_region;
-	struct dev_pagemap pgmap = { 0 };
+	struct dev_pagemap pgmap = { };
 	struct nd_namespace_common *ndns;
 	struct nd_dax *nd_dax = to_nd_dax(dev);
 	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..c2449af2b388 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void __pmem_release_queue(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref)
 
 static void pmem_release_queue(void *ref)
 {
-	__pmem_release_queue(ref);
+	pmem_pagemap_cleanup(ref);
 }
 
-static void pmem_freeze_queue(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap)
 	dev_pagemap_put_ops();
 }
 
-static void fsdax_pagefree(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
 }
 
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.page_free		= pmem_pagemap_page_free,
+	.kill			= pmem_pagemap_kill,
+	.cleanup		= pmem_pagemap_cleanup,
+};
+
 static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
 {
 	dev_pagemap_get_ops();
 	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
 		return -ENOMEM;
 	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->page_free = fsdax_pagefree;
-
+	pgmap->ops = &fsdax_pagemap_ops;
 	return 0;
 }
 
@@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev,
 
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
-	pmem->pgmap.kill = pmem_freeze_queue;
-	pmem->pgmap.cleanup = __pmem_release_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index a4994aa3acc0..fb039259d463 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -153,6 +153,11 @@ out:
 	return error;
 }
 
+static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = {
+	.kill		= pci_p2pdma_percpu_kill,
+	.cleanup	= pci_p2pdma_percpu_cleanup,
+};
+
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
-	pgmap->kill = pci_p2pdma_percpu_kill;
-	pgmap->cleanup = pci_p2pdma_percpu_cleanup;
+	pgmap->ops = &pci_p2pdma_pagemap_ops;
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 0c86f2c5ac9c..919755f48c7e 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -63,41 +63,45 @@ enum memory_type {
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
-/*
- * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
- * explanation in include/linux/memory_hotplug.h.
- *
- * The page_free() callback is called once the page refcount reaches 1
- * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
- * This allows the device driver to implement its own memory management.)
- */
-typedef void (*dev_page_free_t)(struct page *page, void *data);
+struct dev_pagemap_ops {
+	/*
+	 * Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
+	 * reach 0 refcount unless there is a refcount bug. This allows the
+	 * device driver to implement its own memory management.)
+	 */
+	void (*page_free)(struct page *page, void *data);
+
+	/*
+	 * Transition the refcount in struct dev_pagemap to the dead state.
+	 */
+	void (*kill)(struct percpu_ref *ref);
+
+	/*
+	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
+	 */
+	void (*cleanup)(struct percpu_ref *ref);
+};
 
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
- * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
- * @kill: callback to transition @ref to the dead state
- * @cleanup: callback to wait for @ref to be idle and reap it
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @ops: method table
  */
 struct dev_pagemap {
-	dev_page_free_t page_free;
 	struct vmem_altmap altmap;
 	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
-	void (*kill)(struct percpu_ref *ref);
-	void (*cleanup)(struct percpu_ref *ref);
 	struct device *dev;
 	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
+	const struct dev_pagemap_ops *ops;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/kernel/memremap.c b/kernel/memremap.c
index abda62d1e5a3..0824237ef979 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data)
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
- *    by the caller before passing it to this function
+ * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
+ *    initialized by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
@@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+	    !pgmap->ops->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
-
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
@@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page, page->pgmap->data);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 48574f8485bb..583a02a16872 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data)
 	devmem->ops->free(devmem, page);
 }
 
+static const struct dev_pagemap_ops hmm_pagemap_ops = {
+	.page_free		= hmm_devmem_free,
+	.kill			= hmm_devmem_ref_kill,
+	.cleanup		= hmm_devmem_ref_exit,
+};
+
 /*
  * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  *
@@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
+	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 076df22e4bda..cf3f064a697d 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
+	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+		!pgmap->ops->cleanup);
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3


From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:09 +0200
Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup

Passing the actual typed structure leads to more understandable code
vs just passing the ref member.

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 12 ++++++------
 drivers/nvdimm/pmem.c             | 18 +++++++++---------
 drivers/pci/p2pdma.c              |  9 +++++----
 include/linux/memremap.h          |  4 ++--
 kernel/memremap.c                 |  8 ++++----
 mm/hmm.c                          | 10 +++++-----
 tools/testing/nvdimm/test/iomap.c |  4 ++--
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index f390083a64d7..b5257038c188 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref)
 	complete(&dev_dax->cmp);
 }
 
-static void dev_dax_percpu_exit(struct percpu_ref *ref)
+static void dev_dax_percpu_exit(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
 	wait_for_completion(&dev_dax->cmp);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *ref)
+static void dev_dax_percpu_kill(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c2449af2b388..9dac48359353 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void pmem_pagemap_cleanup(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_cleanup_queue(q);
 }
 
-static void pmem_release_queue(void *ref)
+static void pmem_release_queue(void *pgmap)
 {
-	pmem_pagemap_cleanup(ref);
+	pmem_pagemap_cleanup(pgmap);
 }
 
-static void pmem_pagemap_kill(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_freeze_queue_start(q);
 }
 
@@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev,
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
 	} else {
 		if (devm_add_action_or_reset(dev, pmem_release_queue,
-					&q->q_usage_counter))
+					&pmem->pgmap))
 			return -ENOMEM;
 		addr = devm_memremap(dev, pmem->phys_addr,
 				pmem->size, ARCH_MEMREMAP_PMEM);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index fb039259d463..fa6249e4ed5f 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
 	complete(&p2p_pgmap->ref_done);
 }
 
-static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
-static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap)
 {
-	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
+	struct p2pdma_pagemap *p2p_pgmap =
+		container_of(pgmap, struct p2pdma_pagemap, pgmap);
 
 	wait_for_completion(&p2p_pgmap->ref_done);
 	percpu_ref_exit(&p2p_pgmap->ref);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 919755f48c7e..b8666a0d8665 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -74,12 +74,12 @@ struct dev_pagemap_ops {
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
 	 */
-	void (*kill)(struct percpu_ref *ref);
+	void (*kill)(struct dev_pagemap *pgmap);
 
 	/*
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
-	void (*cleanup)(struct percpu_ref *ref);
+	void (*cleanup)(struct dev_pagemap *pgmap);
 };
 
 /**
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 0824237ef979..00c1ceb60c19 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/mm/hmm.c b/mm/hmm.c
index 583a02a16872..987793fba923 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref)
 	complete(&devmem->completion);
 }
 
-static void hmm_devmem_ref_exit(struct percpu_ref *ref)
+static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
 {
 	struct hmm_devmem *devmem;
 
-	devmem = container_of(ref, struct hmm_devmem, ref);
+	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
 	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void hmm_devmem_ref_kill(struct percpu_ref *ref)
+static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index cf3f064a697d..82f901569e06 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap)
 
 	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
 		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3


From f6a55e1a3fe6b3bb294a80a05437fcf86488d819 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:10 +0200
Subject: memremap: lift the devmap_enable manipulation into
 devm_memremap_pages

Just check if there is a ->page_free operation set and take care of the
static key enable, as well as the put using device managed resources.
Also check that a ->page_free is provided for the pgmaps types that
require it, and check for a valid type as well while we are at it.

Note that this also fixes the fact that hmm never called
dev_pagemap_put_ops and thus would leave the slow path enabled forever,
even after a device driver unload or disable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c | 23 ++++----------------
 include/linux/mm.h    | 10 ---------
 kernel/memremap.c     | 59 ++++++++++++++++++++++++++++++++-------------------
 mm/hmm.c              |  2 --
 4 files changed, 41 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9dac48359353..48767171a4df 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,11 +334,6 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_release_pgmap_ops(void *__pgmap)
-{
-	dev_pagemap_put_ops();
-}
-
 static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
@@ -350,16 +345,6 @@ static const struct dev_pagemap_ops fsdax_pagemap_ops = {
 	.cleanup		= pmem_pagemap_cleanup,
 };
 
-static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
-{
-	dev_pagemap_get_ops();
-	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
-		return -ENOMEM;
-	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->ops = &fsdax_pagemap_ops;
-	return 0;
-}
-
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -415,8 +400,8 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	if (is_nd_pfn(dev)) {
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -428,8 +413,8 @@ static int pmem_attach_disk(struct device *dev,
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7399f9f08de6..2425f4167ec2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -932,8 +932,6 @@ static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void dev_pagemap_get_ops(void);
-void dev_pagemap_put_ops(void);
 void __put_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 static inline bool put_devmap_managed_page(struct page *page)
@@ -973,14 +971,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 #endif /* CONFIG_PCI_P2PDMA */
 
 #else /* CONFIG_DEV_PAGEMAP_OPS */
-static inline void dev_pagemap_get_ops(void)
-{
-}
-
-static inline void dev_pagemap_put_ops(void)
-{
-}
-
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 00c1ceb60c19..3219a4c91d07 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -17,6 +17,35 @@ static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void *data)
+{
+	if (atomic_dec_and_test(&devmap_managed_enable))
+		static_branch_disable(&devmap_managed_key);
+}
+
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	if (!pgmap->ops->page_free) {
+		WARN(1, "Missing page_free method\n");
+		return -EINVAL;
+	}
+
+	if (atomic_inc_return(&devmap_managed_enable) == 1)
+		static_branch_enable(&devmap_managed_key);
+	return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
+}
+#else
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 		       unsigned long addr,
@@ -156,6 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
+	bool need_devmap_managed = true;
 
 	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
@@ -173,6 +203,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	case MEMORY_DEVICE_DEVDAX:
 	case MEMORY_DEVICE_PCI_P2PDMA:
+		need_devmap_managed = false;
 		break;
 	default:
 		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
@@ -185,6 +216,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (need_devmap_managed) {
+		error = devmap_managed_enable_get(dev, pgmap);
+		if (error)
+			return ERR_PTR(error);
+	}
+
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
@@ -351,28 +388,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_enable;
-
-/*
- * Toggle the static key for ->page_free() callbacks when dev_pagemap
- * pages go idle.
- */
-void dev_pagemap_get_ops(void)
-{
-	if (atomic_inc_return(&devmap_enable) == 1)
-		static_branch_enable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
-
-void dev_pagemap_put_ops(void)
-{
-	if (atomic_dec_and_test(&devmap_enable))
-		static_branch_disable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-
 void __put_devmap_managed_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
diff --git a/mm/hmm.c b/mm/hmm.c
index 987793fba923..5b0bd5f6a74f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1415,8 +1415,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	void *result;
 	int ret;
 
-	dev_pagemap_get_ops();
-
 	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
 	if (!devmem)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 897e6365cda6ba6356e83a3aaa68dec82ef4c548 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:11 +0200
Subject: memremap: add a migrate_to_ram method to struct dev_pagemap_ops

This replaces the hacky ->fault callback, which is currently directly
called from common code through a hmm specific data structure as an
exercise in layering violations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h      |  6 ------
 include/linux/memremap.h |  6 ++++++
 include/linux/swapops.h  | 15 ---------------
 kernel/memremap.c        | 35 ++++-------------------------------
 mm/hmm.c                 | 13 +++++--------
 mm/memory.c              |  9 ++-------
 6 files changed, 17 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 44a5ac738bb5..ba19c19e24ed 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -692,11 +692,6 @@ struct hmm_devmem_ops {
  * chunk, as an optimization. It must, however, prioritize the faulting address
  * over all the others.
  */
-typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
-				unsigned long addr,
-				const struct page *page,
-				unsigned int flags,
-				pmd_t *pmdp);
 
 struct hmm_devmem {
 	struct completion		completion;
@@ -707,7 +702,6 @@ struct hmm_devmem {
 	struct dev_pagemap		pagemap;
 	const struct hmm_devmem_ops	*ops;
 	struct percpu_ref		ref;
-	dev_page_fault_t		page_fault;
 };
 
 /*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index b8666a0d8665..ac985bd03a7f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -80,6 +80,12 @@ struct dev_pagemap_ops {
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
 	void (*cleanup)(struct dev_pagemap *pgmap);
+
+	/*
+	 * Used for private (un-addressable) device memory only.  Must migrate
+	 * the page back to a CPU accessible page.
+	 */
+	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
 /**
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..15bdb6fe71e5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return pfn_to_page(swp_offset(entry));
 }
-
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp);
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
 {
@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return NULL;
 }
-
-static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-				     unsigned long addr,
-				     swp_entry_t entry,
-				     unsigned int flags,
-				     pmd_t *pmdp)
-{
-	return VM_FAULT_SIGBUS;
-}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 3219a4c91d07..c06a5487dda7 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/xarray.h>
-#include <linux/hmm.h>
 
 static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -46,36 +45,6 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm
 }
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp)
-{
-	struct page *page = device_private_entry_to_page(entry);
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
-
-	/*
-	 * The page_fault() callback must migrate page back to system memory
-	 * so that CPU can access it. This might fail for various reasons
-	 * (device issue, device was unsafely unplugged, ...). When such
-	 * error conditions happen, the callback must return VM_FAULT_SIGBUS.
-	 *
-	 * Note that because memory cgroup charges are accounted to the device
-	 * memory, this should never fail because of memory restrictions (but
-	 * allocation of regular system page might still fail because we are
-	 * out of memory).
-	 *
-	 * There is a more in-depth description of what that callback can and
-	 * cannot do, in include/linux/memremap.h
-	 */
-	return devmem->page_fault(vma, addr, page, flags, pmdp);
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-
 static void pgmap_array_delete(struct resource *res)
 {
 	xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
@@ -193,6 +162,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			WARN(1, "Device private memory not supported\n");
 			return ERR_PTR(-EINVAL);
 		}
+		if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+			WARN(1, "Missing migrate_to_ram method\n");
+			return ERR_PTR(-EINVAL);
+		}
 		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
diff --git a/mm/hmm.c b/mm/hmm.c
index 5b0bd5f6a74f..96633ee066d8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1366,15 +1366,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 	percpu_ref_kill(pgmap->ref);
 }
 
-static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
-			    unsigned long addr,
-			    const struct page *page,
-			    unsigned int flags,
-			    pmd_t *pmdp)
+static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = page->pgmap->data;
+	struct hmm_devmem *devmem = vmf->page->pgmap->data;
 
-	return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
+			vmf->flags, vmf->pmd);
 }
 
 static void hmm_devmem_free(struct page *page, void *data)
@@ -1388,6 +1385,7 @@ static const struct dev_pagemap_ops hmm_pagemap_ops = {
 	.page_free		= hmm_devmem_free,
 	.kill			= hmm_devmem_ref_kill,
 	.cleanup		= hmm_devmem_ref_exit,
+	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
 };
 
 /*
@@ -1438,7 +1436,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
diff --git a/mm/memory.c b/mm/memory.c
index 2d14f4c7e152..d437ccdb210c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2748,13 +2748,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
 		} else if (is_device_private_entry(entry)) {
-			/*
-			 * For un-addressable device memory we call the pgmap
-			 * fault handler callback. The callback must migrate
-			 * the page back to some CPU accessible page.
-			 */
-			ret = device_private_entry_fault(vma, vmf->address, entry,
-						 vmf->flags, vmf->pmd);
+			vmf->page = device_private_entry_to_page(entry);
+			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
-- 
cgit v1.2.3


From 80a72d0af05ae97a8b106c172e431072ba587492 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:12 +0200
Subject: memremap: remove the data field in struct dev_pagemap

struct dev_pagemap is always embedded into a containing structure, so
there is no need to an additional private data field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c    | 2 +-
 include/linux/memremap.h | 3 +--
 kernel/memremap.c        | 2 +-
 mm/hmm.c                 | 9 +++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 48767171a4df..093408ce40ad 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,7 +334,7 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_pagemap_page_free(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page)
 {
 	wake_up_var(&page->_refcount);
 }
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index ac985bd03a7f..336eca601dad 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -69,7 +69,7 @@ struct dev_pagemap_ops {
 	 * reach 0 refcount unless there is a refcount bug. This allows the
 	 * device driver to implement its own memory management.)
 	 */
-	void (*page_free)(struct page *page, void *data);
+	void (*page_free)(struct page *page);
 
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
@@ -104,7 +104,6 @@ struct dev_pagemap {
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
-	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index c06a5487dda7..6c3dbb692037 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -376,7 +376,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->ops->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 96633ee066d8..36e25cdbdac1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1368,15 +1368,17 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 
 static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = vmf->page->pgmap->data;
+	struct hmm_devmem *devmem =
+		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
 
 	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
 			vmf->flags, vmf->pmd);
 }
 
-static void hmm_devmem_free(struct page *page, void *data)
+static void hmm_devmem_free(struct page *page)
 {
-	struct hmm_devmem *devmem = data;
+	struct hmm_devmem *devmem =
+		container_of(page->pgmap, struct hmm_devmem, pagemap);
 
 	devmem->ops->free(devmem, page);
 }
@@ -1442,7 +1444,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
-- 
cgit v1.2.3


From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:13 +0200
Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID
 flag

Add a flags field to struct dev_pagemap to replace the altmap_valid
boolean to be a little more extensible.  Also add a pgmap_altmap() helper
to find the optional altmap and clean up the code using the altmap using
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 arch/powerpc/mm/mem.c     | 10 +---------
 arch/x86/mm/init_64.c     |  8 ++------
 drivers/nvdimm/pfn_devs.c |  3 +--
 drivers/nvdimm/pmem.c     |  1 -
 include/linux/memremap.h  | 12 +++++++++++-
 kernel/memremap.c         | 26 ++++++++++----------------
 mm/hmm.c                  |  1 -
 mm/memory_hotplug.c       |  6 ++----
 mm/page_alloc.c           |  5 ++---
 9 files changed, 29 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2540d3b2588c..a2923c5c1982 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
 	int ret;
 
-	/*
-	 * If we have an altmap then we need to skip over any reserved PFNs
-	 * when querying the zone.
-	 */
-	page = pfn_to_page(start_pfn);
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-
 	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
 
 	/* Remove htab bolted mappings for this section of memory */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f01c7b1d217..08bbf648827b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page = pfn_to_page(start_pfn);
-	struct zone *zone;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
+	struct zone *zone = page_zone(page);
 
-	/* With altmap the first mapped page is offset from @start */
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-	zone = page_zone(page);
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 0f81fc56bbfd..55fb6b7433ed 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		if (offset < reserve)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
-		pgmap->altmap_valid = false;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
 		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
 					- offset) / PAGE_SIZE);
@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		memcpy(altmap, &__altmap, sizeof(*altmap));
 		altmap->free = PHYS_PFN(offset - reserve);
 		altmap->alloc = 0;
-		pgmap->altmap_valid = true;
+		pgmap->flags |= PGMAP_ALTMAP_VALID;
 	} else
 		return -ENXIO;
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 093408ce40ad..e7d8cc9f41e8 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -412,7 +412,6 @@ static int pmem_attach_disk(struct device *dev,
 		bb_res.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
-		pmem->pgmap.altmap_valid = false;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 336eca601dad..e25685b878e9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -88,6 +88,8 @@ struct dev_pagemap_ops {
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
+#define PGMAP_ALTMAP_VALID	(1 << 0)
+
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
@@ -96,19 +98,27 @@ struct dev_pagemap_ops {
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
-	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
 	enum memory_type type;
+	unsigned int flags;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
 };
 
+static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
+{
+	if (pgmap->flags & PGMAP_ALTMAP_VALID)
+		return &pgmap->altmap;
+	return NULL;
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6c3dbb692037..eee490e7d7e1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
 
 static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
-	const struct resource *res = &pgmap->res;
-	struct vmem_altmap *altmap = &pgmap->altmap;
-	unsigned long pfn;
-
-	pfn = res->start >> PAGE_SHIFT;
-	if (pgmap->altmap_valid)
-		pfn += vmem_altmap_offset(altmap);
-	return pfn;
+	return (pgmap->res.start >> PAGE_SHIFT) +
+		vmem_altmap_offset(pgmap_altmap(pgmap));
 }
 
 static unsigned long pfn_end(struct dev_pagemap *pgmap)
@@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data)
 				align_size >> PAGE_SHIFT, NULL);
 	} else {
 		arch_remove_memory(nid, align_start, align_size,
-				pgmap->altmap_valid ? &pgmap->altmap : NULL);
+				pgmap_altmap(pgmap));
 		kasan_remove_zero_shadow(__va(align_start), align_size);
 	}
 	mem_hotplug_done();
@@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data)
  * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
  *    initialized by the caller before passing it to this function
  *
- * 2/ The altmap field may optionally be initialized, in which case altmap_valid
- *    must be set to true
+ * 2/ The altmap field may optionally be initialized, in which case
+ *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
  * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
  *    at devm_memremap_pages_release() time, or if this routine fails.
@@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data)
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
 	resource_size_t align_start, align_size, align_end;
-	struct vmem_altmap *altmap = pgmap->altmap_valid ?
-			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
 	struct mhp_restrictions restrictions = {
 		/*
 		 * We do not want any optional features only our own memmap
 		*/
-		.altmap = altmap,
+		.altmap = pgmap_altmap(pgmap),
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
@@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 
 		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
 		move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, altmap);
+				align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
 	}
 
 	mem_hotplug_done();
@@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	/* number of pfns from base where pfn_to_page() is valid */
-	return altmap->reserve + altmap->free;
+	if (altmap)
+		return altmap->reserve + altmap->free;
+	return 0;
 }
 
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
diff --git a/mm/hmm.c b/mm/hmm.c
index 36e25cdbdac1..e4470462298f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1442,7 +1442,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
 	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e096c987d261..6166ba5a15f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	int sections_to_remove;
 
 	/* In the ZONE_DEVICE case device driver owns the memory region */
-	if (is_dev_zone(zone)) {
-		if (altmap)
-			map_offset = vmem_altmap_offset(altmap);
-	}
+	if (is_dev_zone(zone))
+		map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d66bc8abe0af..17a39d40a556 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 {
 	unsigned long pfn, end_pfn = start_pfn + size;
 	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	 * of the pages reserved for the memmap, so we can just jump to
 	 * the end of that region and start processing the device pages.
 	 */
-	if (pgmap->altmap_valid) {
-		struct vmem_altmap *altmap = &pgmap->altmap;
-
+	if (altmap) {
 		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
 		size = end_pfn - start_pfn;
 	}
-- 
cgit v1.2.3


From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:14 +0200
Subject: memremap: provide an optional internal refcount in struct dev_pagemap

Provide an internal refcounting logic if no ->ref field is provided
in the pagemap passed into devm_memremap_pages so that callers don't
have to reinvent it poorly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/memremap.h          |  4 +++
 kernel/memremap.c                 | 64 +++++++++++++++++++++++++++++++--------
 tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++--------
 3 files changed, 101 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e25685b878e9..f8a5b2a19945 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,6 +95,8 @@ struct dev_pagemap_ops {
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @internal_ref: internal reference if @ref is not provided by the caller
+ * @done: completion for @internal_ref
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -105,6 +107,8 @@ struct dev_pagemap {
 	struct vmem_altmap altmap;
 	struct resource res;
 	struct percpu_ref *ref;
+	struct percpu_ref internal_ref;
+	struct completion done;
 	struct device *dev;
 	enum memory_type type;
 	unsigned int flags;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index eee490e7d7e1..bea6f887adad 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data)
 
 static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
 {
-	if (!pgmap->ops->page_free) {
+	if (!pgmap->ops || !pgmap->ops->page_free) {
 		WARN(1, "Missing page_free method\n");
 		return -EINVAL;
 	}
@@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
 #define for_each_device_pfn(pfn, map) \
 	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
 static void devm_memremap_pages_release(void *data)
 {
 	struct dev_pagemap *pgmap = data;
@@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap);
+	dev_pagemap_kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
 		      "%s: failed to free all reserved pages\n", __func__);
 }
 
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
+}
+
 /**
  * devm_memremap_pages - remap and provide memmap backing for the given resource
  * @dev: hosting device for @res
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
- *    initialized by the caller before passing it to this function
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ *    by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case
  *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
- * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- *    at devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ *    'live' on entry and will be killed and reaped at
+ *    devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-	    !pgmap->ops->cleanup) {
-		WARN(1, "Missing reference count teardown definition\n");
-		return ERR_PTR(-EINVAL);
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
 
 	if (need_devmap_managed) {
@@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_kill(pgmap);
+	dev_pagemap_cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 82f901569e06..cd040b5abffe 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	WARN_ON(!pgmap || !pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
+	int error;
 	resource_size_t offset = pgmap->res.start;
 	struct nfit_test_resource *nfit_res = get_nfit_res(offset);
 
-	if (nfit_res) {
-		int rc;
-
-		rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
-		if (rc)
-			return ERR_PTR(rc);
-		return nfit_res->buf + offset - nfit_res->res.start;
+	if (!nfit_res)
+		return devm_memremap_pages(dev, pgmap);
+
+	pgmap->dev = dev;
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
-	return devm_memremap_pages(dev, pgmap);
+
+	error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
+	if (error)
+		return ERR_PTR(error);
+	return nfit_res->buf + offset - nfit_res->res.start;
 }
 EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
-- 
cgit v1.2.3


From 47e9d836a5e827acdaa5cb6175648fbef15b4e84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:19 +0200
Subject: mm: remove hmm_vma_alloc_locked_page

The only user of it has just been removed, and there wasn't really any need
to wrap a basic memory allocator to start with.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h |  3 ---
 mm/hmm.c            | 14 --------------
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ba19c19e24ed..1d55b7ea2da6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -587,9 +587,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr);
-
 /*
  * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index e4470462298f..fdbd48771292 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1330,20 +1330,6 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr)
-{
-	struct page *page;
-
-	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
-	if (!page)
-		return NULL;
-	lock_page(page);
-	return page;
-}
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
-
-
 static void hmm_devmem_ref_release(struct percpu_ref *ref)
 {
 	struct hmm_devmem *devmem;
-- 
cgit v1.2.3


From eee3ae41b153e55e25d6cf7bd5b5098ba0afe705 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:20 +0200
Subject: mm: remove hmm_devmem_add

There isn't really much value add in the hmm_devmem_add wrapper and
more, as using devm_memremap_pages directly now is just as simple.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst |  26 ----------
 include/linux/hmm.h      | 129 -----------------------------------------------
 mm/hmm.c                 | 110 ----------------------------------------
 3 files changed, 265 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..50e1380950a9 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -329,32 +329,6 @@ directly using struct page for device memory which left most kernel code paths
 unaware of the difference. We only need to make sure that no one ever tries to
 map those pages from the CPU side.
 
-HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API::
-
- struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-                                   struct device *device,
-                                   unsigned long size);
- void hmm_devmem_remove(struct hmm_devmem *devmem);
-
-The hmm_devmem_ops is where most of the important things are::
-
- struct hmm_devmem_ops {
-     void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
-                  struct vm_area_struct *vma,
-                  unsigned long addr,
-                  struct page *page,
-                  unsigned flags,
-                  pmd_t *pmdp);
- };
-
-The first callback (free()) happens when the last reference on a device page is
-dropped. This means the device page is now free and no longer used by anyone.
-The second callback happens whenever the CPU tries to access a device page
-which it cannot do. This second callback must trigger a migration back to
-system memory.
-
 
 Migration to and from device memory
 ===================================
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d55b7ea2da6..86aa4ec3404c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -585,135 +585,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct hmm_devmem;
-
-/*
- * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
- *
- * @free: call when refcount on page reach 1 and thus is no longer use
- * @fault: call when there is a page fault to unaddressable memory
- *
- * Both callback happens from page_free() and page_fault() callback of struct
- * dev_pagemap respectively. See include/linux/memremap.h for more details on
- * those.
- *
- * The hmm_devmem_ops callback are just here to provide a coherent and
- * uniq API to device driver and device driver should not register their
- * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
- * back.
- */
-struct hmm_devmem_ops {
-	/*
-	 * free() - free a device page
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @page: pointer to struct page being freed
-	 *
-	 * Call back occurs whenever a device page refcount reach 1 which
-	 * means that no one is holding any reference on the page anymore
-	 * (ZONE_DEVICE page have an elevated refcount of 1 as default so
-	 * that they are not release to the general page allocator).
-	 *
-	 * Note that callback has exclusive ownership of the page (as no
-	 * one is holding any reference).
-	 */
-	void (*free)(struct hmm_devmem *devmem, struct page *page);
-	/*
-	 * fault() - CPU page fault or get user page (GUP)
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @vma: virtual memory area containing the virtual address
-	 * @addr: virtual address that faulted or for which there is a GUP
-	 * @page: pointer to struct page backing virtual address (unreliable)
-	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
-	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
-	 *   on error
-	 *
-	 * The callback occurs whenever there is a CPU page fault or GUP on a
-	 * virtual address. This means that the device driver must migrate the
-	 * page back to regular memory (CPU accessible).
-	 *
-	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
-	 * address over the others.
-	 *
-	 * The struct page pointer is only given as an hint to allow quick
-	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
-	 * callback.
-	 *
-	 * Note that mmap semaphore is held in read mode at least when this
-	 * callback occurs, hence the vma is valid upon callback entry.
-	 */
-	vm_fault_t (*fault)(struct hmm_devmem *devmem,
-		     struct vm_area_struct *vma,
-		     unsigned long addr,
-		     const struct page *page,
-		     unsigned int flags,
-		     pmd_t *pmdp);
-};
-
-/*
- * struct hmm_devmem - track device memory
- *
- * @completion: completion object for device memory
- * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
- * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
- * @resource: IO resource reserved for this chunk of memory
- * @pagemap: device page map for that chunk
- * @device: device to bind resource to
- * @ops: memory operations callback
- * @ref: per CPU refcount
- * @page_fault: callback when CPU fault on an unaddressable device page
- *
- * This an helper structure for device drivers that do not wish to implement
- * the gory details related to hotplugging new memoy and allocating struct
- * pages.
- *
- * Device drivers can directly use ZONE_DEVICE memory on their own if they
- * wish to do so.
- *
- * The page_fault() callback must migrate page back, from device memory to
- * system memory, so that the CPU can access it. This might fail for various
- * reasons (device issues,  device have been unplugged, ...). When such error
- * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
- * set the CPU page table entry to "poisoned".
- *
- * Note that because memory cgroup charges are transferred to the device memory,
- * this should never fail due to memory restrictions. However, allocation
- * of a regular system page might still fail because we are out of memory. If
- * that happens, the page_fault() callback must return VM_FAULT_OOM.
- *
- * The page_fault() callback can also try to migrate back multiple pages in one
- * chunk, as an optimization. It must, however, prioritize the faulting address
- * over all the others.
- */
-
-struct hmm_devmem {
-	struct completion		completion;
-	unsigned long			pfn_first;
-	unsigned long			pfn_last;
-	struct resource			*resource;
-	struct device			*device;
-	struct dev_pagemap		pagemap;
-	const struct hmm_devmem_ops	*ops;
-	struct percpu_ref		ref;
-};
-
-/*
- * To add (hotplug) device memory, HMM assumes that there is no real resource
- * that reserves a range in the physical address space (this is intended to be
- * use by unaddressable device memory). It will reserve a physical range big
- * enough and allocate struct page for it.
- *
- * The device driver can wrap the hmm_devmem struct inside a private device
- * driver struct.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size);
-
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index fdbd48771292..90ca0cdab9db 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1327,113 +1327,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(ref, struct hmm_devmem, ref);
-	complete(&devmem->completion);
-}
-
-static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
-	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(pgmap->ref);
-}
-
-static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
-{
-	percpu_ref_kill(pgmap->ref);
-}
-
-static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
-{
-	struct hmm_devmem *devmem =
-		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
-
-	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
-			vmf->flags, vmf->pmd);
-}
-
-static void hmm_devmem_free(struct page *page)
-{
-	struct hmm_devmem *devmem =
-		container_of(page->pgmap, struct hmm_devmem, pagemap);
-
-	devmem->ops->free(devmem, page);
-}
-
-static const struct dev_pagemap_ops hmm_pagemap_ops = {
-	.page_free		= hmm_devmem_free,
-	.kill			= hmm_devmem_ref_kill,
-	.cleanup		= hmm_devmem_ref_exit,
-	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
-};
-
-/*
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
- *
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
- * @device: device struct to bind the resource too
- * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
- *
- * This function first finds an empty range of physical address big enough to
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
- * in turn allocates struct pages. It does not do anything beyond that; all
- * events affecting the memory will go through the various callbacks provided
- * by hmm_devmem_ops struct.
- *
- * Device driver should call this function during device initialization and
- * is then responsible of memory management. HMM only provides helpers.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = NULL;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
-			size);
-	if (IS_ERR(devmem->resource))
-		return ERR_CAST(devmem->resource);
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-
-	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.ref = &devmem->ref;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add);
-#endif /* CONFIG_DEVICE_PRIVATE  */
-- 
cgit v1.2.3


From 8a164fef9c4ccf6ff7757170397222860e40d192 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:21 +0200
Subject: mm: simplify ZONE_DEVICE page private data

Remove the clumsy hmm_devmem_page_{get,set}_drvdata helpers, and
instead just access the page directly.  Also make the page data
a void pointer, and thus much easier to use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 18 +++++++-----------
 include/linux/hmm.h                    | 32 --------------------------------
 include/linux/mm_types.h               |  2 +-
 mm/page_alloc.c                        |  8 ++++----
 4 files changed, 12 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0fb7a44b8bc4..42c026010938 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -104,11 +104,8 @@ struct nouveau_migrate {
 
 static void nouveau_dmem_page_free(struct page *page)
 {
-	struct nouveau_dmem_chunk *chunk;
-	unsigned long idx;
-
-	chunk = (void *)hmm_devmem_page_get_drvdata(page);
-	idx = page_to_pfn(page) - chunk->pfn_first;
+	struct nouveau_dmem_chunk *chunk = page->zone_device_data;
+	unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
 
 	/*
 	 * FIXME:
@@ -200,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
 
 		dst_addr = fault->dma[fault->npages++];
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(spage);
+		chunk = spage->zone_device_data;
 		src_addr = page_to_pfn(spage) - chunk->pfn_first;
 		src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -633,9 +630,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 		list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
 
 		page = pfn_to_page(chunk->pfn_first);
-		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) {
-			hmm_devmem_page_set_drvdata(page, (long)chunk);
-		}
+		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
+			page->zone_device_data = chunk;
 	}
 
 	NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
@@ -698,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
 		if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
 			continue;
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(dpage);
+		chunk = dpage->zone_device_data;
 		dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
 		dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -862,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 			continue;
 		}
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(page);
+		chunk = page->zone_device_data;
 		addr = page_to_pfn(page) - chunk->pfn_first;
 		addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 86aa4ec3404c..3d00e9550e77 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,36 +584,4 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-/*
- * hmm_devmem_page_set_drvdata - set per-page driver data field
- *
- * @page: pointer to struct page
- * @data: driver data value to set
- *
- * Because page can not be on lru we have an unsigned long that driver can use
- * to store a per page field. This just a simple helper to do that.
- */
-static inline void hmm_devmem_page_set_drvdata(struct page *page,
-					       unsigned long data)
-{
-	page->hmm_data = data;
-}
-
-/*
- * hmm_devmem_page_get_drvdata - get per page driver data field
- *
- * @page: pointer to struct page
- * Return: driver data value
- */
-static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
-{
-	return page->hmm_data;
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-#else /* IS_ENABLED(CONFIG_HMM) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
-static inline void hmm_mm_init(struct mm_struct *mm) {}
-#endif /* IS_ENABLED(CONFIG_HMM) */
-
 #endif /* LINUX_HMM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8ec38b11b361..f33a1289c101 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -158,7 +158,7 @@ struct page {
 		struct {	/* ZONE_DEVICE pages */
 			/** @pgmap: Points to the hosting device page map. */
 			struct dev_pagemap *pgmap;
-			unsigned long hmm_data;
+			void *zone_device_data;
 			unsigned long _zd_pad_1;	/* uses mapping */
 		};
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a39d40a556..c0e031c52db5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5886,12 +5886,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		__SetPageReserved(page);
 
 		/*
-		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
-		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
-		 * page is ever freed or placed on a driver-private list.
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+		 * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
+		 * ever freed or placed on a driver-private list.
 		 */
 		page->pgmap = pgmap;
-		page->hmm_data = 0;
+		page->zone_device_data = NULL;
 
 		/*
 		 * Mark the block movable so that blocks are reserved for
-- 
cgit v1.2.3


From 43535b0aefab29ea6564e608de4c783ed2ab6c49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:23 +0200
Subject: mm: remove the HMM config option

All the mm/hmm.c code is better keyed off HMM_MIRROR.  Also let nouveau
depend on it instead of the mix of a dummy dependency symbol plus the
actually selected one.  Drop various odd dependencies, as the code is
pretty portable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/Kconfig |  3 +--
 include/linux/hmm.h             |  5 +----
 include/linux/mm_types.h        |  2 +-
 mm/Kconfig                      | 27 ++++-----------------------
 mm/Makefile                     |  2 +-
 mm/hmm.c                        |  2 --
 6 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 6303d203ab1d..66c839d8e9d1 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -84,11 +84,10 @@ config DRM_NOUVEAU_BACKLIGHT
 
 config DRM_NOUVEAU_SVM
 	bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
-	depends on ARCH_HAS_HMM
 	depends on DEVICE_PRIVATE
 	depends on DRM_NOUVEAU
+	depends on HMM_MIRROR
 	depends on STAGING
-	select HMM_MIRROR
 	default n
 	help
 	  Say Y here if you want to enable experimental support for
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 3d00e9550e77..b697496e85ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -62,7 +62,7 @@
 #include <linux/kconfig.h>
 #include <asm/pgtable.h>
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 
 #include <linux/device.h>
 #include <linux/migrate.h>
@@ -332,9 +332,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
 	return hmm_device_entry_from_pfn(range, pfn);
 }
 
-
-
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 /*
  * Mirroring: how to synchronize device page table with CPU page table.
  *
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f33a1289c101..8d37182f8dbe 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,7 +501,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 		/* HMM needs to track a few things per mm */
 		struct hmm *hmm;
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index eecf037a54b3..1e426c26b1d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -669,37 +669,18 @@ config ZONE_DEVICE
 
 	  If FS_DAX is enabled, then say Y.
 
-config ARCH_HAS_HMM_MIRROR
-	bool
-	default y
-	depends on (X86_64 || PPC64)
-	depends on MMU && 64BIT
-
-config ARCH_HAS_HMM
-	bool
-	depends on (X86_64 || PPC64)
-	depends on ZONE_DEVICE
-	depends on MMU && 64BIT
-	depends on MEMORY_HOTPLUG
-	depends on MEMORY_HOTREMOVE
-	depends on SPARSEMEM_VMEMMAP
-	default y
-
 config MIGRATE_VMA_HELPER
 	bool
 
 config DEV_PAGEMAP_OPS
 	bool
 
-config HMM
-	bool
-	select MMU_NOTIFIER
-	select MIGRATE_VMA_HELPER
-
 config HMM_MIRROR
 	bool "HMM mirror CPU page table into a device page table"
-	depends on ARCH_HAS_HMM
-	select HMM
+	depends on (X86_64 || PPC64)
+	depends on MMU && 64BIT
+	select MMU_NOTIFIER
+	select MIGRATE_VMA_HELPER
 	help
 	  Select HMM_MIRROR if you want to mirror range of the CPU page table of a
 	  process into a device page table. Here, mirror means "keep synchronized".
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..91c99040065c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
-obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/hmm.c b/mm/hmm.c
index 90ca0cdab9db..d62ce64d6bca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,7 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
 static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
@@ -1326,4 +1325,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 	return cpages;
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3


From 3cd7957e85e67120bb9f6bfb75d81dcc19af282b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 1 Jul 2019 12:54:10 +0200
Subject: ACPI: PM: Simplify and fix PM domain hibernation callbacks

First, after a previous change causing all runtime-suspended devices
in the ACPI PM domain (and ACPI LPSS devices) to be resumed before
creating a snapshot image of memory during hibernation, it is not
necessary to worry about the case in which them might be left in
runtime-suspend any more, so get rid of the code related to that from
ACPI PM domain and ACPI LPSS hibernation callbacks.

Second, it is not correct to use pm_generic_resume_early() and
acpi_subsys_resume_noirq() in hibernation "restore" callbacks (which
currently happens in the ACPI PM domain and ACPI LPSS), so introduce
proper _restore_late and _restore_noirq callbacks for the ACPI PM
domain and ACPI LPSS.

Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/acpi/acpi_lpss.c | 61 +++++++++++++++++++++++++++++++++++++++++-------
 drivers/acpi/device_pm.c | 61 +++++++-----------------------------------------
 include/linux/acpi.h     | 10 --------
 3 files changed, 61 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index cf768608437e..8ea836857691 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -1094,16 +1094,62 @@ static int acpi_lpss_resume_noirq(struct device *dev)
 	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
 	int ret;
 
-	ret = acpi_subsys_resume_noirq(dev);
+	/* Follow acpi_subsys_resume_noirq(). */
+	if (dev_pm_may_skip_resume(dev))
+		return 0;
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		pm_runtime_set_active(dev);
+
+	ret = pm_generic_resume_noirq(dev);
 	if (ret)
 		return ret;
 
-	if (!dev_pm_may_skip_resume(dev) && pdata->dev_desc->resume_from_noirq)
-		ret = acpi_lpss_do_resume_early(dev);
+	if (!pdata->dev_desc->resume_from_noirq)
+		return 0;
 
-	return ret;
+	/*
+	 * The driver's ->resume_early callback will be invoked by
+	 * acpi_lpss_do_resume_early(), with the assumption that the driver
+	 * really wanted to run that code in ->resume_noirq, but it could not
+	 * run before acpi_dev_resume() and the driver expected the latter to be
+	 * called in the "early" phase.
+	 */
+	return acpi_lpss_do_resume_early(dev);
+}
+
+static int acpi_lpss_do_restore_early(struct device *dev)
+{
+	int ret = acpi_lpss_resume(dev);
+
+	return ret ? ret : pm_generic_restore_early(dev);
+}
+
+static int acpi_lpss_restore_early(struct device *dev)
+{
+	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
+
+	if (pdata->dev_desc->resume_from_noirq)
+		return 0;
+
+	return acpi_lpss_do_restore_early(dev);
 }
 
+static int acpi_lpss_restore_noirq(struct device *dev)
+{
+	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
+	int ret;
+
+	ret = pm_generic_restore_noirq(dev);
+	if (ret)
+		return ret;
+
+	if (!pdata->dev_desc->resume_from_noirq)
+		return 0;
+
+	/* This is analogous to what happens in acpi_lpss_resume_noirq(). */
+	return acpi_lpss_do_restore_early(dev);
+}
 #endif /* CONFIG_PM_SLEEP */
 
 static int acpi_lpss_runtime_suspend(struct device *dev)
@@ -1137,14 +1183,11 @@ static struct dev_pm_domain acpi_lpss_pm_domain = {
 		.resume_noirq = acpi_lpss_resume_noirq,
 		.resume_early = acpi_lpss_resume_early,
 		.freeze = acpi_subsys_freeze,
-		.freeze_late = acpi_subsys_freeze_late,
-		.freeze_noirq = acpi_subsys_freeze_noirq,
-		.thaw_noirq = acpi_subsys_thaw_noirq,
 		.poweroff = acpi_subsys_suspend,
 		.poweroff_late = acpi_lpss_suspend_late,
 		.poweroff_noirq = acpi_lpss_suspend_noirq,
-		.restore_noirq = acpi_lpss_resume_noirq,
-		.restore_early = acpi_lpss_resume_early,
+		.restore_noirq = acpi_lpss_restore_noirq,
+		.restore_early = acpi_lpss_restore_early,
 #endif
 		.runtime_suspend = acpi_lpss_runtime_suspend,
 		.runtime_resume = acpi_lpss_runtime_resume,
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 44172eb18d6e..52fc9042a107 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1073,7 +1073,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend_noirq);
  * acpi_subsys_resume_noirq - Run the device driver's "noirq" resume callback.
  * @dev: Device to handle.
  */
-int acpi_subsys_resume_noirq(struct device *dev)
+static int acpi_subsys_resume_noirq(struct device *dev)
 {
 	if (dev_pm_may_skip_resume(dev))
 		return 0;
@@ -1088,7 +1088,6 @@ int acpi_subsys_resume_noirq(struct device *dev)
 
 	return pm_generic_resume_noirq(dev);
 }
-EXPORT_SYMBOL_GPL(acpi_subsys_resume_noirq);
 
 /**
  * acpi_subsys_resume_early - Resume device using ACPI.
@@ -1098,12 +1097,11 @@ EXPORT_SYMBOL_GPL(acpi_subsys_resume_noirq);
  * generic early resume procedure for it during system transition into the
  * working state.
  */
-int acpi_subsys_resume_early(struct device *dev)
+static int acpi_subsys_resume_early(struct device *dev)
 {
 	int ret = acpi_dev_resume(dev);
 	return ret ? ret : pm_generic_resume_early(dev);
 }
-EXPORT_SYMBOL_GPL(acpi_subsys_resume_early);
 
 /**
  * acpi_subsys_freeze - Run the device driver's freeze callback.
@@ -1126,52 +1124,15 @@ int acpi_subsys_freeze(struct device *dev)
 EXPORT_SYMBOL_GPL(acpi_subsys_freeze);
 
 /**
- * acpi_subsys_freeze_late - Run the device driver's "late" freeze callback.
- * @dev: Device to handle.
- */
-int acpi_subsys_freeze_late(struct device *dev)
-{
-
-	if (dev_pm_smart_suspend_and_suspended(dev))
-		return 0;
-
-	return pm_generic_freeze_late(dev);
-}
-EXPORT_SYMBOL_GPL(acpi_subsys_freeze_late);
-
-/**
- * acpi_subsys_freeze_noirq - Run the device driver's "noirq" freeze callback.
- * @dev: Device to handle.
- */
-int acpi_subsys_freeze_noirq(struct device *dev)
-{
-
-	if (dev_pm_smart_suspend_and_suspended(dev))
-		return 0;
-
-	return pm_generic_freeze_noirq(dev);
-}
-EXPORT_SYMBOL_GPL(acpi_subsys_freeze_noirq);
-
-/**
- * acpi_subsys_thaw_noirq - Run the device driver's "noirq" thaw callback.
- * @dev: Device to handle.
+ * acpi_subsys_restore_early - Restore device using ACPI.
+ * @dev: Device to restore.
  */
-int acpi_subsys_thaw_noirq(struct device *dev)
+int acpi_subsys_restore_early(struct device *dev)
 {
-	/*
-	 * If the device is in runtime suspend, the "thaw" code may not work
-	 * correctly with it, so skip the driver callback and make the PM core
-	 * skip all of the subsequent "thaw" callbacks for the device.
-	 */
-	if (dev_pm_smart_suspend_and_suspended(dev)) {
-		dev_pm_skip_next_resume_phases(dev);
-		return 0;
-	}
-
-	return pm_generic_thaw_noirq(dev);
+	int ret = acpi_dev_resume(dev);
+	return ret ? ret : pm_generic_restore_early(dev);
 }
-EXPORT_SYMBOL_GPL(acpi_subsys_thaw_noirq);
+EXPORT_SYMBOL_GPL(acpi_subsys_restore_early);
 #endif /* CONFIG_PM_SLEEP */
 
 static struct dev_pm_domain acpi_general_pm_domain = {
@@ -1187,14 +1148,10 @@ static struct dev_pm_domain acpi_general_pm_domain = {
 		.resume_noirq = acpi_subsys_resume_noirq,
 		.resume_early = acpi_subsys_resume_early,
 		.freeze = acpi_subsys_freeze,
-		.freeze_late = acpi_subsys_freeze_late,
-		.freeze_noirq = acpi_subsys_freeze_noirq,
-		.thaw_noirq = acpi_subsys_thaw_noirq,
 		.poweroff = acpi_subsys_suspend,
 		.poweroff_late = acpi_subsys_suspend_late,
 		.poweroff_noirq = acpi_subsys_suspend_noirq,
-		.restore_noirq = acpi_subsys_resume_noirq,
-		.restore_early = acpi_subsys_resume_early,
+		.restore_early = acpi_subsys_restore_early,
 #endif
 	},
 };
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d315d86844e4..ea7415440901 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -918,26 +918,16 @@ int acpi_subsys_prepare(struct device *dev);
 void acpi_subsys_complete(struct device *dev);
 int acpi_subsys_suspend_late(struct device *dev);
 int acpi_subsys_suspend_noirq(struct device *dev);
-int acpi_subsys_resume_noirq(struct device *dev);
-int acpi_subsys_resume_early(struct device *dev);
 int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
-int acpi_subsys_freeze_late(struct device *dev);
-int acpi_subsys_freeze_noirq(struct device *dev);
-int acpi_subsys_thaw_noirq(struct device *dev);
 #else
 static inline int acpi_dev_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
 static inline void acpi_subsys_complete(struct device *dev) {}
 static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
 static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
-static inline int acpi_subsys_resume_noirq(struct device *dev) { return 0; }
-static inline int acpi_subsys_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
-static inline int acpi_subsys_freeze_late(struct device *dev) { return 0; }
-static inline int acpi_subsys_freeze_noirq(struct device *dev) { return 0; }
-static inline int acpi_subsys_thaw_noirq(struct device *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From c95b7595f85c688d5c569ddbbd6ab6a4bdae2f36 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 1 Jul 2019 12:54:29 +0200
Subject: ACPI: PM: Introduce "poweroff" callbacks for ACPI PM domain and LPSS

In general, it is not correct to call pm_generic_suspend(),
pm_generic_suspend_late() and pm_generic_suspend_noirq() during the
hibernation's "poweroff" transition, because device drivers may
provide special callbacks to be invoked then and the wrappers in
question cause system suspend callbacks to be run.  Unfortunately,
that happens in the ACPI PM domain and ACPI LPSS.

To address this potential issue, introduce "poweroff" callbacks
for the ACPI PM and LPSS that will use pm_generic_poweroff(),
pm_generic_poweroff_late() and pm_generic_poweroff_noirq() as
appropriate.

Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/acpi/acpi_lpss.c | 50 ++++++++++++++++++++++++++++++++++++++---
 drivers/acpi/device_pm.c | 58 +++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/acpi.h     |  2 ++
 3 files changed, 104 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 8ea836857691..a7396e18f168 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -1064,6 +1064,13 @@ static int acpi_lpss_suspend_noirq(struct device *dev)
 	int ret;
 
 	if (pdata->dev_desc->resume_from_noirq) {
+		/*
+		 * The driver's ->suspend_late callback will be invoked by
+		 * acpi_lpss_do_suspend_late(), with the assumption that the
+		 * driver really wanted to run that code in ->suspend_noirq, but
+		 * it could not run after acpi_dev_suspend() and the driver
+		 * expected the latter to be called in the "late" phase.
+		 */
 		ret = acpi_lpss_do_suspend_late(dev);
 		if (ret)
 			return ret;
@@ -1150,6 +1157,43 @@ static int acpi_lpss_restore_noirq(struct device *dev)
 	/* This is analogous to what happens in acpi_lpss_resume_noirq(). */
 	return acpi_lpss_do_restore_early(dev);
 }
+
+static int acpi_lpss_do_poweroff_late(struct device *dev)
+{
+	int ret = pm_generic_poweroff_late(dev);
+
+	return ret ? ret : acpi_lpss_suspend(dev, device_may_wakeup(dev));
+}
+
+static int acpi_lpss_poweroff_late(struct device *dev)
+{
+	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	if (pdata->dev_desc->resume_from_noirq)
+		return 0;
+
+	return acpi_lpss_do_poweroff_late(dev);
+}
+
+static int acpi_lpss_poweroff_noirq(struct device *dev)
+{
+	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	if (pdata->dev_desc->resume_from_noirq) {
+		/* This is analogous to the acpi_lpss_suspend_noirq() case. */
+		int ret = acpi_lpss_do_poweroff_late(dev);
+		if (ret)
+			return ret;
+	}
+
+	return pm_generic_poweroff_noirq(dev);
+}
 #endif /* CONFIG_PM_SLEEP */
 
 static int acpi_lpss_runtime_suspend(struct device *dev)
@@ -1183,9 +1227,9 @@ static struct dev_pm_domain acpi_lpss_pm_domain = {
 		.resume_noirq = acpi_lpss_resume_noirq,
 		.resume_early = acpi_lpss_resume_early,
 		.freeze = acpi_subsys_freeze,
-		.poweroff = acpi_subsys_suspend,
-		.poweroff_late = acpi_lpss_suspend_late,
-		.poweroff_noirq = acpi_lpss_suspend_noirq,
+		.poweroff = acpi_subsys_poweroff,
+		.poweroff_late = acpi_lpss_poweroff_late,
+		.poweroff_noirq = acpi_lpss_poweroff_noirq,
 		.restore_noirq = acpi_lpss_restore_noirq,
 		.restore_early = acpi_lpss_restore_early,
 #endif
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 52fc9042a107..6a9d41c44b70 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1133,6 +1133,58 @@ int acpi_subsys_restore_early(struct device *dev)
 	return ret ? ret : pm_generic_restore_early(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_restore_early);
+
+/**
+ * acpi_subsys_poweroff - Run the device driver's poweroff callback.
+ * @dev: Device to handle.
+ *
+ * Follow PCI and resume devices from runtime suspend before running their
+ * system poweroff callbacks, unless the driver can cope with runtime-suspended
+ * devices during system suspend and there are no ACPI-specific reasons for
+ * resuming them.
+ */
+int acpi_subsys_poweroff(struct device *dev)
+{
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
+	    acpi_dev_needs_resume(dev, ACPI_COMPANION(dev)))
+		pm_runtime_resume(dev);
+
+	return pm_generic_poweroff(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_poweroff);
+
+/**
+ * acpi_subsys_poweroff_late - Run the device driver's poweroff callback.
+ * @dev: Device to handle.
+ *
+ * Carry out the generic late poweroff procedure for @dev and use ACPI to put
+ * it into a low-power state during system transition into a sleep state.
+ */
+static int acpi_subsys_poweroff_late(struct device *dev)
+{
+	int ret;
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	ret = pm_generic_poweroff_late(dev);
+	if (ret)
+		return ret;
+
+	return acpi_dev_suspend(dev, device_may_wakeup(dev));
+}
+
+/**
+ * acpi_subsys_poweroff_noirq - Run the driver's "noirq" poweroff callback.
+ * @dev: Device to suspend.
+ */
+static int acpi_subsys_poweroff_noirq(struct device *dev)
+{
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	return pm_generic_poweroff_noirq(dev);
+}
 #endif /* CONFIG_PM_SLEEP */
 
 static struct dev_pm_domain acpi_general_pm_domain = {
@@ -1148,9 +1200,9 @@ static struct dev_pm_domain acpi_general_pm_domain = {
 		.resume_noirq = acpi_subsys_resume_noirq,
 		.resume_early = acpi_subsys_resume_early,
 		.freeze = acpi_subsys_freeze,
-		.poweroff = acpi_subsys_suspend,
-		.poweroff_late = acpi_subsys_suspend_late,
-		.poweroff_noirq = acpi_subsys_suspend_noirq,
+		.poweroff = acpi_subsys_poweroff,
+		.poweroff_late = acpi_subsys_poweroff_late,
+		.poweroff_noirq = acpi_subsys_poweroff_noirq,
 		.restore_early = acpi_subsys_restore_early,
 #endif
 	},
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index ea7415440901..22840633c28c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -920,6 +920,7 @@ int acpi_subsys_suspend_late(struct device *dev);
 int acpi_subsys_suspend_noirq(struct device *dev);
 int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
+int acpi_subsys_poweroff(struct device *dev);
 #else
 static inline int acpi_dev_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
@@ -928,6 +929,7 @@ static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
 static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
 static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
+static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From 99465f12babd4f3d5659b6c147bb5b9976dfe033 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 1 Jul 2019 12:55:43 +0200
Subject: ACPI: PM: Drop unused function and function header

Remove a leftover function header and a static inline stub with no
users from the ACPI header file.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/acpi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 22840633c28c..cbbe45e408d9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -913,7 +913,6 @@ static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
 #endif
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
-int acpi_dev_suspend_late(struct device *dev);
 int acpi_subsys_prepare(struct device *dev);
 void acpi_subsys_complete(struct device *dev);
 int acpi_subsys_suspend_late(struct device *dev);
@@ -922,7 +921,6 @@ int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
 int acpi_subsys_poweroff(struct device *dev);
 #else
-static inline int acpi_dev_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
 static inline void acpi_subsys_complete(struct device *dev) {}
 static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
-- 
cgit v1.2.3


From 3db1b636c07e15ff7410db782832dc2e7ffd2bce Mon Sep 17 00:00:00 2001
From: Pawel Laszczak <pawell@cadence.com>
Date: Tue, 2 Jul 2019 14:37:58 +0100
Subject: usb:gadget Separated decoding functions from dwc3 driver.

Patch moves some decoding functions from driver/usb/dwc3/debug.h driver
to driver/usb/gadget/debug.c file. These moved functions include:
    dwc3_decode_get_status
    dwc3_decode_set_clear_feature
    dwc3_decode_set_address
    dwc3_decode_get_set_descriptor
    dwc3_decode_get_configuration
    dwc3_decode_set_configuration
    dwc3_decode_get_intf
    dwc3_decode_set_intf
    dwc3_decode_synch_frame
    dwc3_decode_set_sel
    dwc3_decode_set_isoch_delay
    dwc3_decode_ctrl

These functions are used also in inroduced cdns3 driver.

All functions prefixes were changed from dwc3 to usb.
Also, function's parameters has been extended according to the name
of fields in standard SETUP packet.
Additionally, patch adds usb_decode_ctrl function to
include/linux/usb/gadget.h file.

Signed-off-by: Pawel Laszczak <pawell@cadence.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/dwc3/debug.h    | 252 -----------------------------------------
 drivers/usb/dwc3/trace.h    |   2 +-
 drivers/usb/gadget/Makefile |   1 +
 drivers/usb/gadget/debug.c  | 268 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/gadget.h  |  26 +++++
 5 files changed, 296 insertions(+), 253 deletions(-)
 create mode 100644 drivers/usb/gadget/debug.c

(limited to 'include/linux')

diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h
index 068259fdfb0c..9baabed87d61 100644
--- a/drivers/usb/dwc3/debug.h
+++ b/drivers/usb/dwc3/debug.h
@@ -246,258 +246,6 @@ static inline const char *dwc3_gadget_event_string(char *str, size_t size,
 	return str;
 }
 
-static inline void dwc3_decode_get_status(__u8 t, __u16 i, __u16 l, char *str,
-		size_t size)
-{
-	switch (t & USB_RECIP_MASK) {
-	case USB_RECIP_DEVICE:
-		snprintf(str, size, "Get Device Status(Length = %d)", l);
-		break;
-	case USB_RECIP_INTERFACE:
-		snprintf(str, size, "Get Interface Status(Intf = %d, Length = %d)",
-				i, l);
-		break;
-	case USB_RECIP_ENDPOINT:
-		snprintf(str, size, "Get Endpoint Status(ep%d%s)",
-			i & ~USB_DIR_IN,
-			i & USB_DIR_IN ? "in" : "out");
-		break;
-	}
-}
-
-static inline void dwc3_decode_set_clear_feature(__u8 t, __u8 b, __u16 v,
-		__u16 i, char *str, size_t size)
-{
-	switch (t & USB_RECIP_MASK) {
-	case USB_RECIP_DEVICE:
-		snprintf(str, size, "%s Device Feature(%s%s)",
-			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			({char *s;
-				switch (v) {
-				case USB_DEVICE_SELF_POWERED:
-					s = "Self Powered";
-					break;
-				case USB_DEVICE_REMOTE_WAKEUP:
-					s = "Remote Wakeup";
-					break;
-				case USB_DEVICE_TEST_MODE:
-					s = "Test Mode";
-					break;
-				case USB_DEVICE_U1_ENABLE:
-					s = "U1 Enable";
-					break;
-				case USB_DEVICE_U2_ENABLE:
-					s = "U2 Enable";
-					break;
-				case USB_DEVICE_LTM_ENABLE:
-					s = "LTM Enable";
-					break;
-				default:
-					s = "UNKNOWN";
-				} s; }),
-			v == USB_DEVICE_TEST_MODE ?
-			({ char *s;
-				switch (i) {
-				case TEST_J:
-					s = ": TEST_J";
-					break;
-				case TEST_K:
-					s = ": TEST_K";
-					break;
-				case TEST_SE0_NAK:
-					s = ": TEST_SE0_NAK";
-					break;
-				case TEST_PACKET:
-					s = ": TEST_PACKET";
-					break;
-				case TEST_FORCE_EN:
-					s = ": TEST_FORCE_EN";
-					break;
-				default:
-					s = ": UNKNOWN";
-				} s; }) : "");
-		break;
-	case USB_RECIP_INTERFACE:
-		snprintf(str, size, "%s Interface Feature(%s)",
-			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			v == USB_INTRF_FUNC_SUSPEND ?
-			"Function Suspend" : "UNKNOWN");
-		break;
-	case USB_RECIP_ENDPOINT:
-		snprintf(str, size, "%s Endpoint Feature(%s ep%d%s)",
-			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			v == USB_ENDPOINT_HALT ? "Halt" : "UNKNOWN",
-			i & ~USB_DIR_IN,
-			i & USB_DIR_IN ? "in" : "out");
-		break;
-	}
-}
-
-static inline void dwc3_decode_set_address(__u16 v, char *str, size_t size)
-{
-	snprintf(str, size, "Set Address(Addr = %02x)", v);
-}
-
-static inline void dwc3_decode_get_set_descriptor(__u8 t, __u8 b, __u16 v,
-		__u16 i, __u16 l, char *str, size_t size)
-{
-	snprintf(str, size, "%s %s Descriptor(Index = %d, Length = %d)",
-		b == USB_REQ_GET_DESCRIPTOR ? "Get" : "Set",
-		({ char *s;
-			switch (v >> 8) {
-			case USB_DT_DEVICE:
-				s = "Device";
-				break;
-			case USB_DT_CONFIG:
-				s = "Configuration";
-				break;
-			case USB_DT_STRING:
-				s = "String";
-				break;
-			case USB_DT_INTERFACE:
-				s = "Interface";
-				break;
-			case USB_DT_ENDPOINT:
-				s = "Endpoint";
-				break;
-			case USB_DT_DEVICE_QUALIFIER:
-				s = "Device Qualifier";
-				break;
-			case USB_DT_OTHER_SPEED_CONFIG:
-				s = "Other Speed Config";
-				break;
-			case USB_DT_INTERFACE_POWER:
-				s = "Interface Power";
-				break;
-			case USB_DT_OTG:
-				s = "OTG";
-				break;
-			case USB_DT_DEBUG:
-				s = "Debug";
-				break;
-			case USB_DT_INTERFACE_ASSOCIATION:
-				s = "Interface Association";
-				break;
-			case USB_DT_BOS:
-				s = "BOS";
-				break;
-			case USB_DT_DEVICE_CAPABILITY:
-				s = "Device Capability";
-				break;
-			case USB_DT_PIPE_USAGE:
-				s = "Pipe Usage";
-				break;
-			case USB_DT_SS_ENDPOINT_COMP:
-				s = "SS Endpoint Companion";
-				break;
-			case USB_DT_SSP_ISOC_ENDPOINT_COMP:
-				s = "SSP Isochronous Endpoint Companion";
-				break;
-			default:
-				s = "UNKNOWN";
-				break;
-			} s; }), v & 0xff, l);
-}
-
-
-static inline void dwc3_decode_get_configuration(__u16 l, char *str,
-		size_t size)
-{
-	snprintf(str, size, "Get Configuration(Length = %d)", l);
-}
-
-static inline void dwc3_decode_set_configuration(__u8 v, char *str, size_t size)
-{
-	snprintf(str, size, "Set Configuration(Config = %d)", v);
-}
-
-static inline void dwc3_decode_get_intf(__u16 i, __u16 l, char *str,
-		size_t size)
-{
-	snprintf(str, size, "Get Interface(Intf = %d, Length = %d)", i, l);
-}
-
-static inline void dwc3_decode_set_intf(__u8 v, __u16 i, char *str, size_t size)
-{
-	snprintf(str, size, "Set Interface(Intf = %d, Alt.Setting = %d)", i, v);
-}
-
-static inline void dwc3_decode_synch_frame(__u16 i, __u16 l, char *str,
-		size_t size)
-{
-	snprintf(str, size, "Synch Frame(Endpoint = %d, Length = %d)", i, l);
-}
-
-static inline void dwc3_decode_set_sel(__u16 l, char *str, size_t size)
-{
-	snprintf(str, size, "Set SEL(Length = %d)", l);
-}
-
-static inline void dwc3_decode_set_isoch_delay(__u8 v, char *str, size_t size)
-{
-	snprintf(str, size, "Set Isochronous Delay(Delay = %d ns)", v);
-}
-
-/**
- * dwc3_decode_ctrl - returns a string represetion of ctrl request
- */
-static inline const char *dwc3_decode_ctrl(char *str, size_t size,
-		__u8 bRequestType, __u8 bRequest, __u16 wValue, __u16 wIndex,
-		__u16 wLength)
-{
-	switch (bRequest) {
-	case USB_REQ_GET_STATUS:
-		dwc3_decode_get_status(bRequestType, wIndex, wLength, str,
-				size);
-		break;
-	case USB_REQ_CLEAR_FEATURE:
-	case USB_REQ_SET_FEATURE:
-		dwc3_decode_set_clear_feature(bRequestType, bRequest, wValue,
-				wIndex, str, size);
-		break;
-	case USB_REQ_SET_ADDRESS:
-		dwc3_decode_set_address(wValue, str, size);
-		break;
-	case USB_REQ_GET_DESCRIPTOR:
-	case USB_REQ_SET_DESCRIPTOR:
-		dwc3_decode_get_set_descriptor(bRequestType, bRequest, wValue,
-				wIndex, wLength, str, size);
-		break;
-	case USB_REQ_GET_CONFIGURATION:
-		dwc3_decode_get_configuration(wLength, str, size);
-		break;
-	case USB_REQ_SET_CONFIGURATION:
-		dwc3_decode_set_configuration(wValue, str, size);
-		break;
-	case USB_REQ_GET_INTERFACE:
-		dwc3_decode_get_intf(wIndex, wLength, str, size);
-		break;
-	case USB_REQ_SET_INTERFACE:
-		dwc3_decode_set_intf(wValue, wIndex, str, size);
-		break;
-	case USB_REQ_SYNCH_FRAME:
-		dwc3_decode_synch_frame(wIndex, wLength, str, size);
-		break;
-	case USB_REQ_SET_SEL:
-		dwc3_decode_set_sel(wLength, str, size);
-		break;
-	case USB_REQ_SET_ISOCH_DELAY:
-		dwc3_decode_set_isoch_delay(wValue, str, size);
-		break;
-	default:
-		snprintf(str, size, "%02x %02x %02x %02x %02x %02x %02x %02x",
-			bRequestType, bRequest,
-			cpu_to_le16(wValue) & 0xff,
-			cpu_to_le16(wValue) >> 8,
-			cpu_to_le16(wIndex) & 0xff,
-			cpu_to_le16(wIndex) >> 8,
-			cpu_to_le16(wLength) & 0xff,
-			cpu_to_le16(wLength) >> 8);
-	}
-
-	return str;
-}
-
 /**
  * dwc3_ep_event_string - returns event name
  * @event: then event code
diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h
index 818a63da1a44..9edff17111f7 100644
--- a/drivers/usb/dwc3/trace.h
+++ b/drivers/usb/dwc3/trace.h
@@ -86,7 +86,7 @@ DECLARE_EVENT_CLASS(dwc3_log_ctrl,
 		__entry->wIndex = le16_to_cpu(ctrl->wIndex);
 		__entry->wLength = le16_to_cpu(ctrl->wLength);
 	),
-	TP_printk("%s", dwc3_decode_ctrl(__get_str(str), DWC3_MSG_MAX,
+	TP_printk("%s", usb_decode_ctrl(__get_str(str), DWC3_MSG_MAX,
 					__entry->bRequestType,
 					__entry->bRequest, __entry->wValue,
 					__entry->wIndex, __entry->wLength)
diff --git a/drivers/usb/gadget/Makefile b/drivers/usb/gadget/Makefile
index 130dad7130b6..500a5a592abe 100644
--- a/drivers/usb/gadget/Makefile
+++ b/drivers/usb/gadget/Makefile
@@ -9,5 +9,6 @@ ccflags-y				+= -I$(srctree)/drivers/usb/gadget/udc
 obj-$(CONFIG_USB_LIBCOMPOSITE)	+= libcomposite.o
 libcomposite-y			:= usbstring.o config.o epautoconf.o
 libcomposite-y			+= composite.o functions.o configfs.o u_f.o
+libcomposite-y			+= debug.o
 
 obj-$(CONFIG_USB_GADGET)	+= udc/ function/ legacy/
diff --git a/drivers/usb/gadget/debug.c b/drivers/usb/gadget/debug.c
new file mode 100644
index 000000000000..d5a469bc67a3
--- /dev/null
+++ b/drivers/usb/gadget/debug.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * Common USB debugging functions
+ *
+ * Copyright (C) 2010-2011 Texas Instruments Incorporated - http://www.ti.com
+ *
+ * Authors: Felipe Balbi <balbi@ti.com>,
+ *	    Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ */
+
+#include <linux/usb/ch9.h>
+
+static void usb_decode_get_status(__u8 bRequestType, __u16 wIndex,
+				  __u16 wLength, char *str, size_t size)
+{
+	switch (bRequestType & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		snprintf(str, size, "Get Device Status(Length = %d)", wLength);
+		break;
+	case USB_RECIP_INTERFACE:
+		snprintf(str, size,
+			 "Get Interface Status(Intf = %d, Length = %d)",
+			 wIndex, wLength);
+		break;
+	case USB_RECIP_ENDPOINT:
+		snprintf(str, size, "Get Endpoint Status(ep%d%s)",
+			 wIndex & ~USB_DIR_IN,
+			 wIndex & USB_DIR_IN ? "in" : "out");
+		break;
+	}
+}
+
+static void usb_decode_set_clear_feature(__u8 bRequestType, __u8 bRequest,
+					 __u16 wValue, __u16 wIndex,
+					 char *str, size_t size)
+{
+	switch (bRequestType & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		snprintf(str, size, "%s Device Feature(%s%s)",
+			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			 ({char *s;
+				switch (wValue) {
+				case USB_DEVICE_SELF_POWERED:
+					s = "Self Powered";
+					break;
+				case USB_DEVICE_REMOTE_WAKEUP:
+					s = "Remote Wakeup";
+					break;
+				case USB_DEVICE_TEST_MODE:
+					s = "Test Mode";
+					break;
+				case USB_DEVICE_U1_ENABLE:
+					s = "U1 Enable";
+					break;
+				case USB_DEVICE_U2_ENABLE:
+					s = "U2 Enable";
+					break;
+				case USB_DEVICE_LTM_ENABLE:
+					s = "LTM Enable";
+					break;
+				default:
+					s = "UNKNOWN";
+				} s; }),
+			 wValue == USB_DEVICE_TEST_MODE ?
+			 ({ char *s;
+				switch (wIndex) {
+				case TEST_J:
+					s = ": TEST_J";
+					break;
+				case TEST_K:
+					s = ": TEST_K";
+					break;
+				case TEST_SE0_NAK:
+					s = ": TEST_SE0_NAK";
+					break;
+				case TEST_PACKET:
+					s = ": TEST_PACKET";
+					break;
+				case TEST_FORCE_EN:
+					s = ": TEST_FORCE_EN";
+					break;
+				default:
+					s = ": UNKNOWN";
+				} s; }) : "");
+		break;
+	case USB_RECIP_INTERFACE:
+		snprintf(str, size, "%s Interface Feature(%s)",
+			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			 wValue == USB_INTRF_FUNC_SUSPEND ?
+			 "Function Suspend" : "UNKNOWN");
+		break;
+	case USB_RECIP_ENDPOINT:
+		snprintf(str, size, "%s Endpoint Feature(%s ep%d%s)",
+			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			 wValue == USB_ENDPOINT_HALT ? "Halt" : "UNKNOWN",
+			 wIndex & ~USB_DIR_IN,
+			 wIndex & USB_DIR_IN ? "in" : "out");
+		break;
+	}
+}
+
+static void usb_decode_set_address(__u16 wValue, char *str, size_t size)
+{
+	snprintf(str, size, "Set Address(Addr = %02x)", wValue);
+}
+
+static void usb_decode_get_set_descriptor(__u8 bRequestType, __u8 bRequest,
+					  __u16 wValue, __u16 wIndex,
+					  __u16 wLength, char *str, size_t size)
+{
+	snprintf(str, size, "%s %s Descriptor(Index = %d, Length = %d)",
+		 bRequest == USB_REQ_GET_DESCRIPTOR ? "Get" : "Set",
+		 ({ char *s;
+			switch (wValue >> 8) {
+			case USB_DT_DEVICE:
+				s = "Device";
+				break;
+			case USB_DT_CONFIG:
+				s = "Configuration";
+				break;
+			case USB_DT_STRING:
+				s = "String";
+				break;
+			case USB_DT_INTERFACE:
+				s = "Interface";
+				break;
+			case USB_DT_ENDPOINT:
+				s = "Endpoint";
+				break;
+			case USB_DT_DEVICE_QUALIFIER:
+				s = "Device Qualifier";
+				break;
+			case USB_DT_OTHER_SPEED_CONFIG:
+				s = "Other Speed Config";
+				break;
+			case USB_DT_INTERFACE_POWER:
+				s = "Interface Power";
+				break;
+			case USB_DT_OTG:
+				s = "OTG";
+				break;
+			case USB_DT_DEBUG:
+				s = "Debug";
+				break;
+			case USB_DT_INTERFACE_ASSOCIATION:
+				s = "Interface Association";
+				break;
+			case USB_DT_BOS:
+				s = "BOS";
+				break;
+			case USB_DT_DEVICE_CAPABILITY:
+				s = "Device Capability";
+				break;
+			case USB_DT_PIPE_USAGE:
+				s = "Pipe Usage";
+				break;
+			case USB_DT_SS_ENDPOINT_COMP:
+				s = "SS Endpoint Companion";
+				break;
+			case USB_DT_SSP_ISOC_ENDPOINT_COMP:
+				s = "SSP Isochronous Endpoint Companion";
+				break;
+			default:
+				s = "UNKNOWN";
+				break;
+			} s; }), wValue & 0xff, wLength);
+}
+
+static void usb_decode_get_configuration(__u16 wLength, char *str, size_t size)
+{
+	snprintf(str, size, "Get Configuration(Length = %d)", wLength);
+}
+
+static void usb_decode_set_configuration(__u8 wValue, char *str, size_t size)
+{
+	snprintf(str, size, "Set Configuration(Config = %d)", wValue);
+}
+
+static void usb_decode_get_intf(__u16 wIndex, __u16 wLength, char *str,
+				size_t size)
+{
+	snprintf(str, size, "Get Interface(Intf = %d, Length = %d)",
+		 wIndex, wLength);
+}
+
+static void usb_decode_set_intf(__u8 wValue, __u16 wIndex, char *str,
+				size_t size)
+{
+	snprintf(str, size, "Set Interface(Intf = %d, Alt.Setting = %d)",
+		 wIndex, wValue);
+}
+
+static void usb_decode_synch_frame(__u16 wIndex, __u16 wLength,
+				   char *str, size_t size)
+{
+	snprintf(str, size, "Synch Frame(Endpoint = %d, Length = %d)",
+		 wIndex, wLength);
+}
+
+static void usb_decode_set_sel(__u16 wLength, char *str, size_t size)
+{
+	snprintf(str, size, "Set SEL(Length = %d)", wLength);
+}
+
+static void usb_decode_set_isoch_delay(__u8 wValue, char *str, size_t size)
+{
+	snprintf(str, size, "Set Isochronous Delay(Delay = %d ns)", wValue);
+}
+
+/**
+ * usb_decode_ctrl - returns a string representation of ctrl request
+ */
+const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
+			    __u8 bRequest, __u16 wValue, __u16 wIndex,
+			    __u16 wLength)
+{
+	switch (bRequest) {
+	case USB_REQ_GET_STATUS:
+		usb_decode_get_status(bRequestType, wIndex, wLength, str, size);
+		break;
+	case USB_REQ_CLEAR_FEATURE:
+	case USB_REQ_SET_FEATURE:
+		usb_decode_set_clear_feature(bRequestType, bRequest, wValue,
+					     wIndex, str, size);
+		break;
+	case USB_REQ_SET_ADDRESS:
+		usb_decode_set_address(wValue, str, size);
+		break;
+	case USB_REQ_GET_DESCRIPTOR:
+	case USB_REQ_SET_DESCRIPTOR:
+		usb_decode_get_set_descriptor(bRequestType, bRequest, wValue,
+					      wIndex, wLength, str, size);
+		break;
+	case USB_REQ_GET_CONFIGURATION:
+		usb_decode_get_configuration(wLength, str, size);
+		break;
+	case USB_REQ_SET_CONFIGURATION:
+		usb_decode_set_configuration(wValue, str, size);
+		break;
+	case USB_REQ_GET_INTERFACE:
+		usb_decode_get_intf(wIndex, wLength, str, size);
+		break;
+	case USB_REQ_SET_INTERFACE:
+		usb_decode_set_intf(wValue, wIndex, str, size);
+		break;
+	case USB_REQ_SYNCH_FRAME:
+		usb_decode_synch_frame(wIndex, wLength, str, size);
+		break;
+	case USB_REQ_SET_SEL:
+		usb_decode_set_sel(wLength, str, size);
+		break;
+	case USB_REQ_SET_ISOCH_DELAY:
+		usb_decode_set_isoch_delay(wValue, str, size);
+		break;
+	default:
+		snprintf(str, size, "%02x %02x %02x %02x %02x %02x %02x %02x",
+			 bRequestType, bRequest,
+			 (u8)(cpu_to_le16(wValue) & 0xff),
+			 (u8)(cpu_to_le16(wValue) >> 8),
+			 (u8)(cpu_to_le16(wIndex) & 0xff),
+			 (u8)(cpu_to_le16(wIndex) >> 8),
+			 (u8)(cpu_to_le16(wLength) & 0xff),
+			 (u8)(cpu_to_le16(wLength) >> 8));
+	}
+
+	return str;
+}
+EXPORT_SYMBOL_GPL(usb_decode_ctrl);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index fb19141151d8..42902fcc8696 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -889,4 +889,30 @@ extern void usb_ep_autoconfig_release(struct usb_ep *);
 
 extern void usb_ep_autoconfig_reset(struct usb_gadget *);
 
+/*-------------------------------------------------------------------------*/
+/**
+ * usb_decode_ctrl - Returns human readable representation of control request.
+ * @str: buffer to return a human-readable representation of control request.
+ *       This buffer should have about 200 bytes.
+ * @size: size of str buffer.
+ * @bRequestType: matches the USB bmRequestType field
+ * @bRequest: matches the USB bRequest field
+ * @wValue: matches the USB wValue field (CPU byte order)
+ * @wIndex: matches the USB wIndex field (CPU byte order)
+ * @wLength: matches the USB wLength field (CPU byte order)
+ *
+ * Function returns decoded, formatted and human-readable description of
+ * control request packet.
+ *
+ * The usage scenario for this is for tracepoints, so function as a return
+ * use the same value as in parameters. This approach allows to use this
+ * function in TP_printk
+ *
+ * Important: wValue, wIndex, wLength parameters before invoking this function
+ * should be processed by le16_to_cpu macro.
+ */
+extern const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
+			__u8 bRequest, __u16 wValue, __u16 wIndex,
+			__u16 wLength);
+
 #endif /* __LINUX_USB_GADGET_H */
-- 
cgit v1.2.3


From f50dfaf772db187deb562764e7aa3b988d6bc538 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 30 Jun 2019 16:03:02 +0200
Subject: misc: fsa9480: Delete this driver

The FSA9480 has a new driver more appropriately located
in the drivers/extcon subsystem. It is also more complete
and includes device tree support. Delete the old misc
driver.

Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Pawe Chmiel <pawel.mikolaj.chmiel@gmail.com>
Link: https://lore.kernel.org/r/20190630140302.16245-1-linus.walleij@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/Kconfig                  |   9 -
 drivers/misc/Makefile                 |   1 -
 drivers/misc/fsa9480.c                | 547 ----------------------------------
 include/linux/platform_data/fsa9480.h |  24 --
 4 files changed, 581 deletions(-)
 delete mode 100644 drivers/misc/fsa9480.c
 delete mode 100644 include/linux/platform_data/fsa9480.h

(limited to 'include/linux')

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 8110d6a00c86..6abfc8e92fcc 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -422,15 +422,6 @@ config PCH_PHUB
 	  To compile this driver as a module, choose M here: the module will
 	  be called pch_phub.
 
-config USB_SWITCH_FSA9480
-	tristate "FSA9480 USB Switch"
-	depends on I2C
-	help
-	  The FSA9480 is a USB port accessory detector and switch.
-	  The FSA9480 is fully controlled using I2C and enables USB data,
-	  stereo and mono audio, video, microphone and UART data to use
-	  a common connector port.
-
 config LATTICE_ECP3_CONFIG
 	tristate "Lattice ECP3 FPGA bitstream configuration via SPI"
 	depends on SPI && SYSFS
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 0cb35466c578..abd8ae249746 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_VMWARE_BALLOON)	+= vmw_balloon.o
 obj-$(CONFIG_PCH_PHUB)		+= pch_phub.o
 obj-y				+= ti-st/
 obj-y				+= lis3lv02d/
-obj-$(CONFIG_USB_SWITCH_FSA9480) += fsa9480.o
 obj-$(CONFIG_ALTERA_STAPL)	+=altera-stapl/
 obj-$(CONFIG_INTEL_MEI)		+= mei/
 obj-$(CONFIG_VMWARE_VMCI)	+= vmw_vmci/
diff --git a/drivers/misc/fsa9480.c b/drivers/misc/fsa9480.c
deleted file mode 100644
index 4e11807040d3..000000000000
--- a/drivers/misc/fsa9480.c
+++ /dev/null
@@ -1,547 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * fsa9480.c - FSA9480 micro USB switch device driver
- *
- * Copyright (C) 2010 Samsung Electronics
- * Minkyu Kang <mk7.kang@samsung.com>
- * Wonguk Jeong <wonguk.jeong@samsung.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/fsa9480.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/workqueue.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/pm_runtime.h>
-
-/* FSA9480 I2C registers */
-#define FSA9480_REG_DEVID		0x01
-#define FSA9480_REG_CTRL		0x02
-#define FSA9480_REG_INT1		0x03
-#define FSA9480_REG_INT2		0x04
-#define FSA9480_REG_INT1_MASK		0x05
-#define FSA9480_REG_INT2_MASK		0x06
-#define FSA9480_REG_ADC			0x07
-#define FSA9480_REG_TIMING1		0x08
-#define FSA9480_REG_TIMING2		0x09
-#define FSA9480_REG_DEV_T1		0x0a
-#define FSA9480_REG_DEV_T2		0x0b
-#define FSA9480_REG_BTN1		0x0c
-#define FSA9480_REG_BTN2		0x0d
-#define FSA9480_REG_CK			0x0e
-#define FSA9480_REG_CK_INT1		0x0f
-#define FSA9480_REG_CK_INT2		0x10
-#define FSA9480_REG_CK_INTMASK1		0x11
-#define FSA9480_REG_CK_INTMASK2		0x12
-#define FSA9480_REG_MANSW1		0x13
-#define FSA9480_REG_MANSW2		0x14
-
-/* Control */
-#define CON_SWITCH_OPEN		(1 << 4)
-#define CON_RAW_DATA		(1 << 3)
-#define CON_MANUAL_SW		(1 << 2)
-#define CON_WAIT		(1 << 1)
-#define CON_INT_MASK		(1 << 0)
-#define CON_MASK		(CON_SWITCH_OPEN | CON_RAW_DATA | \
-				CON_MANUAL_SW | CON_WAIT)
-
-/* Device Type 1 */
-#define DEV_USB_OTG		(1 << 7)
-#define DEV_DEDICATED_CHG	(1 << 6)
-#define DEV_USB_CHG		(1 << 5)
-#define DEV_CAR_KIT		(1 << 4)
-#define DEV_UART		(1 << 3)
-#define DEV_USB			(1 << 2)
-#define DEV_AUDIO_2		(1 << 1)
-#define DEV_AUDIO_1		(1 << 0)
-
-#define DEV_T1_USB_MASK		(DEV_USB_OTG | DEV_USB)
-#define DEV_T1_UART_MASK	(DEV_UART)
-#define DEV_T1_CHARGER_MASK	(DEV_DEDICATED_CHG | DEV_USB_CHG)
-
-/* Device Type 2 */
-#define DEV_AV			(1 << 6)
-#define DEV_TTY			(1 << 5)
-#define DEV_PPD			(1 << 4)
-#define DEV_JIG_UART_OFF	(1 << 3)
-#define DEV_JIG_UART_ON		(1 << 2)
-#define DEV_JIG_USB_OFF		(1 << 1)
-#define DEV_JIG_USB_ON		(1 << 0)
-
-#define DEV_T2_USB_MASK		(DEV_JIG_USB_OFF | DEV_JIG_USB_ON)
-#define DEV_T2_UART_MASK	(DEV_JIG_UART_OFF | DEV_JIG_UART_ON)
-#define DEV_T2_JIG_MASK		(DEV_JIG_USB_OFF | DEV_JIG_USB_ON | \
-				DEV_JIG_UART_OFF | DEV_JIG_UART_ON)
-
-/*
- * Manual Switch
- * D- [7:5] / D+ [4:2]
- * 000: Open all / 001: USB / 010: AUDIO / 011: UART / 100: V_AUDIO
- */
-#define SW_VAUDIO		((4 << 5) | (4 << 2))
-#define SW_UART			((3 << 5) | (3 << 2))
-#define SW_AUDIO		((2 << 5) | (2 << 2))
-#define SW_DHOST		((1 << 5) | (1 << 2))
-#define SW_AUTO			((0 << 5) | (0 << 2))
-
-/* Interrupt 1 */
-#define INT_DETACH		(1 << 1)
-#define INT_ATTACH		(1 << 0)
-
-struct fsa9480_usbsw {
-	struct i2c_client		*client;
-	struct fsa9480_platform_data	*pdata;
-	int				dev1;
-	int				dev2;
-	int				mansw;
-};
-
-static struct fsa9480_usbsw *chip;
-
-static int fsa9480_write_reg(struct i2c_client *client,
-		int reg, int value)
-{
-	int ret;
-
-	ret = i2c_smbus_write_byte_data(client, reg, value);
-
-	if (ret < 0)
-		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
-
-	return ret;
-}
-
-static int fsa9480_read_reg(struct i2c_client *client, int reg)
-{
-	int ret;
-
-	ret = i2c_smbus_read_byte_data(client, reg);
-
-	if (ret < 0)
-		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
-
-	return ret;
-}
-
-static int fsa9480_read_irq(struct i2c_client *client, int *value)
-{
-	int ret;
-
-	ret = i2c_smbus_read_i2c_block_data(client,
-			FSA9480_REG_INT1, 2, (u8 *)value);
-	*value &= 0xffff;
-
-	if (ret < 0)
-		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
-
-	return ret;
-}
-
-static void fsa9480_set_switch(const char *buf)
-{
-	struct fsa9480_usbsw *usbsw = chip;
-	struct i2c_client *client = usbsw->client;
-	unsigned int value;
-	unsigned int path = 0;
-
-	value = fsa9480_read_reg(client, FSA9480_REG_CTRL);
-
-	if (!strncmp(buf, "VAUDIO", 6)) {
-		path = SW_VAUDIO;
-		value &= ~CON_MANUAL_SW;
-	} else if (!strncmp(buf, "UART", 4)) {
-		path = SW_UART;
-		value &= ~CON_MANUAL_SW;
-	} else if (!strncmp(buf, "AUDIO", 5)) {
-		path = SW_AUDIO;
-		value &= ~CON_MANUAL_SW;
-	} else if (!strncmp(buf, "DHOST", 5)) {
-		path = SW_DHOST;
-		value &= ~CON_MANUAL_SW;
-	} else if (!strncmp(buf, "AUTO", 4)) {
-		path = SW_AUTO;
-		value |= CON_MANUAL_SW;
-	} else {
-		printk(KERN_ERR "Wrong command\n");
-		return;
-	}
-
-	usbsw->mansw = path;
-	fsa9480_write_reg(client, FSA9480_REG_MANSW1, path);
-	fsa9480_write_reg(client, FSA9480_REG_CTRL, value);
-}
-
-static ssize_t fsa9480_get_switch(char *buf)
-{
-	struct fsa9480_usbsw *usbsw = chip;
-	struct i2c_client *client = usbsw->client;
-	unsigned int value;
-
-	value = fsa9480_read_reg(client, FSA9480_REG_MANSW1);
-
-	if (value == SW_VAUDIO)
-		return sprintf(buf, "VAUDIO\n");
-	else if (value == SW_UART)
-		return sprintf(buf, "UART\n");
-	else if (value == SW_AUDIO)
-		return sprintf(buf, "AUDIO\n");
-	else if (value == SW_DHOST)
-		return sprintf(buf, "DHOST\n");
-	else if (value == SW_AUTO)
-		return sprintf(buf, "AUTO\n");
-	else
-		return sprintf(buf, "%x", value);
-}
-
-static ssize_t fsa9480_show_device(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
-{
-	struct fsa9480_usbsw *usbsw = dev_get_drvdata(dev);
-	struct i2c_client *client = usbsw->client;
-	int dev1, dev2;
-
-	dev1 = fsa9480_read_reg(client, FSA9480_REG_DEV_T1);
-	dev2 = fsa9480_read_reg(client, FSA9480_REG_DEV_T2);
-
-	if (!dev1 && !dev2)
-		return sprintf(buf, "NONE\n");
-
-	/* USB */
-	if (dev1 & DEV_T1_USB_MASK || dev2 & DEV_T2_USB_MASK)
-		return sprintf(buf, "USB\n");
-
-	/* UART */
-	if (dev1 & DEV_T1_UART_MASK || dev2 & DEV_T2_UART_MASK)
-		return sprintf(buf, "UART\n");
-
-	/* CHARGER */
-	if (dev1 & DEV_T1_CHARGER_MASK)
-		return sprintf(buf, "CHARGER\n");
-
-	/* JIG */
-	if (dev2 & DEV_T2_JIG_MASK)
-		return sprintf(buf, "JIG\n");
-
-	return sprintf(buf, "UNKNOWN\n");
-}
-
-static ssize_t fsa9480_show_manualsw(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	return fsa9480_get_switch(buf);
-
-}
-
-static ssize_t fsa9480_set_manualsw(struct device *dev,
-				    struct device_attribute *attr,
-				    const char *buf, size_t count)
-{
-	fsa9480_set_switch(buf);
-
-	return count;
-}
-
-static DEVICE_ATTR(device, S_IRUGO, fsa9480_show_device, NULL);
-static DEVICE_ATTR(switch, S_IRUGO | S_IWUSR,
-		fsa9480_show_manualsw, fsa9480_set_manualsw);
-
-static struct attribute *fsa9480_attributes[] = {
-	&dev_attr_device.attr,
-	&dev_attr_switch.attr,
-	NULL
-};
-
-static const struct attribute_group fsa9480_group = {
-	.attrs = fsa9480_attributes,
-};
-
-static void fsa9480_detect_dev(struct fsa9480_usbsw *usbsw, int intr)
-{
-	int val1, val2, ctrl;
-	struct fsa9480_platform_data *pdata = usbsw->pdata;
-	struct i2c_client *client = usbsw->client;
-
-	val1 = fsa9480_read_reg(client, FSA9480_REG_DEV_T1);
-	val2 = fsa9480_read_reg(client, FSA9480_REG_DEV_T2);
-	ctrl = fsa9480_read_reg(client, FSA9480_REG_CTRL);
-
-	dev_info(&client->dev, "intr: 0x%x, dev1: 0x%x, dev2: 0x%x\n",
-			intr, val1, val2);
-
-	if (!intr)
-		goto out;
-
-	if (intr & INT_ATTACH) {	/* Attached */
-		/* USB */
-		if (val1 & DEV_T1_USB_MASK || val2 & DEV_T2_USB_MASK) {
-			if (pdata->usb_cb)
-				pdata->usb_cb(FSA9480_ATTACHED);
-
-			if (usbsw->mansw) {
-				fsa9480_write_reg(client,
-					FSA9480_REG_MANSW1, usbsw->mansw);
-			}
-		}
-
-		/* UART */
-		if (val1 & DEV_T1_UART_MASK || val2 & DEV_T2_UART_MASK) {
-			if (pdata->uart_cb)
-				pdata->uart_cb(FSA9480_ATTACHED);
-
-			if (!(ctrl & CON_MANUAL_SW)) {
-				fsa9480_write_reg(client,
-					FSA9480_REG_MANSW1, SW_UART);
-			}
-		}
-
-		/* CHARGER */
-		if (val1 & DEV_T1_CHARGER_MASK) {
-			if (pdata->charger_cb)
-				pdata->charger_cb(FSA9480_ATTACHED);
-		}
-
-		/* JIG */
-		if (val2 & DEV_T2_JIG_MASK) {
-			if (pdata->jig_cb)
-				pdata->jig_cb(FSA9480_ATTACHED);
-		}
-	} else if (intr & INT_DETACH) {	/* Detached */
-		/* USB */
-		if (usbsw->dev1 & DEV_T1_USB_MASK ||
-			usbsw->dev2 & DEV_T2_USB_MASK) {
-			if (pdata->usb_cb)
-				pdata->usb_cb(FSA9480_DETACHED);
-		}
-
-		/* UART */
-		if (usbsw->dev1 & DEV_T1_UART_MASK ||
-			usbsw->dev2 & DEV_T2_UART_MASK) {
-			if (pdata->uart_cb)
-				pdata->uart_cb(FSA9480_DETACHED);
-		}
-
-		/* CHARGER */
-		if (usbsw->dev1 & DEV_T1_CHARGER_MASK) {
-			if (pdata->charger_cb)
-				pdata->charger_cb(FSA9480_DETACHED);
-		}
-
-		/* JIG */
-		if (usbsw->dev2 & DEV_T2_JIG_MASK) {
-			if (pdata->jig_cb)
-				pdata->jig_cb(FSA9480_DETACHED);
-		}
-	}
-
-	usbsw->dev1 = val1;
-	usbsw->dev2 = val2;
-
-out:
-	ctrl &= ~CON_INT_MASK;
-	fsa9480_write_reg(client, FSA9480_REG_CTRL, ctrl);
-}
-
-static irqreturn_t fsa9480_irq_handler(int irq, void *data)
-{
-	struct fsa9480_usbsw *usbsw = data;
-	struct i2c_client *client = usbsw->client;
-	int intr;
-
-	/* clear interrupt */
-	fsa9480_read_irq(client, &intr);
-
-	/* device detection */
-	fsa9480_detect_dev(usbsw, intr);
-
-	return IRQ_HANDLED;
-}
-
-static int fsa9480_irq_init(struct fsa9480_usbsw *usbsw)
-{
-	struct fsa9480_platform_data *pdata = usbsw->pdata;
-	struct i2c_client *client = usbsw->client;
-	int ret;
-	int intr;
-	unsigned int ctrl = CON_MASK;
-
-	/* clear interrupt */
-	fsa9480_read_irq(client, &intr);
-
-	/* unmask interrupt (attach/detach only) */
-	fsa9480_write_reg(client, FSA9480_REG_INT1_MASK, 0xfc);
-	fsa9480_write_reg(client, FSA9480_REG_INT2_MASK, 0x1f);
-
-	usbsw->mansw = fsa9480_read_reg(client, FSA9480_REG_MANSW1);
-
-	if (usbsw->mansw)
-		ctrl &= ~CON_MANUAL_SW;	/* Manual Switching Mode */
-
-	fsa9480_write_reg(client, FSA9480_REG_CTRL, ctrl);
-
-	if (pdata && pdata->cfg_gpio)
-		pdata->cfg_gpio();
-
-	if (client->irq) {
-		ret = request_threaded_irq(client->irq, NULL,
-				fsa9480_irq_handler,
-				IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-				"fsa9480 micro USB", usbsw);
-		if (ret) {
-			dev_err(&client->dev, "failed to request IRQ\n");
-			return ret;
-		}
-
-		if (pdata)
-			device_init_wakeup(&client->dev, pdata->wakeup);
-	}
-
-	return 0;
-}
-
-static int fsa9480_probe(struct i2c_client *client,
-			 const struct i2c_device_id *id)
-{
-	struct i2c_adapter *adapter = client->adapter;
-	struct fsa9480_usbsw *usbsw;
-	int ret = 0;
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
-		return -EIO;
-
-	usbsw = kzalloc(sizeof(struct fsa9480_usbsw), GFP_KERNEL);
-	if (!usbsw) {
-		dev_err(&client->dev, "failed to allocate driver data\n");
-		return -ENOMEM;
-	}
-
-	usbsw->client = client;
-	usbsw->pdata = client->dev.platform_data;
-
-	chip = usbsw;
-
-	i2c_set_clientdata(client, usbsw);
-
-	ret = fsa9480_irq_init(usbsw);
-	if (ret)
-		goto fail1;
-
-	ret = sysfs_create_group(&client->dev.kobj, &fsa9480_group);
-	if (ret) {
-		dev_err(&client->dev,
-				"failed to create fsa9480 attribute group\n");
-		goto fail2;
-	}
-
-	/* ADC Detect Time: 500ms */
-	fsa9480_write_reg(client, FSA9480_REG_TIMING1, 0x6);
-
-	if (chip->pdata->reset_cb)
-		chip->pdata->reset_cb();
-
-	/* device detection */
-	fsa9480_detect_dev(usbsw, INT_ATTACH);
-
-	pm_runtime_set_active(&client->dev);
-
-	return 0;
-
-fail2:
-	if (client->irq)
-		free_irq(client->irq, usbsw);
-fail1:
-	kfree(usbsw);
-	return ret;
-}
-
-static int fsa9480_remove(struct i2c_client *client)
-{
-	struct fsa9480_usbsw *usbsw = i2c_get_clientdata(client);
-
-	if (client->irq)
-		free_irq(client->irq, usbsw);
-
-	sysfs_remove_group(&client->dev.kobj, &fsa9480_group);
-	device_init_wakeup(&client->dev, 0);
-	kfree(usbsw);
-	return 0;
-}
-
-#ifdef CONFIG_PM_SLEEP
-
-static int fsa9480_suspend(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct fsa9480_usbsw *usbsw = i2c_get_clientdata(client);
-	struct fsa9480_platform_data *pdata = usbsw->pdata;
-
-	if (device_may_wakeup(&client->dev) && client->irq)
-		enable_irq_wake(client->irq);
-
-	if (pdata->usb_power)
-		pdata->usb_power(0);
-
-	return 0;
-}
-
-static int fsa9480_resume(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct fsa9480_usbsw *usbsw = i2c_get_clientdata(client);
-	int dev1, dev2;
-
-	if (device_may_wakeup(&client->dev) && client->irq)
-		disable_irq_wake(client->irq);
-
-	/*
-	 * Clear Pending interrupt. Note that detect_dev does what
-	 * the interrupt handler does. So, we don't miss pending and
-	 * we reenable interrupt if there is one.
-	 */
-	fsa9480_read_reg(client, FSA9480_REG_INT1);
-	fsa9480_read_reg(client, FSA9480_REG_INT2);
-
-	dev1 = fsa9480_read_reg(client, FSA9480_REG_DEV_T1);
-	dev2 = fsa9480_read_reg(client, FSA9480_REG_DEV_T2);
-
-	/* device detection */
-	fsa9480_detect_dev(usbsw, (dev1 || dev2) ? INT_ATTACH : INT_DETACH);
-
-	return 0;
-}
-
-static SIMPLE_DEV_PM_OPS(fsa9480_pm_ops, fsa9480_suspend, fsa9480_resume);
-#define FSA9480_PM_OPS (&fsa9480_pm_ops)
-
-#else
-
-#define FSA9480_PM_OPS NULL
-
-#endif /* CONFIG_PM_SLEEP */
-
-static const struct i2c_device_id fsa9480_id[] = {
-	{"fsa9480", 0},
-	{}
-};
-MODULE_DEVICE_TABLE(i2c, fsa9480_id);
-
-static struct i2c_driver fsa9480_i2c_driver = {
-	.driver = {
-		.name = "fsa9480",
-		.pm = FSA9480_PM_OPS,
-	},
-	.probe = fsa9480_probe,
-	.remove = fsa9480_remove,
-	.id_table = fsa9480_id,
-};
-
-module_i2c_driver(fsa9480_i2c_driver);
-
-MODULE_AUTHOR("Minkyu Kang <mk7.kang@samsung.com>");
-MODULE_DESCRIPTION("FSA9480 USB Switch driver");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/fsa9480.h b/include/linux/platform_data/fsa9480.h
deleted file mode 100644
index dea8d84448ec..000000000000
--- a/include/linux/platform_data/fsa9480.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2010 Samsung Electronics
- * Minkyu Kang <mk7.kang@samsung.com>
- */
-
-#ifndef _FSA9480_H_
-#define _FSA9480_H_
-
-#define FSA9480_ATTACHED	1
-#define FSA9480_DETACHED	0
-
-struct fsa9480_platform_data {
-	void (*cfg_gpio) (void);
-	void (*usb_cb) (u8 attached);
-	void (*uart_cb) (u8 attached);
-	void (*charger_cb) (u8 attached);
-	void (*jig_cb) (u8 attached);
-	void (*reset_cb) (void);
-	void (*usb_power) (u8 on);
-	int wakeup;
-};
-
-#endif /* _FSA9480_H_ */
-- 
cgit v1.2.3


From 9dea44c91469512d346e638694c22c30a5273992 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 28 Jun 2019 16:59:45 +0200
Subject: devres: allow const resource arguments

devm_ioremap_resource() does not currently take 'const' arguments,
which results in a warning from the first driver trying to do it
anyway:

drivers/gpio/gpio-amd-fch.c: In function 'amd_fch_gpio_probe':
drivers/gpio/gpio-amd-fch.c:171:49: error: passing argument 2 of 'devm_ioremap_resource' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers]
  priv->base = devm_ioremap_resource(&pdev->dev, &amd_fch_gpio_iores);
                                                 ^~~~~~~~~~~~~~~~~~~

Change the prototype to allow it, as there is no real reason not to.

Fixes: 9bb2e0452508 ("gpio: amd: Make resource struct const")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20190628150049.1108048-1-arnd@arndb.de
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviwed-By: Enrico Weigelt <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/device.h | 3 ++-
 lib/devres.c           | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..5c37f0acc419 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -704,7 +704,8 @@ extern unsigned long devm_get_free_pages(struct device *dev,
 					 gfp_t gfp_mask, unsigned int order);
 extern void devm_free_pages(struct device *dev, unsigned long addr);
 
-void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res);
+void __iomem *devm_ioremap_resource(struct device *dev,
+				    const struct resource *res);
 
 void __iomem *devm_of_iomap(struct device *dev,
 			    struct device_node *node, int index,
diff --git a/lib/devres.c b/lib/devres.c
index 69bed2f38306..6a0e9bd6524a 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -131,7 +131,8 @@ EXPORT_SYMBOL(devm_iounmap);
  *	if (IS_ERR(base))
  *		return PTR_ERR(base);
  */
-void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res)
+void __iomem *devm_ioremap_resource(struct device *dev,
+				    const struct resource *res)
 {
 	resource_size_t size;
 	void __iomem *dest_ptr;
-- 
cgit v1.2.3


From 5a35316d97914d56d6be8b3748b2437785e74790 Mon Sep 17 00:00:00 2001
From: "Hook, Gary" <Gary.Hook@amd.com>
Date: Tue, 25 Jun 2019 23:43:43 +0000
Subject: crypto: doc - Add parameter documentation

Fill in missing parameter descriptions for the compression algorithm,
then pick them up to document for the compression_alg structure.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/api-skcipher.rst |  2 +-
 include/linux/crypto.h                | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/crypto/api-skcipher.rst b/Documentation/crypto/api-skcipher.rst
index 4eec4a93f7e3..20ba08dddf2e 100644
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -5,7 +5,7 @@ Block Cipher Algorithm Definitions
    :doc: Block Cipher Algorithm Definitions
 
 .. kernel-doc:: include/linux/crypto.h
-   :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg
+   :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg compress_alg
 
 Symmetric Key Cipher API
 ------------------------
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 311237b1dab0..4b4e2ffbee74 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -327,6 +327,17 @@ struct cipher_alg {
 	void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 };
 
+/**
+ * struct compress_alg - compression/decompression algorithm
+ * @coa_compress: Compress a buffer of specified length, storing the resulting
+ *		  data in the specified buffer. Return the length of the
+ *		  compressed data in dlen.
+ * @coa_decompress: Decompress the source buffer, storing the uncompressed
+ *		    data in the specified buffer. The length of the data is
+ *		    returned in dlen.
+ *
+ * All fields are mandatory.
+ */
 struct compress_alg {
 	int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
 			    unsigned int slen, u8 *dst, unsigned int *dlen);
-- 
cgit v1.2.3


From 1a829ff2a6c37187ff8020488e84ec392cb94854 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 12 Jun 2019 16:55:38 +0200
Subject: ceph: no need to check return value of debugfs_create functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

This cleanup allows the return value of the functions to be made void,
as no logic should care if these files succeed or not.

Cc: "Yan, Zheng" <zyan@redhat.com>
Cc: Sage Weil <sage@redhat.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: ceph-devel@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20190612145538.GA18772@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/debugfs.c            | 24 ++----------------------
 fs/ceph/super.c              |  4 +---
 fs/ceph/super.h              |  2 +-
 include/linux/ceph/debugfs.h |  4 ++--
 net/ceph/ceph_common.c       |  5 +----
 net/ceph/debugfs.c           | 33 ++++-----------------------------
 6 files changed, 11 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index b3fc5fe26a1a..83cd41fa2b01 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -245,21 +245,17 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 	debugfs_remove(fsc->debugfs_mdsc);
 }
 
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	char name[100];
-	int err = -ENOMEM;
 
 	dout("ceph_fs_debugfs_init\n");
-	BUG_ON(!fsc->client->debugfs_dir);
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
 				    fsc->client->debugfs_dir,
 				    fsc,
 				    &congestion_kb_fops);
-	if (!fsc->debugfs_congestion_kb)
-		goto out;
 
 	snprintf(name, sizeof(name), "../../bdi/%s",
 		 dev_name(fsc->sb->s_bdi->dev));
@@ -267,52 +263,36 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 		debugfs_create_symlink("bdi",
 				       fsc->client->debugfs_dir,
 				       name);
-	if (!fsc->debugfs_bdi)
-		goto out;
 
 	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0400,
 					fsc->client->debugfs_dir,
 					fsc,
 					&mdsmap_show_fops);
-	if (!fsc->debugfs_mdsmap)
-		goto out;
 
 	fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
 					0400,
 					fsc->client->debugfs_dir,
 					fsc,
 					&mds_sessions_show_fops);
-	if (!fsc->debugfs_mds_sessions)
-		goto out;
 
 	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
 						0400,
 						fsc->client->debugfs_dir,
 						fsc,
 						&mdsc_show_fops);
-	if (!fsc->debugfs_mdsc)
-		goto out;
 
 	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
 						   fsc->client->debugfs_dir,
 						   fsc,
 						   &caps_show_fops);
-	if (!fsc->debugfs_caps)
-		goto out;
-
-	return 0;
-
-out:
-	ceph_fs_debugfs_cleanup(fsc);
-	return err;
 }
 
 
 #else  /* CONFIG_DEBUG_FS */
 
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	return 0;
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 01be7c1bc4c6..273c94b61a3d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -951,9 +951,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 			dout("mount opening path %s\n", path);
 		}
 
-		err = ceph_fs_debugfs_init(fsc);
-		if (err < 0)
-			goto out;
+		ceph_fs_debugfs_init(fsc);
 
 		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6edab9a750f8..ac1e17853278 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1099,7 +1099,7 @@ extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 				  int num_fcntl_locks, int num_flock_locks);
 
 /* debugfs.c */
-extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
 
 /* quota.c */
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
index fa5f9b7f5dbb..cf5e840eec71 100644
--- a/include/linux/ceph/debugfs.h
+++ b/include/linux/ceph/debugfs.h
@@ -19,9 +19,9 @@ static const struct file_operations name##_fops = {			\
 };
 
 /* debugfs.c */
-extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_init(void);
 extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_init(struct ceph_client *client);
 extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
 
 #endif
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 1c811c74bfc0..4eeea4d5c3ef 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -776,9 +776,7 @@ static int __init init_ceph_lib(void)
 {
 	int ret = 0;
 
-	ret = ceph_debugfs_init();
-	if (ret < 0)
-		goto out;
+	ceph_debugfs_init();
 
 	ret = ceph_crypto_init();
 	if (ret < 0)
@@ -803,7 +801,6 @@ out_crypto:
 	ceph_crypto_shutdown();
 out_debugfs:
 	ceph_debugfs_cleanup();
-out:
 	return ret;
 }
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 63aef9915f75..7cb992e55475 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -389,12 +389,9 @@ CEPH_DEFINE_SHOW_FUNC(monc_show)
 CEPH_DEFINE_SHOW_FUNC(osdc_show)
 CEPH_DEFINE_SHOW_FUNC(client_options_show)
 
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
 {
 	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-	if (!ceph_debugfs_dir)
-		return -ENOMEM;
-	return 0;
 }
 
 void ceph_debugfs_cleanup(void)
@@ -402,9 +399,8 @@ void ceph_debugfs_cleanup(void)
 	debugfs_remove(ceph_debugfs_dir);
 }
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
 {
-	int ret = -ENOMEM;
 	char name[80];
 
 	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
@@ -412,56 +408,37 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 
 	dout("ceph_debugfs_client_init %p %s\n", client, name);
 
-	BUG_ON(client->debugfs_dir);
 	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-	if (!client->debugfs_dir)
-		goto out;
 
 	client->monc.debugfs_file = debugfs_create_file("monc",
 						      0400,
 						      client->debugfs_dir,
 						      client,
 						      &monc_show_fops);
-	if (!client->monc.debugfs_file)
-		goto out;
 
 	client->osdc.debugfs_file = debugfs_create_file("osdc",
 						      0400,
 						      client->debugfs_dir,
 						      client,
 						      &osdc_show_fops);
-	if (!client->osdc.debugfs_file)
-		goto out;
 
 	client->debugfs_monmap = debugfs_create_file("monmap",
 					0400,
 					client->debugfs_dir,
 					client,
 					&monmap_show_fops);
-	if (!client->debugfs_monmap)
-		goto out;
 
 	client->debugfs_osdmap = debugfs_create_file("osdmap",
 					0400,
 					client->debugfs_dir,
 					client,
 					&osdmap_show_fops);
-	if (!client->debugfs_osdmap)
-		goto out;
 
 	client->debugfs_options = debugfs_create_file("client_options",
 					0400,
 					client->debugfs_dir,
 					client,
 					&client_options_show_fops);
-	if (!client->debugfs_options)
-		goto out;
-
-	return 0;
-
-out:
-	ceph_debugfs_client_cleanup(client);
-	return ret;
 }
 
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
@@ -477,18 +454,16 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
 
 #else  /* CONFIG_DEBUG_FS */
 
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
 {
-	return 0;
 }
 
 void ceph_debugfs_cleanup(void)
 {
 }
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
 {
-	return 0;
 }
 
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
-- 
cgit v1.2.3


From 1a4dcb8aed681c426954b1cf7e4b78aab465690e Mon Sep 17 00:00:00 2001
From: Yinbo Zhu <yinbo.zhu@nxp.com>
Date: Mon, 24 Jun 2019 15:22:17 +0800
Subject: usb: linux/fsl_device: Add platform member has_fsl_erratum_a006918

This patch is to add member has_fsl_erratum_a006918 in platform data

Signed-off-by: Yinbo Zhu <yinbo.zhu@nxp.com>
Link: https://lore.kernel.org/r/20190624072219.15258-3-yinbo.zhu@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fsl_devices.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index cb2b46f57af3..5d231ce8709b 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -98,6 +98,7 @@ struct fsl_usb2_platform_data {
 	unsigned	has_fsl_erratum_14:1;
 	unsigned	has_fsl_erratum_a005275:1;
 	unsigned	has_fsl_erratum_a005697:1;
+	unsigned        has_fsl_erratum_a006918:1;
 	unsigned	check_phy_clk_valid:1;
 
 	/* register save area for suspend/resume */
-- 
cgit v1.2.3


From ecd6bf67da3126e8ec731c2dd8cb6c2f17d9563a Mon Sep 17 00:00:00 2001
From: Mark Greer <mgreer@animalcreek.com>
Date: Wed, 26 Jun 2019 09:05:53 -0700
Subject: serial: mpsc: Remove obsolete MPSC driver

Support for the Marvell MV64x60 line of bridge chips that contained
MPSC controllers has been removed and there are no other components
that have that controller so remove its driver.

Signed-off-by: Mark Greer <mgreer@animalcreek.com>
Link: https://lore.kernel.org/r/20190626160553.28518-1-mgreer@animalcreek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/devices.txt |    4 +-
 drivers/tty/serial/Kconfig            |   14 -
 drivers/tty/serial/Makefile           |    1 -
 drivers/tty/serial/mpsc.c             | 2138 ---------------------------------
 include/linux/mv643xx.h               |   46 -
 include/uapi/linux/serial_core.h      |    2 +-
 6 files changed, 3 insertions(+), 2202 deletions(-)
 delete mode 100644 drivers/tty/serial/mpsc.c

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 1649117e6087..e56e00655153 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -2693,8 +2693,8 @@
 		 41 = /dev/ttySMX0		Motorola i.MX - port 0
 		 42 = /dev/ttySMX1		Motorola i.MX - port 1
 		 43 = /dev/ttySMX2		Motorola i.MX - port 2
-		 44 = /dev/ttyMM0		Marvell MPSC - port 0
-		 45 = /dev/ttyMM1		Marvell MPSC - port 1
+		 44 = /dev/ttyMM0		Marvell MPSC - port 0 (obsolete unused)
+		 45 = /dev/ttyMM1		Marvell MPSC - port 1 (obsolete unused)
 		 46 = /dev/ttyCPM0		PPC CPM (SCC or SMC) - port 0
 		    ...
 		 47 = /dev/ttyCPM5		PPC CPM (SCC or SMC) - port 5
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 0d31251e04cc..b416c7b33f49 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -457,20 +457,6 @@ config SERIAL_21285_CONSOLE
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 
-config SERIAL_MPSC
-	bool "Marvell MPSC serial port support"
-	depends on MV64X60
-	select SERIAL_CORE
-	help
-	  Say Y here if you want to use the Marvell MPSC serial controller.
-
-config SERIAL_MPSC_CONSOLE
-	bool "Support for console on Marvell MPSC serial port"
-	depends on SERIAL_MPSC
-	select SERIAL_CORE_CONSOLE
-	help
-	  Say Y here if you want to support a serial console on a Marvell MPSC.
-
 config SERIAL_PXA
 	bool "PXA serial port support (DEPRECATED)"
 	depends on ARCH_PXA || ARCH_MMP
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 79c3d513db7e..7cd7cabfa6c4 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -46,7 +46,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart/
 obj-$(CONFIG_SERIAL_IMX) += imx.o
 obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o
 obj-$(CONFIG_SERIAL_ICOM) += icom.o
-obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
 obj-$(CONFIG_SERIAL_MESON) += meson_uart.o
 obj-$(CONFIG_SERIAL_SB1250_DUART) += sb1250-duart.o
 obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o
diff --git a/drivers/tty/serial/mpsc.c b/drivers/tty/serial/mpsc.c
deleted file mode 100644
index 1f60d6fe4ff2..000000000000
--- a/drivers/tty/serial/mpsc.c
+++ /dev/null
@@ -1,2138 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Generic driver for the MPSC (UART mode) on Marvell parts (e.g., GT64240,
- * GT64260, MV64340, MV64360, GT96100, ... ).
- *
- * Author: Mark A. Greer <mgreer@mvista.com>
- *
- * Based on an old MPSC driver that was in the linuxppc tree.  It appears to
- * have been created by Chris Zankel (formerly of MontaVista) but there
- * is no proper Copyright so I'm not sure.  Apparently, parts were also
- * taken from PPCBoot (now U-Boot).  Also based on drivers/serial/8250.c
- * by Russell King.
- *
- * 2004 (c) MontaVista, Software, Inc.
- */
-/*
- * The MPSC interface is much like a typical network controller's interface.
- * That is, you set up separate rings of descriptors for transmitting and
- * receiving data.  There is also a pool of buffers with (one buffer per
- * descriptor) that incoming data are dma'd into or outgoing data are dma'd
- * out of.
- *
- * The MPSC requires two other controllers to be able to work.  The Baud Rate
- * Generator (BRG) provides a clock at programmable frequencies which determines
- * the baud rate.  The Serial DMA Controller (SDMA) takes incoming data from the
- * MPSC and DMA's it into memory or DMA's outgoing data and passes it to the
- * MPSC.  It is actually the SDMA interrupt that the driver uses to keep the
- * transmit and receive "engines" going (i.e., indicate data has been
- * transmitted or received).
- *
- * NOTES:
- *
- * 1) Some chips have an erratum where several regs cannot be
- * read.  To work around that, we keep a local copy of those regs in
- * 'mpsc_port_info'.
- *
- * 2) Some chips have an erratum where the ctlr will hang when the SDMA ctlr
- * accesses system mem with coherency enabled.  For that reason, the driver
- * assumes that coherency for that ctlr has been disabled.  This means
- * that when in a cache coherent system, the driver has to manually manage
- * the data cache on the areas that it touches because the dma_* macro are
- * basically no-ops.
- *
- * 3) There is an erratum (on PPC) where you can't use the instruction to do
- * a DMA_TO_DEVICE/cache clean so DMA_BIDIRECTIONAL/flushes are used in places
- * where a DMA_TO_DEVICE/clean would have [otherwise] sufficed.
- *
- * 4) AFAICT, hardware flow control isn't supported by the controller --MAG.
- */
-
-
-#if defined(CONFIG_SERIAL_MPSC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
-#define SUPPORT_SYSRQ
-#endif
-
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/sysrq.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/mv643xx.h>
-#include <linux/platform_device.h>
-#include <linux/gfp.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-
-#define	MPSC_NUM_CTLRS		2
-
-/*
- * Descriptors and buffers must be cache line aligned.
- * Buffers lengths must be multiple of cache line size.
- * Number of Tx & Rx descriptors must be powers of 2.
- */
-#define	MPSC_RXR_ENTRIES	32
-#define	MPSC_RXRE_SIZE		dma_get_cache_alignment()
-#define	MPSC_RXR_SIZE		(MPSC_RXR_ENTRIES * MPSC_RXRE_SIZE)
-#define	MPSC_RXBE_SIZE		dma_get_cache_alignment()
-#define	MPSC_RXB_SIZE		(MPSC_RXR_ENTRIES * MPSC_RXBE_SIZE)
-
-#define	MPSC_TXR_ENTRIES	32
-#define	MPSC_TXRE_SIZE		dma_get_cache_alignment()
-#define	MPSC_TXR_SIZE		(MPSC_TXR_ENTRIES * MPSC_TXRE_SIZE)
-#define	MPSC_TXBE_SIZE		dma_get_cache_alignment()
-#define	MPSC_TXB_SIZE		(MPSC_TXR_ENTRIES * MPSC_TXBE_SIZE)
-
-#define	MPSC_DMA_ALLOC_SIZE	(MPSC_RXR_SIZE + MPSC_RXB_SIZE + MPSC_TXR_SIZE \
-		+ MPSC_TXB_SIZE + dma_get_cache_alignment() /* for alignment */)
-
-/* Rx and Tx Ring entry descriptors -- assume entry size is <= cacheline size */
-struct mpsc_rx_desc {
-	u16 bufsize;
-	u16 bytecnt;
-	u32 cmdstat;
-	u32 link;
-	u32 buf_ptr;
-} __attribute((packed));
-
-struct mpsc_tx_desc {
-	u16 bytecnt;
-	u16 shadow;
-	u32 cmdstat;
-	u32 link;
-	u32 buf_ptr;
-} __attribute((packed));
-
-/*
- * Some regs that have the erratum that you can't read them are are shared
- * between the two MPSC controllers.  This struct contains those shared regs.
- */
-struct mpsc_shared_regs {
-	phys_addr_t mpsc_routing_base_p;
-	phys_addr_t sdma_intr_base_p;
-
-	void __iomem *mpsc_routing_base;
-	void __iomem *sdma_intr_base;
-
-	u32 MPSC_MRR_m;
-	u32 MPSC_RCRR_m;
-	u32 MPSC_TCRR_m;
-	u32 SDMA_INTR_CAUSE_m;
-	u32 SDMA_INTR_MASK_m;
-};
-
-/* The main driver data structure */
-struct mpsc_port_info {
-	struct uart_port port;	/* Overlay uart_port structure */
-
-	/* Internal driver state for this ctlr */
-	u8 ready;
-	u8 rcv_data;
-
-	/* Info passed in from platform */
-	u8 mirror_regs;		/* Need to mirror regs? */
-	u8 cache_mgmt;		/* Need manual cache mgmt? */
-	u8 brg_can_tune;	/* BRG has baud tuning? */
-	u32 brg_clk_src;
-	u16 mpsc_max_idle;
-	int default_baud;
-	int default_bits;
-	int default_parity;
-	int default_flow;
-
-	/* Physical addresses of various blocks of registers (from platform) */
-	phys_addr_t mpsc_base_p;
-	phys_addr_t sdma_base_p;
-	phys_addr_t brg_base_p;
-
-	/* Virtual addresses of various blocks of registers (from platform) */
-	void __iomem *mpsc_base;
-	void __iomem *sdma_base;
-	void __iomem *brg_base;
-
-	/* Descriptor ring and buffer allocations */
-	void *dma_region;
-	dma_addr_t dma_region_p;
-
-	dma_addr_t rxr;		/* Rx descriptor ring */
-	dma_addr_t rxr_p;	/* Phys addr of rxr */
-	u8 *rxb;		/* Rx Ring I/O buf */
-	u8 *rxb_p;		/* Phys addr of rxb */
-	u32 rxr_posn;		/* First desc w/ Rx data */
-
-	dma_addr_t txr;		/* Tx descriptor ring */
-	dma_addr_t txr_p;	/* Phys addr of txr */
-	u8 *txb;		/* Tx Ring I/O buf */
-	u8 *txb_p;		/* Phys addr of txb */
-	int txr_head;		/* Where new data goes */
-	int txr_tail;		/* Where sent data comes off */
-	spinlock_t tx_lock;	/* transmit lock */
-
-	/* Mirrored values of regs we can't read (if 'mirror_regs' set) */
-	u32 MPSC_MPCR_m;
-	u32 MPSC_CHR_1_m;
-	u32 MPSC_CHR_2_m;
-	u32 MPSC_CHR_10_m;
-	u32 BRG_BCR_m;
-	struct mpsc_shared_regs *shared_regs;
-};
-
-/* Hooks to platform-specific code */
-int mpsc_platform_register_driver(void);
-void mpsc_platform_unregister_driver(void);
-
-/* Hooks back in to mpsc common to be called by platform-specific code */
-struct mpsc_port_info *mpsc_device_probe(int index);
-struct mpsc_port_info *mpsc_device_remove(int index);
-
-/* Main MPSC Configuration Register Offsets */
-#define	MPSC_MMCRL			0x0000
-#define	MPSC_MMCRH			0x0004
-#define	MPSC_MPCR			0x0008
-#define	MPSC_CHR_1			0x000c
-#define	MPSC_CHR_2			0x0010
-#define	MPSC_CHR_3			0x0014
-#define	MPSC_CHR_4			0x0018
-#define	MPSC_CHR_5			0x001c
-#define	MPSC_CHR_6			0x0020
-#define	MPSC_CHR_7			0x0024
-#define	MPSC_CHR_8			0x0028
-#define	MPSC_CHR_9			0x002c
-#define	MPSC_CHR_10			0x0030
-#define	MPSC_CHR_11			0x0034
-
-#define	MPSC_MPCR_FRZ			(1 << 9)
-#define	MPSC_MPCR_CL_5			0
-#define	MPSC_MPCR_CL_6			1
-#define	MPSC_MPCR_CL_7			2
-#define	MPSC_MPCR_CL_8			3
-#define	MPSC_MPCR_SBL_1			0
-#define	MPSC_MPCR_SBL_2			1
-
-#define	MPSC_CHR_2_TEV			(1<<1)
-#define	MPSC_CHR_2_TA			(1<<7)
-#define	MPSC_CHR_2_TTCS			(1<<9)
-#define	MPSC_CHR_2_REV			(1<<17)
-#define	MPSC_CHR_2_RA			(1<<23)
-#define	MPSC_CHR_2_CRD			(1<<25)
-#define	MPSC_CHR_2_EH			(1<<31)
-#define	MPSC_CHR_2_PAR_ODD		0
-#define	MPSC_CHR_2_PAR_SPACE		1
-#define	MPSC_CHR_2_PAR_EVEN		2
-#define	MPSC_CHR_2_PAR_MARK		3
-
-/* MPSC Signal Routing */
-#define	MPSC_MRR			0x0000
-#define	MPSC_RCRR			0x0004
-#define	MPSC_TCRR			0x0008
-
-/* Serial DMA Controller Interface Registers */
-#define	SDMA_SDC			0x0000
-#define	SDMA_SDCM			0x0008
-#define	SDMA_RX_DESC			0x0800
-#define	SDMA_RX_BUF_PTR			0x0808
-#define	SDMA_SCRDP			0x0810
-#define	SDMA_TX_DESC			0x0c00
-#define	SDMA_SCTDP			0x0c10
-#define	SDMA_SFTDP			0x0c14
-
-#define	SDMA_DESC_CMDSTAT_PE		(1<<0)
-#define	SDMA_DESC_CMDSTAT_CDL		(1<<1)
-#define	SDMA_DESC_CMDSTAT_FR		(1<<3)
-#define	SDMA_DESC_CMDSTAT_OR		(1<<6)
-#define	SDMA_DESC_CMDSTAT_BR		(1<<9)
-#define	SDMA_DESC_CMDSTAT_MI		(1<<10)
-#define	SDMA_DESC_CMDSTAT_A		(1<<11)
-#define	SDMA_DESC_CMDSTAT_AM		(1<<12)
-#define	SDMA_DESC_CMDSTAT_CT		(1<<13)
-#define	SDMA_DESC_CMDSTAT_C		(1<<14)
-#define	SDMA_DESC_CMDSTAT_ES		(1<<15)
-#define	SDMA_DESC_CMDSTAT_L		(1<<16)
-#define	SDMA_DESC_CMDSTAT_F		(1<<17)
-#define	SDMA_DESC_CMDSTAT_P		(1<<18)
-#define	SDMA_DESC_CMDSTAT_EI		(1<<23)
-#define	SDMA_DESC_CMDSTAT_O		(1<<31)
-
-#define SDMA_DESC_DFLT			(SDMA_DESC_CMDSTAT_O \
-		| SDMA_DESC_CMDSTAT_EI)
-
-#define	SDMA_SDC_RFT			(1<<0)
-#define	SDMA_SDC_SFM			(1<<1)
-#define	SDMA_SDC_BLMR			(1<<6)
-#define	SDMA_SDC_BLMT			(1<<7)
-#define	SDMA_SDC_POVR			(1<<8)
-#define	SDMA_SDC_RIFB			(1<<9)
-
-#define	SDMA_SDCM_ERD			(1<<7)
-#define	SDMA_SDCM_AR			(1<<15)
-#define	SDMA_SDCM_STD			(1<<16)
-#define	SDMA_SDCM_TXD			(1<<23)
-#define	SDMA_SDCM_AT			(1<<31)
-
-#define	SDMA_0_CAUSE_RXBUF		(1<<0)
-#define	SDMA_0_CAUSE_RXERR		(1<<1)
-#define	SDMA_0_CAUSE_TXBUF		(1<<2)
-#define	SDMA_0_CAUSE_TXEND		(1<<3)
-#define	SDMA_1_CAUSE_RXBUF		(1<<8)
-#define	SDMA_1_CAUSE_RXERR		(1<<9)
-#define	SDMA_1_CAUSE_TXBUF		(1<<10)
-#define	SDMA_1_CAUSE_TXEND		(1<<11)
-
-#define	SDMA_CAUSE_RX_MASK	(SDMA_0_CAUSE_RXBUF | SDMA_0_CAUSE_RXERR \
-		| SDMA_1_CAUSE_RXBUF | SDMA_1_CAUSE_RXERR)
-#define	SDMA_CAUSE_TX_MASK	(SDMA_0_CAUSE_TXBUF | SDMA_0_CAUSE_TXEND \
-		| SDMA_1_CAUSE_TXBUF | SDMA_1_CAUSE_TXEND)
-
-/* SDMA Interrupt registers */
-#define	SDMA_INTR_CAUSE			0x0000
-#define	SDMA_INTR_MASK			0x0080
-
-/* Baud Rate Generator Interface Registers */
-#define	BRG_BCR				0x0000
-#define	BRG_BTR				0x0004
-
-/*
- * Define how this driver is known to the outside (we've been assigned a
- * range on the "Low-density serial ports" major).
- */
-#define MPSC_MAJOR			204
-#define MPSC_MINOR_START		44
-#define	MPSC_DRIVER_NAME		"MPSC"
-#define	MPSC_DEV_NAME			"ttyMM"
-#define	MPSC_VERSION			"1.00"
-
-static struct mpsc_port_info mpsc_ports[MPSC_NUM_CTLRS];
-static struct mpsc_shared_regs mpsc_shared_regs;
-static struct uart_driver mpsc_reg;
-
-static void mpsc_start_rx(struct mpsc_port_info *pi);
-static void mpsc_free_ring_mem(struct mpsc_port_info *pi);
-static void mpsc_release_port(struct uart_port *port);
-/*
- ******************************************************************************
- *
- * Baud Rate Generator Routines (BRG)
- *
- ******************************************************************************
- */
-static void mpsc_brg_init(struct mpsc_port_info *pi, u32 clk_src)
-{
-	u32	v;
-
-	v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR);
-	v = (v & ~(0xf << 18)) | ((clk_src & 0xf) << 18);
-
-	if (pi->brg_can_tune)
-		v &= ~(1 << 25);
-
-	if (pi->mirror_regs)
-		pi->BRG_BCR_m = v;
-	writel(v, pi->brg_base + BRG_BCR);
-
-	writel(readl(pi->brg_base + BRG_BTR) & 0xffff0000,
-		pi->brg_base + BRG_BTR);
-}
-
-static void mpsc_brg_enable(struct mpsc_port_info *pi)
-{
-	u32	v;
-
-	v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR);
-	v |= (1 << 16);
-
-	if (pi->mirror_regs)
-		pi->BRG_BCR_m = v;
-	writel(v, pi->brg_base + BRG_BCR);
-}
-
-static void mpsc_brg_disable(struct mpsc_port_info *pi)
-{
-	u32	v;
-
-	v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR);
-	v &= ~(1 << 16);
-
-	if (pi->mirror_regs)
-		pi->BRG_BCR_m = v;
-	writel(v, pi->brg_base + BRG_BCR);
-}
-
-/*
- * To set the baud, we adjust the CDV field in the BRG_BCR reg.
- * From manual: Baud = clk / ((CDV+1)*2) ==> CDV = (clk / (baud*2)) - 1.
- * However, the input clock is divided by 16 in the MPSC b/c of how
- * 'MPSC_MMCRH' was set up so we have to divide the 'clk' used in our
- * calculation by 16 to account for that.  So the real calculation
- * that accounts for the way the mpsc is set up is:
- * CDV = (clk / (baud*2*16)) - 1 ==> CDV = (clk / (baud << 5)) - 1.
- */
-static void mpsc_set_baudrate(struct mpsc_port_info *pi, u32 baud)
-{
-	u32	cdv = (pi->port.uartclk / (baud << 5)) - 1;
-	u32	v;
-
-	mpsc_brg_disable(pi);
-	v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR);
-	v = (v & 0xffff0000) | (cdv & 0xffff);
-
-	if (pi->mirror_regs)
-		pi->BRG_BCR_m = v;
-	writel(v, pi->brg_base + BRG_BCR);
-	mpsc_brg_enable(pi);
-}
-
-/*
- ******************************************************************************
- *
- * Serial DMA Routines (SDMA)
- *
- ******************************************************************************
- */
-
-static void mpsc_sdma_burstsize(struct mpsc_port_info *pi, u32 burst_size)
-{
-	u32	v;
-
-	pr_debug("mpsc_sdma_burstsize[%d]: burst_size: %d\n",
-			pi->port.line, burst_size);
-
-	burst_size >>= 3; /* Divide by 8 b/c reg values are 8-byte chunks */
-
-	if (burst_size < 2)
-		v = 0x0;	/* 1 64-bit word */
-	else if (burst_size < 4)
-		v = 0x1;	/* 2 64-bit words */
-	else if (burst_size < 8)
-		v = 0x2;	/* 4 64-bit words */
-	else
-		v = 0x3;	/* 8 64-bit words */
-
-	writel((readl(pi->sdma_base + SDMA_SDC) & (0x3 << 12)) | (v << 12),
-		pi->sdma_base + SDMA_SDC);
-}
-
-static void mpsc_sdma_init(struct mpsc_port_info *pi, u32 burst_size)
-{
-	pr_debug("mpsc_sdma_init[%d]: burst_size: %d\n", pi->port.line,
-		burst_size);
-
-	writel((readl(pi->sdma_base + SDMA_SDC) & 0x3ff) | 0x03f,
-		pi->sdma_base + SDMA_SDC);
-	mpsc_sdma_burstsize(pi, burst_size);
-}
-
-static u32 mpsc_sdma_intr_mask(struct mpsc_port_info *pi, u32 mask)
-{
-	u32	old, v;
-
-	pr_debug("mpsc_sdma_intr_mask[%d]: mask: 0x%x\n", pi->port.line, mask);
-
-	old = v = (pi->mirror_regs) ? pi->shared_regs->SDMA_INTR_MASK_m :
-		readl(pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK);
-
-	mask &= 0xf;
-	if (pi->port.line)
-		mask <<= 8;
-	v &= ~mask;
-
-	if (pi->mirror_regs)
-		pi->shared_regs->SDMA_INTR_MASK_m = v;
-	writel(v, pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK);
-
-	if (pi->port.line)
-		old >>= 8;
-	return old & 0xf;
-}
-
-static void mpsc_sdma_intr_unmask(struct mpsc_port_info *pi, u32 mask)
-{
-	u32	v;
-
-	pr_debug("mpsc_sdma_intr_unmask[%d]: mask: 0x%x\n", pi->port.line,mask);
-
-	v = (pi->mirror_regs) ? pi->shared_regs->SDMA_INTR_MASK_m
-		: readl(pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK);
-
-	mask &= 0xf;
-	if (pi->port.line)
-		mask <<= 8;
-	v |= mask;
-
-	if (pi->mirror_regs)
-		pi->shared_regs->SDMA_INTR_MASK_m = v;
-	writel(v, pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK);
-}
-
-static void mpsc_sdma_intr_ack(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_sdma_intr_ack[%d]: Acknowledging IRQ\n", pi->port.line);
-
-	if (pi->mirror_regs)
-		pi->shared_regs->SDMA_INTR_CAUSE_m = 0;
-	writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE
-			+ pi->port.line);
-}
-
-static void mpsc_sdma_set_rx_ring(struct mpsc_port_info *pi,
-		struct mpsc_rx_desc *rxre_p)
-{
-	pr_debug("mpsc_sdma_set_rx_ring[%d]: rxre_p: 0x%x\n",
-		pi->port.line, (u32)rxre_p);
-
-	writel((u32)rxre_p, pi->sdma_base + SDMA_SCRDP);
-}
-
-static void mpsc_sdma_set_tx_ring(struct mpsc_port_info *pi,
-		struct mpsc_tx_desc *txre_p)
-{
-	writel((u32)txre_p, pi->sdma_base + SDMA_SFTDP);
-	writel((u32)txre_p, pi->sdma_base + SDMA_SCTDP);
-}
-
-static void mpsc_sdma_cmd(struct mpsc_port_info *pi, u32 val)
-{
-	u32	v;
-
-	v = readl(pi->sdma_base + SDMA_SDCM);
-	if (val)
-		v |= val;
-	else
-		v = 0;
-	wmb();
-	writel(v, pi->sdma_base + SDMA_SDCM);
-	wmb();
-}
-
-static uint mpsc_sdma_tx_active(struct mpsc_port_info *pi)
-{
-	return readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_TXD;
-}
-
-static void mpsc_sdma_start_tx(struct mpsc_port_info *pi)
-{
-	struct mpsc_tx_desc *txre, *txre_p;
-
-	/* If tx isn't running & there's a desc ready to go, start it */
-	if (!mpsc_sdma_tx_active(pi)) {
-		txre = (struct mpsc_tx_desc *)(pi->txr
-				+ (pi->txr_tail * MPSC_TXRE_SIZE));
-		dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE,
-				DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			invalidate_dcache_range((ulong)txre,
-					(ulong)txre + MPSC_TXRE_SIZE);
-#endif
-
-		if (be32_to_cpu(txre->cmdstat) & SDMA_DESC_CMDSTAT_O) {
-			txre_p = (struct mpsc_tx_desc *)
-				(pi->txr_p + (pi->txr_tail * MPSC_TXRE_SIZE));
-
-			mpsc_sdma_set_tx_ring(pi, txre_p);
-			mpsc_sdma_cmd(pi, SDMA_SDCM_STD | SDMA_SDCM_TXD);
-		}
-	}
-}
-
-static void mpsc_sdma_stop(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_sdma_stop[%d]: Stopping SDMA\n", pi->port.line);
-
-	/* Abort any SDMA transfers */
-	mpsc_sdma_cmd(pi, 0);
-	mpsc_sdma_cmd(pi, SDMA_SDCM_AR | SDMA_SDCM_AT);
-
-	/* Clear the SDMA current and first TX and RX pointers */
-	mpsc_sdma_set_tx_ring(pi, NULL);
-	mpsc_sdma_set_rx_ring(pi, NULL);
-
-	/* Disable interrupts */
-	mpsc_sdma_intr_mask(pi, 0xf);
-	mpsc_sdma_intr_ack(pi);
-}
-
-/*
- ******************************************************************************
- *
- * Multi-Protocol Serial Controller Routines (MPSC)
- *
- ******************************************************************************
- */
-
-static void mpsc_hw_init(struct mpsc_port_info *pi)
-{
-	u32	v;
-
-	pr_debug("mpsc_hw_init[%d]: Initializing hardware\n", pi->port.line);
-
-	/* Set up clock routing */
-	if (pi->mirror_regs) {
-		v = pi->shared_regs->MPSC_MRR_m;
-		v &= ~0x1c7;
-		pi->shared_regs->MPSC_MRR_m = v;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_MRR);
-
-		v = pi->shared_regs->MPSC_RCRR_m;
-		v = (v & ~0xf0f) | 0x100;
-		pi->shared_regs->MPSC_RCRR_m = v;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_RCRR);
-
-		v = pi->shared_regs->MPSC_TCRR_m;
-		v = (v & ~0xf0f) | 0x100;
-		pi->shared_regs->MPSC_TCRR_m = v;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_TCRR);
-	} else {
-		v = readl(pi->shared_regs->mpsc_routing_base + MPSC_MRR);
-		v &= ~0x1c7;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_MRR);
-
-		v = readl(pi->shared_regs->mpsc_routing_base + MPSC_RCRR);
-		v = (v & ~0xf0f) | 0x100;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_RCRR);
-
-		v = readl(pi->shared_regs->mpsc_routing_base + MPSC_TCRR);
-		v = (v & ~0xf0f) | 0x100;
-		writel(v, pi->shared_regs->mpsc_routing_base + MPSC_TCRR);
-	}
-
-	/* Put MPSC in UART mode & enabel Tx/Rx egines */
-	writel(0x000004c4, pi->mpsc_base + MPSC_MMCRL);
-
-	/* No preamble, 16x divider, low-latency, */
-	writel(0x04400400, pi->mpsc_base + MPSC_MMCRH);
-	mpsc_set_baudrate(pi, pi->default_baud);
-
-	if (pi->mirror_regs) {
-		pi->MPSC_CHR_1_m = 0;
-		pi->MPSC_CHR_2_m = 0;
-	}
-	writel(0, pi->mpsc_base + MPSC_CHR_1);
-	writel(0, pi->mpsc_base + MPSC_CHR_2);
-	writel(pi->mpsc_max_idle, pi->mpsc_base + MPSC_CHR_3);
-	writel(0, pi->mpsc_base + MPSC_CHR_4);
-	writel(0, pi->mpsc_base + MPSC_CHR_5);
-	writel(0, pi->mpsc_base + MPSC_CHR_6);
-	writel(0, pi->mpsc_base + MPSC_CHR_7);
-	writel(0, pi->mpsc_base + MPSC_CHR_8);
-	writel(0, pi->mpsc_base + MPSC_CHR_9);
-	writel(0, pi->mpsc_base + MPSC_CHR_10);
-}
-
-static void mpsc_enter_hunt(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_enter_hunt[%d]: Hunting...\n", pi->port.line);
-
-	if (pi->mirror_regs) {
-		writel(pi->MPSC_CHR_2_m | MPSC_CHR_2_EH,
-			pi->mpsc_base + MPSC_CHR_2);
-		/* Erratum prevents reading CHR_2 so just delay for a while */
-		udelay(100);
-	} else {
-		writel(readl(pi->mpsc_base + MPSC_CHR_2) | MPSC_CHR_2_EH,
-				pi->mpsc_base + MPSC_CHR_2);
-
-		while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_EH)
-			udelay(10);
-	}
-}
-
-static void mpsc_freeze(struct mpsc_port_info *pi)
-{
-	u32	v;
-
-	pr_debug("mpsc_freeze[%d]: Freezing\n", pi->port.line);
-
-	v = (pi->mirror_regs) ? pi->MPSC_MPCR_m :
-		readl(pi->mpsc_base + MPSC_MPCR);
-	v |= MPSC_MPCR_FRZ;
-
-	if (pi->mirror_regs)
-		pi->MPSC_MPCR_m = v;
-	writel(v, pi->mpsc_base + MPSC_MPCR);
-}
-
-static void mpsc_unfreeze(struct mpsc_port_info *pi)
-{
-	u32	v;
-
-	v = (pi->mirror_regs) ? pi->MPSC_MPCR_m :
-		readl(pi->mpsc_base + MPSC_MPCR);
-	v &= ~MPSC_MPCR_FRZ;
-
-	if (pi->mirror_regs)
-		pi->MPSC_MPCR_m = v;
-	writel(v, pi->mpsc_base + MPSC_MPCR);
-
-	pr_debug("mpsc_unfreeze[%d]: Unfrozen\n", pi->port.line);
-}
-
-static void mpsc_set_char_length(struct mpsc_port_info *pi, u32 len)
-{
-	u32	v;
-
-	pr_debug("mpsc_set_char_length[%d]: char len: %d\n", pi->port.line,len);
-
-	v = (pi->mirror_regs) ? pi->MPSC_MPCR_m :
-		readl(pi->mpsc_base + MPSC_MPCR);
-	v = (v & ~(0x3 << 12)) | ((len & 0x3) << 12);
-
-	if (pi->mirror_regs)
-		pi->MPSC_MPCR_m = v;
-	writel(v, pi->mpsc_base + MPSC_MPCR);
-}
-
-static void mpsc_set_stop_bit_length(struct mpsc_port_info *pi, u32 len)
-{
-	u32	v;
-
-	pr_debug("mpsc_set_stop_bit_length[%d]: stop bits: %d\n",
-		pi->port.line, len);
-
-	v = (pi->mirror_regs) ? pi->MPSC_MPCR_m :
-		readl(pi->mpsc_base + MPSC_MPCR);
-
-	v = (v & ~(1 << 14)) | ((len & 0x1) << 14);
-
-	if (pi->mirror_regs)
-		pi->MPSC_MPCR_m = v;
-	writel(v, pi->mpsc_base + MPSC_MPCR);
-}
-
-static void mpsc_set_parity(struct mpsc_port_info *pi, u32 p)
-{
-	u32	v;
-
-	pr_debug("mpsc_set_parity[%d]: parity bits: 0x%x\n", pi->port.line, p);
-
-	v = (pi->mirror_regs) ? pi->MPSC_CHR_2_m :
-		readl(pi->mpsc_base + MPSC_CHR_2);
-
-	p &= 0x3;
-	v = (v & ~0xc000c) | (p << 18) | (p << 2);
-
-	if (pi->mirror_regs)
-		pi->MPSC_CHR_2_m = v;
-	writel(v, pi->mpsc_base + MPSC_CHR_2);
-}
-
-/*
- ******************************************************************************
- *
- * Driver Init Routines
- *
- ******************************************************************************
- */
-
-static void mpsc_init_hw(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_init_hw[%d]: Initializing\n", pi->port.line);
-
-	mpsc_brg_init(pi, pi->brg_clk_src);
-	mpsc_brg_enable(pi);
-	mpsc_sdma_init(pi, dma_get_cache_alignment());	/* burst a cacheline */
-	mpsc_sdma_stop(pi);
-	mpsc_hw_init(pi);
-}
-
-static int mpsc_alloc_ring_mem(struct mpsc_port_info *pi)
-{
-	int rc = 0;
-
-	pr_debug("mpsc_alloc_ring_mem[%d]: Allocating ring mem\n",
-		pi->port.line);
-
-	if (!pi->dma_region) {
-		if (!dma_set_mask(pi->port.dev, 0xffffffff)) {
-			printk(KERN_ERR "MPSC: Inadequate DMA support\n");
-			rc = -ENXIO;
-		} else if ((pi->dma_region = dma_alloc_attrs(pi->port.dev,
-						MPSC_DMA_ALLOC_SIZE,
-						&pi->dma_region_p, GFP_KERNEL,
-						DMA_ATTR_NON_CONSISTENT))
-				== NULL) {
-			printk(KERN_ERR "MPSC: Can't alloc Desc region\n");
-			rc = -ENOMEM;
-		}
-	}
-
-	return rc;
-}
-
-static void mpsc_free_ring_mem(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_free_ring_mem[%d]: Freeing ring mem\n", pi->port.line);
-
-	if (pi->dma_region) {
-		dma_free_attrs(pi->port.dev, MPSC_DMA_ALLOC_SIZE,
-				pi->dma_region, pi->dma_region_p,
-				DMA_ATTR_NON_CONSISTENT);
-		pi->dma_region = NULL;
-		pi->dma_region_p = (dma_addr_t)NULL;
-	}
-}
-
-static void mpsc_init_rings(struct mpsc_port_info *pi)
-{
-	struct mpsc_rx_desc *rxre;
-	struct mpsc_tx_desc *txre;
-	dma_addr_t dp, dp_p;
-	u8 *bp, *bp_p;
-	int i;
-
-	pr_debug("mpsc_init_rings[%d]: Initializing rings\n", pi->port.line);
-
-	BUG_ON(pi->dma_region == NULL);
-
-	memset(pi->dma_region, 0, MPSC_DMA_ALLOC_SIZE);
-
-	/*
-	 * Descriptors & buffers are multiples of cacheline size and must be
-	 * cacheline aligned.
-	 */
-	dp = ALIGN((u32)pi->dma_region, dma_get_cache_alignment());
-	dp_p = ALIGN((u32)pi->dma_region_p, dma_get_cache_alignment());
-
-	/*
-	 * Partition dma region into rx ring descriptor, rx buffers,
-	 * tx ring descriptors, and tx buffers.
-	 */
-	pi->rxr = dp;
-	pi->rxr_p = dp_p;
-	dp += MPSC_RXR_SIZE;
-	dp_p += MPSC_RXR_SIZE;
-
-	pi->rxb = (u8 *)dp;
-	pi->rxb_p = (u8 *)dp_p;
-	dp += MPSC_RXB_SIZE;
-	dp_p += MPSC_RXB_SIZE;
-
-	pi->rxr_posn = 0;
-
-	pi->txr = dp;
-	pi->txr_p = dp_p;
-	dp += MPSC_TXR_SIZE;
-	dp_p += MPSC_TXR_SIZE;
-
-	pi->txb = (u8 *)dp;
-	pi->txb_p = (u8 *)dp_p;
-
-	pi->txr_head = 0;
-	pi->txr_tail = 0;
-
-	/* Init rx ring descriptors */
-	dp = pi->rxr;
-	dp_p = pi->rxr_p;
-	bp = pi->rxb;
-	bp_p = pi->rxb_p;
-
-	for (i = 0; i < MPSC_RXR_ENTRIES; i++) {
-		rxre = (struct mpsc_rx_desc *)dp;
-
-		rxre->bufsize = cpu_to_be16(MPSC_RXBE_SIZE);
-		rxre->bytecnt = cpu_to_be16(0);
-		rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O
-				| SDMA_DESC_CMDSTAT_EI | SDMA_DESC_CMDSTAT_F
-				| SDMA_DESC_CMDSTAT_L);
-		rxre->link = cpu_to_be32(dp_p + MPSC_RXRE_SIZE);
-		rxre->buf_ptr = cpu_to_be32(bp_p);
-
-		dp += MPSC_RXRE_SIZE;
-		dp_p += MPSC_RXRE_SIZE;
-		bp += MPSC_RXBE_SIZE;
-		bp_p += MPSC_RXBE_SIZE;
-	}
-	rxre->link = cpu_to_be32(pi->rxr_p);	/* Wrap last back to first */
-
-	/* Init tx ring descriptors */
-	dp = pi->txr;
-	dp_p = pi->txr_p;
-	bp = pi->txb;
-	bp_p = pi->txb_p;
-
-	for (i = 0; i < MPSC_TXR_ENTRIES; i++) {
-		txre = (struct mpsc_tx_desc *)dp;
-
-		txre->link = cpu_to_be32(dp_p + MPSC_TXRE_SIZE);
-		txre->buf_ptr = cpu_to_be32(bp_p);
-
-		dp += MPSC_TXRE_SIZE;
-		dp_p += MPSC_TXRE_SIZE;
-		bp += MPSC_TXBE_SIZE;
-		bp_p += MPSC_TXBE_SIZE;
-	}
-	txre->link = cpu_to_be32(pi->txr_p);	/* Wrap last back to first */
-
-	dma_cache_sync(pi->port.dev, (void *)pi->dma_region,
-			MPSC_DMA_ALLOC_SIZE, DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			flush_dcache_range((ulong)pi->dma_region,
-					(ulong)pi->dma_region
-					+ MPSC_DMA_ALLOC_SIZE);
-#endif
-
-	return;
-}
-
-static void mpsc_uninit_rings(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_uninit_rings[%d]: Uninitializing rings\n",pi->port.line);
-
-	BUG_ON(pi->dma_region == NULL);
-
-	pi->rxr = 0;
-	pi->rxr_p = 0;
-	pi->rxb = NULL;
-	pi->rxb_p = NULL;
-	pi->rxr_posn = 0;
-
-	pi->txr = 0;
-	pi->txr_p = 0;
-	pi->txb = NULL;
-	pi->txb_p = NULL;
-	pi->txr_head = 0;
-	pi->txr_tail = 0;
-}
-
-static int mpsc_make_ready(struct mpsc_port_info *pi)
-{
-	int rc;
-
-	pr_debug("mpsc_make_ready[%d]: Making cltr ready\n", pi->port.line);
-
-	if (!pi->ready) {
-		mpsc_init_hw(pi);
-		rc = mpsc_alloc_ring_mem(pi);
-		if (rc)
-			return rc;
-		mpsc_init_rings(pi);
-		pi->ready = 1;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_CONSOLE_POLL
-static int serial_polled;
-#endif
-
-/*
- ******************************************************************************
- *
- * Interrupt Handling Routines
- *
- ******************************************************************************
- */
-
-static int mpsc_rx_intr(struct mpsc_port_info *pi, unsigned long *flags)
-{
-	struct mpsc_rx_desc *rxre;
-	struct tty_port *port = &pi->port.state->port;
-	u32	cmdstat, bytes_in, i;
-	int	rc = 0;
-	u8	*bp;
-	char	flag = TTY_NORMAL;
-
-	pr_debug("mpsc_rx_intr[%d]: Handling Rx intr\n", pi->port.line);
-
-	rxre = (struct mpsc_rx_desc *)(pi->rxr + (pi->rxr_posn*MPSC_RXRE_SIZE));
-
-	dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE,
-			DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-	if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-		invalidate_dcache_range((ulong)rxre,
-				(ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-
-	/*
-	 * Loop through Rx descriptors handling ones that have been completed.
-	 */
-	while (!((cmdstat = be32_to_cpu(rxre->cmdstat))
-				& SDMA_DESC_CMDSTAT_O)) {
-		bytes_in = be16_to_cpu(rxre->bytecnt);
-#ifdef CONFIG_CONSOLE_POLL
-		if (unlikely(serial_polled)) {
-			serial_polled = 0;
-			return 0;
-		}
-#endif
-		/* Following use of tty struct directly is deprecated */
-		if (tty_buffer_request_room(port, bytes_in) < bytes_in) {
-			if (port->low_latency) {
-				spin_unlock_irqrestore(&pi->port.lock, *flags);
-				tty_flip_buffer_push(port);
-				spin_lock_irqsave(&pi->port.lock, *flags);
-			}
-			/*
-			 * If this failed then we will throw away the bytes
-			 * but must do so to clear interrupts.
-			 */
-		}
-
-		bp = pi->rxb + (pi->rxr_posn * MPSC_RXBE_SIZE);
-		dma_cache_sync(pi->port.dev, (void *)bp, MPSC_RXBE_SIZE,
-				DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			invalidate_dcache_range((ulong)bp,
-					(ulong)bp + MPSC_RXBE_SIZE);
-#endif
-
-		/*
-		 * Other than for parity error, the manual provides little
-		 * info on what data will be in a frame flagged by any of
-		 * these errors.  For parity error, it is the last byte in
-		 * the buffer that had the error.  As for the rest, I guess
-		 * we'll assume there is no data in the buffer.
-		 * If there is...it gets lost.
-		 */
-		if (unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR
-						| SDMA_DESC_CMDSTAT_FR
-						| SDMA_DESC_CMDSTAT_OR))) {
-
-			pi->port.icount.rx++;
-
-			if (cmdstat & SDMA_DESC_CMDSTAT_BR) {	/* Break */
-				pi->port.icount.brk++;
-
-				if (uart_handle_break(&pi->port))
-					goto next_frame;
-			} else if (cmdstat & SDMA_DESC_CMDSTAT_FR) {
-				pi->port.icount.frame++;
-			} else if (cmdstat & SDMA_DESC_CMDSTAT_OR) {
-				pi->port.icount.overrun++;
-			}
-
-			cmdstat &= pi->port.read_status_mask;
-
-			if (cmdstat & SDMA_DESC_CMDSTAT_BR)
-				flag = TTY_BREAK;
-			else if (cmdstat & SDMA_DESC_CMDSTAT_FR)
-				flag = TTY_FRAME;
-			else if (cmdstat & SDMA_DESC_CMDSTAT_OR)
-				flag = TTY_OVERRUN;
-			else if (cmdstat & SDMA_DESC_CMDSTAT_PE)
-				flag = TTY_PARITY;
-		}
-
-		if (uart_handle_sysrq_char(&pi->port, *bp)) {
-			bp++;
-			bytes_in--;
-#ifdef CONFIG_CONSOLE_POLL
-			if (unlikely(serial_polled)) {
-				serial_polled = 0;
-				return 0;
-			}
-#endif
-			goto next_frame;
-		}
-
-		if ((unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR
-						| SDMA_DESC_CMDSTAT_FR
-						| SDMA_DESC_CMDSTAT_OR)))
-				&& !(cmdstat & pi->port.ignore_status_mask)) {
-			tty_insert_flip_char(port, *bp, flag);
-		} else {
-			for (i=0; i<bytes_in; i++)
-				tty_insert_flip_char(port, *bp++, TTY_NORMAL);
-
-			pi->port.icount.rx += bytes_in;
-		}
-
-next_frame:
-		rxre->bytecnt = cpu_to_be16(0);
-		wmb();
-		rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O
-				| SDMA_DESC_CMDSTAT_EI | SDMA_DESC_CMDSTAT_F
-				| SDMA_DESC_CMDSTAT_L);
-		wmb();
-		dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE,
-				DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			flush_dcache_range((ulong)rxre,
-					(ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-
-		/* Advance to next descriptor */
-		pi->rxr_posn = (pi->rxr_posn + 1) & (MPSC_RXR_ENTRIES - 1);
-		rxre = (struct mpsc_rx_desc *)
-			(pi->rxr + (pi->rxr_posn * MPSC_RXRE_SIZE));
-		dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE,
-				DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			invalidate_dcache_range((ulong)rxre,
-					(ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-		rc = 1;
-	}
-
-	/* Restart rx engine, if its stopped */
-	if ((readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_ERD) == 0)
-		mpsc_start_rx(pi);
-
-	spin_unlock_irqrestore(&pi->port.lock, *flags);
-	tty_flip_buffer_push(port);
-	spin_lock_irqsave(&pi->port.lock, *flags);
-	return rc;
-}
-
-static void mpsc_setup_tx_desc(struct mpsc_port_info *pi, u32 count, u32 intr)
-{
-	struct mpsc_tx_desc *txre;
-
-	txre = (struct mpsc_tx_desc *)(pi->txr
-			+ (pi->txr_head * MPSC_TXRE_SIZE));
-
-	txre->bytecnt = cpu_to_be16(count);
-	txre->shadow = txre->bytecnt;
-	wmb();			/* ensure cmdstat is last field updated */
-	txre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O | SDMA_DESC_CMDSTAT_F
-			| SDMA_DESC_CMDSTAT_L
-			| ((intr) ? SDMA_DESC_CMDSTAT_EI : 0));
-	wmb();
-	dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE,
-			DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-	if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-		flush_dcache_range((ulong)txre,
-				(ulong)txre + MPSC_TXRE_SIZE);
-#endif
-}
-
-static void mpsc_copy_tx_data(struct mpsc_port_info *pi)
-{
-	struct circ_buf *xmit = &pi->port.state->xmit;
-	u8 *bp;
-	u32 i;
-
-	/* Make sure the desc ring isn't full */
-	while (CIRC_CNT(pi->txr_head, pi->txr_tail, MPSC_TXR_ENTRIES)
-			< (MPSC_TXR_ENTRIES - 1)) {
-		if (pi->port.x_char) {
-			/*
-			 * Ideally, we should use the TCS field in
-			 * CHR_1 to put the x_char out immediately but
-			 * errata prevents us from being able to read
-			 * CHR_2 to know that its safe to write to
-			 * CHR_1.  Instead, just put it in-band with
-			 * all the other Tx data.
-			 */
-			bp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE);
-			*bp = pi->port.x_char;
-			pi->port.x_char = 0;
-			i = 1;
-		} else if (!uart_circ_empty(xmit)
-				&& !uart_tx_stopped(&pi->port)) {
-			i = min((u32)MPSC_TXBE_SIZE,
-				(u32)uart_circ_chars_pending(xmit));
-			i = min(i, (u32)CIRC_CNT_TO_END(xmit->head, xmit->tail,
-				UART_XMIT_SIZE));
-			bp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE);
-			memcpy(bp, &xmit->buf[xmit->tail], i);
-			xmit->tail = (xmit->tail + i) & (UART_XMIT_SIZE - 1);
-
-			if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-				uart_write_wakeup(&pi->port);
-		} else { /* All tx data copied into ring bufs */
-			return;
-		}
-
-		dma_cache_sync(pi->port.dev, (void *)bp, MPSC_TXBE_SIZE,
-				DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			flush_dcache_range((ulong)bp,
-					(ulong)bp + MPSC_TXBE_SIZE);
-#endif
-		mpsc_setup_tx_desc(pi, i, 1);
-
-		/* Advance to next descriptor */
-		pi->txr_head = (pi->txr_head + 1) & (MPSC_TXR_ENTRIES - 1);
-	}
-}
-
-static int mpsc_tx_intr(struct mpsc_port_info *pi)
-{
-	struct mpsc_tx_desc *txre;
-	int rc = 0;
-	unsigned long iflags;
-
-	spin_lock_irqsave(&pi->tx_lock, iflags);
-
-	if (!mpsc_sdma_tx_active(pi)) {
-		txre = (struct mpsc_tx_desc *)(pi->txr
-				+ (pi->txr_tail * MPSC_TXRE_SIZE));
-
-		dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE,
-				DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			invalidate_dcache_range((ulong)txre,
-					(ulong)txre + MPSC_TXRE_SIZE);
-#endif
-
-		while (!(be32_to_cpu(txre->cmdstat) & SDMA_DESC_CMDSTAT_O)) {
-			rc = 1;
-			pi->port.icount.tx += be16_to_cpu(txre->bytecnt);
-			pi->txr_tail = (pi->txr_tail+1) & (MPSC_TXR_ENTRIES-1);
-
-			/* If no more data to tx, fall out of loop */
-			if (pi->txr_head == pi->txr_tail)
-				break;
-
-			txre = (struct mpsc_tx_desc *)(pi->txr
-					+ (pi->txr_tail * MPSC_TXRE_SIZE));
-			dma_cache_sync(pi->port.dev, (void *)txre,
-					MPSC_TXRE_SIZE, DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-			if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-				invalidate_dcache_range((ulong)txre,
-						(ulong)txre + MPSC_TXRE_SIZE);
-#endif
-		}
-
-		mpsc_copy_tx_data(pi);
-		mpsc_sdma_start_tx(pi);	/* start next desc if ready */
-	}
-
-	spin_unlock_irqrestore(&pi->tx_lock, iflags);
-	return rc;
-}
-
-/*
- * This is the driver's interrupt handler.  To avoid a race, we first clear
- * the interrupt, then handle any completed Rx/Tx descriptors.  When done
- * handling those descriptors, we restart the Rx/Tx engines if they're stopped.
- */
-static irqreturn_t mpsc_sdma_intr(int irq, void *dev_id)
-{
-	struct mpsc_port_info *pi = dev_id;
-	ulong iflags;
-	int rc = IRQ_NONE;
-
-	pr_debug("mpsc_sdma_intr[%d]: SDMA Interrupt Received\n",pi->port.line);
-
-	spin_lock_irqsave(&pi->port.lock, iflags);
-	mpsc_sdma_intr_ack(pi);
-	if (mpsc_rx_intr(pi, &iflags))
-		rc = IRQ_HANDLED;
-	if (mpsc_tx_intr(pi))
-		rc = IRQ_HANDLED;
-	spin_unlock_irqrestore(&pi->port.lock, iflags);
-
-	pr_debug("mpsc_sdma_intr[%d]: SDMA Interrupt Handled\n", pi->port.line);
-	return rc;
-}
-
-/*
- ******************************************************************************
- *
- * serial_core.c Interface routines
- *
- ******************************************************************************
- */
-static uint mpsc_tx_empty(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	ulong iflags;
-	uint rc;
-
-	spin_lock_irqsave(&pi->port.lock, iflags);
-	rc = mpsc_sdma_tx_active(pi) ? 0 : TIOCSER_TEMT;
-	spin_unlock_irqrestore(&pi->port.lock, iflags);
-
-	return rc;
-}
-
-static void mpsc_set_mctrl(struct uart_port *port, uint mctrl)
-{
-	/* Have no way to set modem control lines AFAICT */
-}
-
-static uint mpsc_get_mctrl(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	u32 mflags, status;
-
-	status = (pi->mirror_regs) ? pi->MPSC_CHR_10_m
-		: readl(pi->mpsc_base + MPSC_CHR_10);
-
-	mflags = 0;
-	if (status & 0x1)
-		mflags |= TIOCM_CTS;
-	if (status & 0x2)
-		mflags |= TIOCM_CAR;
-
-	return mflags | TIOCM_DSR;	/* No way to tell if DSR asserted */
-}
-
-static void mpsc_stop_tx(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-
-	pr_debug("mpsc_stop_tx[%d]\n", port->line);
-
-	mpsc_freeze(pi);
-}
-
-static void mpsc_start_tx(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	unsigned long iflags;
-
-	spin_lock_irqsave(&pi->tx_lock, iflags);
-
-	mpsc_unfreeze(pi);
-	mpsc_copy_tx_data(pi);
-	mpsc_sdma_start_tx(pi);
-
-	spin_unlock_irqrestore(&pi->tx_lock, iflags);
-
-	pr_debug("mpsc_start_tx[%d]\n", port->line);
-}
-
-static void mpsc_start_rx(struct mpsc_port_info *pi)
-{
-	pr_debug("mpsc_start_rx[%d]: Starting...\n", pi->port.line);
-
-	if (pi->rcv_data) {
-		mpsc_enter_hunt(pi);
-		mpsc_sdma_cmd(pi, SDMA_SDCM_ERD);
-	}
-}
-
-static void mpsc_stop_rx(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-
-	pr_debug("mpsc_stop_rx[%d]: Stopping...\n", port->line);
-
-	if (pi->mirror_regs) {
-		writel(pi->MPSC_CHR_2_m | MPSC_CHR_2_RA,
-				pi->mpsc_base + MPSC_CHR_2);
-		/* Erratum prevents reading CHR_2 so just delay for a while */
-		udelay(100);
-	} else {
-		writel(readl(pi->mpsc_base + MPSC_CHR_2) | MPSC_CHR_2_RA,
-				pi->mpsc_base + MPSC_CHR_2);
-
-		while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_RA)
-			udelay(10);
-	}
-
-	mpsc_sdma_cmd(pi, SDMA_SDCM_AR);
-}
-
-static void mpsc_break_ctl(struct uart_port *port, int ctl)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	ulong	flags;
-	u32	v;
-
-	v = ctl ? 0x00ff0000 : 0;
-
-	spin_lock_irqsave(&pi->port.lock, flags);
-	if (pi->mirror_regs)
-		pi->MPSC_CHR_1_m = v;
-	writel(v, pi->mpsc_base + MPSC_CHR_1);
-	spin_unlock_irqrestore(&pi->port.lock, flags);
-}
-
-static int mpsc_startup(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	u32 flag = 0;
-	int rc;
-
-	pr_debug("mpsc_startup[%d]: Starting up MPSC, irq: %d\n",
-		port->line, pi->port.irq);
-
-	if ((rc = mpsc_make_ready(pi)) == 0) {
-		/* Setup IRQ handler */
-		mpsc_sdma_intr_ack(pi);
-
-		/* If irq's are shared, need to set flag */
-		if (mpsc_ports[0].port.irq == mpsc_ports[1].port.irq)
-			flag = IRQF_SHARED;
-
-		if (request_irq(pi->port.irq, mpsc_sdma_intr, flag,
-					"mpsc-sdma", pi))
-			printk(KERN_ERR "MPSC: Can't get SDMA IRQ %d\n",
-					pi->port.irq);
-
-		mpsc_sdma_intr_unmask(pi, 0xf);
-		mpsc_sdma_set_rx_ring(pi, (struct mpsc_rx_desc *)(pi->rxr_p
-					+ (pi->rxr_posn * MPSC_RXRE_SIZE)));
-	}
-
-	return rc;
-}
-
-static void mpsc_shutdown(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-
-	pr_debug("mpsc_shutdown[%d]: Shutting down MPSC\n", port->line);
-
-	mpsc_sdma_stop(pi);
-	free_irq(pi->port.irq, pi);
-}
-
-static void mpsc_set_termios(struct uart_port *port, struct ktermios *termios,
-		 struct ktermios *old)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	u32 baud;
-	ulong flags;
-	u32 chr_bits, stop_bits, par;
-
-	switch (termios->c_cflag & CSIZE) {
-	case CS5:
-		chr_bits = MPSC_MPCR_CL_5;
-		break;
-	case CS6:
-		chr_bits = MPSC_MPCR_CL_6;
-		break;
-	case CS7:
-		chr_bits = MPSC_MPCR_CL_7;
-		break;
-	case CS8:
-	default:
-		chr_bits = MPSC_MPCR_CL_8;
-		break;
-	}
-
-	if (termios->c_cflag & CSTOPB)
-		stop_bits = MPSC_MPCR_SBL_2;
-	else
-		stop_bits = MPSC_MPCR_SBL_1;
-
-	par = MPSC_CHR_2_PAR_EVEN;
-	if (termios->c_cflag & PARENB)
-		if (termios->c_cflag & PARODD)
-			par = MPSC_CHR_2_PAR_ODD;
-#ifdef	CMSPAR
-		if (termios->c_cflag & CMSPAR) {
-			if (termios->c_cflag & PARODD)
-				par = MPSC_CHR_2_PAR_MARK;
-			else
-				par = MPSC_CHR_2_PAR_SPACE;
-		}
-#endif
-
-	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk);
-
-	spin_lock_irqsave(&pi->port.lock, flags);
-
-	uart_update_timeout(port, termios->c_cflag, baud);
-
-	mpsc_set_char_length(pi, chr_bits);
-	mpsc_set_stop_bit_length(pi, stop_bits);
-	mpsc_set_parity(pi, par);
-	mpsc_set_baudrate(pi, baud);
-
-	/* Characters/events to read */
-	pi->port.read_status_mask = SDMA_DESC_CMDSTAT_OR;
-
-	if (termios->c_iflag & INPCK)
-		pi->port.read_status_mask |= SDMA_DESC_CMDSTAT_PE
-			| SDMA_DESC_CMDSTAT_FR;
-
-	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
-		pi->port.read_status_mask |= SDMA_DESC_CMDSTAT_BR;
-
-	/* Characters/events to ignore */
-	pi->port.ignore_status_mask = 0;
-
-	if (termios->c_iflag & IGNPAR)
-		pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_PE
-			| SDMA_DESC_CMDSTAT_FR;
-
-	if (termios->c_iflag & IGNBRK) {
-		pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_BR;
-
-		if (termios->c_iflag & IGNPAR)
-			pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_OR;
-	}
-
-	if ((termios->c_cflag & CREAD)) {
-		if (!pi->rcv_data) {
-			pi->rcv_data = 1;
-			mpsc_start_rx(pi);
-		}
-	} else if (pi->rcv_data) {
-		mpsc_stop_rx(port);
-		pi->rcv_data = 0;
-	}
-
-	spin_unlock_irqrestore(&pi->port.lock, flags);
-}
-
-static const char *mpsc_type(struct uart_port *port)
-{
-	pr_debug("mpsc_type[%d]: port type: %s\n", port->line,MPSC_DRIVER_NAME);
-	return MPSC_DRIVER_NAME;
-}
-
-static int mpsc_request_port(struct uart_port *port)
-{
-	/* Should make chip/platform specific call */
-	return 0;
-}
-
-static void mpsc_release_port(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-
-	if (pi->ready) {
-		mpsc_uninit_rings(pi);
-		mpsc_free_ring_mem(pi);
-		pi->ready = 0;
-	}
-}
-
-static void mpsc_config_port(struct uart_port *port, int flags)
-{
-}
-
-static int mpsc_verify_port(struct uart_port *port, struct serial_struct *ser)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	int rc = 0;
-
-	pr_debug("mpsc_verify_port[%d]: Verifying port data\n", pi->port.line);
-
-	if (ser->type != PORT_UNKNOWN && ser->type != PORT_MPSC)
-		rc = -EINVAL;
-	else if (pi->port.irq != ser->irq)
-		rc = -EINVAL;
-	else if (ser->io_type != SERIAL_IO_MEM)
-		rc = -EINVAL;
-	else if (pi->port.uartclk / 16 != ser->baud_base) /* Not sure */
-		rc = -EINVAL;
-	else if ((void *)pi->port.mapbase != ser->iomem_base)
-		rc = -EINVAL;
-	else if (pi->port.iobase != ser->port)
-		rc = -EINVAL;
-	else if (ser->hub6 != 0)
-		rc = -EINVAL;
-
-	return rc;
-}
-#ifdef CONFIG_CONSOLE_POLL
-/* Serial polling routines for writing and reading from the uart while
- * in an interrupt or debug context.
- */
-
-static char poll_buf[2048];
-static int poll_ptr;
-static int poll_cnt;
-static void mpsc_put_poll_char(struct uart_port *port,
-							   unsigned char c);
-
-static int mpsc_get_poll_char(struct uart_port *port)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	struct mpsc_rx_desc *rxre;
-	u32	cmdstat, bytes_in, i;
-	u8	*bp;
-
-	if (!serial_polled)
-		serial_polled = 1;
-
-	pr_debug("mpsc_rx_intr[%d]: Handling Rx intr\n", pi->port.line);
-
-	if (poll_cnt) {
-		poll_cnt--;
-		return poll_buf[poll_ptr++];
-	}
-	poll_ptr = 0;
-	poll_cnt = 0;
-
-	while (poll_cnt == 0) {
-		rxre = (struct mpsc_rx_desc *)(pi->rxr +
-		       (pi->rxr_posn*MPSC_RXRE_SIZE));
-		dma_cache_sync(pi->port.dev, (void *)rxre,
-			       MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			invalidate_dcache_range((ulong)rxre,
-			(ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-		/*
-		 * Loop through Rx descriptors handling ones that have
-		 * been completed.
-		 */
-		while (poll_cnt == 0 &&
-		       !((cmdstat = be32_to_cpu(rxre->cmdstat)) &
-			 SDMA_DESC_CMDSTAT_O)){
-			bytes_in = be16_to_cpu(rxre->bytecnt);
-			bp = pi->rxb + (pi->rxr_posn * MPSC_RXBE_SIZE);
-			dma_cache_sync(pi->port.dev, (void *) bp,
-				       MPSC_RXBE_SIZE, DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-			if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-				invalidate_dcache_range((ulong)bp,
-					(ulong)bp + MPSC_RXBE_SIZE);
-#endif
-			if ((unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR |
-			 SDMA_DESC_CMDSTAT_FR | SDMA_DESC_CMDSTAT_OR))) &&
-				!(cmdstat & pi->port.ignore_status_mask)) {
-				poll_buf[poll_cnt] = *bp;
-				poll_cnt++;
-			} else {
-				for (i = 0; i < bytes_in; i++) {
-					poll_buf[poll_cnt] = *bp++;
-					poll_cnt++;
-				}
-				pi->port.icount.rx += bytes_in;
-			}
-			rxre->bytecnt = cpu_to_be16(0);
-			wmb();
-			rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O |
-						    SDMA_DESC_CMDSTAT_EI |
-						    SDMA_DESC_CMDSTAT_F |
-						    SDMA_DESC_CMDSTAT_L);
-			wmb();
-			dma_cache_sync(pi->port.dev, (void *)rxre,
-				       MPSC_RXRE_SIZE, DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-			if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-				flush_dcache_range((ulong)rxre,
-					   (ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-
-			/* Advance to next descriptor */
-			pi->rxr_posn = (pi->rxr_posn + 1) &
-				(MPSC_RXR_ENTRIES - 1);
-			rxre = (struct mpsc_rx_desc *)(pi->rxr +
-				       (pi->rxr_posn * MPSC_RXRE_SIZE));
-			dma_cache_sync(pi->port.dev, (void *)rxre,
-				       MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-			if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-				invalidate_dcache_range((ulong)rxre,
-						(ulong)rxre + MPSC_RXRE_SIZE);
-#endif
-		}
-
-		/* Restart rx engine, if its stopped */
-		if ((readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_ERD) == 0)
-			mpsc_start_rx(pi);
-	}
-	if (poll_cnt) {
-		poll_cnt--;
-		return poll_buf[poll_ptr++];
-	}
-
-	return 0;
-}
-
-
-static void mpsc_put_poll_char(struct uart_port *port,
-			 unsigned char c)
-{
-	struct mpsc_port_info *pi =
-		container_of(port, struct mpsc_port_info, port);
-	u32 data;
-
-	data = readl(pi->mpsc_base + MPSC_MPCR);
-	writeb(c, pi->mpsc_base + MPSC_CHR_1);
-	mb();
-	data = readl(pi->mpsc_base + MPSC_CHR_2);
-	data |= MPSC_CHR_2_TTCS;
-	writel(data, pi->mpsc_base + MPSC_CHR_2);
-	mb();
-
-	while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS);
-}
-#endif
-
-static const struct uart_ops mpsc_pops = {
-	.tx_empty	= mpsc_tx_empty,
-	.set_mctrl	= mpsc_set_mctrl,
-	.get_mctrl	= mpsc_get_mctrl,
-	.stop_tx	= mpsc_stop_tx,
-	.start_tx	= mpsc_start_tx,
-	.stop_rx	= mpsc_stop_rx,
-	.break_ctl	= mpsc_break_ctl,
-	.startup	= mpsc_startup,
-	.shutdown	= mpsc_shutdown,
-	.set_termios	= mpsc_set_termios,
-	.type		= mpsc_type,
-	.release_port	= mpsc_release_port,
-	.request_port	= mpsc_request_port,
-	.config_port	= mpsc_config_port,
-	.verify_port	= mpsc_verify_port,
-#ifdef CONFIG_CONSOLE_POLL
-	.poll_get_char = mpsc_get_poll_char,
-	.poll_put_char = mpsc_put_poll_char,
-#endif
-};
-
-/*
- ******************************************************************************
- *
- * Console Interface Routines
- *
- ******************************************************************************
- */
-
-#ifdef CONFIG_SERIAL_MPSC_CONSOLE
-static void mpsc_console_write(struct console *co, const char *s, uint count)
-{
-	struct mpsc_port_info *pi = &mpsc_ports[co->index];
-	u8 *bp, *dp, add_cr = 0;
-	int i;
-	unsigned long iflags;
-
-	spin_lock_irqsave(&pi->tx_lock, iflags);
-
-	while (pi->txr_head != pi->txr_tail) {
-		while (mpsc_sdma_tx_active(pi))
-			udelay(100);
-		mpsc_sdma_intr_ack(pi);
-		mpsc_tx_intr(pi);
-	}
-
-	while (mpsc_sdma_tx_active(pi))
-		udelay(100);
-
-	while (count > 0) {
-		bp = dp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE);
-
-		for (i = 0; i < MPSC_TXBE_SIZE; i++) {
-			if (count == 0)
-				break;
-
-			if (add_cr) {
-				*(dp++) = '\r';
-				add_cr = 0;
-			} else {
-				*(dp++) = *s;
-
-				if (*(s++) == '\n') { /* add '\r' after '\n' */
-					add_cr = 1;
-					count++;
-				}
-			}
-
-			count--;
-		}
-
-		dma_cache_sync(pi->port.dev, (void *)bp, MPSC_TXBE_SIZE,
-				DMA_BIDIRECTIONAL);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
-		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
-			flush_dcache_range((ulong)bp,
-					(ulong)bp + MPSC_TXBE_SIZE);
-#endif
-		mpsc_setup_tx_desc(pi, i, 0);
-		pi->txr_head = (pi->txr_head + 1) & (MPSC_TXR_ENTRIES - 1);
-		mpsc_sdma_start_tx(pi);
-
-		while (mpsc_sdma_tx_active(pi))
-			udelay(100);
-
-		pi->txr_tail = (pi->txr_tail + 1) & (MPSC_TXR_ENTRIES - 1);
-	}
-
-	spin_unlock_irqrestore(&pi->tx_lock, iflags);
-}
-
-static int __init mpsc_console_setup(struct console *co, char *options)
-{
-	struct mpsc_port_info *pi;
-	int baud, bits, parity, flow;
-
-	pr_debug("mpsc_console_setup[%d]: options: %s\n", co->index, options);
-
-	if (co->index >= MPSC_NUM_CTLRS)
-		co->index = 0;
-
-	pi = &mpsc_ports[co->index];
-
-	baud = pi->default_baud;
-	bits = pi->default_bits;
-	parity = pi->default_parity;
-	flow = pi->default_flow;
-
-	if (!pi->port.ops)
-		return -ENODEV;
-
-	spin_lock_init(&pi->port.lock);	/* Temporary fix--copied from 8250.c */
-
-	if (options)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-
-	return uart_set_options(&pi->port, co, baud, parity, bits, flow);
-}
-
-static struct console mpsc_console = {
-	.name	= MPSC_DEV_NAME,
-	.write	= mpsc_console_write,
-	.device	= uart_console_device,
-	.setup	= mpsc_console_setup,
-	.flags	= CON_PRINTBUFFER,
-	.index	= -1,
-	.data	= &mpsc_reg,
-};
-
-static int __init mpsc_late_console_init(void)
-{
-	pr_debug("mpsc_late_console_init: Enter\n");
-
-	if (!(mpsc_console.flags & CON_ENABLED))
-		register_console(&mpsc_console);
-	return 0;
-}
-
-late_initcall(mpsc_late_console_init);
-
-#define MPSC_CONSOLE	&mpsc_console
-#else
-#define MPSC_CONSOLE	NULL
-#endif
-/*
- ******************************************************************************
- *
- * Dummy Platform Driver to extract & map shared register regions
- *
- ******************************************************************************
- */
-static void mpsc_resource_err(char *s)
-{
-	printk(KERN_WARNING "MPSC: Platform device resource error in %s\n", s);
-}
-
-static int mpsc_shared_map_regs(struct platform_device *pd)
-{
-	struct resource	*r;
-
-	if ((r = platform_get_resource(pd, IORESOURCE_MEM,
-					MPSC_ROUTING_BASE_ORDER))
-			&& request_mem_region(r->start,
-				MPSC_ROUTING_REG_BLOCK_SIZE,
-				"mpsc_routing_regs")) {
-		mpsc_shared_regs.mpsc_routing_base = ioremap(r->start,
-				MPSC_ROUTING_REG_BLOCK_SIZE);
-		mpsc_shared_regs.mpsc_routing_base_p = r->start;
-	} else {
-		mpsc_resource_err("MPSC routing base");
-		return -ENOMEM;
-	}
-
-	if ((r = platform_get_resource(pd, IORESOURCE_MEM,
-					MPSC_SDMA_INTR_BASE_ORDER))
-			&& request_mem_region(r->start,
-				MPSC_SDMA_INTR_REG_BLOCK_SIZE,
-				"sdma_intr_regs")) {
-		mpsc_shared_regs.sdma_intr_base = ioremap(r->start,
-			MPSC_SDMA_INTR_REG_BLOCK_SIZE);
-		mpsc_shared_regs.sdma_intr_base_p = r->start;
-	} else {
-		iounmap(mpsc_shared_regs.mpsc_routing_base);
-		release_mem_region(mpsc_shared_regs.mpsc_routing_base_p,
-				MPSC_ROUTING_REG_BLOCK_SIZE);
-		mpsc_resource_err("SDMA intr base");
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void mpsc_shared_unmap_regs(void)
-{
-	if (mpsc_shared_regs.mpsc_routing_base) {
-		iounmap(mpsc_shared_regs.mpsc_routing_base);
-		release_mem_region(mpsc_shared_regs.mpsc_routing_base_p,
-				MPSC_ROUTING_REG_BLOCK_SIZE);
-	}
-	if (mpsc_shared_regs.sdma_intr_base) {
-		iounmap(mpsc_shared_regs.sdma_intr_base);
-		release_mem_region(mpsc_shared_regs.sdma_intr_base_p,
-				MPSC_SDMA_INTR_REG_BLOCK_SIZE);
-	}
-
-	mpsc_shared_regs.mpsc_routing_base = NULL;
-	mpsc_shared_regs.sdma_intr_base = NULL;
-
-	mpsc_shared_regs.mpsc_routing_base_p = 0;
-	mpsc_shared_regs.sdma_intr_base_p = 0;
-}
-
-static int mpsc_shared_drv_probe(struct platform_device *dev)
-{
-	struct mpsc_shared_pdata	*pdata;
-	int rc;
-
-	if (dev->id != 0)
-		return -ENODEV;
-
-	rc = mpsc_shared_map_regs(dev);
-	if (rc)
-		return rc;
-
-	pdata = dev_get_platdata(&dev->dev);
-
-	mpsc_shared_regs.MPSC_MRR_m = pdata->mrr_val;
-	mpsc_shared_regs.MPSC_RCRR_m= pdata->rcrr_val;
-	mpsc_shared_regs.MPSC_TCRR_m= pdata->tcrr_val;
-	mpsc_shared_regs.SDMA_INTR_CAUSE_m = pdata->intr_cause_val;
-	mpsc_shared_regs.SDMA_INTR_MASK_m = pdata->intr_mask_val;
-
-	return 0;
-}
-
-static int mpsc_shared_drv_remove(struct platform_device *dev)
-{
-	if (dev->id != 0)
-		return -ENODEV;
-
-	mpsc_shared_unmap_regs();
-	mpsc_shared_regs.MPSC_MRR_m = 0;
-	mpsc_shared_regs.MPSC_RCRR_m = 0;
-	mpsc_shared_regs.MPSC_TCRR_m = 0;
-	mpsc_shared_regs.SDMA_INTR_CAUSE_m = 0;
-	mpsc_shared_regs.SDMA_INTR_MASK_m = 0;
-
-	return 0;
-}
-
-static struct platform_driver mpsc_shared_driver = {
-	.probe	= mpsc_shared_drv_probe,
-	.remove	= mpsc_shared_drv_remove,
-	.driver	= {
-		.name	= MPSC_SHARED_NAME,
-	},
-};
-
-/*
- ******************************************************************************
- *
- * Driver Interface Routines
- *
- ******************************************************************************
- */
-static struct uart_driver mpsc_reg = {
-	.owner		= THIS_MODULE,
-	.driver_name	= MPSC_DRIVER_NAME,
-	.dev_name	= MPSC_DEV_NAME,
-	.major		= MPSC_MAJOR,
-	.minor		= MPSC_MINOR_START,
-	.nr		= MPSC_NUM_CTLRS,
-	.cons		= MPSC_CONSOLE,
-};
-
-static int mpsc_drv_map_regs(struct mpsc_port_info *pi,
-		struct platform_device *pd)
-{
-	struct resource	*r;
-
-	if ((r = platform_get_resource(pd, IORESOURCE_MEM, MPSC_BASE_ORDER))
-			&& request_mem_region(r->start, MPSC_REG_BLOCK_SIZE,
-			"mpsc_regs")) {
-		pi->mpsc_base = ioremap(r->start, MPSC_REG_BLOCK_SIZE);
-		pi->mpsc_base_p = r->start;
-	} else {
-		mpsc_resource_err("MPSC base");
-		goto err;
-	}
-
-	if ((r = platform_get_resource(pd, IORESOURCE_MEM,
-					MPSC_SDMA_BASE_ORDER))
-			&& request_mem_region(r->start,
-				MPSC_SDMA_REG_BLOCK_SIZE, "sdma_regs")) {
-		pi->sdma_base = ioremap(r->start,MPSC_SDMA_REG_BLOCK_SIZE);
-		pi->sdma_base_p = r->start;
-	} else {
-		mpsc_resource_err("SDMA base");
-		goto err;
-	}
-
-	if ((r = platform_get_resource(pd,IORESOURCE_MEM,MPSC_BRG_BASE_ORDER))
-			&& request_mem_region(r->start,
-				MPSC_BRG_REG_BLOCK_SIZE, "brg_regs")) {
-		pi->brg_base = ioremap(r->start, MPSC_BRG_REG_BLOCK_SIZE);
-		pi->brg_base_p = r->start;
-	} else {
-		mpsc_resource_err("BRG base");
-		goto err;
-	}
-	return 0;
-
-err:
-	if (pi->sdma_base) {
-		iounmap(pi->sdma_base);
-		pi->sdma_base = NULL;
-	}
-	if (pi->mpsc_base) {
-		iounmap(pi->mpsc_base);
-		pi->mpsc_base = NULL;
-	}
-	return -ENOMEM;
-}
-
-static void mpsc_drv_unmap_regs(struct mpsc_port_info *pi)
-{
-	if (pi->mpsc_base) {
-		iounmap(pi->mpsc_base);
-		release_mem_region(pi->mpsc_base_p, MPSC_REG_BLOCK_SIZE);
-	}
-	if (pi->sdma_base) {
-		iounmap(pi->sdma_base);
-		release_mem_region(pi->sdma_base_p, MPSC_SDMA_REG_BLOCK_SIZE);
-	}
-	if (pi->brg_base) {
-		iounmap(pi->brg_base);
-		release_mem_region(pi->brg_base_p, MPSC_BRG_REG_BLOCK_SIZE);
-	}
-
-	pi->mpsc_base = NULL;
-	pi->sdma_base = NULL;
-	pi->brg_base = NULL;
-
-	pi->mpsc_base_p = 0;
-	pi->sdma_base_p = 0;
-	pi->brg_base_p = 0;
-}
-
-static void mpsc_drv_get_platform_data(struct mpsc_port_info *pi,
-		struct platform_device *pd, int num)
-{
-	struct mpsc_pdata	*pdata;
-
-	pdata = dev_get_platdata(&pd->dev);
-
-	pi->port.uartclk = pdata->brg_clk_freq;
-	pi->port.iotype = UPIO_MEM;
-	pi->port.line = num;
-	pi->port.type = PORT_MPSC;
-	pi->port.fifosize = MPSC_TXBE_SIZE;
-	pi->port.membase = pi->mpsc_base;
-	pi->port.mapbase = (ulong)pi->mpsc_base;
-	pi->port.ops = &mpsc_pops;
-
-	pi->mirror_regs = pdata->mirror_regs;
-	pi->cache_mgmt = pdata->cache_mgmt;
-	pi->brg_can_tune = pdata->brg_can_tune;
-	pi->brg_clk_src = pdata->brg_clk_src;
-	pi->mpsc_max_idle = pdata->max_idle;
-	pi->default_baud = pdata->default_baud;
-	pi->default_bits = pdata->default_bits;
-	pi->default_parity = pdata->default_parity;
-	pi->default_flow = pdata->default_flow;
-
-	/* Initial values of mirrored regs */
-	pi->MPSC_CHR_1_m = pdata->chr_1_val;
-	pi->MPSC_CHR_2_m = pdata->chr_2_val;
-	pi->MPSC_CHR_10_m = pdata->chr_10_val;
-	pi->MPSC_MPCR_m = pdata->mpcr_val;
-	pi->BRG_BCR_m = pdata->bcr_val;
-
-	pi->shared_regs = &mpsc_shared_regs;
-
-	pi->port.irq = platform_get_irq(pd, 0);
-}
-
-static int mpsc_drv_probe(struct platform_device *dev)
-{
-	struct mpsc_port_info *pi;
-	int rc;
-
-	dev_dbg(&dev->dev, "mpsc_drv_probe: Adding MPSC %d\n", dev->id);
-
-	if (dev->id >= MPSC_NUM_CTLRS)
-		return -ENODEV;
-
-	pi = &mpsc_ports[dev->id];
-
-	rc = mpsc_drv_map_regs(pi, dev);
-	if (rc)
-		return rc;
-
-	mpsc_drv_get_platform_data(pi, dev, dev->id);
-	pi->port.dev = &dev->dev;
-
-	rc = mpsc_make_ready(pi);
-	if (rc)
-		goto err_unmap;
-
-	spin_lock_init(&pi->tx_lock);
-	rc = uart_add_one_port(&mpsc_reg, &pi->port);
-	if (rc)
-		goto err_relport;
-
-	return 0;
-err_relport:
-	mpsc_release_port(&pi->port);
-err_unmap:
-	mpsc_drv_unmap_regs(pi);
-	return rc;
-}
-
-static struct platform_driver mpsc_driver = {
-	.probe	= mpsc_drv_probe,
-	.driver	= {
-		.name			= MPSC_CTLR_NAME,
-		.suppress_bind_attrs	= true,
-	},
-};
-
-static int __init mpsc_drv_init(void)
-{
-	int	rc;
-
-	printk(KERN_INFO "Serial: MPSC driver\n");
-
-	memset(mpsc_ports, 0, sizeof(mpsc_ports));
-	memset(&mpsc_shared_regs, 0, sizeof(mpsc_shared_regs));
-
-	rc = uart_register_driver(&mpsc_reg);
-	if (rc)
-		return rc;
-
-	rc = platform_driver_register(&mpsc_shared_driver);
-	if (rc)
-		goto err_unreg_uart;
-
-	rc = platform_driver_register(&mpsc_driver);
-	if (rc)
-		goto err_unreg_plat;
-
-	return 0;
-err_unreg_plat:
-	platform_driver_unregister(&mpsc_shared_driver);
-err_unreg_uart:
-	uart_unregister_driver(&mpsc_reg);
-	return rc;
-}
-device_initcall(mpsc_drv_init);
-
-/*
-MODULE_AUTHOR("Mark A. Greer <mgreer@mvista.com>");
-MODULE_DESCRIPTION("Generic Marvell MPSC serial/UART driver");
-MODULE_LICENSE("GPL");
-*/
diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
index 4471cf96ef69..47e5679b48e1 100644
--- a/include/linux/mv643xx.h
+++ b/include/linux/mv643xx.h
@@ -918,52 +918,6 @@
 
 extern void mv64340_irq_init(unsigned int base);
 
-/* MPSC Platform Device, Driver Data (Shared register regions) */
-#define	MPSC_SHARED_NAME		"mpsc_shared"
-
-#define	MPSC_ROUTING_BASE_ORDER		0
-#define	MPSC_SDMA_INTR_BASE_ORDER	1
-
-#define MPSC_ROUTING_REG_BLOCK_SIZE	0x000c
-#define MPSC_SDMA_INTR_REG_BLOCK_SIZE	0x0084
-
-struct mpsc_shared_pdata {
-	u32	mrr_val;
-	u32	rcrr_val;
-	u32	tcrr_val;
-	u32	intr_cause_val;
-	u32	intr_mask_val;
-};
-
-/* MPSC Platform Device, Driver Data */
-#define	MPSC_CTLR_NAME			"mpsc"
-
-#define	MPSC_BASE_ORDER			0
-#define	MPSC_SDMA_BASE_ORDER		1
-#define	MPSC_BRG_BASE_ORDER		2
-
-#define MPSC_REG_BLOCK_SIZE		0x0038
-#define MPSC_SDMA_REG_BLOCK_SIZE	0x0c18
-#define MPSC_BRG_REG_BLOCK_SIZE		0x0008
-
-struct mpsc_pdata {
-	u8	mirror_regs;
-	u8	cache_mgmt;
-	u8	max_idle;
-	int	default_baud;
-	int	default_bits;
-	int	default_parity;
-	int	default_flow;
-	u32	chr_1_val;
-	u32	chr_2_val;
-	u32	chr_10_val;
-	u32	mpcr_val;
-	u32	bcr_val;
-	u8	brg_can_tune;
-	u8	brg_clk_src;
-	u32	brg_clk_freq;
-};
-
 /* Watchdog Platform Device, Driver Data */
 #define	MV64x60_WDT_NAME			"mv64x60_wdt"
 
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 67c4aaaa2308..5642c05e0da0 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -129,7 +129,7 @@
 /* Motorola i.MX SoC */
 #define PORT_IMX	62
 
-/* Marvell MPSC */
+/* Marvell MPSC (obsolete unused) */
 #define PORT_MPSC	63
 
 /* TXX9 type number */
-- 
cgit v1.2.3


From b9a7ba5562074855e8a3f92ea7e1174b61a3e87d Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:23 +0300
Subject: net/mlx5: Use event mask based on device capabilities

Use the reported device capabilities for the supported user events (i.e.
affiliated and un-affiliated) to set the EQ mask.

As the event mask can be up to 256 defined by 4 entries of u64 change
the applicable code to work accordingly.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c             |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 40 ++++++++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fw.c |  6 +++++
 include/linux/mlx5/device.h                  |  6 ++++-
 include/linux/mlx5/eq.h                      |  2 +-
 include/linux/mlx5/mlx5_ifc.h                | 13 ++++++---
 6 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index c594489eb2d7..831c450b271a 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1558,9 +1558,9 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 8000d2a4a7e2..33f78d4d3724 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -256,6 +256,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	int inlen;
 	u32 *in;
 	int err;
+	int i;
 
 	/* Init CQ table */
 	memset(cq_table, 0, sizeof(*cq_table));
@@ -283,10 +284,12 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	mlx5_fill_page_array(&eq->buf, pas);
 
 	MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
-	if (!param->mask && MLX5_CAP_GEN(dev, log_max_uctx))
+	if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx))
 		MLX5_SET(create_eq_in, in, uid, MLX5_SHARED_RESOURCE_UID);
 
-	MLX5_SET64(create_eq_in, in, event_bitmask, param->mask);
+	for (i = 0; i < 4; i++)
+		MLX5_ARRAY_SET64(create_eq_in, in, event_bitmask, i,
+				 param->mask[i]);
 
 	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
 	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
@@ -507,7 +510,23 @@ static int cq_err_event_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
+static void gather_user_async_events(struct mlx5_core_dev *dev, u64 mask[4])
+{
+	__be64 *user_unaffiliated_events;
+	__be64 *user_affiliated_events;
+	int i;
+
+	user_affiliated_events =
+		MLX5_CAP_DEV_EVENT(dev, user_affiliated_events);
+	user_unaffiliated_events =
+		MLX5_CAP_DEV_EVENT(dev, user_unaffiliated_events);
+
+	for (i = 0; i < 4; i++)
+		mask[i] |= be64_to_cpu(user_affiliated_events[i] |
+				       user_unaffiliated_events[i]);
+}
+
+static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4])
 {
 	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
 
@@ -544,7 +563,10 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 		async_event_mask |=
 			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
-	return async_event_mask;
+	mask[0] = async_event_mask;
+
+	if (MLX5_CAP_GEN(dev, event_cap))
+		gather_user_async_events(dev, mask);
 }
 
 static int create_async_eqs(struct mlx5_core_dev *dev)
@@ -559,9 +581,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
 	};
+
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD;
 	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
@@ -577,9 +600,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
 	};
+
+	gather_async_events_mask(dev, param.mask);
 	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
@@ -595,9 +619,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
 	};
+
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST;
 	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
@@ -789,7 +814,6 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.irq_index = vecidx,
-			.mask = 0,
 			.nent = nent,
 		};
 		err = create_map_eq(dev, &eq->core, &param);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 1ab6f7e3bec6..05367f15c3a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -202,6 +202,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 			return err;
 	}
 
+	if (MLX5_CAP_GEN(dev, event_cap)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_EVENT);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5e760067ac41..0d1abe097627 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -351,7 +351,7 @@ enum mlx5_event {
 
 	MLX5_EVENT_TYPE_DEVICE_TRACER      = 0x26,
 
-	MLX5_EVENT_TYPE_MAX                = MLX5_EVENT_TYPE_DEVICE_TRACER + 1,
+	MLX5_EVENT_TYPE_MAX                = 0x100,
 };
 
 enum {
@@ -1077,6 +1077,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEBUG,
 	MLX5_CAP_RESERVED_14,
 	MLX5_CAP_DEV_MEM,
+	MLX5_CAP_DEV_EVENT = 0x14,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1255,6 +1256,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
 	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
 
+#define MLX5_CAP_DEV_EVENT(mdev, cap)\
+	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
+
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
 	MLX5_CMD_STAT_INT_ERR			= 0x1,
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 70e16dcfb4c4..e49d8c0d4f26 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -15,7 +15,7 @@ struct mlx5_core_dev;
 struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
-	u64            mask;
+	u64            mask[4];
 };
 
 struct mlx5_eq *
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 031db53e94ce..4148c47a65ed 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -860,6 +860,12 @@ struct mlx5_ifc_device_mem_cap_bits {
 	u8         reserved_at_180[0x680];
 };
 
+struct mlx5_ifc_device_event_cap_bits {
+	u8         user_affiliated_events[4][0x40];
+
+	u8         user_unaffiliated_events[4][0x40];
+};
+
 enum {
 	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE     = 0x0,
 	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES    = 0x2,
@@ -1017,7 +1023,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_max_srq_sz[0x8];
 	u8         log_max_qp_sz[0x8];
-	u8         reserved_at_90[0x8];
+	u8         event_cap[0x1];
+	u8         reserved_at_91[0x7];
 	u8         prio_tag_required[0x1];
 	u8         reserved_at_99[0x2];
 	u8         log_max_qp[0x5];
@@ -7422,9 +7429,9 @@ struct mlx5_ifc_create_eq_in_bits {
 
 	u8         reserved_at_280[0x40];
 
-	u8         event_bitmask[0x40];
+	u8         event_bitmask[4][0x40];
 
-	u8         reserved_at_300[0x580];
+	u8         reserved_at_3c0[0x4c0];
 
 	u8         pas[0][0x40];
 };
-- 
cgit v1.2.3


From c0670781f54839fb9d0b2c0eaee58862601981bf Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:24 +0300
Subject: net/mlx5: Expose the API to register for ANY event

Expose the API to register for ANY event, mlx5_ib will be able to use
this functionality for its needs.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 3 ---
 include/linux/mlx5/driver.h                      | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 33f78d4d3724..c634a78d5cdd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -949,6 +949,7 @@ int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 
 	return atomic_notifier_chain_register(&eqt->nh[nb->event_type], &nb->nb);
 }
+EXPORT_SYMBOL(mlx5_eq_notifier_register);
 
 int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 {
@@ -959,3 +960,4 @@ int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 
 	return atomic_notifier_chain_unregister(&eqt->nh[nb->event_type], &nb->nb);
 }
+EXPORT_SYMBOL(mlx5_eq_notifier_unregister);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index d826e63d5a17..3dfab91ae5f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -97,7 +97,4 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
 struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev);
 #endif
 
-int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
-int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
-
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7658a4908431..24b02ab206c3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1043,6 +1043,8 @@ int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
 int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
+int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
+int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
 
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
 
-- 
cgit v1.2.3


From 38164b771947be9baf06e78ffdfb650f8f3e908e Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:25 +0300
Subject: net/mlx5: mlx5_core_create_cq() enhancements

Enhance mlx5_core_create_cq() to get the command out buffer from the
callers to let them use the output.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                     | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c        | 7 +++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 ++-
 include/linux/mlx5/cq.h                             | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 0220736b073e..d323b822b694 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -891,6 +891,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_ib_cq *cq;
 	int uninitialized_var(index);
 	int uninitialized_var(inlen);
@@ -958,7 +959,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
 		MLX5_SET(cqc, cqc, oi, 1);
 
-	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
 	if (err)
 		goto err_cqb;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 703d88332bc6..1bd4336392a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -87,11 +87,10 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
 }
 
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
-			u32 *in, int inlen)
+			u32 *in, int inlen, u32 *out, int outlen)
 {
 	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn);
 	u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)];
-	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	u32 din[MLX5_ST_SZ_DW(destroy_cq_in)];
 	struct mlx5_eq_comp *eq;
 	int err;
@@ -100,9 +99,9 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	if (IS_ERR(eq))
 		return PTR_ERR(eq);
 
-	memset(out, 0, sizeof(out));
+	memset(out, 0, outlen);
 	MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
-	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f83fdb67e760..9ae55e93286d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1517,6 +1517,7 @@ static void mlx5e_free_cq(struct mlx5e_cq *cq)
 
 static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 {
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_core_dev *mdev = cq->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 
@@ -1551,7 +1552,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
 
-	err = mlx5_core_create_cq(mdev, mcq, in, inlen);
+	err = mlx5_core_create_cq(mdev, mcq, in, inlen, out, sizeof(out));
 
 	kvfree(in);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index d61d536f4e17..1fa922698a88 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -429,6 +429,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	struct mlx5_fpga_device *fdev = conn->fdev;
 	struct mlx5_core_dev *mdev = fdev->mdev;
 	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {0};
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_wq_param wqp;
 	struct mlx5_cqe64 *cqe;
 	int inlen, err, eqn;
@@ -476,7 +477,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
 	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas);
 
-	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
+	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen, out, sizeof(out));
 	kvfree(in);
 
 	if (err)
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 769326ea1d9b..e44157a2b7db 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -185,7 +185,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq)
 }
 
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
-			u32 *in, int inlen);
+			u32 *in, int inlen, u32 *out, int outlen);
 int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
 int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 		       u32 *out, int outlen);
-- 
cgit v1.2.3


From 4e0e2ea1886afe8c001971ff767f6670312a9b04 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:27 +0300
Subject: net/mlx5: Report EQE data upon CQ completion

Report EQE data upon CQ completion to let upper layers use this data.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                     | 2 +-
 drivers/infiniband/hw/mlx5/main.c                   | 2 +-
 drivers/infiniband/hw/mlx5/qp.c                     | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c        | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en.h        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c   | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 ++-
 include/linux/mlx5/cq.h                             | 4 ++--
 9 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index d323b822b694..4efbbd2fce0c 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -37,7 +37,7 @@
 #include "mlx5_ib.h"
 #include "srq.h"
 
-static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
 {
 	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 798aa5e0941e..26b1ce2359ba 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4492,7 +4492,7 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
 	 * lock/unlock above locks Now need to arm all involved CQs.
 	 */
 	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
-		mcq->comp(mcq);
+		mcq->comp(mcq, NULL);
 	}
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 }
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index f6623c77443a..768c7e81f688 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -6297,7 +6297,7 @@ static void handle_drain_completion(struct ib_cq *cq,
 		/* Run the CQ handler - this makes sure that the drain WR will
 		 * be processed if wasn't processed yet.
 		 */
-		mcq->mcq.comp(&mcq->mcq);
+		mcq->mcq.comp(&mcq->mcq, NULL);
 	}
 
 	wait_for_completion(&sdrain->done);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 1bd4336392a2..818edc63e428 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -58,7 +58,7 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
 				 tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
-		mcq->tasklet_ctx.comp(mcq);
+		mcq->tasklet_ctx.comp(mcq, NULL);
 		mlx5_cq_put(mcq);
 		if (time_after(jiffies, end))
 			break;
@@ -68,7 +68,8 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 		tasklet_schedule(&ctx->task);
 }
 
-static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
+				   struct mlx5_eqe *eqe)
 {
 	unsigned long flags;
 	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3a183d690e23..16753f263079 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -780,7 +780,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more);
 
 void mlx5e_trigger_irq(struct mlx5e_icosq *sq);
-void mlx5e_completion_event(struct mlx5_core_cq *mcq);
+void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe);
 void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index f9862bf75491..c665ae0f22bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -136,7 +136,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-void mlx5e_completion_event(struct mlx5_core_cq *mcq)
+void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
 {
 	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 678454535460..41f25ea2e8d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -153,7 +153,7 @@ static int mlx5_eq_comp_int(struct notifier_block *nb,
 		cq = mlx5_eq_cq_get(eq, cqn);
 		if (likely(cq)) {
 			++cq->arm_sn;
-			cq->comp(cq);
+			cq->comp(cq, eqe);
 			mlx5_cq_put(cq);
 		} else {
 			mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index 1fa922698a88..4c50efe4e7f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -414,7 +414,8 @@ static void mlx5_fpga_conn_cq_tasklet(unsigned long data)
 	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
 }
 
-static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq)
+static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq,
+				       struct mlx5_eqe *eqe)
 {
 	struct mlx5_fpga_conn *conn;
 
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index e44157a2b7db..40748fc1b11b 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -47,7 +47,7 @@ struct mlx5_core_cq {
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
-	void (*comp)		(struct mlx5_core_cq *);
+	void (*comp)(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
 	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
 	u32			cons_index;
 	unsigned		arm_sn;
@@ -55,7 +55,7 @@ struct mlx5_core_cq {
 	int			pid;
 	struct {
 		struct list_head list;
-		void (*comp)(struct mlx5_core_cq *);
+		void (*comp)(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
 		void		*priv;
 	} tasklet_ctx;
 	int			reset_notify_added;
-- 
cgit v1.2.3


From e4075c44287638b9a99430fea79a2d1468fbc27d Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:28 +0300
Subject: net/mlx5: Expose device definitions for object events

Expose an extra device definitions for objects events.

It includes: object_type values for legacy objects and generic data
header for any other object.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4148c47a65ed..be92401a25a0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -91,6 +91,20 @@ enum {
 
 enum {
 	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
+	MLX5_OBJ_TYPE_MKEY = 0xff01,
+	MLX5_OBJ_TYPE_QP = 0xff02,
+	MLX5_OBJ_TYPE_PSV = 0xff03,
+	MLX5_OBJ_TYPE_RMP = 0xff04,
+	MLX5_OBJ_TYPE_XRC_SRQ = 0xff05,
+	MLX5_OBJ_TYPE_RQ = 0xff06,
+	MLX5_OBJ_TYPE_SQ = 0xff07,
+	MLX5_OBJ_TYPE_TIR = 0xff08,
+	MLX5_OBJ_TYPE_TIS = 0xff09,
+	MLX5_OBJ_TYPE_DCT = 0xff0a,
+	MLX5_OBJ_TYPE_XRQ = 0xff0b,
+	MLX5_OBJ_TYPE_RQT = 0xff0e,
+	MLX5_OBJ_TYPE_FLOW_COUNTER = 0xff0f,
+	MLX5_OBJ_TYPE_CQ = 0xff10,
 };
 
 enum {
@@ -9944,4 +9958,11 @@ struct mlx5_ifc_alloc_sf_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_affiliated_event_header_bits {
+	u8         reserved_at_0[0x10];
+	u8         obj_type[0x10];
+
+	u8         obj_id[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 83b44fe343b5abfcb1b2261289bd0cfcfcfd60a8 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 24 Jun 2019 18:36:56 +0100
Subject: drivers: base: cacheinfo: Ensure cpu hotplug work is done before
 Intel RDT

The cacheinfo structures are alloced/freed by cpu online/offline
callbacks. Originally these were only used by sysfs to expose the
cache topology to user space. Without any in-kernel dependencies
CPUHP_AP_ONLINE_DYN was an appropriate choice.

resctrl has started using these structures to identify CPUs that
share a cache. It updates its 'domain' structures from cpu
online/offline callbacks. These depend on the cacheinfo structures
(resctrl_online_cpu()->domain_add_cpu()->get_cache_id()->
 get_cpu_cacheinfo()).
These also run as CPUHP_AP_ONLINE_DYN.

Now that there is an in-kernel dependency, move the cacheinfo
work earlier so we know its done before resctrl's CPUHP_AP_ONLINE_DYN
work runs.

Fixes: 2264d9c74dda1 ("x86/intel_rdt: Build structures for each resource based on cache topology")
Cc: <stable@vger.kernel.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: James Morse <james.morse@arm.com>
Link: https://lore.kernel.org/r/20190624173656.202407-1-james.morse@arm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/cacheinfo.c   | 3 ++-
 include/linux/cpuhotplug.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a7359535caf5..b444f89a2041 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -655,7 +655,8 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
 
 static int __init cacheinfo_sysfs_init(void)
 {
-	return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "base/cacheinfo:online",
+	return cpuhp_setup_state(CPUHP_AP_BASE_CACHEINFO_ONLINE,
+				 "base/cacheinfo:online",
 				 cacheinfo_cpu_online, cacheinfo_cpu_pre_down);
 }
 device_initcall(cacheinfo_sysfs_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6a381594608c..50c893f03c21 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -175,6 +175,7 @@ enum cpuhp_state {
 	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
+	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_AP_X86_HPET_ONLINE,
-- 
cgit v1.2.3


From 62a6bc3a1e4f4ee9ae0076fa295f9af1c3725ce3 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 21 Jun 2019 17:17:25 +0200
Subject: driver: core: Allow subsystems to continue deferring probe

Some subsystems, such as pinctrl, allow continuing to defer probe
indefinitely. This is useful for devices that depend on resources
provided by devices that are only probed after the init stage.

One example of this can be seen on Tegra, where the DPAUX hardware
contains pinmuxing controls for pins that it shares with an I2C
controller. The I2C controller is typically used for communication
with a monitor over HDMI (DDC). However, other instances of the I2C
controller are used to access system critical components, such as a
PMIC. The I2C controller driver will therefore usually be a builtin
driver, whereas the DPAUX driver is part of the display driver that
is loaded from a module to avoid bloating the kernel image with all
of the DRM/KMS subsystem.

In this particular case the pins used by this I2C/DDC controller
become accessible very late in the boot process. However, since the
controller is only used in conjunction with display, that's not an
issue.

Unfortunately the driver core currently outputs a warning message
when a device fails to get the pinctrl before the end of the init
stage. That can be confusing for the user because it may sound like
an unwanted error occurred, whereas it's really an expected and
harmless situation.

In order to eliminate this warning, this patch allows callers of the
driver_deferred_probe_check_state() helper to specify that they want
to continue deferring probe, regardless of whether we're past the
init stage or not. All of the callers of that function are updated
for the new signature, but only the pinctrl subsystem passes a true
value in the new persist parameter if appropriate.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20190621151725.20414-1-thierry.reding@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c            | 55 +++++++++++++++++++++++++++++++++++++-------
 drivers/pinctrl/devicetree.c |  7 +++---
 include/linux/device.h       |  1 +
 3 files changed, 51 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 0df9b4461766..994a90747420 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -235,6 +235,19 @@ static int __init deferred_probe_timeout_setup(char *str)
 }
 __setup("deferred_probe_timeout=", deferred_probe_timeout_setup);
 
+static int __driver_deferred_probe_check_state(struct device *dev)
+{
+	if (!initcalls_done)
+		return -EPROBE_DEFER;
+
+	if (!deferred_probe_timeout) {
+		dev_WARN(dev, "deferred probe timeout, ignoring dependency");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
 /**
  * driver_deferred_probe_check_state() - Check deferred probe state
  * @dev: device to check
@@ -248,14 +261,40 @@ __setup("deferred_probe_timeout=", deferred_probe_timeout_setup);
  */
 int driver_deferred_probe_check_state(struct device *dev)
 {
-	if (initcalls_done) {
-		if (!deferred_probe_timeout) {
-			dev_WARN(dev, "deferred probe timeout, ignoring dependency");
-			return -ETIMEDOUT;
-		}
-		dev_warn(dev, "ignoring dependency for device, assuming no driver");
-		return -ENODEV;
-	}
+	int ret;
+
+	ret = __driver_deferred_probe_check_state(dev);
+	if (ret < 0)
+		return ret;
+
+	dev_warn(dev, "ignoring dependency for device, assuming no driver");
+
+	return -ENODEV;
+}
+
+/**
+ * driver_deferred_probe_check_state_continue() - check deferred probe state
+ * @dev: device to check
+ *
+ * Returns -ETIMEDOUT if deferred probe debug timeout has expired, or
+ * -EPROBE_DEFER otherwise.
+ *
+ * Drivers or subsystems can opt-in to calling this function instead of
+ * directly returning -EPROBE_DEFER.
+ *
+ * This is similar to driver_deferred_probe_check_state(), but it allows the
+ * subsystem to keep deferring probe after built-in drivers have had a chance
+ * to probe. One scenario where that is useful is if built-in drivers rely on
+ * resources that are provided by modular drivers.
+ */
+int driver_deferred_probe_check_state_continue(struct device *dev)
+{
+	int ret;
+
+	ret = __driver_deferred_probe_check_state(dev);
+	if (ret < 0)
+		return ret;
+
 	return -EPROBE_DEFER;
 }
 
diff --git a/drivers/pinctrl/devicetree.c b/drivers/pinctrl/devicetree.c
index f7e354f85518..88ddbb2e30de 100644
--- a/drivers/pinctrl/devicetree.c
+++ b/drivers/pinctrl/devicetree.c
@@ -112,12 +112,11 @@ static int dt_to_map_one_config(struct pinctrl *p,
 		np_pctldev = of_get_next_parent(np_pctldev);
 		if (!np_pctldev || of_node_is_root(np_pctldev)) {
 			of_node_put(np_pctldev);
-			ret = driver_deferred_probe_check_state(p->dev);
 			/* keep deferring if modules are enabled unless we've timed out */
-			if (IS_ENABLED(CONFIG_MODULES) && !allow_default && ret == -ENODEV)
-				ret = -EPROBE_DEFER;
+			if (IS_ENABLED(CONFIG_MODULES) && !allow_default)
+				return driver_deferred_probe_check_state_continue(p->dev);
 
-			return ret;
+			return driver_deferred_probe_check_state(p->dev);
 		}
 		/* If we're creating a hog we can use the passed pctldev */
 		if (hog_pctldev && (np_pctldev == p->dev->of_node)) {
diff --git a/include/linux/device.h b/include/linux/device.h
index 709308560d32..ef61e2d50ecc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -343,6 +343,7 @@ struct device *driver_find_device(struct device_driver *drv,
 
 void driver_deferred_probe_add(struct device *dev);
 int driver_deferred_probe_check_state(struct device *dev);
+int driver_deferred_probe_check_state_continue(struct device *dev);
 
 /**
  * struct subsys_interface - interfaces to device functions
-- 
cgit v1.2.3


From 2752b823169b216db142c4466b43269281962dcf Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 15 May 2019 00:04:27 -0500
Subject: net/mlx5: Introduce and use mlx5_eswitch_get_total_vports()

Instead MLX5_TOTAL_VPORTS, use mlx5_eswitch_get_total_vports().
mlx5_eswitch_get_total_vports() in subsequent patch accounts for SF
vports as well.
Expanding MLX5_TOTAL_VPORTS macro would require exposing SF internals to
more generic vport.h header file. Such exposure is not desired.
Hence a mlx5_eswitch_get_total_vports() is introduced.

Given that mlx5_eswitch_get_total_vports() API wants to work on const
mlx5_core_dev*, change its helper functions also to accept const *dev.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  4 +++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 26 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 15 +++++++++++++
 include/linux/mlx5/driver.h                        |  9 ++++----
 include/linux/mlx5/eswitch.h                       |  3 +++
 include/linux/mlx5/vport.h                         |  3 ---
 8 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 3065c5d0ee96..f2cb789d2331 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -29,7 +29,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
-	int num_ports = MLX5_TOTAL_VPORTS(dev);
+	int num_ports = mlx5_eswitch_get_total_vports(dev);
 	const struct mlx5_ib_profile *profile;
 	struct mlx5_ib_dev *ibdev;
 	int vport_index;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 89f52370e770..9137a8390216 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1868,14 +1868,16 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
-	int total_vports = MLX5_TOTAL_VPORTS(dev);
 	struct mlx5_eswitch *esw;
 	struct mlx5_vport *vport;
+	int total_vports;
 	int err, i;
 
 	if (!MLX5_VPORT_MANAGER(dev))
 		return 0;
 
+	total_vports = mlx5_eswitch_get_total_vports(dev);
+
 	esw_info(dev,
 		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
 		 total_vports,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 50e5841c1698..5c8fb2597bfa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1394,7 +1394,7 @@ void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw)
 
 int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 {
-	int total_vports = MLX5_TOTAL_VPORTS(esw->dev);
+	int total_vports = esw->total_vports;
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_eswitch_rep *rep;
 	u8 hw_id[ETH_ALEN], rep_type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9f5544ac6b8a..8162252585ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2090,7 +2090,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
 
-	if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+	if (!steering || vport >= mlx5_eswitch_get_total_vports(dev))
 		return NULL;
 
 	switch (type) {
@@ -2421,7 +2421,7 @@ static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
 	if (!steering->esw_egress_root_ns)
 		return;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+	for (i = 0; i < mlx5_eswitch_get_total_vports(dev); i++)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 
 	kfree(steering->esw_egress_root_ns);
@@ -2435,7 +2435,7 @@ static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 	if (!steering->esw_ingress_root_ns)
 		return;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+	for (i = 0; i < mlx5_eswitch_get_total_vports(dev); i++)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 
 	kfree(steering->esw_ingress_root_ns);
@@ -2614,16 +2614,18 @@ static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vpo
 static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int total_vports = mlx5_eswitch_get_total_vports(dev);
 	int err;
 	int i;
 
-	steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
-					       sizeof(*steering->esw_egress_root_ns),
-					       GFP_KERNEL);
+	steering->esw_egress_root_ns =
+			kcalloc(total_vports,
+				sizeof(*steering->esw_egress_root_ns),
+				GFP_KERNEL);
 	if (!steering->esw_egress_root_ns)
 		return -ENOMEM;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+	for (i = 0; i < total_vports; i++) {
 		err = init_egress_acl_root_ns(steering, i);
 		if (err)
 			goto cleanup_root_ns;
@@ -2641,16 +2643,18 @@ cleanup_root_ns:
 static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int total_vports = mlx5_eswitch_get_total_vports(dev);
 	int err;
 	int i;
 
-	steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
-						sizeof(*steering->esw_ingress_root_ns),
-						GFP_KERNEL);
+	steering->esw_ingress_root_ns =
+			kcalloc(total_vports,
+				sizeof(*steering->esw_ingress_root_ns),
+				GFP_KERNEL);
 	if (!steering->esw_ingress_root_ns)
 		return -ENOMEM;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+	for (i = 0; i < total_vports; i++) {
 		err = init_ingress_acl_root_ns(steering, i);
 		if (err)
 			goto cleanup_root_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 670fa493c5f5..c912d82ca64b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -34,6 +34,7 @@
 #include <linux/etherdevice.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/eswitch.h>
 #include "mlx5_core.h"
 
 /* Mutex to hold while enabling or disabling RoCE */
@@ -1165,3 +1166,17 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 	return tmp;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
+
+/**
+ * mlx5_eswitch_get_total_vports - Get total vports of the eswitch
+ *
+ * @dev:	Pointer to core device
+ *
+ * mlx5_eswitch_get_total_vports returns total number of vports for
+ * the eswitch.
+ */
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
+{
+	return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev);
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_total_vports);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 24b02ab206c3..031043341ed5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1085,7 +1085,7 @@ enum {
 	MLX5_PCI_DEV_IS_VF		= 1 << 0,
 };
 
-static inline bool mlx5_core_is_pf(struct mlx5_core_dev *dev)
+static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev)
 {
 	return dev->coredev_type == MLX5_COREDEV_PF;
 }
@@ -1095,17 +1095,18 @@ static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
-static inline bool mlx5_core_is_ecpf_esw_manager(struct mlx5_core_dev *dev)
+static inline bool
+mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
 	return dev->caps.embedded_cpu && MLX5_CAP_GEN(dev, eswitch_manager);
 }
 
-static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
+static inline bool mlx5_ecpf_vport_exists(const struct mlx5_core_dev *dev)
 {
 	return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
 }
 
-static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
+static inline u16 mlx5_core_max_vfs(const struct mlx5_core_dev *dev)
 {
 	return dev->priv.sriov.max_vfs;
 }
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index d4731199edb4..61db37aa9642 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -66,6 +66,8 @@ struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 				    int vport, u32 sqn);
 
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
+
 #ifdef CONFIG_MLX5_ESWITCH
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
@@ -93,4 +95,5 @@ mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
 	return 0;
 };
 #endif /* CONFIG_MLX5_ESWITCH */
+
 #endif
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 6cbf29229749..16060fb9b5e5 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -44,9 +44,6 @@
 				   MLX5_VPORT_UPLINK_PLACEHOLDER +	\
 				   MLX5_VPORT_ECPF_PLACEHOLDER(mdev))
 
-#define MLX5_TOTAL_VPORTS(mdev)	(MLX5_SPECIAL_VPORTS(mdev) +		\
-				 mlx5_core_max_vfs(mdev))
-
 #define MLX5_VPORT_MANAGER(mdev)					\
 	(MLX5_CAP_GEN(mdev, vport_group_manager) &&			\
 	 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&	\
-- 
cgit v1.2.3


From a12ff35e0fb770b4d060298be147189313ec002c Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 3 Apr 2019 13:05:50 +0300
Subject: net/mlx5: Introduce TLS TX offload hardware bits and structures

Add TLS offload related IFC structs, layouts and enumerations.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  14 ++++++
 include/linux/mlx5/mlx5_ifc.h | 104 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0d1abe097627..7358d64e76fa 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -437,6 +437,7 @@ enum {
 	MLX5_OPCODE_SET_PSV		= 0x20,
 	MLX5_OPCODE_GET_PSV		= 0x21,
 	MLX5_OPCODE_CHECK_PSV		= 0x22,
+	MLX5_OPCODE_DUMP		= 0x23,
 	MLX5_OPCODE_RGET_PSV		= 0x26,
 	MLX5_OPCODE_RCHECK_PSV		= 0x27,
 
@@ -444,6 +445,14 @@ enum {
 
 };
 
+enum {
+	MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x20,
+};
+
+enum {
+	MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x20,
+};
+
 enum {
 	MLX5_SET_PORT_RESET_QKEY	= 0,
 	MLX5_SET_PORT_GUID0		= 16,
@@ -1077,6 +1086,8 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEBUG,
 	MLX5_CAP_RESERVED_14,
 	MLX5_CAP_DEV_MEM,
+	MLX5_CAP_RESERVED_16,
+	MLX5_CAP_TLS,
 	MLX5_CAP_DEV_EVENT = 0x14,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
@@ -1256,6 +1267,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
 	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
 
+#define MLX5_CAP_TLS(mdev, cap) \
+	MLX5_GET(tls_cap, (mdev)->caps.hca_cur[MLX5_CAP_TLS], cap)
+
 #define MLX5_CAP_DEV_EVENT(mdev, cap)\
 	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index be92401a25a0..f03ec31e3232 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -973,6 +973,16 @@ struct mlx5_ifc_vector_calc_cap_bits {
 	u8         reserved_at_c0[0x720];
 };
 
+struct mlx5_ifc_tls_cap_bits {
+	u8         tls_1_2_aes_gcm_128[0x1];
+	u8         tls_1_3_aes_gcm_128[0x1];
+	u8         tls_1_2_aes_gcm_256[0x1];
+	u8         tls_1_3_aes_gcm_256[0x1];
+	u8         reserved_at_4[0x1c];
+
+	u8         reserved_at_20[0x7e0];
+};
+
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
@@ -1303,7 +1313,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_440[0x20];
 
-	u8         reserved_at_460[0x3];
+	u8         tls[0x1];
+	u8         reserved_at_461[0x2];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x3];
 	u8         log_max_umem[0x5];
@@ -1328,7 +1339,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_geneve_tlv_option_data_len[0x5];
 	u8         reserved_at_570[0x10];
 
-	u8         reserved_at_580[0x3c];
+	u8         reserved_at_580[0x33];
+	u8         log_max_dek[0x5];
+	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
 	u8         cqe_128_always[0x1];
 	u8         cqe_compression_128[0x1];
@@ -2607,6 +2620,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	struct mlx5_ifc_debug_cap_bits debug_cap;
 	struct mlx5_ifc_fpga_cap_bits fpga_cap;
+	struct mlx5_ifc_tls_cap_bits tls_cap;
 	u8         reserved_at_0[0x8000];
 };
 
@@ -2746,7 +2760,8 @@ struct mlx5_ifc_traffic_counter_bits {
 
 struct mlx5_ifc_tisc_bits {
 	u8         strict_lag_tx_port_affinity[0x1];
-	u8         reserved_at_1[0x3];
+	u8         tls_en[0x1];
+	u8         reserved_at_1[0x2];
 	u8         lag_tx_port_affinity[0x04];
 
 	u8         reserved_at_8[0x4];
@@ -2760,7 +2775,11 @@ struct mlx5_ifc_tisc_bits {
 
 	u8         reserved_at_140[0x8];
 	u8         underlay_qpn[0x18];
-	u8         reserved_at_160[0x3a0];
+
+	u8         reserved_at_160[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_180[0x380];
 };
 
 enum {
@@ -9965,4 +9984,81 @@ struct mlx5_ifc_affiliated_event_header_bits {
 	u8         obj_id[0x20];
 };
 
+enum {
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc),
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
+};
+
+struct mlx5_ifc_encryption_key_obj_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x14];
+	u8         key_size[0x4];
+	u8         reserved_at_58[0x4];
+	u8         key_type[0x4];
+
+	u8         reserved_at_60[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_80[0x180];
+	u8         key[8][0x20];
+
+	u8         reserved_at_300[0x500];
+};
+
+struct mlx5_ifc_create_encryption_key_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_encryption_key_obj_bits encryption_key_object;
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128 = 0x0,
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256 = 0x1,
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_DEK = 0x1,
+};
+
+struct mlx5_ifc_tls_static_params_bits {
+	u8         const_2[0x2];
+	u8         tls_version[0x4];
+	u8         const_1[0x2];
+	u8         reserved_at_8[0x14];
+	u8         encryption_standard[0x4];
+
+	u8         reserved_at_20[0x20];
+
+	u8         initial_record_number[0x40];
+
+	u8         resync_tcp_sn[0x20];
+
+	u8         gcm_iv[0x20];
+
+	u8         implicit_iv[0x40];
+
+	u8         reserved_at_100[0x8];
+	u8         dek_index[0x18];
+
+	u8         reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_tls_progress_params_bits {
+	u8         valid[0x1];
+	u8         reserved_at_1[0x7];
+	u8         pd[0x18];
+
+	u8         next_record_tcp_sn[0x20];
+
+	u8         hw_resync_tcp_sn[0x20];
+
+	u8         record_tracker_state[0x2];
+	u8         auth_state[0x2];
+	u8         reserved_at_64[0x4];
+	u8         hw_offset_record_number[0x18];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 0718edf528c552c66a5dc3525ffb145971efa766 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Tue, 2 Jul 2019 17:12:09 +0300
Subject: net/mlx5: Properly name the generic WQE control field

A generic WQE control field is used for different purposes
in different cases.
Use union to allow using the proper name in each case.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/qp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index d1f353c64797..127d224443e3 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -202,7 +202,12 @@ struct mlx5_wqe_ctrl_seg {
 	u8			signature;
 	u8			rsvd[2];
 	u8			fm_ce_se;
-	__be32			imm;
+	union {
+		__be32		general_id;
+		__be32		imm;
+		__be32		umr_mkey;
+		__be32		tisn;
+	};
 };
 
 #define MLX5_WQE_CTRL_DS_MASK 0x3f
-- 
cgit v1.2.3


From 89e0edfbea103d9b274efa10a8fc7a88bdac8f76 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 23 May 2019 10:45:45 -0400
Subject: lockd: Convert NLM service fl_owner to nlm_lockowner

Do as the NLM client: allocate and track a struct nlm_lockowner for use as
the fl_owner for locks created by the NLM sever.  This allows us to keep
the svid within this structure for matching locks, and will allow us to
track the pid of lockd in a future patch.  It should also allow easier
reference of the nlm_host in conflicting locks, and simplify lock hashing
and comparison.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
[bfields@redhat.com: fix type of some error returns]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc4proc.c         | 13 +++++-
 fs/lockd/svclock.c          | 96 +++++++++++++++++++++++++++++++++++++++++++++
 fs/lockd/svcproc.c          | 13 +++++-
 fs/lockd/svcsubs.c          |  2 +-
 fs/lockd/xdr.c              |  1 -
 fs/lockd/xdr4.c             |  1 -
 include/linux/lockd/lockd.h |  2 +
 7 files changed, 123 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1bddf70d9656..a00134fd8956 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -46,8 +46,13 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file;
-		lock->fl.fl_owner = (fl_owner_t) host;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+		if (!lock->fl.fl_owner) {
+			/* lockowner allocation has failed */
+			nlmsvc_release_host(host);
+			return nlm_lck_denied_nolocks;
+		}
 	}
 
 	return 0;
@@ -94,6 +99,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -142,6 +148,7 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp)
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -178,6 +185,7 @@ __nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp)
 	resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock);
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -217,6 +225,7 @@ __nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp)
 	resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock);
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -365,6 +374,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp)
 	resp->status = nlmsvc_share_file(host, file, argp);
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -399,6 +409,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp)
 	resp->status = nlmsvc_unshare_file(host, file, argp);
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ea719cdd6a36..34c6ee85274e 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -332,6 +332,93 @@ restart:
 	mutex_unlock(&file->f_mutex);
 }
 
+static struct nlm_lockowner *
+nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
+{
+	refcount_inc(&lockowner->count);
+	return lockowner;
+}
+
+static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+{
+	if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
+		return;
+	list_del(&lockowner->list);
+	spin_unlock(&lockowner->host->h_lock);
+	nlmsvc_release_host(lockowner->host);
+	kfree(lockowner);
+}
+
+static struct nlm_lockowner *__nlmsvc_find_lockowner(struct nlm_host *host, pid_t pid)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->pid != pid)
+			continue;
+		return nlmsvc_get_lockowner(lockowner);
+	}
+	return NULL;
+}
+
+static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t pid)
+{
+	struct nlm_lockowner *res, *new = NULL;
+
+	spin_lock(&host->h_lock);
+	res = __nlmsvc_find_lockowner(host, pid);
+
+	if (res == NULL) {
+		spin_unlock(&host->h_lock);
+		new = kmalloc(sizeof(*res), GFP_KERNEL);
+		spin_lock(&host->h_lock);
+		res = __nlmsvc_find_lockowner(host, pid);
+		if (res == NULL && new != NULL) {
+			res = new;
+			/* fs/locks.c will manage the refcount through lock_ops */
+			refcount_set(&new->count, 1);
+			new->pid = pid;
+			new->host = nlm_get_host(host);
+			list_add(&new->list, &host->h_lockowners);
+			new = NULL;
+		}
+	}
+
+	spin_unlock(&host->h_lock);
+	kfree(new);
+	return res;
+}
+
+void
+nlmsvc_release_lockowner(struct nlm_lock *lock)
+{
+	if (lock->fl.fl_owner)
+		nlmsvc_put_lockowner(lock->fl.fl_owner);
+}
+
+static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner;
+	new->fl_owner = nlmsvc_get_lockowner(nlm_lo);
+}
+
+static void nlmsvc_locks_release_private(struct file_lock *fl)
+{
+	nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner);
+}
+
+const struct file_lock_operations nlmsvc_lock_ops = {
+	.fl_copy_lock = nlmsvc_locks_copy_lock,
+	.fl_release_private = nlmsvc_locks_release_private,
+};
+
+void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
+						pid_t pid)
+{
+	fl->fl_owner = nlmsvc_find_lockowner(host, pid);
+	if (fl->fl_owner != NULL)
+		fl->fl_ops = &nlmsvc_lock_ops;
+}
+
 /*
  * Initialize arguments for GRANTED call. The nlm_rqst structure
  * has been cleared already.
@@ -509,6 +596,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 {
 	int			error;
 	__be32			ret;
+	struct nlm_lockowner	*test_owner;
 
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				locks_inode(file->f_file)->i_sb->s_id,
@@ -522,6 +610,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	/* If there's a conflicting lock, remember to clean up the test lock */
+	test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
+
 	error = vfs_test_lock(file->f_file, &lock->fl);
 	if (error) {
 		/* We can't currently deal with deferred test requests */
@@ -548,6 +639,11 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	conflock->fl.fl_start = lock->fl.fl_start;
 	conflock->fl.fl_end = lock->fl.fl_end;
 	locks_release_private(&lock->fl);
+
+	/* Clean up the test lock */
+	lock->fl.fl_owner = NULL;
+	nlmsvc_put_lockowner(test_owner);
+
 	ret = nlm_lck_denied;
 out:
 	return ret;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index ea77c66d3cc3..36245ab43ae3 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -76,8 +76,13 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file;
-		lock->fl.fl_owner = (fl_owner_t) host;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+		if (!lock->fl.fl_owner) {
+			/* lockowner allocation has failed */
+			nlmsvc_release_host(host);
+			return nlm_lck_denied_nolocks;
+		}
 	}
 
 	return 0;
@@ -125,6 +130,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -173,6 +179,7 @@ __nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp)
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -210,6 +217,7 @@ __nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp)
 	resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock));
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -250,6 +258,7 @@ __nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp)
 	resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock));
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -408,6 +417,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp)
 	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -442,6 +452,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp)
 	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0e610f422406..028fc152da22 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -180,7 +180,7 @@ again:
 		/* update current lock count */
 		file->f_locks++;
 
-		lockhost = (struct nlm_host *) fl->fl_owner;
+		lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 7147e4aebecc..ec717ae41ee3 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -126,7 +126,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
-	fl->fl_owner = current->files;
 	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 7ed9edf9aed4..45741adfe041 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -118,7 +118,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
-	fl->fl_owner = current->files;
 	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c9b422dde542..d294dde9e546 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -282,6 +282,7 @@ void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					nlm_host_match_fn_t match);
 void		  nlmsvc_grant_reply(struct nlm_cookie *, __be32);
 void		  nlmsvc_release_call(struct nlm_rqst *);
+void		  nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t);
 
 /*
  * File handling for the server personality
@@ -289,6 +290,7 @@ void		  nlmsvc_release_call(struct nlm_rqst *);
 __be32		  nlm_lookup_file(struct svc_rqst *, struct nlm_file **,
 					struct nfs_fh *);
 void		  nlm_release_file(struct nlm_file *);
+void		  nlmsvc_release_lockowner(struct nlm_lock *);
 void		  nlmsvc_mark_resources(struct net *);
 void		  nlmsvc_free_host_resources(struct nlm_host *);
 void		  nlmsvc_invalidate_all(void);
-- 
cgit v1.2.3


From f85d93385e9fe6886a751f647f6812a89bf6bee3 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 23 May 2019 10:45:48 -0400
Subject: locks: Cleanup lm_compare_owner and lm_owner_key

After the update to use nlm_lockowners for the NLM server, there are no
more users of lm_compare_owner and lm_owner_key.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 Documentation/filesystems/Locking | 14 --------------
 fs/locks.c                        |  5 -----
 include/linux/fs.h                |  2 --
 3 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index dac435575384..204dd3ea36bb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -361,8 +361,6 @@ so fl_release_private called on a lease should not block.
 
 ----------------------- lock_manager_operations ---------------------------
 prototypes:
-	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
-	unsigned long (*lm_owner_key)(struct file_lock *);
 	void (*lm_notify)(struct file_lock *);  /* unblock callback */
 	int (*lm_grant)(struct file_lock *, struct file_lock *, int);
 	void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -371,23 +369,11 @@ prototypes:
 locking rules:
 
 			inode->i_lock	blocked_lock_lock	may block
-lm_compare_owner:	yes[1]		maybe			no
-lm_owner_key		yes[1]		yes			no
 lm_notify:		yes		yes			no
 lm_grant:		no		no			no
 lm_break:		yes		no			no
 lm_change		yes		no			no
 
-[1]:	->lm_compare_owner and ->lm_owner_key are generally called with
-*an* inode->i_lock held. It may not be the i_lock of the inode
-associated with either file_lock argument! This is the case with deadlock
-detection, since the code has to chase down the owners of locks that may
-be entirely unrelated to the one on which the lock is being acquired.
-For deadlock detection however, the blocked_lock_lock is also held. The
-fact that these locks are held ensures that the file_locks do not
-disappear out from under you while doing the comparison or generating an
-owner key.
-
 --------------------------- buffer_head -----------------------------------
 prototypes:
 	void (*b_end_io)(struct buffer_head *bh, int uptodate);
diff --git a/fs/locks.c b/fs/locks.c
index ec1e4a5df629..0f85e840b2c7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -658,9 +658,6 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
  */
 static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 {
-	if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner)
-		return fl2->fl_lmops == fl1->fl_lmops &&
-			fl1->fl_lmops->lm_compare_owner(fl1, fl2);
 	return fl1->fl_owner == fl2->fl_owner;
 }
 
@@ -701,8 +698,6 @@ static void locks_delete_global_locks(struct file_lock *fl)
 static unsigned long
 posix_owner_key(struct file_lock *fl)
 {
-	if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
-		return fl->fl_lmops->lm_owner_key(fl);
 	return (unsigned long)fl->fl_owner;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..0fa010bb7b6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1019,8 +1019,6 @@ struct file_lock_operations {
 };
 
 struct lock_manager_operations {
-	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
-	unsigned long (*lm_owner_key)(struct file_lock *);
 	fl_owner_t (*lm_get_owner)(fl_owner_t);
 	void (*lm_put_owner)(fl_owner_t);
 	void (*lm_notify)(struct file_lock *);	/* unblock callback */
-- 
cgit v1.2.3


From ea053e164cc812f0c00a58cbbf8c65e27ceb6148 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 19 Jun 2019 12:30:13 -0400
Subject: nfsd: escape high characters in binary data

I'm exposing some information about NFS clients in pseudofiles.  I
expect to eventually have simple tools to help read those pseudofiles.

But it's also helpful if the raw files are human-readable to the extent
possible.  It aids debugging and makes them usable on systems that don't
have the latest nfs-utils.

A minor challenge there is opaque client-generated protocol objects like
state owners and client identifiers.  Some clients generate those to
include handy information in plain ascii.  But they may also include
arbitrary byte sequences.

I think the simplest approach is to limit to isprint(c) && isascii(c)
and escape everything else.

That means you can just cat the file and get something that looks OK.
Also, I'm trying to keep these files legal YAML, which requires them to
UTF-8, and this is a simple way to guarantee that.

Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/seq_file.c                  | 11 +++++++++++
 include/linux/seq_file.h       |  1 +
 include/linux/string_helpers.h |  3 +++
 lib/string_helpers.c           | 19 +++++++++++++++++++
 4 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index abe27ec43176..04f09689cd6d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -384,6 +384,17 @@ void seq_escape(struct seq_file *m, const char *s, const char *esc)
 }
 EXPORT_SYMBOL(seq_escape);
 
+void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int ret;
+
+	ret = string_escape_mem_ascii(src, isz, buf, size);
+	seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem_ascii);
+
 void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
 	int len;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index a121982af0f5..5998e1f4ff06 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -127,6 +127,7 @@ void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
 		    unsigned long long v, unsigned int width);
 
 void seq_escape(struct seq_file *m, const char *s, const char *esc);
+void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);
 
 void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 		  int rowsize, int groupsize, const void *buf, size_t len,
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index d23c5030901a..c28955132234 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -54,6 +54,9 @@ static inline int string_unescape_any_inplace(char *buf)
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
 
+int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
+					size_t osz);
+
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
 		char *dst, size_t osz, const char *only)
 {
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 4403e1924f73..3a90a9e2b94a 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -540,6 +540,25 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 }
 EXPORT_SYMBOL(string_escape_mem);
 
+int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
+					size_t osz)
+{
+	char *p = dst;
+	char *end = p + osz;
+
+	while (isz--) {
+		unsigned char c = *src++;
+
+		if (!isprint(c) || !isascii(c) || c == '"' || c == '\\')
+			escape_hex(c, &p, end);
+		else
+			escape_passthrough(c, &p, end);
+	}
+
+	return p - dst;
+}
+EXPORT_SYMBOL(string_escape_mem_ascii);
+
 /*
  * Return an allocated string that has been escaped of special characters
  * and double quotes, making it safe to log in quotes.
-- 
cgit v1.2.3


From 6f4859b8a72638f60c7051247aac63a761f01933 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 19 Jun 2019 14:30:33 -0400
Subject: nfsd: create xdr_netobj_dup helper

Move some repeated code to a common helper.  No change in behavior.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c        | 11 ++++-------
 include/linux/sunrpc/xdr.h |  7 +++++++
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e6229bfea2b8..640cd221fc77 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1857,7 +1857,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
 	if (clp == NULL)
 		return NULL;
-	clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
+	xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL);
 	if (clp->cl_name.data == NULL)
 		goto err_no_name;
 	clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE,
@@ -1867,7 +1867,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 		goto err_no_hashtbl;
 	for (i = 0; i < OWNER_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
-	clp->cl_name.len = name.len;
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	idr_init(&clp->cl_stateids);
 	atomic_set(&clp->cl_rpc_users, 0);
@@ -4000,12 +3999,11 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 	if (!sop)
 		return NULL;
 
-	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+	xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL);
 	if (!sop->so_owner.data) {
 		kmem_cache_free(slab, sop);
 		return NULL;
 	}
-	sop->so_owner.len = owner->len;
 
 	INIT_LIST_HEAD(&sop->so_stateids);
 	sop->so_client = clp;
@@ -6093,12 +6091,11 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
 		lo = (struct nfs4_lockowner *) fl->fl_owner;
-		deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
-					lo->lo_owner.so_owner.len, GFP_KERNEL);
+		xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
+						GFP_KERNEL);
 		if (!deny->ld_owner.data)
 			/* We just don't care that much */
 			goto nevermind;
-		deny->ld_owner.len = lo->lo_owner.so_owner.len;
 		deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
 	} else {
 nevermind:
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9ee3970ba59c..8a87d8bcb197 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -164,6 +164,13 @@ xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len)
 	return p + XDR_QUADLEN(len);
 }
 
+static inline void xdr_netobj_dup(struct xdr_netobj *dst,
+				  struct xdr_netobj *src, gfp_t gfp_mask)
+{
+	dst->data = kmemdup(src->data, src->len, gfp_mask);
+	dst->len = src->len;
+}
+
 /*
  * Adjust kvec to reflect end of xdr'ed data (RPC client XDR)
  */
-- 
cgit v1.2.3


From f99d479bcb78ecc0243f1fcc53d7081fa150d7eb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 1 Jul 2019 16:26:50 +0200
Subject: gpiolib: Document new gpio_chip.init_valid_mask field

A new field init_valid_mask was added to struct gpio_chip, but it was
not documented.

Fixes: f8ec92a9f63b3b11 ("gpiolib: Add init_valid_mask exported function")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20190701142650.25122-1-geert+renesas@glider.be
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 5d325fd29d6b..8d58386aadd5 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -194,6 +194,8 @@ struct gpio_irq_chip {
  * @dbg_show: optional routine to show contents in debugfs; default code
  *	will be used when this is omitted, but custom code can show extra
  *	state (such as pullup/pulldown configuration).
+ * @init_valid_mask: optional routine to initialize @valid_mask, to be used if
+ *	not all GPIOs are valid.
  * @base: identifies the first GPIO number handled by this chip;
  *	or, if negative during registration, requests dynamic ID allocation.
  *	DEPRECATION: providing anything non-negative and nailing the base
-- 
cgit v1.2.3


From 0b07ee944701dabcddc294d903b5e8e21c2c5d95 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:17 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_{add|remove}_notifier()

In order to use the same set of routines to register notifiers for
different request types, update the existing
dev_pm_qos_{add|remove}_notifier() routines with an additional
parameter: request-type.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt | 10 ++++++----
 drivers/base/power/domain.c              |  8 +++++---
 drivers/base/power/qos.c                 | 14 ++++++++++++--
 include/linux/pm_qos.h                   | 12 ++++++++----
 4 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 19c5f7b1a7ba..ec7d662d1707 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -164,12 +164,14 @@ directory.
 Notification mechanisms:
 The per-device PM QoS framework has a per-device notification tree.
 
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
+int dev_pm_qos_add_notifier(device, notifier, type):
+Adds a notification callback function for the device for a particular request
+type.
+
 The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
+is changed.
 
-int dev_pm_qos_remove_notifier(device, notifier):
+int dev_pm_qos_remove_notifier(device, notifier, type):
 Removes the notification callback function for the device.
 
 
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 33c30c1e6a30..b063bc41b0a9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1536,7 +1536,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
 	else
-		dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+		dev_pm_qos_add_notifier(dev, &gpd_data->nb,
+					DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
@@ -1569,7 +1570,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 	pdd = dev->power.subsys_data->domain_data;
 	gpd_data = to_gpd_data(pdd);
-	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_remove_notifier(dev, &gpd_data->nb,
+				   DEV_PM_QOS_RESUME_LATENCY);
 
 	genpd_lock(genpd);
 
@@ -1597,7 +1599,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
  out:
 	genpd_unlock(genpd);
-	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_add_notifier(dev, &gpd_data->nb, DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c91f8df1d59..cfd463212513 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -467,6 +467,7 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block managed by caller.
+ * @type: request type.
  *
  * Will register the notifier into a notification chain that gets called
  * upon changes to the target value for the device.
@@ -474,10 +475,14 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  * If the device's constraints object doesn't exist when this routine is called,
  * it will be created (or error code will be returned if that fails).
  */
-int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier)
+int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type)
 {
 	int ret = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -500,15 +505,20 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block to be removed.
+ * @type: request type.
  *
  * Will remove the notifier from the notification chain that gets called
  * upon changes to the target value.
  */
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier)
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type)
 {
 	int retval = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 6ea1ae373d77..58e8749ceac5 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -146,9 +146,11 @@ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
 int dev_pm_qos_remove_request(struct dev_pm_qos_request *req);
 int dev_pm_qos_add_notifier(struct device *dev,
-			    struct notifier_block *notifier);
+			    struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type);
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier);
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type);
 void dev_pm_qos_constraints_init(struct device *dev);
 void dev_pm_qos_constraints_destroy(struct device *dev);
 int dev_pm_qos_add_ancestor_request(struct device *dev,
@@ -202,10 +204,12 @@ static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
 			{ return 0; }
 static inline int dev_pm_qos_add_notifier(struct device *dev,
-					  struct notifier_block *notifier)
+					  struct notifier_block *notifier,
+					  enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline int dev_pm_qos_remove_notifier(struct device *dev,
-					     struct notifier_block *notifier)
+					     struct notifier_block *notifier,
+					     enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline void dev_pm_qos_constraints_init(struct device *dev)
 {
-- 
cgit v1.2.3


From 8262331eaaf751076fb2c781f492bafd8344591d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:18 +0530
Subject: PM / QOS: Rename __dev_pm_qos_read_value() and
 dev_pm_qos_raw_read_value()

dev_pm_qos_read_value() will soon need to support more constraint types
(min/max frequency) and will have another argument to it, i.e. type of
the constraint. While that is fine for the existing users of
dev_pm_qos_read_value(), but not that optimal for the callers of
__dev_pm_qos_read_value() and dev_pm_qos_raw_read_value() as all the
callers of these two routines are only looking for resume latency
constraint.

Lets make these two routines care only about the resume latency
constraint and rename them to __dev_pm_qos_resume_latency() and
dev_pm_qos_raw_resume_latency().

Suggested-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain_governor.c |  2 +-
 drivers/base/power/qos.c             | 13 +++++++++----
 drivers/base/power/runtime.c         |  2 +-
 drivers/cpuidle/governor.c           |  2 +-
 include/linux/pm_qos.h               |  8 ++++----
 5 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 3838045c9277..20e56a5be01f 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -66,7 +66,7 @@ static bool default_suspend_ok(struct device *dev)
 	td->constraint_changed = false;
 	td->cached_suspend_ok = false;
 	td->effective_constraint_ns = 0;
-	constraint_ns = __dev_pm_qos_read_value(dev);
+	constraint_ns = __dev_pm_qos_resume_latency(dev);
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index cfd463212513..7a0d197f0809 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -90,16 +90,16 @@ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
 EXPORT_SYMBOL_GPL(dev_pm_qos_flags);
 
 /**
- * __dev_pm_qos_read_value - Get PM QoS constraint for a given device.
+ * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
  * @dev: Device to get the PM QoS constraint value for.
  *
  * This routine must be called with dev->power.lock held.
  */
-s32 __dev_pm_qos_read_value(struct device *dev)
+s32 __dev_pm_qos_resume_latency(struct device *dev)
 {
 	lockdep_assert_held(&dev->power.lock);
 
-	return dev_pm_qos_raw_read_value(dev);
+	return dev_pm_qos_raw_resume_latency(dev);
 }
 
 /**
@@ -112,7 +112,12 @@ s32 dev_pm_qos_read_value(struct device *dev)
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
-	ret = __dev_pm_qos_read_value(dev);
+
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	else
+		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
 	return ret;
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..b75335508d2c 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -275,7 +275,7 @@ static int rpm_check_suspend_allowed(struct device *dev)
 	    || (dev->power.request_pending
 			&& dev->power.request == RPM_REQ_RESUME))
 		retval = -EAGAIN;
-	else if (__dev_pm_qos_read_value(dev) == 0)
+	else if (__dev_pm_qos_resume_latency(dev) == 0)
 		retval = -EPERM;
 	else if (dev->power.runtime_status == RPM_SUSPENDED)
 		retval = 1;
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 9fddf828a76f..2e3e14192bee 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -110,7 +110,7 @@ int cpuidle_governor_latency_req(unsigned int cpu)
 {
 	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
-	int device_req = dev_pm_qos_raw_read_value(device);
+	int device_req = dev_pm_qos_raw_resume_latency(device);
 
 	return device_req < global_req ? device_req : global_req;
 }
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 58e8749ceac5..5e09d4980786 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -139,7 +139,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 #ifdef CONFIG_PM
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
-s32 __dev_pm_qos_read_value(struct device *dev);
+s32 __dev_pm_qos_resume_latency(struct device *dev);
 s32 dev_pm_qos_read_value(struct device *dev);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
@@ -176,7 +176,7 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev)
 	return dev->power.qos->flags_req->data.flr.flags;
 }
 
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return IS_ERR_OR_NULL(dev->power.qos) ?
 		PM_QOS_RESUME_LATENCY_NO_CONSTRAINT :
@@ -189,7 +189,7 @@ static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
 static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 							s32 mask)
 			{ return PM_QOS_FLAGS_UNDEFINED; }
-static inline s32 __dev_pm_qos_read_value(struct device *dev)
+static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
 static inline s32 dev_pm_qos_read_value(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
@@ -245,7 +245,7 @@ static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
-- 
cgit v1.2.3


From 2a79ea5ec53973c8711b54d33ace5c77659dc8f8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:19 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_read_value()

In order to allow dev_pm_qos_read_value() to read values for different
QoS requests, pass request type as a parameter to these routines.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt |  2 +-
 drivers/base/power/domain_governor.c     |  2 +-
 drivers/base/power/qos.c                 | 17 ++++++++++++-----
 include/linux/pm_qos.h                   | 16 +++++++++++++---
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index ec7d662d1707..cfcb1df39799 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -123,7 +123,7 @@ Will remove the element.  After removal it will update the aggregate target and
 call the notification trees if the target was changed as a result of removing
 the request.
 
-s32 dev_pm_qos_read_value(device):
+s32 dev_pm_qos_read_value(device, type):
 Returns the aggregated value for a given device's constraints list.
 
 enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 20e56a5be01f..daa8c7689f7e 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -33,7 +33,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data)
 		 * take its current PM QoS constraint (that's the only thing
 		 * known at this point anyway).
 		 */
-		constraint_ns = dev_pm_qos_read_value(dev);
+		constraint_ns = dev_pm_qos_read_value(dev, DEV_PM_QOS_RESUME_LATENCY);
 		constraint_ns *= NSEC_PER_USEC;
 	}
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7a0d197f0809..2461fed0efa0 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -105,18 +105,25 @@ s32 __dev_pm_qos_resume_latency(struct device *dev)
 /**
  * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
  * @dev: Device to get the PM QoS constraint value for.
+ * @type: QoS request type.
  */
-s32 dev_pm_qos_read_value(struct device *dev)
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 {
+	struct dev_pm_qos *qos = dev->power.qos;
 	unsigned long flags;
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
-	if (IS_ERR_OR_NULL(dev->power.qos))
-		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
-	else
-		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
+			: pm_qos_read_value(&qos->resume_latency);
+		break;
+	default:
+		WARN_ON(1);
+		ret = 0;
+	}
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 5e09d4980786..9a21b7ba72ae 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -140,7 +140,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
 s32 __dev_pm_qos_resume_latency(struct device *dev);
-s32 dev_pm_qos_read_value(struct device *dev);
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
@@ -191,8 +191,18 @@ static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 			{ return PM_QOS_FLAGS_UNDEFINED; }
 static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
-static inline s32 dev_pm_qos_read_value(struct device *dev)
-			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
+static inline s32 dev_pm_qos_read_value(struct device *dev,
+					enum dev_pm_qos_req_type type)
+{
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
 static inline int dev_pm_qos_add_request(struct device *dev,
 					 struct dev_pm_qos_request *req,
 					 enum dev_pm_qos_req_type type,
-- 
cgit v1.2.3


From 208637b37824c8956fe28d277835a403ee35fa84 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:20 +0530
Subject: PM / QoS: Add support for MIN/MAX frequency constraints

This patch introduces the min-frequency and max-frequency device
constraints, which will be used by the cpufreq core to begin with.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 111 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/pm_qos.h   |  12 +++++
 2 files changed, 109 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 2461fed0efa0..6c90fd7e2ff8 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -120,6 +120,14 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
 			: pm_qos_read_value(&qos->resume_latency);
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->min_frequency);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->max_frequency);
+		break;
 	default:
 		WARN_ON(1);
 		ret = 0;
@@ -161,6 +169,14 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 			req->dev->power.set_latency_tolerance(req->dev, value);
 		}
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = pm_qos_update_target(&qos->min_frequency,
+					   &req->data.pnode, action, value);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = pm_qos_update_target(&qos->max_frequency,
+					   &req->data.pnode, action, value);
+		break;
 	case DEV_PM_QOS_FLAGS:
 		ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
 					  action, value);
@@ -189,12 +205,11 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	if (!qos)
 		return -ENOMEM;
 
-	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	n = kzalloc(3 * sizeof(*n), GFP_KERNEL);
 	if (!n) {
 		kfree(qos);
 		return -ENOMEM;
 	}
-	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->resume_latency;
 	plist_head_init(&c->list);
@@ -203,6 +218,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->latency_tolerance;
 	plist_head_init(&c->list);
@@ -211,6 +227,24 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 
+	c = &qos->min_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MAX;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
+	c = &qos->max_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MIN;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
 	INIT_LIST_HEAD(&qos->flags.list);
 
 	spin_lock_irq(&dev->power.lock);
@@ -264,11 +298,25 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
 	c = &qos->latency_tolerance;
 	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
+	c = &qos->min_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
+	c = &qos->max_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
 	f = &qos->flags;
 	list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -380,6 +428,8 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 	case DEV_PM_QOS_LATENCY_TOLERANCE:
+	case DEV_PM_QOS_MIN_FREQUENCY:
+	case DEV_PM_QOS_MAX_FREQUENCY:
 		curr_value = req->data.pnode.prio;
 		break;
 	case DEV_PM_QOS_FLAGS:
@@ -492,9 +542,6 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 {
 	int ret = 0;
 
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
-
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -502,10 +549,28 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 	else if (!dev->power.qos)
 		ret = dev_pm_qos_constraints_allocate(dev);
 
-	if (!ret)
+	if (ret)
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
 		ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
 						       notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->min_frequency.notifiers,
+						       notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->max_frequency.notifiers,
+						       notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
@@ -526,20 +591,35 @@ int dev_pm_qos_remove_notifier(struct device *dev,
 			       struct notifier_block *notifier,
 			       enum dev_pm_qos_req_type type)
 {
-	int retval = 0;
-
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
+	int ret = 0;
 
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
-	if (!IS_ERR_OR_NULL(dev->power.qos))
-		retval = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
-							    notifier);
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->min_frequency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->max_frequency.notifiers,
+							 notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
-	return retval;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
 
@@ -599,6 +679,9 @@ static void __dev_pm_qos_drop_user_request(struct device *dev,
 		req = dev->power.qos->flags_req;
 		dev->power.qos->flags_req = NULL;
 		break;
+	default:
+		WARN_ON(1);
+		return;
 	}
 	__dev_pm_qos_remove_request(req);
 	kfree(req);
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 9a21b7ba72ae..2aebbc5b9950 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -40,6 +40,8 @@ enum pm_qos_flags_status {
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT	PM_QOS_LATENCY_ANY
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS	PM_QOS_LATENCY_ANY_NS
 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
+#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE	0
+#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE	(-1)
 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
@@ -58,6 +60,8 @@ struct pm_qos_flags_request {
 enum dev_pm_qos_req_type {
 	DEV_PM_QOS_RESUME_LATENCY = 1,
 	DEV_PM_QOS_LATENCY_TOLERANCE,
+	DEV_PM_QOS_MIN_FREQUENCY,
+	DEV_PM_QOS_MAX_FREQUENCY,
 	DEV_PM_QOS_FLAGS,
 };
 
@@ -99,10 +103,14 @@ struct pm_qos_flags {
 struct dev_pm_qos {
 	struct pm_qos_constraints resume_latency;
 	struct pm_qos_constraints latency_tolerance;
+	struct pm_qos_constraints min_frequency;
+	struct pm_qos_constraints max_frequency;
 	struct pm_qos_flags flags;
 	struct dev_pm_qos_request *resume_latency_req;
 	struct dev_pm_qos_request *latency_tolerance_req;
 	struct dev_pm_qos_request *flags_req;
+	struct dev_pm_qos_request *min_frequency_req;
+	struct dev_pm_qos_request *max_frequency_req;
 };
 
 /* Action requested to pm_qos_update_target */
@@ -197,6 +205,10 @@ static inline s32 dev_pm_qos_read_value(struct device *dev,
 	switch (type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
 	default:
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3


From 02bd45a28bf32993e396fdcfd7d7c7cdc0847ed1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 4 Jul 2019 01:05:38 +0200
Subject: PM: sleep: Drop dev_pm_skip_next_resume_phases()

After recent hibernation-related changes, there are no more callers
of dev_pm_skip_next_resume_phases() except for the PM core itself
in which it is more straightforward to run the statements from
that function directly, so do that and drop it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/base/power/main.c | 19 +++----------------
 include/linux/pm.h        |  1 -
 2 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 1e84b8aa220f..7fb2c39bc725 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -529,21 +529,6 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd)
 
 /*------------------------- Resume routines -------------------------*/
 
-/**
- * dev_pm_skip_next_resume_phases - Skip next system resume phases for device.
- * @dev: Target device.
- *
- * Make the core skip the "early resume" and "resume" phases for @dev.
- *
- * This function can be called by middle-layer code during the "noirq" phase of
- * system resume if necessary, but not by device drivers.
- */
-void dev_pm_skip_next_resume_phases(struct device *dev)
-{
-	dev->power.is_late_suspended = false;
-	dev->power.is_suspended = false;
-}
-
 /**
  * suspend_event - Return a "suspend" message for given "resume" one.
  * @resume_msg: PM message representing a system-wide resume transition.
@@ -681,6 +666,9 @@ Skip:
 	dev->power.is_noirq_suspended = false;
 
 	if (skip_resume) {
+		/* Make the next phases of resume skip the device. */
+		dev->power.is_late_suspended = false;
+		dev->power.is_suspended = false;
 		/*
 		 * The device is going to be left in suspend, but it might not
 		 * have been in runtime suspend before the system suspended, so
@@ -689,7 +677,6 @@ Skip:
 		 * device again.
 		 */
 		pm_runtime_set_suspended(dev);
-		dev_pm_skip_next_resume_phases(dev);
 	}
 
 Out:
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 345d74a727e3..283fb3defe56 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -760,7 +760,6 @@ extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
-extern void dev_pm_skip_next_resume_phases(struct device *dev);
 extern bool dev_pm_may_skip_resume(struct device *dev);
 extern bool dev_pm_smart_suspend_and_suspended(struct device *dev);
 
-- 
cgit v1.2.3


From 332694f8a4f7e49b8b7278734d0ce331f954b20e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 4 Jul 2019 13:02:09 +0200
Subject: Revert "usb:gadget Separated decoding functions from dwc3 driver."

This reverts commit 3db1b636c07e15ff7410db782832dc2e7ffd2bce.

It's broken.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Felipe Balbi <balbi@kernel.org>
Cc: Pawel Laszczak <pawell@cadence.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc3/debug.h    | 252 +++++++++++++++++++++++++++++++++++++++++
 drivers/usb/dwc3/trace.h    |   2 +-
 drivers/usb/gadget/Makefile |   1 -
 drivers/usb/gadget/debug.c  | 268 --------------------------------------------
 include/linux/usb/gadget.h  |  26 -----
 5 files changed, 253 insertions(+), 296 deletions(-)
 delete mode 100644 drivers/usb/gadget/debug.c

(limited to 'include/linux')

diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h
index 9baabed87d61..068259fdfb0c 100644
--- a/drivers/usb/dwc3/debug.h
+++ b/drivers/usb/dwc3/debug.h
@@ -246,6 +246,258 @@ static inline const char *dwc3_gadget_event_string(char *str, size_t size,
 	return str;
 }
 
+static inline void dwc3_decode_get_status(__u8 t, __u16 i, __u16 l, char *str,
+		size_t size)
+{
+	switch (t & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		snprintf(str, size, "Get Device Status(Length = %d)", l);
+		break;
+	case USB_RECIP_INTERFACE:
+		snprintf(str, size, "Get Interface Status(Intf = %d, Length = %d)",
+				i, l);
+		break;
+	case USB_RECIP_ENDPOINT:
+		snprintf(str, size, "Get Endpoint Status(ep%d%s)",
+			i & ~USB_DIR_IN,
+			i & USB_DIR_IN ? "in" : "out");
+		break;
+	}
+}
+
+static inline void dwc3_decode_set_clear_feature(__u8 t, __u8 b, __u16 v,
+		__u16 i, char *str, size_t size)
+{
+	switch (t & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		snprintf(str, size, "%s Device Feature(%s%s)",
+			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			({char *s;
+				switch (v) {
+				case USB_DEVICE_SELF_POWERED:
+					s = "Self Powered";
+					break;
+				case USB_DEVICE_REMOTE_WAKEUP:
+					s = "Remote Wakeup";
+					break;
+				case USB_DEVICE_TEST_MODE:
+					s = "Test Mode";
+					break;
+				case USB_DEVICE_U1_ENABLE:
+					s = "U1 Enable";
+					break;
+				case USB_DEVICE_U2_ENABLE:
+					s = "U2 Enable";
+					break;
+				case USB_DEVICE_LTM_ENABLE:
+					s = "LTM Enable";
+					break;
+				default:
+					s = "UNKNOWN";
+				} s; }),
+			v == USB_DEVICE_TEST_MODE ?
+			({ char *s;
+				switch (i) {
+				case TEST_J:
+					s = ": TEST_J";
+					break;
+				case TEST_K:
+					s = ": TEST_K";
+					break;
+				case TEST_SE0_NAK:
+					s = ": TEST_SE0_NAK";
+					break;
+				case TEST_PACKET:
+					s = ": TEST_PACKET";
+					break;
+				case TEST_FORCE_EN:
+					s = ": TEST_FORCE_EN";
+					break;
+				default:
+					s = ": UNKNOWN";
+				} s; }) : "");
+		break;
+	case USB_RECIP_INTERFACE:
+		snprintf(str, size, "%s Interface Feature(%s)",
+			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			v == USB_INTRF_FUNC_SUSPEND ?
+			"Function Suspend" : "UNKNOWN");
+		break;
+	case USB_RECIP_ENDPOINT:
+		snprintf(str, size, "%s Endpoint Feature(%s ep%d%s)",
+			b == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
+			v == USB_ENDPOINT_HALT ? "Halt" : "UNKNOWN",
+			i & ~USB_DIR_IN,
+			i & USB_DIR_IN ? "in" : "out");
+		break;
+	}
+}
+
+static inline void dwc3_decode_set_address(__u16 v, char *str, size_t size)
+{
+	snprintf(str, size, "Set Address(Addr = %02x)", v);
+}
+
+static inline void dwc3_decode_get_set_descriptor(__u8 t, __u8 b, __u16 v,
+		__u16 i, __u16 l, char *str, size_t size)
+{
+	snprintf(str, size, "%s %s Descriptor(Index = %d, Length = %d)",
+		b == USB_REQ_GET_DESCRIPTOR ? "Get" : "Set",
+		({ char *s;
+			switch (v >> 8) {
+			case USB_DT_DEVICE:
+				s = "Device";
+				break;
+			case USB_DT_CONFIG:
+				s = "Configuration";
+				break;
+			case USB_DT_STRING:
+				s = "String";
+				break;
+			case USB_DT_INTERFACE:
+				s = "Interface";
+				break;
+			case USB_DT_ENDPOINT:
+				s = "Endpoint";
+				break;
+			case USB_DT_DEVICE_QUALIFIER:
+				s = "Device Qualifier";
+				break;
+			case USB_DT_OTHER_SPEED_CONFIG:
+				s = "Other Speed Config";
+				break;
+			case USB_DT_INTERFACE_POWER:
+				s = "Interface Power";
+				break;
+			case USB_DT_OTG:
+				s = "OTG";
+				break;
+			case USB_DT_DEBUG:
+				s = "Debug";
+				break;
+			case USB_DT_INTERFACE_ASSOCIATION:
+				s = "Interface Association";
+				break;
+			case USB_DT_BOS:
+				s = "BOS";
+				break;
+			case USB_DT_DEVICE_CAPABILITY:
+				s = "Device Capability";
+				break;
+			case USB_DT_PIPE_USAGE:
+				s = "Pipe Usage";
+				break;
+			case USB_DT_SS_ENDPOINT_COMP:
+				s = "SS Endpoint Companion";
+				break;
+			case USB_DT_SSP_ISOC_ENDPOINT_COMP:
+				s = "SSP Isochronous Endpoint Companion";
+				break;
+			default:
+				s = "UNKNOWN";
+				break;
+			} s; }), v & 0xff, l);
+}
+
+
+static inline void dwc3_decode_get_configuration(__u16 l, char *str,
+		size_t size)
+{
+	snprintf(str, size, "Get Configuration(Length = %d)", l);
+}
+
+static inline void dwc3_decode_set_configuration(__u8 v, char *str, size_t size)
+{
+	snprintf(str, size, "Set Configuration(Config = %d)", v);
+}
+
+static inline void dwc3_decode_get_intf(__u16 i, __u16 l, char *str,
+		size_t size)
+{
+	snprintf(str, size, "Get Interface(Intf = %d, Length = %d)", i, l);
+}
+
+static inline void dwc3_decode_set_intf(__u8 v, __u16 i, char *str, size_t size)
+{
+	snprintf(str, size, "Set Interface(Intf = %d, Alt.Setting = %d)", i, v);
+}
+
+static inline void dwc3_decode_synch_frame(__u16 i, __u16 l, char *str,
+		size_t size)
+{
+	snprintf(str, size, "Synch Frame(Endpoint = %d, Length = %d)", i, l);
+}
+
+static inline void dwc3_decode_set_sel(__u16 l, char *str, size_t size)
+{
+	snprintf(str, size, "Set SEL(Length = %d)", l);
+}
+
+static inline void dwc3_decode_set_isoch_delay(__u8 v, char *str, size_t size)
+{
+	snprintf(str, size, "Set Isochronous Delay(Delay = %d ns)", v);
+}
+
+/**
+ * dwc3_decode_ctrl - returns a string represetion of ctrl request
+ */
+static inline const char *dwc3_decode_ctrl(char *str, size_t size,
+		__u8 bRequestType, __u8 bRequest, __u16 wValue, __u16 wIndex,
+		__u16 wLength)
+{
+	switch (bRequest) {
+	case USB_REQ_GET_STATUS:
+		dwc3_decode_get_status(bRequestType, wIndex, wLength, str,
+				size);
+		break;
+	case USB_REQ_CLEAR_FEATURE:
+	case USB_REQ_SET_FEATURE:
+		dwc3_decode_set_clear_feature(bRequestType, bRequest, wValue,
+				wIndex, str, size);
+		break;
+	case USB_REQ_SET_ADDRESS:
+		dwc3_decode_set_address(wValue, str, size);
+		break;
+	case USB_REQ_GET_DESCRIPTOR:
+	case USB_REQ_SET_DESCRIPTOR:
+		dwc3_decode_get_set_descriptor(bRequestType, bRequest, wValue,
+				wIndex, wLength, str, size);
+		break;
+	case USB_REQ_GET_CONFIGURATION:
+		dwc3_decode_get_configuration(wLength, str, size);
+		break;
+	case USB_REQ_SET_CONFIGURATION:
+		dwc3_decode_set_configuration(wValue, str, size);
+		break;
+	case USB_REQ_GET_INTERFACE:
+		dwc3_decode_get_intf(wIndex, wLength, str, size);
+		break;
+	case USB_REQ_SET_INTERFACE:
+		dwc3_decode_set_intf(wValue, wIndex, str, size);
+		break;
+	case USB_REQ_SYNCH_FRAME:
+		dwc3_decode_synch_frame(wIndex, wLength, str, size);
+		break;
+	case USB_REQ_SET_SEL:
+		dwc3_decode_set_sel(wLength, str, size);
+		break;
+	case USB_REQ_SET_ISOCH_DELAY:
+		dwc3_decode_set_isoch_delay(wValue, str, size);
+		break;
+	default:
+		snprintf(str, size, "%02x %02x %02x %02x %02x %02x %02x %02x",
+			bRequestType, bRequest,
+			cpu_to_le16(wValue) & 0xff,
+			cpu_to_le16(wValue) >> 8,
+			cpu_to_le16(wIndex) & 0xff,
+			cpu_to_le16(wIndex) >> 8,
+			cpu_to_le16(wLength) & 0xff,
+			cpu_to_le16(wLength) >> 8);
+	}
+
+	return str;
+}
+
 /**
  * dwc3_ep_event_string - returns event name
  * @event: then event code
diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h
index 9edff17111f7..818a63da1a44 100644
--- a/drivers/usb/dwc3/trace.h
+++ b/drivers/usb/dwc3/trace.h
@@ -86,7 +86,7 @@ DECLARE_EVENT_CLASS(dwc3_log_ctrl,
 		__entry->wIndex = le16_to_cpu(ctrl->wIndex);
 		__entry->wLength = le16_to_cpu(ctrl->wLength);
 	),
-	TP_printk("%s", usb_decode_ctrl(__get_str(str), DWC3_MSG_MAX,
+	TP_printk("%s", dwc3_decode_ctrl(__get_str(str), DWC3_MSG_MAX,
 					__entry->bRequestType,
 					__entry->bRequest, __entry->wValue,
 					__entry->wIndex, __entry->wLength)
diff --git a/drivers/usb/gadget/Makefile b/drivers/usb/gadget/Makefile
index 500a5a592abe..130dad7130b6 100644
--- a/drivers/usb/gadget/Makefile
+++ b/drivers/usb/gadget/Makefile
@@ -9,6 +9,5 @@ ccflags-y				+= -I$(srctree)/drivers/usb/gadget/udc
 obj-$(CONFIG_USB_LIBCOMPOSITE)	+= libcomposite.o
 libcomposite-y			:= usbstring.o config.o epautoconf.o
 libcomposite-y			+= composite.o functions.o configfs.o u_f.o
-libcomposite-y			+= debug.o
 
 obj-$(CONFIG_USB_GADGET)	+= udc/ function/ legacy/
diff --git a/drivers/usb/gadget/debug.c b/drivers/usb/gadget/debug.c
deleted file mode 100644
index d5a469bc67a3..000000000000
--- a/drivers/usb/gadget/debug.c
+++ /dev/null
@@ -1,268 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/**
- * Common USB debugging functions
- *
- * Copyright (C) 2010-2011 Texas Instruments Incorporated - http://www.ti.com
- *
- * Authors: Felipe Balbi <balbi@ti.com>,
- *	    Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- */
-
-#include <linux/usb/ch9.h>
-
-static void usb_decode_get_status(__u8 bRequestType, __u16 wIndex,
-				  __u16 wLength, char *str, size_t size)
-{
-	switch (bRequestType & USB_RECIP_MASK) {
-	case USB_RECIP_DEVICE:
-		snprintf(str, size, "Get Device Status(Length = %d)", wLength);
-		break;
-	case USB_RECIP_INTERFACE:
-		snprintf(str, size,
-			 "Get Interface Status(Intf = %d, Length = %d)",
-			 wIndex, wLength);
-		break;
-	case USB_RECIP_ENDPOINT:
-		snprintf(str, size, "Get Endpoint Status(ep%d%s)",
-			 wIndex & ~USB_DIR_IN,
-			 wIndex & USB_DIR_IN ? "in" : "out");
-		break;
-	}
-}
-
-static void usb_decode_set_clear_feature(__u8 bRequestType, __u8 bRequest,
-					 __u16 wValue, __u16 wIndex,
-					 char *str, size_t size)
-{
-	switch (bRequestType & USB_RECIP_MASK) {
-	case USB_RECIP_DEVICE:
-		snprintf(str, size, "%s Device Feature(%s%s)",
-			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			 ({char *s;
-				switch (wValue) {
-				case USB_DEVICE_SELF_POWERED:
-					s = "Self Powered";
-					break;
-				case USB_DEVICE_REMOTE_WAKEUP:
-					s = "Remote Wakeup";
-					break;
-				case USB_DEVICE_TEST_MODE:
-					s = "Test Mode";
-					break;
-				case USB_DEVICE_U1_ENABLE:
-					s = "U1 Enable";
-					break;
-				case USB_DEVICE_U2_ENABLE:
-					s = "U2 Enable";
-					break;
-				case USB_DEVICE_LTM_ENABLE:
-					s = "LTM Enable";
-					break;
-				default:
-					s = "UNKNOWN";
-				} s; }),
-			 wValue == USB_DEVICE_TEST_MODE ?
-			 ({ char *s;
-				switch (wIndex) {
-				case TEST_J:
-					s = ": TEST_J";
-					break;
-				case TEST_K:
-					s = ": TEST_K";
-					break;
-				case TEST_SE0_NAK:
-					s = ": TEST_SE0_NAK";
-					break;
-				case TEST_PACKET:
-					s = ": TEST_PACKET";
-					break;
-				case TEST_FORCE_EN:
-					s = ": TEST_FORCE_EN";
-					break;
-				default:
-					s = ": UNKNOWN";
-				} s; }) : "");
-		break;
-	case USB_RECIP_INTERFACE:
-		snprintf(str, size, "%s Interface Feature(%s)",
-			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			 wValue == USB_INTRF_FUNC_SUSPEND ?
-			 "Function Suspend" : "UNKNOWN");
-		break;
-	case USB_RECIP_ENDPOINT:
-		snprintf(str, size, "%s Endpoint Feature(%s ep%d%s)",
-			 bRequest == USB_REQ_CLEAR_FEATURE ? "Clear" : "Set",
-			 wValue == USB_ENDPOINT_HALT ? "Halt" : "UNKNOWN",
-			 wIndex & ~USB_DIR_IN,
-			 wIndex & USB_DIR_IN ? "in" : "out");
-		break;
-	}
-}
-
-static void usb_decode_set_address(__u16 wValue, char *str, size_t size)
-{
-	snprintf(str, size, "Set Address(Addr = %02x)", wValue);
-}
-
-static void usb_decode_get_set_descriptor(__u8 bRequestType, __u8 bRequest,
-					  __u16 wValue, __u16 wIndex,
-					  __u16 wLength, char *str, size_t size)
-{
-	snprintf(str, size, "%s %s Descriptor(Index = %d, Length = %d)",
-		 bRequest == USB_REQ_GET_DESCRIPTOR ? "Get" : "Set",
-		 ({ char *s;
-			switch (wValue >> 8) {
-			case USB_DT_DEVICE:
-				s = "Device";
-				break;
-			case USB_DT_CONFIG:
-				s = "Configuration";
-				break;
-			case USB_DT_STRING:
-				s = "String";
-				break;
-			case USB_DT_INTERFACE:
-				s = "Interface";
-				break;
-			case USB_DT_ENDPOINT:
-				s = "Endpoint";
-				break;
-			case USB_DT_DEVICE_QUALIFIER:
-				s = "Device Qualifier";
-				break;
-			case USB_DT_OTHER_SPEED_CONFIG:
-				s = "Other Speed Config";
-				break;
-			case USB_DT_INTERFACE_POWER:
-				s = "Interface Power";
-				break;
-			case USB_DT_OTG:
-				s = "OTG";
-				break;
-			case USB_DT_DEBUG:
-				s = "Debug";
-				break;
-			case USB_DT_INTERFACE_ASSOCIATION:
-				s = "Interface Association";
-				break;
-			case USB_DT_BOS:
-				s = "BOS";
-				break;
-			case USB_DT_DEVICE_CAPABILITY:
-				s = "Device Capability";
-				break;
-			case USB_DT_PIPE_USAGE:
-				s = "Pipe Usage";
-				break;
-			case USB_DT_SS_ENDPOINT_COMP:
-				s = "SS Endpoint Companion";
-				break;
-			case USB_DT_SSP_ISOC_ENDPOINT_COMP:
-				s = "SSP Isochronous Endpoint Companion";
-				break;
-			default:
-				s = "UNKNOWN";
-				break;
-			} s; }), wValue & 0xff, wLength);
-}
-
-static void usb_decode_get_configuration(__u16 wLength, char *str, size_t size)
-{
-	snprintf(str, size, "Get Configuration(Length = %d)", wLength);
-}
-
-static void usb_decode_set_configuration(__u8 wValue, char *str, size_t size)
-{
-	snprintf(str, size, "Set Configuration(Config = %d)", wValue);
-}
-
-static void usb_decode_get_intf(__u16 wIndex, __u16 wLength, char *str,
-				size_t size)
-{
-	snprintf(str, size, "Get Interface(Intf = %d, Length = %d)",
-		 wIndex, wLength);
-}
-
-static void usb_decode_set_intf(__u8 wValue, __u16 wIndex, char *str,
-				size_t size)
-{
-	snprintf(str, size, "Set Interface(Intf = %d, Alt.Setting = %d)",
-		 wIndex, wValue);
-}
-
-static void usb_decode_synch_frame(__u16 wIndex, __u16 wLength,
-				   char *str, size_t size)
-{
-	snprintf(str, size, "Synch Frame(Endpoint = %d, Length = %d)",
-		 wIndex, wLength);
-}
-
-static void usb_decode_set_sel(__u16 wLength, char *str, size_t size)
-{
-	snprintf(str, size, "Set SEL(Length = %d)", wLength);
-}
-
-static void usb_decode_set_isoch_delay(__u8 wValue, char *str, size_t size)
-{
-	snprintf(str, size, "Set Isochronous Delay(Delay = %d ns)", wValue);
-}
-
-/**
- * usb_decode_ctrl - returns a string representation of ctrl request
- */
-const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
-			    __u8 bRequest, __u16 wValue, __u16 wIndex,
-			    __u16 wLength)
-{
-	switch (bRequest) {
-	case USB_REQ_GET_STATUS:
-		usb_decode_get_status(bRequestType, wIndex, wLength, str, size);
-		break;
-	case USB_REQ_CLEAR_FEATURE:
-	case USB_REQ_SET_FEATURE:
-		usb_decode_set_clear_feature(bRequestType, bRequest, wValue,
-					     wIndex, str, size);
-		break;
-	case USB_REQ_SET_ADDRESS:
-		usb_decode_set_address(wValue, str, size);
-		break;
-	case USB_REQ_GET_DESCRIPTOR:
-	case USB_REQ_SET_DESCRIPTOR:
-		usb_decode_get_set_descriptor(bRequestType, bRequest, wValue,
-					      wIndex, wLength, str, size);
-		break;
-	case USB_REQ_GET_CONFIGURATION:
-		usb_decode_get_configuration(wLength, str, size);
-		break;
-	case USB_REQ_SET_CONFIGURATION:
-		usb_decode_set_configuration(wValue, str, size);
-		break;
-	case USB_REQ_GET_INTERFACE:
-		usb_decode_get_intf(wIndex, wLength, str, size);
-		break;
-	case USB_REQ_SET_INTERFACE:
-		usb_decode_set_intf(wValue, wIndex, str, size);
-		break;
-	case USB_REQ_SYNCH_FRAME:
-		usb_decode_synch_frame(wIndex, wLength, str, size);
-		break;
-	case USB_REQ_SET_SEL:
-		usb_decode_set_sel(wLength, str, size);
-		break;
-	case USB_REQ_SET_ISOCH_DELAY:
-		usb_decode_set_isoch_delay(wValue, str, size);
-		break;
-	default:
-		snprintf(str, size, "%02x %02x %02x %02x %02x %02x %02x %02x",
-			 bRequestType, bRequest,
-			 (u8)(cpu_to_le16(wValue) & 0xff),
-			 (u8)(cpu_to_le16(wValue) >> 8),
-			 (u8)(cpu_to_le16(wIndex) & 0xff),
-			 (u8)(cpu_to_le16(wIndex) >> 8),
-			 (u8)(cpu_to_le16(wLength) & 0xff),
-			 (u8)(cpu_to_le16(wLength) >> 8));
-	}
-
-	return str;
-}
-EXPORT_SYMBOL_GPL(usb_decode_ctrl);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 42902fcc8696..fb19141151d8 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -889,30 +889,4 @@ extern void usb_ep_autoconfig_release(struct usb_ep *);
 
 extern void usb_ep_autoconfig_reset(struct usb_gadget *);
 
-/*-------------------------------------------------------------------------*/
-/**
- * usb_decode_ctrl - Returns human readable representation of control request.
- * @str: buffer to return a human-readable representation of control request.
- *       This buffer should have about 200 bytes.
- * @size: size of str buffer.
- * @bRequestType: matches the USB bmRequestType field
- * @bRequest: matches the USB bRequest field
- * @wValue: matches the USB wValue field (CPU byte order)
- * @wIndex: matches the USB wIndex field (CPU byte order)
- * @wLength: matches the USB wLength field (CPU byte order)
- *
- * Function returns decoded, formatted and human-readable description of
- * control request packet.
- *
- * The usage scenario for this is for tracepoints, so function as a return
- * use the same value as in parameters. This approach allows to use this
- * function in TP_printk
- *
- * Important: wValue, wIndex, wLength parameters before invoking this function
- * should be processed by le16_to_cpu macro.
- */
-extern const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
-			__u8 bRequest, __u16 wValue, __u16 wIndex,
-			__u16 wLength);
-
 #endif /* __LINUX_USB_GADGET_H */
-- 
cgit v1.2.3


From 2da8d9473e20a2f6645dcb0cea4848a2c1e83af9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Wed, 3 Jul 2019 18:10:34 +0200
Subject: regulator: implement selector stepping

Some regulators require that the requested voltage be reached gradually
by setting all or some of the intermediate values. Implement a new field
in the regulator description struct that allows users to specify the
number of selectors by which the regulator API should step when ramping
the voltage up/down.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Link: https://lore.kernel.org/r/20190703161035.31808-2-brgl@bgdev.pl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 63 ++++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h |  6 ++++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 9d3ed13b7f12..df82e2a8442a 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -3106,6 +3106,66 @@ static int _regulator_call_set_voltage_sel(struct regulator_dev *rdev,
 	return ret;
 }
 
+static int _regulator_set_voltage_sel_step(struct regulator_dev *rdev,
+					   int uV, int new_selector)
+{
+	const struct regulator_ops *ops = rdev->desc->ops;
+	int diff, old_sel, curr_sel, ret;
+
+	/* Stepping is only needed if the regulator is enabled. */
+	if (!_regulator_is_enabled(rdev))
+		goto final_set;
+
+	if (!ops->get_voltage_sel)
+		return -EINVAL;
+
+	old_sel = ops->get_voltage_sel(rdev);
+	if (old_sel < 0)
+		return old_sel;
+
+	diff = new_selector - old_sel;
+	if (diff == 0)
+		return 0; /* No change needed. */
+
+	if (diff > 0) {
+		/* Stepping up. */
+		for (curr_sel = old_sel + rdev->desc->vsel_step;
+		     curr_sel < new_selector;
+		     curr_sel += rdev->desc->vsel_step) {
+			/*
+			 * Call the callback directly instead of using
+			 * _regulator_call_set_voltage_sel() as we don't
+			 * want to notify anyone yet. Same in the branch
+			 * below.
+			 */
+			ret = ops->set_voltage_sel(rdev, curr_sel);
+			if (ret)
+				goto try_revert;
+		}
+	} else {
+		/* Stepping down. */
+		for (curr_sel = old_sel - rdev->desc->vsel_step;
+		     curr_sel > new_selector;
+		     curr_sel -= rdev->desc->vsel_step) {
+			ret = ops->set_voltage_sel(rdev, curr_sel);
+			if (ret)
+				goto try_revert;
+		}
+	}
+
+final_set:
+	/* The final selector will trigger the notifiers. */
+	return _regulator_call_set_voltage_sel(rdev, uV, new_selector);
+
+try_revert:
+	/*
+	 * At least try to return to the previous voltage if setting a new
+	 * one failed.
+	 */
+	(void)ops->set_voltage_sel(rdev, old_sel);
+	return ret;
+}
+
 static int _regulator_set_voltage_time(struct regulator_dev *rdev,
 				       int old_uV, int new_uV)
 {
@@ -3179,6 +3239,9 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev,
 				selector = ret;
 				if (old_selector == selector)
 					ret = 0;
+				else if (rdev->desc->vsel_step)
+					ret = _regulator_set_voltage_sel_step(
+						rdev, best_val, selector);
 				else
 					ret = _regulator_call_set_voltage_sel(
 						rdev, best_val, selector);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 377da2357118..f0d7b0496e54 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -286,6 +286,11 @@ enum regulator_type {
  * @vsel_range_mask: Mask for register bitfield used for range selector
  * @vsel_reg: Register for selector when using regulator_regmap_X_voltage_
  * @vsel_mask: Mask for register bitfield used for selector
+ * @vsel_step: Specify the resolution of selector stepping when setting
+ *	       voltage. If 0, then no stepping is done (requested selector is
+ *	       set directly), if >0 then the regulator API will ramp the
+ *	       voltage up/down gradually each time increasing/decreasing the
+ *	       selector by the specified step value.
  * @csel_reg: Register for current limit selector using regmap set_current_limit
  * @csel_mask: Mask for register bitfield used for current limit selector
  * @apply_reg: Register for initiate voltage change on the output when
@@ -360,6 +365,7 @@ struct regulator_desc {
 	unsigned int vsel_range_mask;
 	unsigned int vsel_reg;
 	unsigned int vsel_mask;
+	unsigned int vsel_step;
 	unsigned int csel_reg;
 	unsigned int csel_mask;
 	unsigned int apply_reg;
-- 
cgit v1.2.3


From f8efee08dd9d41ab71010e9b16c9ead51753b7d6 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:30 +0300
Subject: net/mlx5: Add rts2rts_qp_counters_set_id field in hca cap

Add rts2rts_qp_counters_set_id field in hca cap so that RTS2RTS
qp modification can be used to change the counter of a QP.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f03ec31e3232..06881b79167e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1096,7 +1096,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cc_modify_allowed[0x1];
 	u8         start_pad[0x1];
 	u8         cache_line_128byte[0x1];
-	u8         reserved_at_165[0xa];
+	u8         reserved_at_165[0x4];
+	u8         rts2rts_qp_counters_set_id[0x1];
+	u8         reserved_at_16a[0x5];
 	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
-- 
cgit v1.2.3


From 2ac295d4f0c095310addbcb03d91d2a4c9f7d435 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 20:48:55 -0400
Subject: convenience helper get_tree_nodev()

counterpart of mount_nodev().  Switch hugetlb and pseudo to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c       | 2 +-
 fs/libfs.c                 | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1dcc57189382..a478df035651 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1299,7 +1299,7 @@ static int hugetlbfs_get_tree(struct fs_context *fc)
 	int err = hugetlbfs_validate(fc);
 	if (err)
 		return err;
-	return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+	return get_tree_nodev(fc, hugetlbfs_fill_super);
 }
 
 static void hugetlbfs_fs_context_free(struct fs_context *fc)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7e6811ba4edd..c9463dc6a5d4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -270,7 +270,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_independent_super, pseudo_fs_fill_super);
+	return get_tree_nodev(fc, pseudo_fs_fill_super);
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
diff --git a/fs/super.c b/fs/super.c
index ca2302501d32..3318225b0878 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1198,6 +1198,14 @@ int vfs_get_super(struct fs_context *fc,
 }
 EXPORT_SYMBOL(vfs_get_super);
 
+int get_tree_nodev(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_nodev);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index c995b852ba40..38b1ec918a4e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -151,6 +151,9 @@ extern int vfs_get_super(struct fs_context *fc,
 			 enum vfs_get_super_keying keying,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_nodev(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3


From c23a0bbab30cc1714b6b1d6a1c153a5ccab3f0d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 May 2019 21:23:39 -0400
Subject: convenience helper: get_tree_single()

counterpart of mount_single(); switch fusectl to it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/control.c          | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 14ce1e47f980..c23f6f243ad4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -346,7 +346,7 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
 
 static int fuse_ctl_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_single_super, fuse_ctl_fill_super);
+	return get_tree_single(fc, fuse_ctl_fill_super);
 }
 
 static const struct fs_context_operations fuse_ctl_context_ops = {
diff --git a/fs/super.c b/fs/super.c
index 3318225b0878..113c58f19425 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1206,6 +1206,14 @@ int get_tree_nodev(struct fs_context *fc,
 }
 EXPORT_SYMBOL(get_tree_nodev);
 
+int get_tree_single(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_single_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_single);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 38b1ec918a4e..1775969e000d 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -154,6 +154,9 @@ extern int vfs_get_super(struct fs_context *fc,
 extern int get_tree_nodev(struct fs_context *fc,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_single(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3


From 14a253ce4210cd2ef133b392062477e9d656db4a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 15:59:57 -0400
Subject: init_rootfs(): don't bother with init_ramfs_fs()

the only thing done by the latter is making ramfs visible
to mount(2); we don't need it there - rootfs is separate
and, in fact, made visible to mount(2) in the same init_rootfs().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ramfs/inode.c      | 6 +-----
 include/linux/ramfs.h | 1 -
 init/do_mounts.c      | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..733c6b4193dc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -266,12 +266,8 @@ static struct file_system_type ramfs_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-int __init init_ramfs_fs(void)
+static int __init init_ramfs_fs(void)
 {
-	static unsigned long once;
-
-	if (test_and_set_bit(0, &once))
-		return 0;
 	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5ef7d54caac2..ee582bdb7fda 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -19,7 +19,6 @@ extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
 
 extern const struct file_operations ramfs_file_operations;
 extern const struct vm_operations_struct generic_file_vm_ops;
-extern int __init init_ramfs_fs(void);
 
 int ramfs_fill_super(struct super_block *sb, void *data, int silent);
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f8c230c77035..c170d8b309b1 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -658,8 +658,6 @@ int __init init_rootfs(void)
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
-	} else {
-		err = init_ramfs_fs();
 	}
 
 	if (err)
-- 
cgit v1.2.3


From fd3e007f6c6a0f677e4ee8aca4b9bab8ad6cab9a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 17:48:35 -0400
Subject: don't bother with registering rootfs

init_mount_tree() can get to rootfs_fs_type directly and that simplifies
a lot of things.  We don't need to register it, we don't need to look
it up *and* we don't need to bother with preventing subsequent userland
mounts.  That's the way we should've done that from the very beginning.

There is a user-visible change, namely the disappearance of "rootfs"
from /proc/filesystems.  Note that it's been unmountable all along
and it didn't show up in /proc/mounts; however, it *is* a user-visible
change and theoretically some script might've been using its presence
in /proc/filesystems to tell 2.4.11+ from earlier kernels.

*IF* any complaints about behaviour change do show up, we could fake
it in /proc/filesystems.  I very much doubt we'll have to, though.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       |  7 +------
 include/linux/init.h |  3 +++
 init/do_mounts.c     | 15 ++-------------
 3 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1141641dff96..2db2f4c36c50 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3686,13 +3686,8 @@ static void __init init_mount_tree(void)
 	struct mount *m;
 	struct mnt_namespace *ns;
 	struct path root;
-	struct file_system_type *type;
 
-	type = get_fs_type("rootfs");
-	if (!type)
-		panic("Can't find rootfs type");
-	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
-	put_filesystem(type);
+	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
diff --git a/include/linux/init.h b/include/linux/init.h
index 5255069f5a9f..cbe93521397e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -137,6 +137,8 @@ extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
 /* Used for contructor calls. */
 typedef void (*ctor_fn_t)(void);
 
+struct file_system_type;
+
 /* Defined in init/main.c */
 extern int do_one_initcall(initcall_t fn);
 extern char __initdata boot_command_line[];
@@ -147,6 +149,7 @@ extern unsigned int reset_devices;
 void setup_arch(char **);
 void prepare_namespace(void);
 int __init init_rootfs(void);
+extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 extern bool rodata_enabled;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index c170d8b309b1..e7f0b0f18cce 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -629,19 +629,15 @@ static bool is_tmpfs;
 static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	static unsigned long once;
 	void *fill = ramfs_fill_super;
 
-	if (test_and_set_bit(0, &once))
-		return ERR_PTR(-ENODEV);
-
 	if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
 		fill = shmem_fill_super;
 
 	return mount_nodev(fs_type, flags, data, fill);
 }
 
-static struct file_system_type rootfs_fs_type = {
+struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
 	.mount		= rootfs_mount,
 	.kill_sb	= kill_litter_super,
@@ -649,19 +645,12 @@ static struct file_system_type rootfs_fs_type = {
 
 int __init init_rootfs(void)
 {
-	int err = register_filesystem(&rootfs_fs_type);
-
-	if (err)
-		return err;
+	int err = 0;
 
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
 	}
-
-	if (err)
-		unregister_filesystem(&rootfs_fs_type);
-
 	return err;
 }
-- 
cgit v1.2.3


From 33488845f211afcdb7e5c00a3152890e06cdc78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2019 20:09:15 -0400
Subject: constify ksys_mount() string arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c  | 3 +--
 fs/namespace.c           | 4 ++--
 include/linux/syscalls.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 0dbc43068eeb..ba5c80903efe 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -357,8 +357,7 @@ int devtmpfs_mount(const char *mntdir)
 	if (!thread)
 		return 0;
 
-	err = ksys_mount("devtmpfs", (char *)mntdir, "devtmpfs", MS_SILENT,
-			 NULL);
+	err = ksys_mount("devtmpfs", mntdir, "devtmpfs", MS_SILENT, NULL);
 	if (err)
 		printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
 	else
diff --git a/fs/namespace.c b/fs/namespace.c
index 2db2f4c36c50..e272c2403014 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3295,8 +3295,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 }
 EXPORT_SYMBOL(mount_subtree);
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data)
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data)
 {
 	int ret;
 	char *kernel_type;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..2a0ac10a6f95 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1228,8 +1228,8 @@ asmlinkage long sys_ni_syscall(void);
  * the ksys_xyzyyz() functions prototyped below.
  */
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data);
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data);
 int ksys_umount(char __user *name, int flags);
 int ksys_dup(unsigned int fildes);
 int ksys_chroot(const char __user *filename);
-- 
cgit v1.2.3


From 037f11b4752f717201143a1dc5d6acf3cb71ddfa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 18:09:44 -0400
Subject: mnt_init(): call shmem_init() unconditionally

No point having two call sites (earlier in init_rootfs() from
mnt_init() in case we are going to use shmem-style rootfs,
later from do_basic_setup() unconditionally), along with the
logics in shmem_init() itself to make the second call a no-op...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       | 2 ++
 include/linux/init.h | 2 +-
 init/do_mounts.c     | 9 ++-------
 init/main.c          | 1 -
 mm/shmem.c           | 4 ----
 5 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index e272c2403014..e6990f3d526d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
 #include <linux/fs_context.h>
+#include <linux/shmem_fs.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -3740,6 +3741,7 @@ void __init mnt_init(void)
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	shmem_init();
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/include/linux/init.h b/include/linux/init.h
index cbe93521397e..212fc9e2f691 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -148,7 +148,7 @@ extern unsigned int reset_devices;
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
-int __init init_rootfs(void);
+void __init init_rootfs(void);
 extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index e7f0b0f18cce..864c032e995d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -643,14 +643,9 @@ struct file_system_type rootfs_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-int __init init_rootfs(void)
+void __init init_rootfs(void)
 {
-	int err = 0;
-
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
-		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
-		err = shmem_init();
+		(!root_fs_names || strstr(root_fs_names, "tmpfs")))
 		is_tmpfs = true;
-	}
-	return err;
 }
diff --git a/init/main.c b/init/main.c
index 5a2c69b4d7b3..4dbc7243557e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1000,7 +1000,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
-	shmem_init();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
diff --git a/mm/shmem.c b/mm/shmem.c
index 1bb3b8dc8bb2..1f67ec9e2062 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3775,10 +3775,6 @@ int __init shmem_init(void)
 {
 	int error;
 
-	/* If rootfs called this, don't re-init */
-	if (shmem_inode_cachep)
-		return 0;
-
 	shmem_init_inodecache();
 
 	error = register_filesystem(&shmem_fs_type);
-- 
cgit v1.2.3


From d14133dd41614aaaac1fa0505c7dab01f4211d2c Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:36 +0300
Subject: IB/mlx5: Support set qp counter

Support bind a qp with counter. If counter is null then bind the qp to the
default counter. Different QP state has different operation:

- RESET: Set the counter field so that it will take effective during
  RST2INIT change;
- RTS: Issue an RTS2RTS change to update the QP counter;
- Other: Set the counter field and mark the counter_pending flag, when QP
  is moved to RTS state and this flag is set, then issue an RTS2RTS
  modification to update the counter.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++
 drivers/infiniband/hw/mlx5/qp.c      | 76 +++++++++++++++++++++++++++++++++++-
 include/linux/mlx5/qp.h              |  1 +
 3 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7373e9da0919..c482f19958b3 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -439,6 +439,10 @@ struct mlx5_ib_qp {
 	u32			flags_en;
 	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
 	enum ib_qp_type		qp_sub_type;
+	/* A flag to indicate if there's a new counter is configured
+	 * but not take effective
+	 */
+	u32                     counter_pending;
 };
 
 struct mlx5_ib_cq_buf {
@@ -1468,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void);
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8b7a60ada92c..2a97619ed603 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
@@ -3380,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
 	return tx_port_affinity;
 }
 
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+				    struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_qp_context context = {};
+	struct mlx5_ib_port *mibport = NULL;
+	struct mlx5_ib_qp_base *base;
+	u32 set_id;
+
+	if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+		return 0;
+
+	if (counter) {
+		set_id = counter->id;
+	} else {
+		mibport = &dev->port[mqp->port - 1];
+		set_id = mibport->cnts.set_id;
+	}
+
+	base = &mqp->trans_qp.base;
+	context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+	context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+	return mlx5_core_qp_modify(dev->mdev,
+				   MLX5_CMD_OP_RTS2RTS_QP,
+				   MLX5_QP_OPTPAR_COUNTER_SET_ID,
+				   &context, &base->mqp);
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state,
@@ -3433,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	struct mlx5_ib_port *mibport = NULL;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
 	enum mlx5_qp_optpar optpar;
+	u32 set_id = 0;
 	int mlx5_st;
 	int err;
 	u16 op;
@@ -3595,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			port_num = 0;
 
 		mibport = &dev->port[port_num];
+		if (ibqp->counter)
+			set_id = ibqp->counter->id;
+		else
+			set_id = mibport->cnts.set_id;
 		context->qp_counter_set_usr_page |=
-			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+			cpu_to_be32(set_id << 24);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3624,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 		raw_qp_param.operation = op;
 		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+			raw_qp_param.rq_q_ctr_id = set_id;
 			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
 		}
 
@@ -3701,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		qp->db.db[MLX5_SND_DBR] = 0;
 	}
 
+	if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+		err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+		if (!err)
+			qp->counter_pending = 0;
+	}
+
 out:
 	kfree(context);
 	return err;
@@ -6435,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
 
 	handle_drain_completion(cq, &rdrain, dev);
 }
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	int err = 0;
+
+	mutex_lock(&mqp->mutex);
+	if (mqp->state == IB_QPS_RESET) {
+		qp->counter = counter;
+		goto out;
+	}
+
+	if (mqp->state == IB_QPS_RTS) {
+		err = __mlx5_ib_qp_set_counter(qp, counter);
+		if (!err)
+			qp->counter = counter;
+
+		goto out;
+	}
+
+	mqp->counter_pending = 1;
+	qp->counter = counter;
+
+out:
+	mutex_unlock(&mqp->mutex);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 937041101504..ae63b1ae9004 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -71,6 +71,7 @@ enum mlx5_qp_optpar {
 	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
 	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
 	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+	MLX5_QP_OPTPAR_COUNTER_SET_ID		= 1 << 25,
 };
 
 enum mlx5_qp_state {
-- 
cgit v1.2.3


From 7582f5b70f9a2335f3713edb9a2614a50f1f1a90 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 5 Jul 2019 21:16:34 +0800
Subject: bridge: add br_vlan_get_pvid_rcu()

This new function allows you to fetch bridge pvid from packet path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_vlan.c      | 19 +++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f3fab5d0ea97..950db1dad830 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -88,6 +88,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
 bool br_vlan_enabled(const struct net_device *dev);
 int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 #else
@@ -101,6 +102,11 @@ static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	return -EINVAL;
 }
 
+static inline int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+	return -EINVAL;
+}
+
 static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 				   struct bridge_vlan_info *p_vinfo)
 {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index f47f526b4f19..8d97b91ad503 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1227,13 +1227,11 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 	}
 }
 
-int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+static int __br_vlan_get_pvid(const struct net_device *dev,
+			      struct net_bridge_port *p, u16 *p_pvid)
 {
 	struct net_bridge_vlan_group *vg;
-	struct net_bridge_port *p;
 
-	ASSERT_RTNL();
-	p = br_port_get_check_rtnl(dev);
 	if (p)
 		vg = nbp_vlan_group(p);
 	else if (netif_is_bridge_master(dev))
@@ -1244,8 +1242,21 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	*p_pvid = br_get_pvid(vg);
 	return 0;
 }
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	ASSERT_RTNL();
+
+	return __br_vlan_get_pvid(dev, br_port_get_check_rtnl(dev), p_pvid);
+}
 EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
 
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+	return __br_vlan_get_pvid(dev, br_port_get_check_rcu(dev), p_pvid);
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo)
 {
-- 
cgit v1.2.3


From 31aed46fedbba65abece57e14d24f00b52389c4f Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 5 Jul 2019 21:16:36 +0800
Subject: bridge: add br_vlan_get_proto()

This new function allows you to fetch the bridge port vlan protocol.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_vlan.c      | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 950db1dad830..9e57c4411734 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -89,6 +89,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 bool br_vlan_enabled(const struct net_device *dev);
 int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 #else
@@ -102,6 +103,11 @@ static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	return -EINVAL;
 }
 
+static inline int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+	return -EINVAL;
+}
+
 static inline int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
 {
 	return -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8d97b91ad503..021cc9f66804 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -797,6 +797,16 @@ bool br_vlan_enabled(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_vlan_enabled);
 
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	*p_proto = ntohs(br->vlan_proto);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_proto);
+
 int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
 {
 	int err = 0;
-- 
cgit v1.2.3


From c5d4355d10d414a96ca870b731756b89d068d57a Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:22 +0530
Subject: libnvdimm: nd_region flush callback support

This patch adds functionality to perform flush from guest
to host over VIRTIO. We are registering a callback based
on 'nd_region' type. virtio_pmem driver requires this special
flush function. For rest of the region types we are registering
existing flush function. Report error returned by host fsync
failure to userspace.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c     |  4 ++--
 drivers/nvdimm/claim.c       |  6 ++++--
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/pmem.c        | 13 ++++++++-----
 drivers/nvdimm/region_devs.c | 26 ++++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  9 ++++++++-
 6 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 23022cf20d26..c02fa27dd3f3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region);
+	nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 	}
 
 	if (rw)
-		nvdimm_flush(nfit_blk->nd_region);
+		nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 26c1c7618891..2985ca949912 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
 	sector_t sector = offset >> 9;
-	int rc = 0;
+	int rc = 0, ret = 0;
 
 	if (unlikely(!size))
 		return 0;
@@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	memcpy_flushcache(nsio->addr + offset, buf, size);
-	nvdimm_flush(to_nd_region(ndns->dev.parent));
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+	if (ret)
+		rc = ret;
 
 	return rc;
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d24304c0e6d7..1b9955651379 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -155,6 +155,7 @@ struct nd_region {
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 	struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..223da63d1bd7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+	int ret = 0;
 	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
@@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
 	if (bio->bi_opf & REQ_FUA)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -477,7 +481,6 @@ static int pmem_attach_disk(struct device *dev,
 	}
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
-
 	gendev = disk_to_dev(disk);
 	gendev->groups = pmem_attribute_groups;
 
@@ -535,14 +538,14 @@ static int nd_pmem_remove(struct device *dev)
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
 	}
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 
 	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 }
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..eca2e62af134 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att
 		return rc;
 	if (!flush)
 		return -EINVAL;
-	nvdimm_flush(nd_region);
+	rc = nvdimm_flush(nd_region, NULL);
+	if (rc)
+		return rc;
 
 	return len;
 }
@@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	dev->of_node = ndr_desc->of_node;
 	nd_region->ndr_size = resource_size(ndr_desc->res);
 	nd_region->ndr_start = ndr_desc->res->start;
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = NULL;
+
 	nd_device_register(dev);
 
 	return nd_region;
@@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	int rc = 0;
+
+	if (!nd_region->flush)
+		rc = generic_nvdimm_flush(nd_region);
+	else {
+		if (nd_region->flush(nd_region, bio))
+			rc = -EIO;
+	}
+
+	return rc;
+}
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
  * @nd_region: blk or interleaved pmem region
  */
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
 {
 	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i, idx;
@@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region)
 		if (ndrd_get_flush_wpq(ndrd, i, 0))
 			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
 	wmb();
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_flush);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 03d5c3aece9d..e13100f424c8 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/spinlock.h>
+#include <linux/bio.h>
 
 struct badrange_entry {
 	u64 start;
@@ -57,6 +58,9 @@ enum {
 	 */
 	ND_REGION_PERSIST_MEMCTRL = 2,
 
+	/* Platform provides asynchronous flush mechanism */
+	ND_REGION_ASYNC = 3,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -113,6 +117,7 @@ struct nd_mapping_desc {
 	int position;
 };
 
+struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping_desc *mapping;
@@ -125,6 +130,7 @@ struct nd_region_desc {
 	int target_node;
 	unsigned long flags;
 	struct device_node *of_node;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
 
 struct device;
@@ -252,7 +258,8 @@ unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
-void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio);
+int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
-- 
cgit v1.2.3


From fefc1d97fa4b5e016bbe15447dc3edcd9e1bcb9f Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:24 +0530
Subject: libnvdimm: add dax_dev sync flag

This patch adds 'DAXDEV_SYNC' flag which is set
for nd_region doing synchronous flush. This later
is used to disable MAP_SYNC functionality for
ext4 & xfs filesystem for devices don't support
synchronous flush.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/bus.c            |  2 +-
 drivers/dax/super.c          | 19 ++++++++++++++++++-
 drivers/md/dm.c              |  3 ++-
 drivers/nvdimm/pmem.c        |  5 ++++-
 drivers/nvdimm/region_devs.c |  7 +++++++
 drivers/s390/block/dcssblk.c |  2 +-
 include/linux/dax.h          | 24 ++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  1 +
 8 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2109cfe80219..5f184e751c82 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -388,7 +388,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
 	 * No 'host' or dax_operations since there is no access to this
 	 * device outside of mmap of the resulting character device.
 	 */
-	dax_dev = alloc_dax(dev_dax, NULL, NULL);
+	dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
 	if (!dax_dev)
 		goto err;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 4e5ae7e8b557..8ab12068eea3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -195,6 +195,8 @@ enum dax_device_flags {
 	DAXDEV_ALIVE,
 	/* gate whether dax_flush() calls the low level flush routine */
 	DAXDEV_WRITE_CACHE,
+	/* flag to check if device supports synchronous flush */
+	DAXDEV_SYNC,
 };
 
 /**
@@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 
+bool __dax_synchronous(struct dax_device *dax_dev)
+{
+	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__dax_synchronous);
+
+void __set_dax_synchronous(struct dax_device *dax_dev)
+{
+	set_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__set_dax_synchronous);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
@@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
 }
 
 struct dax_device *alloc_dax(void *private, const char *__host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host,
 	dax_add_host(dax_dev, host);
 	dax_dev->ops = ops;
 	dax_dev->private = private;
+	if (flags & DAXDEV_F_SYNC)
+		set_dax_synchronous(dax_dev);
+
 	return dax_dev;
 
  err_dev:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..b1caa7188209 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1991,7 +1991,8 @@ static struct mapped_device *alloc_dev(int minor)
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
 	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+		md->dax_dev = alloc_dax(md, md->disk->disk_name,
+					&dm_dax_ops, 0);
 		if (!md->dax_dev)
 			goto bad;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 223da63d1bd7..8be868e2a18b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -376,6 +376,7 @@ static int pmem_attach_disk(struct device *dev,
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	unsigned long flags = 0UL;
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
@@ -474,7 +475,9 @@ static int pmem_attach_disk(struct device *dev,
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
 	disk->bb = &pmem->bb;
 
-	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (is_nvdimm_sync(nd_region))
+		flags = DAXDEV_F_SYNC;
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
 	if (!dax_dev) {
 		put_disk(disk);
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index eca2e62af134..56f2227f192a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1211,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
 struct conflict_context {
 	struct nd_region *nd_region;
 	resource_size_t start, size;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index d04d4378ca50..63502ca537eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 		goto put_dev;
 
 	dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
-			&dcssblk_dax_ops);
+			&dcssblk_dax_ops, DAXDEV_F_SYNC);
 	if (!dev_info->dax_dev) {
 		rc = -ENOMEM;
 		goto put_dev;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index becaea5f4488..8b535bc4526f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,9 @@
 #include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* Flag for synchronous flush */
+#define DAXDEV_F_SYNC (1UL << 0)
+
 typedef unsigned long dax_entry_t;
 
 struct iomap_ops;
@@ -38,18 +41,28 @@ extern struct attribute_group dax_attribute_group;
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
+		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
+bool __dax_synchronous(struct dax_device *dax_dev);
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return  __dax_synchronous(dax_dev);
+}
+void __set_dax_synchronous(struct dax_device *dax_dev);
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+	__set_dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
 	return NULL;
 }
 static inline struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	/*
 	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
@@ -70,6 +83,13 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 {
 	return false;
 }
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return true;
+}
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+}
 #endif
 
 struct writeback_control;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index e13100f424c8..7a64b3ddb408 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -263,6 +263,7 @@ int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
+bool is_nvdimm_sync(struct nd_region *nd_region);
 
 static inline int nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
-- 
cgit v1.2.3


From 32de1484648a837db5dea0a7007fe7136804e392 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:26 +0530
Subject: dax: check synchronous mapping is supported

This patch introduces 'daxdev_mapping_supported' helper
which checks if 'MAP_SYNC' is supported with filesystem
mapping. It also checks if corresponding dax_device is
synchronous. Virtio pmem device is asynchronous and
does not not support VM_SYNC.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8b535bc4526f..9bd8528bd305 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -56,6 +56,18 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 	__set_dax_synchronous(dax_dev);
 }
+/*
+ * Check if given mapping is supported by the file / underlying device.
+ */
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+					     struct dax_device *dax_dev)
+{
+	if (!(vma->vm_flags & VM_SYNC))
+		return true;
+	if (!IS_DAX(file_inode(vma->vm_file)))
+		return false;
+	return dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
@@ -90,6 +102,11 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+				struct dax_device *dax_dev)
+{
+	return !(vma->vm_flags & VM_SYNC);
+}
 #endif
 
 struct writeback_control;
-- 
cgit v1.2.3


From e4aa33ad595936391f7356f25c0c839011f14ead Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 4 Jul 2019 17:03:26 +0800
Subject: net: remove unused parameter from skb_checksum_try_convert

the check parameter is never used

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 +++-----
 net/ipv4/gre_demux.c   | 2 +-
 net/ipv4/udp.c         | 3 +--
 net/ipv6/udp.c         | 3 +--
 4 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b5d427b149c9..7ece49d5f8ef 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3919,18 +3919,16 @@ static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
 	return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
 }
 
-static inline void __skb_checksum_convert(struct sk_buff *skb,
-					  __sum16 check, __wsum pseudo)
+static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
 {
 	skb->csum = ~pseudo;
 	skb->ip_summed = CHECKSUM_COMPLETE;
 }
 
-#define skb_checksum_try_convert(skb, proto, check, compute_pseudo)	\
+#define skb_checksum_try_convert(skb, proto, compute_pseudo)	\
 do {									\
 	if (__skb_checksum_convert_check(skb))				\
-		__skb_checksum_convert(skb, check,			\
-				       compute_pseudo(skb, proto));	\
+		__skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
 } while (0)
 
 static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 293acfb36376..44bfeecac33e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	options = (__be32 *)(greh + 1);
 	if (greh->flags & GRE_CSUM) {
 		if (!skb_checksum_simple_validate(skb)) {
-			skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+			skb_checksum_try_convert(skb, IPPROTO_GRE,
 						 null_compute_pseudo);
 		} else if (csum_err) {
 			*csum_err = true;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b971bd95786..c21862ba9c02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2224,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
 	int ret;
 
 	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-		skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-					 inet_compute_pseudo);
+		skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
 
 	ret = udp_queue_rcv_skb(sk, skb);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 66ca5a4b17c4..4406e059da68 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -826,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
 	int ret;
 
 	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-		skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-					 ip6_compute_pseudo);
+		skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
 
 	ret = udpv6_queue_rcv_skb(sk, skb);
 
-- 
cgit v1.2.3


From e2869fb2068be603b46cd62bc980b4765948c6ed Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Fri, 5 Jul 2019 18:30:12 +0300
Subject: net/mlx5: Kconfig, Better organize compilation flags

Always contain all acceleration functions declarations in
'accel' files, independent to the flags setting.
For this, introduce new flags CONFIG_FPGA_{IPSEC/TLS} and use stubs
where needed.

This obsoletes the need for stubs in 'fpga' files. Remove them.

Also use the new flags in Makefile, to decide whether to compile
TLS-specific or IPSEC-specific objects, or not.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    | 43 ++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  7 +-
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c  |  4 ++
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  3 +
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  4 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.h   | 75 ----------------------
 include/linux/mlx5/accel.h                         |  2 +-
 8 files changed, 47 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..6556490d809c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -97,26 +97,49 @@ config MLX5_CORE_IPOIB
 	---help---
 	  MLX5 IPoIB offloads & acceleration support.
 
+config MLX5_FPGA_IPSEC
+	bool "Mellanox Technologies IPsec Innova support"
+	depends on MLX5_CORE
+	depends on MLX5_FPGA
+	default n
+	help
+	Build IPsec support for the Innova family of network cards by Mellanox
+	Technologies. Innova network cards are comprised of a ConnectX chip
+	and an FPGA chip on one board. If you select this option, the
+	mlx5_core driver will include the Innova FPGA core and allow building
+	sandbox-specific client drivers.
+
 config MLX5_EN_IPSEC
 	bool "IPSec XFRM cryptography-offload accelaration"
-	depends on MLX5_ACCEL
 	depends on MLX5_CORE_EN
 	depends on XFRM_OFFLOAD
 	depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD
+	depends on MLX5_FPGA_IPSEC
 	default n
-	---help---
+	help
 	  Build support for IPsec cryptography-offload accelaration in the NIC.
 	  Note: Support for hardware with this capability needs to be selected
 	  for this option to become available.
 
-config MLX5_EN_TLS
-	bool "TLS cryptography-offload accelaration"
-	depends on MLX5_CORE_EN
+config MLX5_FPGA_TLS
+	bool "Mellanox Technologies TLS Innova support"
 	depends on TLS_DEVICE
 	depends on TLS=y || MLX5_CORE=m
-	depends on MLX5_ACCEL
+	depends on MLX5_FPGA
 	default n
-	---help---
-	  Build support for TLS cryptography-offload accelaration in the NIC.
-	  Note: Support for hardware with this capability needs to be selected
-	  for this option to become available.
+	help
+	Build TLS support for the Innova family of network cards by Mellanox
+	Technologies. Innova network cards are comprised of a ConnectX chip
+	and an FPGA chip on one board. If you select this option, the
+	mlx5_core driver will include the Innova FPGA core and allow building
+	sandbox-specific client drivers.
+
+config MLX5_EN_TLS
+	bool "TLS cryptography-offload accelaration"
+	depends on MLX5_CORE_EN
+	depends on MLX5_FPGA_TLS
+	default y
+	help
+	Build support for TLS cryptography-offload accelaration in the NIC.
+	Note: Support for hardware with this capability needs to be selected
+	for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8456b19d79cd..d3409870646a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -53,10 +53,11 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib
 #
 # Accelerations & FPGA
 #
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA_IPSEC) += fpga/ipsec.o
+mlx5_core-$(CONFIG_MLX5_FPGA_TLS)   += fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_ACCEL)      += accel/tls.o accel/ipsec.o
 
-mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
-				 fpga/ipsec.o fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 				     en_accel/ipsec_stats.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
index d1e76d5a413b..eddc34e4a762 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -31,6 +31,8 @@
  *
  */
 
+#ifdef CONFIG_MLX5_FPGA_IPSEC
+
 #include <linux/mlx5/device.h>
 
 #include "accel/ipsec.h"
@@ -112,3 +114,5 @@ int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
 	return mlx5_fpga_esp_modify_xfrm(xfrm, attrs);
 }
 EXPORT_SYMBOL_GPL(mlx5_accel_esp_modify_xfrm);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
index 93b3f5faddb5..530e428d46ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
@@ -37,7 +37,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/accel.h>
 
-#ifdef CONFIG_MLX5_ACCEL
+#ifdef CONFIG_MLX5_FPGA_IPSEC
 
 #define MLX5_IPSEC_DEV(mdev) (mlx5_accel_ipsec_device_caps(mdev) & \
 			      MLX5_ACCEL_IPSEC_CAP_DEVICE)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
index da7bd26368f9..a2c9eda1ebf5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -35,6 +35,8 @@
 
 #include "accel/tls.h"
 #include "mlx5_core.h"
+
+#ifdef CONFIG_MLX5_FPGA_TLS
 #include "fpga/tls.h"
 
 int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
@@ -78,3 +80,4 @@ void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
 {
 	mlx5_fpga_tls_cleanup(mdev);
 }
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
index def4093ebfae..e5d306ad7f91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -37,8 +37,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/tls.h>
 
-#ifdef CONFIG_MLX5_ACCEL
-
+#ifdef CONFIG_MLX5_FPGA_TLS
 enum {
 	MLX5_ACCEL_TLS_TX = BIT(0),
 	MLX5_ACCEL_TLS_RX = BIT(1),
@@ -88,7 +87,6 @@ static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return
 static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
 static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
 static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { }
-
 #endif
 
 #endif	/* __MLX5_ACCEL_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
index 2b5e63b0d4d6..382985e65b48 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
@@ -37,8 +37,6 @@
 #include "accel/ipsec.h"
 #include "fs_cmd.h"
 
-#ifdef CONFIG_MLX5_FPGA
-
 u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev);
 unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev);
 int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
@@ -66,77 +64,4 @@ int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
 const struct mlx5_flow_cmds *
 mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type);
 
-#else
-
-static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline unsigned int
-mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev,
-						u64 *counters)
-{
-	return 0;
-}
-
-static inline void *
-mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
-			      struct mlx5_accel_esp_xfrm *accel_xfrm,
-			      const __be32 saddr[4],
-			      const __be32 daddr[4],
-			      const __be32 spi, bool is_ipv6)
-{
-	return NULL;
-}
-
-static inline void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
-{
-}
-
-static inline int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
-{
-}
-
-static inline void mlx5_fpga_ipsec_build_fs_cmds(void)
-{
-}
-
-static inline struct mlx5_accel_esp_xfrm *
-mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
-			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
-			  u32 flags)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
-{
-}
-
-static inline int
-mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
-			  const struct mlx5_accel_esp_xfrm_attrs *attrs)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline const struct mlx5_flow_cmds *
-mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
-{
-	return mlx5_fs_cmd_get_default(type);
-}
-
-#endif /* CONFIG_MLX5_FPGA */
-
 #endif	/* __MLX5_FPGA_SADB_H__ */
diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h
index 70e7e5673ce9..5613e677a5f9 100644
--- a/include/linux/mlx5/accel.h
+++ b/include/linux/mlx5/accel.h
@@ -114,7 +114,7 @@ enum mlx5_accel_ipsec_cap {
 	MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN	= 1 << 7,
 };
 
-#ifdef CONFIG_MLX5_ACCEL
+#ifdef CONFIG_MLX5_FPGA_IPSEC
 
 u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev);
 
-- 
cgit v1.2.3


From 7e0a0e38fcfea47e74b0ff6da6266f00bcd2af43 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 10:49:27 -0400
Subject: SUNRPC: Replace the queue timer with a delayed work function

The queue timer function, which walks the RPC queue in order to locate
candidates for waking up is one of the current constraints against
removing the bh-safe queue spin locks. Replace it with a delayed
work queue, so that we can do the actual rpc task wake ups from an
ordinary process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h |  3 ++-
 net/sunrpc/sched.c           | 30 ++++++++++++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..7d8db5dcac04 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -183,8 +183,9 @@ struct rpc_task_setup {
 #define RPC_NR_PRIORITY		(1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW)
 
 struct rpc_timer {
-	struct timer_list timer;
 	struct list_head list;
+	unsigned long expires;
+	struct delayed_work dwork;
 };
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a2c114812717..e0a0cf381eba 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -46,7 +46,7 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
-static void __rpc_queue_timer_fn(struct timer_list *t);
+static void __rpc_queue_timer_fn(struct work_struct *);
 
 /*
  * RPC tasks sit here while waiting for conditions to improve.
@@ -87,13 +87,19 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 	task->tk_timeout = 0;
 	list_del(&task->u.tk_wait.timer_list);
 	if (list_empty(&queue->timer_list.list))
-		del_timer(&queue->timer_list.timer);
+		cancel_delayed_work(&queue->timer_list.dwork);
 }
 
 static void
 rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
 {
-	timer_reduce(&queue->timer_list.timer, expires);
+	unsigned long now = jiffies;
+	queue->timer_list.expires = expires;
+	if (time_before_eq(expires, now))
+		expires = 0;
+	else
+		expires -= now;
+	mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires);
 }
 
 /*
@@ -107,7 +113,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
 		task->tk_pid, jiffies_to_msecs(timeout - jiffies));
 
 	task->tk_timeout = timeout;
-	rpc_set_queue_timer(queue, timeout);
+	if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
+		rpc_set_queue_timer(queue, timeout);
 	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
@@ -250,7 +257,8 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
 	queue->maxpriority = nr_queues - 1;
 	rpc_reset_waitqueue_priority(queue);
 	queue->qlen = 0;
-	timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0);
+	queue->timer_list.expires = 0;
+	INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
 	INIT_LIST_HEAD(&queue->timer_list.list);
 	rpc_assign_waitqueue_name(queue, qname);
 }
@@ -269,7 +277,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
 
 void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
 {
-	del_timer_sync(&queue->timer_list.timer);
+	cancel_delayed_work_sync(&queue->timer_list.dwork);
 }
 EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
@@ -759,13 +767,15 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
-static void __rpc_queue_timer_fn(struct timer_list *t)
+static void __rpc_queue_timer_fn(struct work_struct *work)
 {
-	struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer);
+	struct rpc_wait_queue *queue = container_of(work,
+			struct rpc_wait_queue,
+			timer_list.dwork.work);
 	struct rpc_task *task, *n;
 	unsigned long expires, now, timeo;
 
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	expires = now = jiffies;
 	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
 		timeo = task->tk_timeout;
@@ -780,7 +790,7 @@ static void __rpc_queue_timer_fn(struct timer_list *t)
 	}
 	if (!list_empty(&queue->timer_list.list))
 		rpc_set_queue_timer(queue, expires);
-	spin_unlock(&queue->lock);
+	spin_unlock_bh(&queue->lock);
 }
 
 static void __rpc_atrun(struct rpc_task *task)
-- 
cgit v1.2.3


From 4f8943f8088348ec01456b075d44ad19dce3d698 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 16:28:29 -0400
Subject: SUNRPC: Replace direct task wakeups from softirq context

Replace the direct task wakeups from inside a softirq context with
wakeups from a process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  5 +++
 net/sunrpc/xprtsock.c           | 78 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index b81d0b3e0799..7638dbe7bc50 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -56,6 +56,7 @@ struct sock_xprt {
 	 */
 	unsigned long		sock_state;
 	struct delayed_work	connect_worker;
+	struct work_struct	error_worker;
 	struct work_struct	recv_worker;
 	struct mutex		recv_mutex;
 	struct sockaddr_storage	srcaddr;
@@ -84,6 +85,10 @@ struct sock_xprt {
 #define XPRT_SOCK_CONNECTING	1U
 #define XPRT_SOCK_DATA_READY	(2)
 #define XPRT_SOCK_UPD_TIMEOUT	(3)
+#define XPRT_SOCK_WAKE_ERROR	(4)
+#define XPRT_SOCK_WAKE_WRITE	(5)
+#define XPRT_SOCK_WAKE_PENDING	(6)
+#define XPRT_SOCK_WAKE_DISCONNECT	(7)
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 36652352a38c..92af57019b96 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1211,6 +1211,15 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
 	clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+}
+
+static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
+{
+	set_bit(nr, &transport->sock_state);
+	queue_work(xprtiod_workqueue, &transport->error_worker);
 }
 
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
@@ -1231,6 +1240,7 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
  */
 static void xs_error_report(struct sock *sk)
 {
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 	int err;
 
@@ -1238,13 +1248,14 @@ static void xs_error_report(struct sock *sk)
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	err = -sk->sk_err;
 	if (err == 0)
 		goto out;
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
-	xprt_wake_pending_tasks(xprt, err);
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1507,7 +1518,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt->stat.connect_count++;
 			xprt->stat.connect_time += (long)jiffies -
 						   xprt->stat.connect_start;
-			xprt_wake_pending_tasks(xprt, -EAGAIN);
+			xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING);
 		}
 		spin_unlock(&xprt->transport_lock);
 		break;
@@ -1525,7 +1536,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		/* The server initiated a shutdown of the socket */
 		xprt->connect_cookie++;
 		clear_bit(XPRT_CONNECTED, &xprt->state);
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 		/* fall through */
 	case TCP_CLOSING:
 		/*
@@ -1547,7 +1558,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt_clear_connecting(xprt);
 		clear_bit(XPRT_CLOSING, &xprt->state);
 		/* Trigger the socket release */
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 	}
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1556,6 +1567,7 @@ static void xs_tcp_state_change(struct sock *sk)
 static void xs_write_space(struct sock *sk)
 {
 	struct socket_wq *wq;
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 
 	if (!sk->sk_socket)
@@ -1564,13 +1576,14 @@ static void xs_write_space(struct sock *sk)
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
 		goto out;
 
-	if (xprt_write_space(xprt))
-		sk->sk_write_pending--;
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
+	sk->sk_write_pending--;
 out:
 	rcu_read_unlock();
 }
@@ -2461,6 +2474,56 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 			delay);
 }
 
+static void xs_wake_disconnect(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state))
+		xs_tcp_force_close(&transport->xprt);
+}
+
+static void xs_wake_write(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state))
+		xprt_write_space(&transport->xprt);
+}
+
+static void xs_wake_error(struct sock_xprt *transport)
+{
+	int sockerr;
+	int sockerr_len = sizeof(sockerr);
+
+	if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		return;
+	mutex_lock(&transport->recv_mutex);
+	if (transport->sock == NULL)
+		goto out;
+	if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		goto out;
+	if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
+				(char *)&sockerr, &sockerr_len) != 0)
+		goto out;
+	if (sockerr < 0)
+		xprt_wake_pending_tasks(&transport->xprt, sockerr);
+out:
+	mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_wake_pending(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state))
+		xprt_wake_pending_tasks(&transport->xprt, -EAGAIN);
+}
+
+static void xs_error_handle(struct work_struct *work)
+{
+	struct sock_xprt *transport = container_of(work,
+			struct sock_xprt, error_worker);
+
+	xs_wake_disconnect(transport);
+	xs_wake_write(transport);
+	xs_wake_error(transport);
+	xs_wake_pending(transport);
+}
+
 /**
  * xs_local_print_stats - display AF_LOCAL socket-specifc stats
  * @xprt: rpc_xprt struct containing statistics
@@ -2873,6 +2936,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	xprt->timeout = &xs_local_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
 
 	switch (sun->sun_family) {
@@ -2943,6 +3007,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 	xprt->timeout = &xs_udp_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
 
 	switch (addr->sa_family) {
@@ -3024,6 +3089,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 		(xprt->timeout->to_retries + 1);
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
 
 	switch (addr->sa_family) {
-- 
cgit v1.2.3


From 21f0ffaff510b0530bfdf77da7133c0b99dee2fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 28 Apr 2017 10:52:42 -0400
Subject: SUNRPC: Add basic load balancing to the transport switch

For now, just count the queue length. It is less accurate than counting
number of bytes queued, but easier to implement.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h          |  1 +
 include/linux/sunrpc/xprtmultipath.h |  2 ++
 net/sunrpc/clnt.c                    | 40 +++++++++++++++++++++++++++++++++---
 net/sunrpc/xprtmultipath.c           | 20 +++++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..15322c1d9c8c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -238,6 +238,7 @@ struct rpc_xprt {
 	/*
 	 * Send stuff
 	 */
+	atomic_long_t		queuelen;
 	spinlock_t		transport_lock;	/* lock transport info */
 	spinlock_t		reserve_lock;	/* lock slot table */
 	spinlock_t		queue_lock;	/* send/receive queue lock */
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index af1257c030d2..c6cce3fbf29d 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -15,6 +15,8 @@ struct rpc_xprt_switch {
 	struct kref		xps_kref;
 
 	unsigned int		xps_nxprts;
+	unsigned int		xps_nactive;
+	atomic_long_t		xps_queuelen;
 	struct list_head	xps_xprt_list;
 
 	struct net *		xps_net;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b03bfa055c08..976eab68bb5d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -968,13 +968,47 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+static struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
+
+	if (!xprt)
+		return NULL;
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_inc(&xps->xps_queuelen);
+	rcu_read_unlock();
+	atomic_long_inc(&xprt->queuelen);
+
+	return xprt;
+}
+
+static void
+rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	atomic_long_dec(&xprt->queuelen);
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_dec(&xps->xps_queuelen);
+	rcu_read_unlock();
+
+	xprt_put(xprt);
+}
+
 void rpc_task_release_transport(struct rpc_task *task)
 {
 	struct rpc_xprt *xprt = task->tk_xprt;
 
 	if (xprt) {
 		task->tk_xprt = NULL;
-		xprt_put(xprt);
+		if (task->tk_client)
+			rpc_task_release_xprt(task->tk_client, xprt);
+		else
+			xprt_put(xprt);
 	}
 }
 EXPORT_SYMBOL_GPL(rpc_task_release_transport);
@@ -983,6 +1017,7 @@ void rpc_task_release_client(struct rpc_task *task)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 
+	rpc_task_release_transport(task);
 	if (clnt != NULL) {
 		/* Remove from client task list */
 		spin_lock(&clnt->cl_lock);
@@ -992,14 +1027,13 @@ void rpc_task_release_client(struct rpc_task *task)
 
 		rpc_release_client(clnt);
 	}
-	rpc_task_release_transport(task);
 }
 
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
 	if (!task->tk_xprt)
-		task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8394124126f8..394e427533be 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -36,6 +36,7 @@ static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = xprt->xprt_net;
 	xps->xps_nxprts++;
+	xps->xps_nactive++;
 }
 
 /**
@@ -62,6 +63,7 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
 {
 	if (unlikely(xprt == NULL))
 		return;
+	xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = NULL;
@@ -317,8 +319,24 @@ struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
 static
 struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
 {
-	return xprt_iter_next_entry_multiple(xpi,
+	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+	struct rpc_xprt *xprt;
+	unsigned long xprt_queuelen;
+	unsigned long xps_queuelen;
+	unsigned long xps_avglen;
+
+	do {
+		xprt = xprt_iter_next_entry_multiple(xpi,
 			xprt_switch_find_next_entry_roundrobin);
+		if (xprt == NULL)
+			break;
+		xprt_queuelen = atomic_long_read(&xprt->queuelen);
+		if (xprt_queuelen <= 2)
+			break;
+		xps_queuelen = atomic_long_read(&xps->xps_queuelen);
+		xps_avglen = DIV_ROUND_UP(xps_queuelen, xps->xps_nactive);
+	} while (xprt_queuelen > xps_avglen);
+	return xprt;
 }
 
 static
-- 
cgit v1.2.3


From 1c341b777501613aad83f9c233a3fe5701cff083 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 22 May 2019 08:38:57 -0400
Subject: NFS: Add deferred cache invalidation for close-to-open consistency
 violations

If the client detects that close-to-open cache consistency has been
violated, and that the file or directory has been changed on the
server, then do a cache invalidation when we're done working with
the file.
The reason we don't do an immediate cache invalidation is that we
want to avoid performance problems due to false positives. Also,
note that we cannot guarantee cache consistency in this situation
even if we do invalidate the cache.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  4 ++++
 fs/nfs/inode.c         | 15 +++++++++++----
 include/linux/nfs_fs.h |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57b6a45576ad..bd1f9555447b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -80,6 +80,10 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->dup_cookie = 0;
 		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
+		if (list_empty(&nfsi->open_files) &&
+		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+				NFS_INO_REVAL_FORCED;
 		list_add(&ctx->list, &nfsi->open_files);
 		spin_unlock(&dir->i_lock);
 		return ctx;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 53777813ca95..ea52c71534b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,7 +208,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 	}
 
 	if (inode->i_mapping->nrpages == 0)
-		flags &= ~NFS_INO_INVALID_DATA;
+		flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
 	nfsi->cache_validity |= flags;
 	if (flags & NFS_INO_INVALID_DATA)
 		nfs_fscache_invalidate(inode);
@@ -652,7 +652,8 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	i_size_write(inode, offset);
 	/* Optimisation */
 	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
+				NFS_INO_DATA_INVAL_DEFER);
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -1032,6 +1033,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
+	if (list_empty(&nfsi->open_files) &&
+	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+			NFS_INO_REVAL_FORCED;
 	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
@@ -1313,7 +1318,8 @@ int nfs_revalidate_mapping(struct inode *inode,
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
+			NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1871,7 +1877,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			}
+			} else if (!have_delegation)
+				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
 			inode_set_iversion_raw(inode, fattr->change_attr);
 			attr_changed = true;
 		}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d363d5765cdf..0a11712a80e3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -223,6 +223,8 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_MTIME	BIT(10)		/* cached mtime is invalid */
 #define NFS_INO_INVALID_SIZE	BIT(11)		/* cached size is invalid */
 #define NFS_INO_INVALID_OTHER	BIT(12)		/* other attrs are invalid */
+#define NFS_INO_DATA_INVAL_DEFER	\
+				BIT(13)		/* Deferred cache invalidation */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
-- 
cgit v1.2.3


From 612b41f808a98a124b23d72229693c3181733291 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 08:50:51 -0400
Subject: SUNRPC: Allow creation of RPC clients with multiple connections

Add an argument to struct rpc_create_args that allows the specification
of how many transport connections you want to set up to the server.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 17 ++++++++++++++++-
 net/sunrpc/xprtmultipath.c  |  3 +--
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6e8073140a5d..4619098affa3 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -124,6 +124,7 @@ struct rpc_create_args {
 	u32			prognumber;	/* overrides program->number */
 	u32			version;
 	rpc_authflavor_t	authflavor;
+	u32			nconnect;
 	unsigned long		flags;
 	char			*client_name;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 976eab68bb5d..b6aca8cb5ae6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -528,6 +528,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		.bc_xprt = args->bc_xprt,
 	};
 	char servername[48];
+	struct rpc_clnt *clnt;
+	int i;
 
 	if (args->bc_xprt) {
 		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -590,7 +592,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
 		xprt->resvport = 0;
 
-	return rpc_create_xprt(args, xprt);
+	clnt = rpc_create_xprt(args, xprt);
+	if (IS_ERR(clnt) || args->nconnect <= 1)
+		return clnt;
+
+	for (i = 0; i < args->nconnect - 1; i++) {
+		if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+			break;
+	}
+	return clnt;
 }
 EXPORT_SYMBOL_GPL(rpc_create);
 
@@ -2730,6 +2740,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 		return -ENOMEM;
 	data->xps = xprt_switch_get(xps);
 	data->xprt = xprt_get(xprt);
+	if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+		rpc_cb_add_xprt_release(data);
+		goto success;
+	}
 
 	task = rpc_call_null_helper(clnt, xprt, NULL,
 			RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
@@ -2737,6 +2751,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	rpc_put_task(task);
+success:
 	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 394e427533be..9d66ce53355d 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -52,8 +52,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 	if (xprt == NULL)
 		return;
 	spin_lock(&xps->xps_lock);
-	if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
-	    !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
 }
-- 
cgit v1.2.3


From 6619079d05404cb32be29af329b87ac3b0ad4f96 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 11:13:40 -0400
Subject: NFSv4: Allow multiple connections to NFSv4.x (x>0) servers

If the user specifies the -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the server. The
connections will all go to the same IP address.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           |  2 ++
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4client.c       | 11 +++++++++--
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d7e4f0848e28..fa6953e56a71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -175,6 +175,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
+	clp->cl_nconnect = cl_init->nconnect;
 	clp->cl_net = get_net(cl_init->net);
 
 	clp->cl_principal = "*";
@@ -493,6 +494,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 	struct rpc_create_args args = {
 		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
+		.nconnect	= clp->cl_nconnect,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
 		.timeout	= cl_init->timeparms,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bba09dace5d6..4a49dc1495c5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -82,6 +82,7 @@ struct nfs_client_initdata {
 	struct nfs_subversion *nfs_mod;
 	int proto;
 	u32 minorversion;
+	unsigned int nconnect;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
 	const struct cred *cred;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 81b9b6d7927a..5c244c440658 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -859,7 +859,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		int proto, const struct rpc_timeout *timeparms,
-		u32 minorversion, struct net *net)
+		u32 minorversion, unsigned int nconnect,
+		struct net *net)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -875,6 +876,8 @@ static int nfs4_set_client(struct nfs_server *server,
 	};
 	struct nfs_client *clp;
 
+	if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = nconnect;
 	if (server->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	if (server->options & NFS_OPTION_MIGRATION)
@@ -1074,6 +1077,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
+			data->nfs_server.nconnect,
 			data->net);
 	if (error < 0)
 		return error;
@@ -1163,6 +1167,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_RDMA,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (!error)
 		goto init_server;
@@ -1176,6 +1181,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_TCP,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (error < 0)
 		goto error;
@@ -1271,7 +1277,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 	set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
 				clp->cl_proto, clnt->cl_timeout,
-				clp->cl_minorversion, net);
+				clp->cl_minorversion,
+				clp->cl_nconnect, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	if (error != 0) {
 		nfs_server_insert_lists(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1e78032a174b..a87fe854f008 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -58,6 +58,7 @@ struct nfs_client {
 	struct nfs_subversion *	cl_nfs_mod;	/* pointer to nfs version module */
 
 	u32			cl_minorversion;/* NFSv4 minorversion */
+	unsigned int		cl_nconnect;	/* Number of connections */
 	const char *		cl_principal;  /* used for machine cred */
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From 5a0c257f8e0f4c4b3c33dff545317c21a921303e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 30 May 2019 10:41:28 +1000
Subject: NFS: send state management on a single connection.

With NFSv4.1, different network connections need to be explicitly
bound to a session.  During session startup, this is not possible
so only a single connection must be used for session startup.

So add a task flag to disable the default round-robin choice of
connections (when nconnect > 1) and force the use of a single
connection.
Then use that flag on all requests for session management - for
consistence, include NFSv4.0 management (SETCLIENTID) and session
destruction

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c            | 22 +++++++++++++---------
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/clnt.c            | 24 +++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26626ea1f197..d115d9973efc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5992,7 +5992,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_setclientid_ops,
 		.callback_data = &setclientid,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status;
 
@@ -6058,7 +6058,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_clientid);
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_setclientid_confirm(clp, status);
 	dprintk("NFS reply setclientid_confirm: %d\n", status);
 	return status;
@@ -7639,7 +7640,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
 		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
 
 	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
 	put_cred(cred);
@@ -7977,7 +7978,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
 		.rpc_client = clp->cl_rpcclient,
 		.callback_ops = &nfs4_exchange_id_call_ops,
 		.rpc_message = &msg,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	struct nfs41_exchange_id_data *calldata;
 	int status;
@@ -8202,7 +8203,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
 	};
 	int status;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_clientid(clp, status);
 	if (status)
 		dprintk("NFS: Got error %d from the server %s on "
@@ -8481,7 +8483,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_create_session(clp, status);
 
 	switch (status) {
@@ -8557,7 +8560,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
 		return 0;
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_session(session->clp, status);
 
 	if (status)
@@ -8811,7 +8815,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_reclaim_complete_call_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status = -ENOMEM;
 
@@ -9330,7 +9334,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("--> %s\n", __func__);
 	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
 	put_cred(cred);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..11424bdf09e6 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -126,6 +126,7 @@ struct rpc_task_setup {
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
 #define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
+#define	RPC_TASK_NO_ROUND_ROBIN	0x0100		/* send requests on "main" xprt */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
 #define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
 #define RPC_TASK_SENT		0x0800		/* message was sent */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b6aca8cb5ae6..d599fab8adcb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -995,6 +995,24 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	if (xprt) {
+		atomic_long_inc(&xprt->queuelen);
+		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+		atomic_long_inc(&xps->xps_queuelen);
+	}
+	rcu_read_unlock();
+
+	return xprt;
+}
+
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1042,7 +1060,11 @@ void rpc_task_release_client(struct rpc_task *task)
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
-	if (!task->tk_xprt)
+	if (task->tk_xprt)
+		return;
+	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
+		task->tk_xprt = rpc_task_get_first_xprt(clnt);
+	else
 		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
-- 
cgit v1.2.3


From a332518fda4731c07394164b3edcbb6efaf4c4d7 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 23 May 2019 16:13:50 -0400
Subject: SUNRPC: Count ops completing with tk_status < 0

We often see various error conditions with NFS4.x that show up with
a very high operation count all completing with tk_status < 0 in a
short period of time.  Add a count to rpc_iostats to record on a
per-op basis the ops that complete in this manner, which will
enable lower overhead diagnostics.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/metrics.h | 7 ++++++-
 net/sunrpc/stats.c             | 8 ++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 1b3751327575..0ee3f7052846 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -30,7 +30,7 @@
 #include <linux/ktime.h>
 #include <linux/spinlock.h>
 
-#define RPC_IOSTATS_VERS	"1.0"
+#define RPC_IOSTATS_VERS	"1.1"
 
 struct rpc_iostats {
 	spinlock_t		om_lock;
@@ -66,6 +66,11 @@ struct rpc_iostats {
 	ktime_t			om_queue,	/* queued for xmit */
 				om_rtt,		/* RPC RTT */
 				om_execute;	/* RPC execution */
+	/*
+	 * The count of operations that complete with tk_status < 0.
+	 * These statuses usually indicate error conditions.
+	 */
+	unsigned long           om_error_status;
 } ____cacheline_aligned;
 
 struct rpc_task;
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2f7bde82450b..48ea776364f8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -177,6 +177,8 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
 
 	execute = ktime_sub(now, task->tk_start);
 	op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
+	if (task->tk_status < 0)
+		op_metrics->om_error_status++;
 
 	spin_unlock(&op_metrics->om_lock);
 
@@ -219,13 +221,14 @@ static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
 	a->om_queue = ktime_add(a->om_queue, b->om_queue);
 	a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
 	a->om_execute = ktime_add(a->om_execute, b->om_execute);
+	a->om_error_status += b->om_error_status;
 }
 
 static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 			       int op, const struct rpc_procinfo *procs)
 {
 	_print_name(seq, op, procs);
-	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu\n",
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n",
 		   stats->om_ops,
 		   stats->om_ntrans,
 		   stats->om_timeouts,
@@ -233,7 +236,8 @@ static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 		   stats->om_bytes_recv,
 		   ktime_to_ms(stats->om_queue),
 		   ktime_to_ms(stats->om_rtt),
-		   ktime_to_ms(stats->om_execute));
+		   ktime_to_ms(stats->om_execute),
+		   stats->om_error_status);
 }
 
 void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
-- 
cgit v1.2.3


From 1dd7382b1bb608e7ccae3672621eaceca355ae8b Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 1 Jul 2019 21:14:01 +0300
Subject: net/mlx5: Introduce VHCA tunnel device capability

When using the device emulation feature (introduced in Bluefield-1 SOC),
a privileged function (the device emulation manager) will be able to
create a channel to execute commands on behalf of the emulated function.

This channel will be a general object of type VHCA_TUNNEL that will have
a unique ID for each emulated function. This ID will be passed in each
cmd that will be issued by the emulation SW in a well known offset in
the command header.

This channel is needed since the emulated function doesn't have a normal
command interface to the HCA HW, but some basic configuration for that
function is needed (e.g. initialize and enable the HCA). For that matter,
a specific command-set was defined and only those commands will be issued
by the HCA.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 06881b79167e..ba60bd17a92a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1390,7 +1390,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_6c8[0x28];
 	u8	   sf_base_id[0x10];
 
-	u8	   reserved_at_700[0x100];
+	u8	   reserved_at_700[0x80];
+	u8	   vhca_tunnel_commands[0x40];
+	u8	   reserved_at_7c0[0x40];
 };
 
 enum mlx5_flow_destination_type {
@@ -9694,7 +9696,7 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
 	u8         opcode[0x10];
 	u8         uid[0x10];
 
-	u8         reserved_at_20[0x10];
+	u8         vhca_tunnel_id[0x10];
 	u8         obj_type[0x10];
 
 	u8         obj_id[0x20];
-- 
cgit v1.2.3


From 28ef2db8866495150e4260773fead8313f1a5625 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 28 Jun 2019 18:59:42 +0800
Subject: timer: Document TIMER_PINNED

The flag hints the user that the pinned timers will always be run on a
static CPU (because that should be what "pinned" means...) but that's
not the truth, at least with the current implementation.

For example, currently if a pinned timer is set up but later mod_timer()
upon the pinned timer is invoked, mod_timer() will still try to queue the
timer on the current processor and migrate the timer if necessary.

Document it a bit with the definition of TIMER_PINNED so that all future
users will use it correctly.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Link: https://lkml.kernel.org/r/20190628105942.14131-1-peterx@redhat.com
---
 include/linux/timer.h | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 7b066fd38248..282e4f2a532a 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -36,19 +36,30 @@ struct timer_list {
 #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
 #endif
 
-/*
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
+/**
+ * @TIMER_DEFERRABLE: A deferrable timer will work normally when the
+ * system is busy, but will not cause a CPU to come out of idle just
+ * to service it; instead, the timer will be serviced when the CPU
+ * eventually wakes up with a subsequent non-deferrable timer.
  *
- * An irqsafe timer is executed with IRQ disabled and it's safe to wait for
- * the completion of the running instance from IRQ handlers, for example,
- * by calling del_timer_sync().
+ * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and
+ * it's safe to wait for the completion of the running instance from
+ * IRQ handlers, for example, by calling del_timer_sync().
  *
  * Note: The irq disabled callback execution is a special case for
  * workqueue locking issues. It's not meant for executing random crap
  * with interrupts disabled. Abuse is monitored!
+ *
+ * @TIMER_PINNED: A pinned timer will not be affected by any timer
+ * placement heuristics (like, NOHZ) and will always expire on the CPU
+ * on which the timer was enqueued.
+ *
+ * Note: Because enqueuing of timers can migrate the timer from one
+ * CPU to another, pinned timers are not guaranteed to stay on the
+ * initialy selected CPU.  They move to the CPU on which the enqueue
+ * function is invoked via mod_timer() or add_timer().  If the timer
+ * should be placed on a particular CPU, then add_timer_on() has to be
+ * used.
  */
 #define TIMER_CPUMASK		0x0003FFFF
 #define TIMER_MIGRATING		0x00040000
-- 
cgit v1.2.3


From 97a385e558292ba0851906783642239865670a5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 May 2019 16:40:32 -0400
Subject: libceph: remove ceph_get_direct_page_vector()

This function is entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  4 ----
 net/ceph/pagevec.c           | 33 ---------------------------------
 2 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 337d5049ff93..a3cddf5f0e60 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -299,10 +299,6 @@ int ceph_wait_for_latest_osdmap(struct ceph_client *client,
 
 /* pagevec.c */
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
-
-extern struct page **ceph_get_direct_page_vector(const void __user *data,
-						 int num_pages,
-						 bool write_page);
 extern void ceph_put_page_vector(struct page **pages, int num_pages,
 				 bool dirty);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 74cafc0142ea..64305e7056a1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
 
 #include <linux/ceph/libceph.h>
 
-/*
- * build a vector of user pages
- */
-struct page **ceph_get_direct_page_vector(const void __user *data,
-					  int num_pages, bool write_page)
-{
-	struct page **pages;
-	int got = 0;
-	int rc = 0;
-
-	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	while (got < num_pages) {
-		rc = get_user_pages_fast(
-		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-		    num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
-		if (rc < 0)
-			break;
-		BUG_ON(rc == 0);
-		got += rc;
-	}
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	ceph_put_page_vector(pages, got, false);
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(ceph_get_direct_page_vector);
-
 void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
 {
 	int i;
-- 
cgit v1.2.3


From 6c37f0e64173571914a443f74d36e5a22dabfc05 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 3 Jun 2019 14:45:16 -0400
Subject: libceph: add ceph_decode_entity_addr

Add a function for decoding an entity_addr_t. Once
CEPH_FEATURE_MSG_ADDR2 is enabled, the server daemons will start
encoding entity_addr_t differently.

Add a new helper function that can handle either format.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  2 +
 net/ceph/Makefile           |  2 +-
 net/ceph/decode.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 net/ceph/decode.c

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6c2a48d42e0..1c0a665bfc03 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -230,6 +230,8 @@ static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 	WARN_ON(a->in_addr.ss_family == 512);
 }
 
+extern int ceph_decode_entity_addr(void **p, void *end,
+				   struct ceph_entity_addr *addr);
 /*
  * encoders
  */
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09defe27d0..59d0ba2072de 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_CEPH_LIB) += libceph.o
 
 libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
-	mon_client.o \
+	mon_client.o decode.o \
 	cls_lock_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	striper.o \
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..b82981199549
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/decode.h>
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+				  struct ceph_entity_addr *addr)
+{
+	int ret;
+	u8 struct_v;
+	u32 struct_len, addr_len;
+	void *struct_end;
+
+	ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		goto bad;
+
+	ret = -EINVAL;
+	struct_end = *p + struct_len;
+
+	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+	/*
+	 * TYPE_NONE == 0
+	 * TYPE_LEGACY == 1
+	 *
+	 * Clients that don't support ADDR2 always send TYPE_NONE.
+	 * For now, since all we support is msgr1, just set this to 0
+	 * when we get a TYPE_LEGACY type.
+	 */
+	if (addr->type == cpu_to_le32(1))
+		addr->type = 0;
+
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+	ceph_decode_32_safe(p, end, addr_len, bad);
+	if (addr_len > sizeof(addr->in_addr))
+		goto bad;
+
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	if (addr_len) {
+		ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+		addr->in_addr.ss_family =
+			le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+	}
+
+	/* Advance past anything the client doesn't yet understand */
+	*p = struct_end;
+	ret = 0;
+bad:
+	return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+			       struct ceph_entity_addr *addr)
+{
+	int ret = -EINVAL;
+
+	/* Skip rest of type field */
+	ceph_decode_skip_n(p, end, 3, bad);
+	addr->type = 0;
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	ceph_decode_copy_safe(p, end, &addr->in_addr,
+			      sizeof(addr->in_addr), bad);
+	addr->in_addr.ss_family =
+			be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+	ret = 0;
+bad:
+	return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+	u8 marker;
+
+	ceph_decode_8_safe(p, end, marker, bad);
+	if (marker == 1)
+		return ceph_decode_entity_addr_versioned(p, end, addr);
+	else if (marker == 0)
+		return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
-- 
cgit v1.2.3


From 0bfb0f288992adbf8d1f0d5f22f0fd398b146316 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 15:32:28 -0400
Subject: libceph: ADDR2 support for monmap

Switch the MonMap decoder to use the new decoding routine for
entity_addr_t's.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |  1 -
 net/ceph/mon_client.c           | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 3a4688af7455..b4d134d3312a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -104,7 +104,6 @@ struct ceph_mon_client {
 #endif
 };
 
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
 extern int ceph_monmap_contains(struct ceph_monmap *m,
 				struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 895679d3529b..0520bf9825aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -39,7 +39,7 @@ static int __validate_auth(struct ceph_mon_client *monc);
 /*
  * Decode a monmap blob (e.g., during mount).
  */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 {
 	struct ceph_monmap *m = NULL;
 	int i, err = -EINVAL;
@@ -50,7 +50,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	ceph_decode_32_safe(&p, end, len, bad);
 	ceph_decode_need(&p, end, len, bad);
 
-	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
 	p += sizeof(u16);  /* skip version */
 
 	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
@@ -58,7 +58,6 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	epoch = ceph_decode_32(&p);
 
 	num_mon = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
 
 	if (num_mon > CEPH_MAX_MON)
 		goto bad;
@@ -68,17 +67,22 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	m->fsid = fsid;
 	m->epoch = epoch;
 	m->num_mon = num_mon;
-	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-	for (i = 0; i < num_mon; i++)
-		ceph_decode_addr(&m->mon_inst[i].addr);
-
+	for (i = 0; i < num_mon; ++i) {
+		struct ceph_entity_inst *inst = &m->mon_inst[i];
+
+		/* copy name portion */
+		ceph_decode_copy_safe(&p, end, &inst->name,
+					sizeof(inst->name), bad);
+		err = ceph_decode_entity_addr(&p, end, &inst->addr);
+		if (err)
+			goto bad;
+	}
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
 		dout("monmap_decode  mon%d is %s\n", i,
 		     ceph_pr_addr(&m->mon_inst[i].addr));
 	return m;
-
 bad:
 	dout("monmap_decode failed with %d\n", err);
 	kfree(m);
@@ -469,6 +473,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
+		ceph_msg_dump(msg);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From d3c3c0a841d5dafc5395be363996d619255a732f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 06:57:25 -0400
Subject: libceph: use TYPE_LEGACY for entity addrs instead of TYPE_NONE

Going forward, we'll have different address types so let's use
the addr2 TYPE_LEGACY for internal tracking rather than TYPE_NONE.

Also, make ceph_pr_addr print the address type value as well.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  7 +++++++
 net/ceph/decode.c           | 18 ++++++------------
 net/ceph/messenger.c        |  7 +++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 1c0a665bfc03..ce488d95be89 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -218,16 +218,23 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 /*
  * sockaddr_storage <-> ceph_sockaddr
  */
+#define CEPH_ENTITY_ADDR_TYPE_NONE	0
+#define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+
 static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
+
+	/* Banner addresses require TYPE_NONE */
+	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
 	WARN_ON(a->in_addr.ss_family == 512);
+	a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 }
 
 extern int ceph_decode_entity_addr(void **p, void *end,
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index b82981199549..eea529595a7a 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -21,17 +21,6 @@ ceph_decode_entity_addr_versioned(void **p, void *end,
 
 	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
 
-	/*
-	 * TYPE_NONE == 0
-	 * TYPE_LEGACY == 1
-	 *
-	 * Clients that don't support ADDR2 always send TYPE_NONE.
-	 * For now, since all we support is msgr1, just set this to 0
-	 * when we get a TYPE_LEGACY type.
-	 */
-	if (addr->type == cpu_to_le32(1))
-		addr->type = 0;
-
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 
 	ceph_decode_32_safe(p, end, addr_len, bad);
@@ -61,7 +50,12 @@ ceph_decode_entity_addr_legacy(void **p, void *end,
 
 	/* Skip rest of type field */
 	ceph_decode_skip_n(p, end, 3, bad);
-	addr->type = 0;
+
+	/*
+	 * Clients that don't support ADDR2 always send TYPE_NONE, change it
+	 * to TYPE_LEGACY for forward compatibility.
+	 */
+	addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
 	ceph_decode_copy_safe(p, end, &addr->in_addr,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8d0c51dd4666..0a3ef33cf7ac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -199,12 +199,14 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 
 	switch (ss.ss_family) {
 	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+			 le32_to_cpu(addr->type), &in4->sin_addr,
 			 ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+			 le32_to_cpu(addr->type), &in6->sin6_addr,
 			 ntohs(in6->sin6_port));
 		break;
 
@@ -1982,6 +1984,7 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		addr_set_port(&addr[i], port);
+		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
-- 
cgit v1.2.3


From 2c66de560fa2dda0a600e908897116914db8f500 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 09:24:31 -0400
Subject: libceph: rename ceph_encode_addr to ceph_encode_banner_addr

...ditto for the decode function. We only use these functions to fix
up banner addresses now, so let's name them more appropriately.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h | 4 ++--
 net/ceph/messenger.c        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index ce488d95be89..450384fe487c 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,7 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
@@ -229,7 +229,7 @@ static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 	/* Banner addresses require TYPE_NONE */
 	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0a3ef33cf7ac..0473d9a7b1f4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -222,7 +222,7 @@ EXPORT_SYMBOL(ceph_pr_addr);
 static void encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_addr(&msgr->my_enc_addr);
+	ceph_encode_banner_addr(&msgr->my_enc_addr);
 }
 
 /*
@@ -1734,14 +1734,14 @@ static int read_partial_banner(struct ceph_connection *con)
 	ret = read_partial(con, end, size, &con->actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->actual_peer_addr);
 
 	size = sizeof (con->peer_addr_for_me);
 	end += size;
 	ret = read_partial(con, end, size, &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
 
 out:
 	return ret;
-- 
cgit v1.2.3


From 6adaaafdd81d5c01875fe233ab73deb81b34caa1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 12:24:22 -0400
Subject: libceph: turn on CEPH_FEATURE_MSG_ADDR2

Now that the client can handle either address formatting, advertise to
the peer that we can support it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 65a38c4a02a1..39e6f4c57580 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -211,6 +211,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MON_STATEFUL_SUB |	\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING |	\
+	 CEPH_FEATURE_MSG_ADDR2 |		\
 	 CEPH_FEATURE_CEPHX_V2)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT	0
-- 
cgit v1.2.3


From 441d367644e2f60b37f36bfc656deee551acba5b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 5 Jun 2019 17:24:22 -0400
Subject: iversion: add a routine to update a raw value with a larger one

Under ceph, clients can be independently updating iversion themselves,
while working under comprehensive sets of caps on an inode. In that
situation we always want to prefer the largest value of a change
attribute. Add a new function that will update a raw value with a larger
one, but otherwise leave it alone.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/iversion.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index be50ef7cedab..2917ef990d43 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -112,6 +112,30 @@ inode_peek_iversion_raw(const struct inode *inode)
 	return atomic64_read(&inode->i_version);
 }
 
+/**
+ * inode_set_max_iversion_raw - update i_version new value is larger
+ * @inode: inode to set
+ * @val: new i_version to set
+ *
+ * Some self-managed filesystems (e.g Ceph) will only update the i_version
+ * value if the new value is larger than the one we already have.
+ */
+static inline void
+inode_set_max_iversion_raw(struct inode *inode, u64 val)
+{
+	u64 cur, old;
+
+	cur = inode_peek_iversion_raw(inode);
+	for (;;) {
+		if (cur > val)
+			break;
+		old = atomic64_cmpxchg(&inode->i_version, cur, val);
+		if (likely(old == cur))
+			break;
+		cur = old;
+	}
+}
+
 /**
  * inode_set_iversion - set i_version to a particular value
  * @inode: inode to set
-- 
cgit v1.2.3


From 49ada6e8dc9f64ad1e8dd6f7b453c9e584e9f897 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 20 Jun 2019 12:09:08 +0800
Subject: ceph: more precise CEPH_CLIENT_CAPS_PENDING_CAPSNAP

Client uses this flag to tell mds if there is more cap snap need to
flush. It's mainly for the case that client needs to re-send cap/snap
flushes after mds failover, but CEPH_CAP_ANY_FILE_WR on corresponding
inodes are all released before mds failover.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               | 41 ++++++++++++++++++++++++++++++-----------
 include/linux/ceph/ceph_fs.h |  2 +-
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f9055cdec3c7..d98dcd976c80 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1295,7 +1295,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-		      int op, bool sync, int used, int want, int retain,
+		      int op, int flags, int used, int want, int retain,
 		      int flushing, u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
@@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	arg.mode = inode->i_mode;
 
 	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-	if (list_empty(&ci->i_cap_snaps))
-		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
-	else
-		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
-	if (sync)
-		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+	    !list_empty(&ci->i_cap_snaps)) {
+		struct ceph_cap_snap *capsnap;
+		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->cap_flush.tid)
+				break;
+			if (capsnap->need_flush) {
+				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+				break;
+			}
+		}
+	}
+	arg.flags = flags;
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2085,7 +2092,7 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
 				cap_used, want, retain, flushing,
 				flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2155,7 +2162,8 @@ retry_locked:
 						&flush_tid, &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     CEPH_CLIENT_CAPS_SYNC,
 				     __ceph_caps_used(ci),
 				     __ceph_caps_wanted(ci),
 				     (cap->issued | cap->implemented),
@@ -2328,9 +2336,17 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 	struct ceph_cap_flush *cf;
 	int ret;
 	u64 first_tid = 0;
+	u64 last_snap_flush = 0;
 
 	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 
+	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+		if (!cf->caps) {
+			last_snap_flush = cf->tid;
+			break;
+		}
+	}
+
 	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
 		if (cf->tid < first_tid)
 			continue;
@@ -2348,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
 			ci->i_ceph_flags |= CEPH_I_NODELAY;
+
 			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					  false, __ceph_caps_used(ci),
+					 (cf->tid < last_snap_flush ?
+					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
-					  cap->issued | cap->implemented,
+					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
 			if (ret) {
 				pr_err("kick_flushing_caps: error sending "
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3ac0feaf2b5e..cb21c5cf12c3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -682,7 +682,7 @@ extern const char *ceph_cap_op_name(int op);
 /* flags field in client cap messages (version >= 10) */
 #define CEPH_CLIENT_CAPS_SYNC			(1<<0)
 #define CEPH_CLIENT_CAPS_NO_CAPSNAP		(1<<1)
-#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2);
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2)
 
 /*
  * caps message, used for capability callbacks, acks, requests, etc.
-- 
cgit v1.2.3


From 94e85771881027e62afdddadd31e3eec73025990 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Jul 2019 12:50:09 +0200
Subject: libceph: rename r_unsafe_item to r_private_item

This list item remained from when we had safe and unsafe replies
(commit vs ack).  It has since become a private list item for use by
clients.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c                  | 6 +++---
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a06090c8281e..d5bee928603a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1026,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			req->r_callback = ceph_aio_complete_req;
 			req->r_inode = inode;
 			req->r_priv = aio_req;
-			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+			list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
 
 			pos += len;
 			continue;
@@ -1086,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		while (!list_empty(&osd_reqs)) {
 			req = list_first_entry(&osd_reqs,
 					       struct ceph_osd_request,
-					       r_unsafe_item);
-			list_del_init(&req->r_unsafe_item);
+					       r_private_item);
+			list_del_init(&req->r_private_item);
 			if (ret >= 0)
 				ret = ceph_osdc_start_request(req->r_osdc,
 							      req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2294f963dab7..024f6fed0ac5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,9 +198,9 @@ struct ceph_osd_request {
 	bool              r_mempool;
 	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
-	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
+	struct list_head r_private_item;      /* ditto */
 	void *r_priv;			      /* ditto */
 
 	/* set by submitter */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 54170a35ecec..6495982c5c07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -478,7 +478,7 @@ static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
 	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
-	WARN_ON(!list_empty(&req->r_unsafe_item));
+	WARN_ON(!list_empty(&req->r_private_item));
 	WARN_ON(req->r_osd);
 }
 
@@ -538,7 +538,7 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	RB_CLEAR_NODE(&req->r_node);
 	RB_CLEAR_NODE(&req->r_mc_node);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
+	INIT_LIST_HEAD(&req->r_private_item);
 
 	target_init(&req->r_t);
 }
-- 
cgit v1.2.3


From ef83171b49c66d851a1a0dc6da5b4a4d8ee6ce9a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Apr 2019 14:16:05 +0200
Subject: libceph: bump CEPH_MSG_MAX_DATA_LEN (again)

This time for rbd object map.  Object maps are limited in size to
256000000 objects, two bits per object.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 include/linux/ceph/libceph.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index a3cddf5f0e60..82156da3c650 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,11 +84,13 @@ struct ceph_options {
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
 /*
- * Handle the largest possible rbd object in one message.
+ * The largest possible rbd data object is 32M.
+ * The largest possible rbd object map object is 64M.
+ *
  * There is no limit on the size of cephfs objects, but it has to obey
  * rsize and wsize mount options anyway.
  */
-#define CEPH_MSG_MAX_DATA_LEN	(32*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(64*1024*1024)
 
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 
-- 
cgit v1.2.3


From 68ada915eea10f36760ffe414810390a104df093 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:16:51 +0200
Subject: libceph: change ceph_osdc_call() to take page vector for response

This will be used for loading object map.  rbd_obj_read_sync() isn't
suitable because object map must be accessed through class methods.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 drivers/block/rbd.c             |  8 ++++----
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/cls_lock_client.c      |  2 +-
 net/ceph/osd_client.c           | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6d1df82eb883..f0814c148b1c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4076,7 +4076,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
-			     reply_page, &inbound_size);
+			     &reply_page, &inbound_size);
 	if (!ret) {
 		memcpy(inbound, page_address(reply_page), inbound_size);
 		ret = inbound_size;
@@ -5102,7 +5102,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret == -EOPNOTSUPP ? 1 : ret;
 
@@ -5114,7 +5114,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
@@ -5145,7 +5145,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 024f6fed0ac5..c567cfa4f107 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -497,7 +497,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len);
+		   struct page **resp_pages, size_t *resp_len);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b1d12bf4b83e..fb59094caf13 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -363,7 +363,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	dout("%s lock_name %s\n", __func__, lock_name);
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
 			     CEPH_OSD_FLAG_READ, get_info_op_page,
-			     get_info_op_buf_size, reply_page, &reply_len);
+			     get_info_op_buf_size, &reply_page, &reply_len);
 
 	dout("%s: status %d\n", __func__, ret);
 	if (ret >= 0) {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 6495982c5c07..a90fbfce7e93 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5050,12 +5050,12 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len)
+		   struct page **resp_pages, size_t *resp_len)
 {
 	struct ceph_osd_request *req;
 	int ret;
 
-	if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+	if (req_len > PAGE_SIZE)
 		return -E2BIG;
 
 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -5073,8 +5073,8 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (req_page)
 		osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
 						  0, false, false);
-	if (resp_page)
-		osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+	if (resp_pages)
+		osd_req_op_cls_response_data_pages(req, 0, resp_pages,
 						   *resp_len, 0, false, false);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
@@ -5085,7 +5085,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
-		if (resp_page)
+		if (resp_pages)
 			*resp_len = req->r_ops[0].outdata_len;
 	}
 
-- 
cgit v1.2.3


From 4cf3e6dff7ea517544e1da7810a0b3ebba380d2c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:00:19 +0200
Subject: libceph: export osd_req_op_data() macro

We already have one exported wrapper around it for extent.osd_data and
rbd_object_map_update_finish() needs another one for cls.request_data.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/osd_client.h | 8 ++++++++
 net/ceph/osd_client.c           | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c567cfa4f107..ad7fe5d10dcd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -389,6 +389,14 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
 
+#define osd_req_op_data(oreq, whch, typ, fld)				\
+({									\
+	struct ceph_osd_request *__oreq = (oreq);			\
+	unsigned int __whch = (whch);					\
+	BUG_ON(__whch >= __oreq->r_num_ops);				\
+	&__oreq->r_ops[__whch].typ.fld;					\
+})
+
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a90fbfce7e93..0b2df09b2554 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,14 +171,6 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
 	osd_data->num_bvecs = num_bvecs;
 }
 
-#define osd_req_op_data(oreq, whch, typ, fld)				\
-({									\
-	struct ceph_osd_request *__oreq = (oreq);			\
-	unsigned int __whch = (whch);					\
-	BUG_ON(__whch >= __oreq->r_num_ops);				\
-	&__oreq->r_ops[__whch].typ.fld;					\
-})
-
 static struct ceph_osd_data *
 osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
 {
-- 
cgit v1.2.3


From 22e8bd51bb0469d1a524130a057f894ff632376a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 5 Jun 2019 19:25:11 +0200
Subject: rbd: support for object-map and fast-diff

Speed up reads, discards and zeroouts through RBD_OBJ_FLAG_MAY_EXIST
and RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT based on object map.

Invalid object maps are not trusted, but still updated.  Note that we
never iterate, resize or invalidate object maps.  If object-map feature
is enabled but object map fails to load, we just fail the requester
(either "rbd map" or I/O, by way of post-acquire action).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c                  | 720 ++++++++++++++++++++++++++++++++++-
 drivers/block/rbd_types.h            |  10 +
 include/linux/ceph/cls_lock_client.h |   3 +
 include/linux/ceph/striper.h         |   2 +
 net/ceph/cls_lock_client.c           |  45 +++
 net/ceph/striper.c                   |  17 +
 6 files changed, 794 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3d861d3013f8..0df91665c4eb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING		(1ULL<<0)
 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 				 RBD_FEATURE_STRIPINGV2 |	\
 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
+				 RBD_FEATURE_OBJECT_MAP |	\
+				 RBD_FEATURE_FAST_DIFF |	\
 				 RBD_FEATURE_DEEP_FLATTEN |	\
 				 RBD_FEATURE_DATA_POOL |	\
 				 RBD_FEATURE_OPERATIONS)
@@ -227,6 +231,8 @@ enum obj_operation_type {
 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
 
 enum rbd_obj_read_state {
 	RBD_OBJ_READ_START = 1,
@@ -261,14 +267,18 @@ enum rbd_obj_read_state {
  */
 enum rbd_obj_write_state {
 	RBD_OBJ_WRITE_START = 1,
+	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
 	RBD_OBJ_WRITE_OBJECT,
 	__RBD_OBJ_WRITE_COPYUP,
 	RBD_OBJ_WRITE_COPYUP,
+	RBD_OBJ_WRITE_POST_OBJECT_MAP,
 };
 
 enum rbd_obj_copyup_state {
 	RBD_OBJ_COPYUP_START = 1,
 	RBD_OBJ_COPYUP_READ_PARENT,
+	__RBD_OBJ_COPYUP_OBJECT_MAPS,
+	RBD_OBJ_COPYUP_OBJECT_MAPS,
 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
 	RBD_OBJ_COPYUP_WRITE_OBJECT,
 };
@@ -419,6 +429,11 @@ struct rbd_device {
 	int			acquire_err;
 	struct completion	releasing_wait;
 
+	spinlock_t		object_map_lock;
+	u8			*object_map;
+	u64			object_map_size;	/* in objects */
+	u64			object_map_flags;
+
 	struct workqueue_struct	*task_wq;
 
 	struct rbd_spec		*parent_spec;
@@ -620,6 +635,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 				u8 *order, u64 *snap_size);
 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 		u64 *snap_features);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
 
 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
@@ -1768,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
 	kmem_cache_free(rbd_img_request_cache, img_request);
 }
 
+#define BITS_PER_OBJ	2
+#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+				   u64 *index, u8 *shift)
+{
+	u32 off;
+
+	rbd_assert(objno < rbd_dev->object_map_size);
+	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u64 index;
+	u8 shift;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+	u64 index;
+	u8 shift;
+	u8 *p;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	rbd_assert(!(val & ~OBJ_MASK));
+
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	p = &rbd_dev->object_map[index];
+	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	spin_unlock(&rbd_dev->object_map_lock);
+	return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	/* fall back to default logic if object map is disabled or invalid */
+	if (!use_object_map(rbd_dev))
+		return true;
+
+	state = rbd_object_map_get(rbd_dev, objno);
+	return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+				struct ceph_object_id *oid)
+{
+	if (snap_id == CEPH_NOSNAP)
+		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id);
+	else
+		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	u8 lock_type;
+	char *lock_tag;
+	struct ceph_locker *lockers;
+	u32 num_lockers;
+	bool broke_lock = false;
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+	if (ret != -EBUSY || broke_lock) {
+		if (ret == -EEXIST)
+			ret = 0; /* already locked by myself */
+		if (ret)
+			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+		return ret;
+	}
+
+	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+				 RBD_LOCK_NAME, &lock_type, &lock_tag,
+				 &lockers, &num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+		return ret;
+	}
+
+	kfree(lock_tag);
+	if (num_lockers == 0)
+		goto again;
+
+	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+		 ENTITY_NAME(lockers[0].id.name));
+
+	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+				  RBD_LOCK_NAME, lockers[0].id.cookie,
+				  &lockers[0].id.name);
+	ceph_free_lockers(lockers, num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+		return ret;
+	}
+
+	broke_lock = true;
+	goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			      "");
+	if (ret && ret != -ENOENT)
+		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+	u8 struct_v;
+	u32 struct_len;
+	u32 header_len;
+	void *header_end;
+	int ret;
+
+	ceph_decode_32_safe(p, end, header_len, e_inval);
+	header_end = *p + header_len;
+
+	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+	*p = header_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	struct page **pages;
+	void *p, *end;
+	size_t reply_len;
+	u64 num_objects;
+	u64 object_map_bytes;
+	u64 object_map_size;
+	int num_pages;
+	int ret;
+
+	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+	num_objects = ceph_get_num_objects(&rbd_dev->layout,
+					   rbd_dev->mapping.size);
+	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+					    BITS_PER_BYTE);
+	num_pages = calc_pages_for(0, object_map_bytes) + 1;
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	reply_len = num_pages * PAGE_SIZE;
+	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+			     NULL, 0, pages, &reply_len);
+	if (ret)
+		goto out;
+
+	p = page_address(pages[0]);
+	end = p + min(reply_len, (size_t)PAGE_SIZE);
+	ret = decode_object_map_header(&p, end, &object_map_size);
+	if (ret)
+		goto out;
+
+	if (object_map_size != num_objects) {
+		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+			 object_map_size, num_objects);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (offset_in_page(p) + object_map_bytes > reply_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+	if (!rbd_dev->object_map) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rbd_dev->object_map_size = object_map_size;
+	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+				   offset_in_page(p), object_map_bytes);
+
+out:
+	ceph_release_page_vector(pages, num_pages);
+	return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+	kvfree(rbd_dev->object_map);
+	rbd_dev->object_map = NULL;
+	rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = __rbd_object_map_load(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_v2_get_flags(rbd_dev);
+	if (ret) {
+		rbd_object_map_free(rbd_dev);
+		return ret;
+	}
+
+	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+		rbd_warn(rbd_dev, "object map is invalid");
+
+	return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_object_map_lock(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_object_map_load(rbd_dev);
+	if (ret) {
+		rbd_object_map_unlock(rbd_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+	rbd_object_map_free(rbd_dev);
+	rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request.  A HEAD update has two ops (assert_locked).  For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+					struct ceph_osd_request *osd_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_data *osd_data;
+	u64 objno;
+	u8 state, new_state, current_state;
+	bool has_current_state;
+	void *p;
+
+	if (osd_req->r_result)
+		return osd_req->r_result;
+
+	/*
+	 * Nothing to do for a snapshot object map.
+	 */
+	if (osd_req->r_num_ops == 1)
+		return 0;
+
+	/*
+	 * Update in-memory HEAD object map.
+	 */
+	rbd_assert(osd_req->r_num_ops == 2);
+	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+	p = page_address(osd_data->pages[0]);
+	objno = ceph_decode_64(&p);
+	rbd_assert(objno == obj_req->ex.oe_objno);
+	rbd_assert(ceph_decode_64(&p) == objno + 1);
+	new_state = ceph_decode_8(&p);
+	has_current_state = ceph_decode_8(&p);
+	if (has_current_state)
+		current_state = ceph_decode_8(&p);
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	if (!has_current_state || current_state == state ||
+	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+		__rbd_object_map_set(rbd_dev, objno, new_state);
+	spin_unlock(&rbd_dev->object_map_lock);
+
+	return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int result;
+
+	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+	     osd_req->r_result, obj_req);
+
+	result = rbd_object_map_update_finish(obj_req, osd_req);
+	rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+	u8 state = rbd_object_map_get(rbd_dev, objno);
+
+	if (state == new_state ||
+	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+		return false;
+
+	return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+				     int which, u64 objno, u8 new_state,
+				     const u8 *current_state)
+{
+	struct page **pages;
+	void *p, *start;
+	int ret;
+
+	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = start = page_address(pages[0]);
+	ceph_encode_64(&p, objno);
+	ceph_encode_64(&p, objno + 1);
+	ceph_encode_8(&p, new_state);
+	if (current_state) {
+		ceph_encode_8(&p, 1);
+		ceph_encode_8(&p, *current_state);
+	} else {
+		ceph_encode_8(&p, 0);
+	}
+
+	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+					  false, true);
+	return 0;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+				 u8 new_state, const u8 *current_state)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	int num_ops = 1;
+	int which = 0;
+	int ret;
+
+	if (snap_id == CEPH_NOSNAP) {
+		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+			return 1;
+
+		num_ops++; /* assert_locked */
+	}
+
+	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+	req->r_callback = rbd_object_map_callback;
+	req->r_priv = obj_req;
+
+	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE;
+	ktime_get_real_ts64(&req->r_mtime);
+
+	if (snap_id == CEPH_NOSNAP) {
+		/*
+		 * Protect against possible race conditions during lock
+		 * ownership transitions.
+		 */
+		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+		if (ret)
+			return ret;
+	}
+
+	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+					new_state, current_state);
+	if (ret)
+		return ret;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	ceph_osdc_start_request(osdc, req, false);
+	return 0;
+}
+
 static void prune_extents(struct ceph_file_extent *img_extents,
 			  u32 *num_img_extents, u64 overlap)
 {
@@ -1975,6 +2451,7 @@ static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
 	if (ret)
 		return ret;
 
+	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 
@@ -2022,6 +2499,7 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
 	if (rbd_obj_copyup_enabled(obj_req))
 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
 	if (!obj_req->num_img_extents) {
+		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 		if (rbd_obj_is_entire(obj_req))
 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 	}
@@ -2407,6 +2885,20 @@ static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
 	queue_work(rbd_wq, &img_req->work);
 }
 
+static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+		return true;
+	}
+
+	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
+	     obj_req->ex.oe_objno);
+	return false;
+}
+
 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2482,10 +2974,17 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
+again:
 	switch (obj_req->read_state) {
 	case RBD_OBJ_READ_START:
 		rbd_assert(!*result);
 
+		if (!rbd_obj_may_exist(obj_req)) {
+			*result = -ENOENT;
+			obj_req->read_state = RBD_OBJ_READ_OBJECT;
+			goto again;
+		}
+
 		ret = rbd_obj_read_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2536,6 +3035,44 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	}
 }
 
+static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
+	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
+		dout("%s %p noop for nonexistent\n", __func__, obj_req);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 new_state;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+		new_state = OBJECT_PENDING;
+	else
+		new_state = OBJECT_EXISTS;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
+}
+
 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2706,6 +3243,41 @@ static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
 	return rbd_obj_read_from_parent(obj_req);
 }
 
+static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
+	u8 new_state;
+	u32 i;
+	int ret;
+
+	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+		return;
+
+	for (i = 0; i < snapc->num_snaps; i++) {
+		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
+		    i + 1 < snapc->num_snaps)
+			new_state = OBJECT_EXISTS_CLEAN;
+		else
+			new_state = OBJECT_EXISTS;
+
+		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
+					    new_state, NULL);
+		if (ret < 0) {
+			obj_req->pending.result = ret;
+			return;
+		}
+
+		rbd_assert(!ret);
+		obj_req->pending.num_pending++;
+	}
+}
+
 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 {
 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
@@ -2749,6 +3321,7 @@ static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 
 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
 {
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
 again:
@@ -2776,6 +3349,25 @@ again:
 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
 		}
 
+		rbd_obj_copyup_object_maps(obj_req);
+		if (!obj_req->pending.num_pending) {
+			*result = obj_req->pending.result;
+			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
+			goto again;
+		}
+		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
+		return false;
+	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (!pending_result_dec(&obj_req->pending, result))
+			return false;
+		/* fall through */
+	case RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (*result) {
+			rbd_warn(rbd_dev, "snap object map update failed: %d",
+				 *result);
+			return true;
+		}
+
 		rbd_obj_copyup_write_object(obj_req);
 		if (!obj_req->pending.num_pending) {
 			*result = obj_req->pending.result;
@@ -2795,6 +3387,27 @@ again:
 	}
 }
 
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 current_state = OBJECT_PENDING;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
+		return 1;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
+				     &current_state);
+}
+
 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
 {
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
@@ -2805,6 +3418,24 @@ again:
 	case RBD_OBJ_WRITE_START:
 		rbd_assert(!*result);
 
+		if (rbd_obj_write_is_noop(obj_req))
+			return true;
+
+		ret = rbd_obj_write_pre_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
+		if (*result) {
+			rbd_warn(rbd_dev, "pre object map update failed: %d",
+				 *result);
+			return true;
+		}
 		ret = rbd_obj_write_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2837,8 +3468,23 @@ again:
 			return false;
 		/* fall through */
 	case RBD_OBJ_WRITE_COPYUP:
-		if (*result)
+		if (*result) {
 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
+			return true;
+		}
+		ret = rbd_obj_write_post_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
+		if (*result)
+			rbd_warn(rbd_dev, "post object map update failed: %d",
+				 *result);
 		return true;
 	default:
 		BUG();
@@ -2892,7 +3538,8 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req)
 		return false;
 
 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
-	if (rbd_dev->opts->lock_on_read)
+	if (rbd_dev->opts->lock_on_read ||
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
 		return true;
 
 	return rbd_img_is_write(img_req);
@@ -3431,7 +4078,7 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
 		if (ret)
 			goto out; /* request lock or error */
 
-		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
+		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
 			 ENTITY_NAME(lockers[0].id.name));
 
 		ret = ceph_monc_blacklist_add(&client->monc,
@@ -3458,6 +4105,19 @@ out:
 	return ret;
 }
 
+static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
+		ret = rbd_object_map_open(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /*
  * Return:
  *   0 - lock acquired
@@ -3501,6 +4161,17 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	ret = rbd_post_acquire_action(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
+		/*
+		 * Can't stay in RBD_LOCK_STATE_LOCKED because
+		 * rbd_lock_add_request() would let the request through,
+		 * assuming that e.g. object map is locked and loaded.
+		 */
+		rbd_unlock(rbd_dev);
+	}
+
 out:
 	wake_lock_waiters(rbd_dev, ret);
 	up_write(&rbd_dev->lock_rwsem);
@@ -3574,10 +4245,17 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
 	return true;
 }
 
+static void rbd_pre_release_action(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
+		rbd_object_map_close(rbd_dev);
+}
+
 static void __rbd_release_lock(struct rbd_device *rbd_dev)
 {
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	rbd_pre_release_action(rbd_dev);
 	rbd_unlock(rbd_dev);
 }
 
@@ -4864,6 +5542,8 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
 	init_completion(&rbd_dev->acquire_wait);
 	init_completion(&rbd_dev->releasing_wait);
 
+	spin_lock_init(&rbd_dev->object_map_lock);
+
 	rbd_dev->dev.bus = &rbd_bus_type;
 	rbd_dev->dev.type = &rbd_device_type;
 	rbd_dev->dev.parent = &rbd_root_dev;
@@ -5045,6 +5725,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
 						&rbd_dev->header.features);
 }
 
+/*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+ *
+ * For the same reason, this function is called only on object map
+ * (re)load and not on header refresh.
+ */
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
+{
+	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+	__le64 flags;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_flags",
+				  &snapid, sizeof(snapid),
+				  &flags, sizeof(flags));
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof(flags))
+		return -EBADMSG;
+
+	rbd_dev->object_map_flags = le64_to_cpu(flags);
+	return 0;
+}
+
 struct parent_image_info {
 	u64		pool_id;
 	const char	*pool_ns;
@@ -6018,6 +6724,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
 	struct rbd_image_header	*header;
 
 	rbd_dev_parent_put(rbd_dev);
+	rbd_object_map_free(rbd_dev);
 	rbd_dev_mapping_clear(rbd_dev);
 
 	/* Free dynamic fields from the header, then zero it out */
@@ -6267,6 +6974,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 	if (ret)
 		goto err_out_probe;
 
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
+		ret = rbd_object_map_load(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+	}
+
 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
 		ret = rbd_dev_v2_parent_info(rbd_dev);
 		if (ret)
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 62ff50d3e7a6..ac98ab6ccd3b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -18,6 +18,7 @@
 /* For format version 2, rbd image 'foo' consists of objects
  *   rbd_id.foo		- id of image
  *   rbd_header.<id>	- image metadata
+ *   rbd_object_map.<id> - optional image object map
  *   rbd_data.<id>.0000000000000000
  *   rbd_data.<id>.0000000000000001
  *   ...		- data
@@ -25,6 +26,7 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
 #define RBD_ID_PREFIX          "rbd_id."
 #define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
@@ -39,6 +41,14 @@ enum rbd_notify_op {
 	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
 };
 
+#define OBJECT_NONEXISTENT	0
+#define OBJECT_EXISTS		1
+#define OBJECT_PENDING		2
+#define OBJECT_EXISTS_CLEAN	3
+
+#define RBD_FLAG_OBJECT_MAP_INVALID	(1ULL << 0)
+#define RBD_FLAG_FAST_DIFF_INVALID	(1ULL << 1)
+
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd		- image metadata
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index bea6c77d2093..17bc7584d1fe 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -52,4 +52,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 		       char *lock_name, u8 *type, char **tag,
 		       struct ceph_locker **lockers, u32 *num_lockers);
 
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag);
+
 #endif
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
index cbd0d24b7148..3486636c0e6e 100644
--- a/include/linux/ceph/striper.h
+++ b/include/linux/ceph/striper.h
@@ -66,4 +66,6 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 			struct ceph_file_extent **file_extents,
 			u32 *num_file_extents);
 
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
+
 #endif
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index fb59094caf13..17447c19d937 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,6 +6,7 @@
 
 #include <linux/ceph/cls_lock_client.h>
 #include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
 
 /**
  * ceph_cls_lock - grab rados lock for object
@@ -378,3 +379,47 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	return ret;
 }
 EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag)
+{
+	int assert_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	int tag_len = strlen(tag);
+	struct page **pages;
+	void *p, *end;
+	int ret;
+
+	assert_op_buf_size = name_len + sizeof(__le32) +
+			     cookie_len + sizeof(__le32) +
+			     tag_len + sizeof(__le32) +
+			     sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+	if (assert_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = page_address(pages[0]);
+	end = p + assert_op_buf_size;
+
+	/* encode cls_lock_assert_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, cookie, cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	WARN_ON(p != end);
+
+	osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+					  0, false, true);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462dc86b7..3b3fa75d1189 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 	return 0;
 }
 EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+	u64 period = (u64)l->stripe_count * l->object_size;
+	u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+	u64 remainder_bytes;
+	u64 remainder_objs = 0;
+
+	div64_u64_rem(size, period, &remainder_bytes);
+	if (remainder_bytes > 0 &&
+	    remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+		remainder_objs = l->stripe_count -
+			    DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+	return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);
-- 
cgit v1.2.3


From 600c70bad6594cb124c641ed05355ca134650ea4 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 1 Jul 2019 10:38:39 -0700
Subject: bpf: allow wide (u64) aligned stores for some fields of bpf_sock_addr

Since commit cd17d7770578 ("bpf/tools: sync bpf.h") clang decided
that it can do a single u64 store into user_ip6[2] instead of two
separate u32 ones:

 #  17: (18) r2 = 0x100000000000000
 #  ; ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
 #  19: (7b) *(u64 *)(r1 +16) = r2
 #  invalid bpf_context access off=16 size=8

>From the compiler point of view it does look like a correct thing
to do, so let's support it on the kernel side.

Credit to Andrii Nakryiko for a proper implementation of
bpf_ctx_wide_store_ok.

Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Fixes: cd17d7770578 ("bpf/tools: sync bpf.h")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |  6 ++++++
 include/uapi/linux/bpf.h |  6 +++---
 net/core/filter.c        | 22 ++++++++++++++--------
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1fe53e78c7e3..6d944369ca87 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,6 +747,12 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
+#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+	(size == sizeof(__u64) &&					\
+	off >= offsetof(type, field) &&					\
+	off + sizeof(__u64) <= offsetofend(type, field) &&		\
+	off % sizeof(__u64) == 0)
+
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ead27aebf491..c318385aba51 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3247,7 +3247,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3256,10 +3256,10 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
-	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index 089aaea0ccc6..4481e950f020 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
+			if (bpf_ctx_wide_store_ok(off, size,
+						  struct bpf_sock_addr,
+						  user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_store_ok(off, size,
+						  struct bpf_sock_addr,
+						  msg_src_ip6))
+				return true;
+
 			if (size != size_default)
 				return false;
 		}
@@ -7730,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
  * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
  *
- * It doesn't support SIZE argument though since narrow stores are not
- * supported for now.
- *
  * In addition it uses Temporary Field TF (member of struct S) as the 3rd
  * "register" since two registers available in convert_ctx_access are not
  * enough: we can't override neither SRC, since it contains value to store, nor
@@ -7740,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
  * instructions. But we need a temporary place to save pointer to nested
  * structure whose field we want to store to.
  */
-#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF)		       \
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)	       \
 	do {								       \
 		int tmp_reg = BPF_REG_9;				       \
 		if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)	       \
@@ -7751,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(S, TF));			       \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,	       \
 				      si->dst_reg, offsetof(S, F));	       \
-		*insn++ = BPF_STX_MEM(					       \
-			BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg,	       \
+		*insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg,	       \
 			bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF),	       \
 				       target_size)			       \
 				+ OFF);					       \
@@ -7764,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 						      TF)		       \
 	do {								       \
 		if (type == BPF_WRITE) {				       \
-			SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF,    \
-							 TF);		       \
+			SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
+							 OFF, TF);	       \
 		} else {						       \
 			SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(		       \
 				S, NS, F, NF, SIZE, OFF);  \
-- 
cgit v1.2.3


From f4915455dcf07c4f237d6160a4b6adb0575d2909 Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 8 Jul 2019 13:59:02 +0300
Subject: linux/dim: Implement RDMA adaptive moderation (DIM)

RDMA DIM implements a different algorithm from net DIM and is based on
completions which is how we can implement interrupt moderation in RDMA.

The algorithm optimizes for number of completions and ratio between
completions and events. In order to avoid long latencies, the
implementation performs fast reduction of moderation level when the
traffic changes.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/dim.h |  36 ++++++++++++++++++
 lib/dim/Makefile    |   6 +--
 lib/dim/rdma_dim.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa9bdd47a648..aa69730c3b8d 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -82,6 +82,7 @@ struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@ struct dim {
 	struct dim_sample start_sample;
 	struct dim_sample measuring_sample;
 	struct work_struct work;
+	void *priv;
 	u8 profile_ix;
 	u8 mode;
 	u8 tune_state;
@@ -363,4 +365,38 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 160afe288df0..1d6858a108cb 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,8 +2,6 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-	dim.o		\
-	net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644
index 000000000000..f7e26c7b4749
--- /dev/null
+++ b/lib/dim/rdma_dim.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+	if (dim->tune_state == DIM_GOING_RIGHT) {
+		if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+	}
+	if (dim->tune_state == DIM_GOING_LEFT) {
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+	}
+
+	return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+				  struct dim_stats *prev)
+{
+	/* first stat */
+	if (!prev->cpms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+		return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+		return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_ix = dim->profile_ix;
+	u8 state = dim->tune_state;
+	int stats_res;
+	int step_res;
+
+	if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+		stats_res = rdma_dim_stats_compare(curr_stats,
+						   &dim->prev_stats);
+
+		switch (stats_res) {
+		case DIM_STATS_SAME:
+			if (curr_stats->cpe_ratio <= 50 * prev_ix)
+				dim->profile_ix = 0;
+			break;
+		case DIM_STATS_WORSE:
+			dim_turn(dim);
+			/* fall through */
+		case DIM_STATS_BETTER:
+			step_res = rdma_dim_step(dim);
+			if (step_res == DIM_ON_EDGE)
+				dim_turn(dim);
+			break;
+		}
+	}
+
+	dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+	struct dim_sample *curr_sample = &dim->measuring_sample;
+	struct dim_stats curr_stats;
+	u32 nevents;
+
+	dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0,
+				     curr_sample->comp_ctr + completions,
+				     &dim->measuring_sample);
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+		if (rdma_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0,
+					     curr_sample->comp_ctr,
+					     &dim->start_sample);
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(rdma_dim);
-- 
cgit v1.2.3


From 15ffe5e1acf5fe1512e98b20702e46ce9f25e2f7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jul 2019 12:55:27 -0700
Subject: dma-mapping: mark dma_alloc_need_uncached as __always_inline

Without the __always_inline at least i386 configs that have
CONFIG_OPTIMIZE_INLINING set seem fail to inline
dma_alloc_need_uncached, leading to a linker error because of
undefined symbols.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
---
 include/linux/dma-noncoherent.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 53ee36ecdf37..3813211a9aad 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -23,7 +23,7 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 /*
  * Check if an allocation needs to be marked uncached to be coherent.
  */
-static inline bool dma_alloc_need_uncached(struct device *dev,
+static __always_inline bool dma_alloc_need_uncached(struct device *dev,
 		unsigned long attrs)
 {
 	if (dev_is_dma_coherent(dev))
-- 
cgit v1.2.3


From 67d874c3b2c69d65274fa5ce44ba939788d5729d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 8 Jul 2019 16:27:52 +0530
Subject: cpufreq: Register notifiers with the PM QoS framework

Register notifiers for min/max frequency constraints with the PM QoS
framework. The constraints are also taken into consideration in
cpufreq_set_policy().

This also relocates cpufreq_policy_put_kobj() as it is required to be
called from cpufreq_policy_alloc() now.

refresh_frequency_limits() is updated to avoid calling
cpufreq_set_policy() for inactive policies and handle_update() is
updated to have proper locking in place.

No constraints are added until now though.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 135 +++++++++++++++++++++++++++++++++++-----------
 include/linux/cpufreq.h   |   3 ++
 2 files changed, 108 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ceb57af15ca0..b96ef6db1bfe 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -999,7 +1000,7 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 
-	if (!dev)
+	if (unlikely(!dev))
 		return;
 
 	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1117,14 +1118,16 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 static void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy new_policy = *policy;
-
-	pr_debug("updating policy for CPU %u\n", policy->cpu);
+	struct cpufreq_policy new_policy;
 
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
+	if (!policy_is_inactive(policy)) {
+		new_policy = *policy;
+		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-	cpufreq_set_policy(policy, &new_policy);
+		new_policy.min = policy->user_policy.min;
+		new_policy.max = policy->user_policy.max;
+		cpufreq_set_policy(policy, &new_policy);
+	}
 }
 
 static void handle_update(struct work_struct *work)
@@ -1133,14 +1136,60 @@ static void handle_update(struct work_struct *work)
 		container_of(work, struct cpufreq_policy, update);
 
 	pr_debug("handle_update for cpu %u called\n", policy->cpu);
+	down_write(&policy->rwsem);
 	refresh_frequency_limits(policy);
+	up_write(&policy->rwsem);
+}
+
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+	struct kobject *kobj;
+	struct completion *cmp;
+
+	down_write(&policy->rwsem);
+	cpufreq_stats_free_table(policy);
+	kobj = &policy->kobj;
+	cmp = &policy->kobj_unregister;
+	up_write(&policy->rwsem);
+	kobject_put(kobj);
+
+	/*
+	 * We need to make sure that the underlying kobj is
+	 * actually not referenced anymore by anybody before we
+	 * proceed with unloading.
+	 */
+	pr_debug("waiting for dropping of refcount\n");
+	wait_for_completion(cmp);
+	pr_debug("wait complete\n");
 }
 
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	struct device *dev = get_cpu_device(cpu);
 	int ret;
 
+	if (!dev)
+		return NULL;
+
 	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
 	if (!policy)
 		return NULL;
@@ -1157,7 +1206,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   cpufreq_global_kobject, "policy%u", cpu);
 	if (ret) {
-		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
 		/*
 		 * The entire policy object will be freed below, but the extra
 		 * memory allocated for the kobject name needs to be freed by
@@ -1167,6 +1216,25 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	policy->nb_min.notifier_call = cpufreq_notifier_min;
+	policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+				      DEV_PM_QOS_MIN_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_kobj_remove;
+	}
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+				      DEV_PM_QOS_MAX_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_min_qos_notifier;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1177,6 +1245,11 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_min_qos_notifier:
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
 	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
@@ -1189,30 +1262,9 @@ err_free_policy:
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
-	struct kobject *kobj;
-	struct completion *cmp;
-
-	down_write(&policy->rwsem);
-	cpufreq_stats_free_table(policy);
-	kobj = &policy->kobj;
-	cmp = &policy->kobj_unregister;
-	up_write(&policy->rwsem);
-	kobject_put(kobj);
-
-	/*
-	 * We need to make sure that the underlying kobj is
-	 * actually not referenced anymore by anybody before we
-	 * proceed with unloading.
-	 */
-	pr_debug("waiting for dropping of refcount\n");
-	wait_for_completion(cmp);
-	pr_debug("wait complete\n");
-}
-
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
+	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1224,6 +1276,11 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+				   DEV_PM_QOS_MAX_FREQUENCY);
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
@@ -2283,6 +2340,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2297,11 +2356,27 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (new_policy->min > new_policy->max)
 		return -EINVAL;
 
+	/*
+	 * PM QoS framework collects all the requests from users and provide us
+	 * the final aggregated value here.
+	 */
+	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+
+	if (min > new_policy->min)
+		new_policy->min = min;
+	if (max < new_policy->max)
+		new_policy->max = max;
+
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
 	if (ret)
 		return ret;
 
+	/*
+	 * The notifier-chain shall be removed once all the users of
+	 * CPUFREQ_ADJUST are moved to use the QoS framework.
+	 */
 	/* adjust if necessary - all reasons */
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a1467aa7f58b..95425941f46d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -147,6 +147,9 @@ struct cpufreq_policy {
 
 	/* Pointer to the cooling device if used for thermal mitigation */
 	struct thermal_cooling_device *cdev;
+
+	struct notifier_block nb_min;
+	struct notifier_block nb_max;
 };
 
 struct cpufreq_freqs {
-- 
cgit v1.2.3


From c57b25bdf7cd374af106992356536bf5df7c255b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:22 +0530
Subject: cpufreq: intel_pstate: Reuse refresh_frequency_limits()

The implementation of intel_pstate_update_max_freq() is quite similar to
refresh_frequency_limits(), lets reuse it.

Finding minimum of policy->user_policy.max and policy->cpuinfo.max_freq
in intel_pstate_update_max_freq() is redundant as cpufreq_set_policy()
will call the ->verify() callback of intel-pstate driver, which will do
this comparison anyway and so dropping it from
intel_pstate_update_max_freq() doesn't harm.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c      | 3 ++-
 drivers/cpufreq/intel_pstate.c | 7 +------
 include/linux/cpufreq.h        | 1 +
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b96ef6db1bfe..79bac52919a5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1116,7 +1116,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	return ret;
 }
 
-static void refresh_frequency_limits(struct cpufreq_policy *policy)
+void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
 	struct cpufreq_policy new_policy;
 
@@ -1129,6 +1129,7 @@ static void refresh_frequency_limits(struct cpufreq_policy *policy)
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
+EXPORT_SYMBOL(refresh_frequency_limits);
 
 static void handle_update(struct work_struct *work)
 {
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f2ff5de988c1..cc27d4c59dca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -898,7 +898,6 @@ static void intel_pstate_update_policies(void)
 static void intel_pstate_update_max_freq(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
-	struct cpufreq_policy new_policy;
 	struct cpudata *cpudata;
 
 	if (!policy)
@@ -908,11 +907,7 @@ static void intel_pstate_update_max_freq(unsigned int cpu)
 	policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
 			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
 
-	memcpy(&new_policy, policy, sizeof(*policy));
-	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
-	new_policy.min = min(policy->user_policy.min, new_policy.max);
-
-	cpufreq_set_policy(policy, &new_policy);
+	refresh_frequency_limits(policy);
 
 	cpufreq_cpu_release(policy);
 }
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 95425941f46d..1fa37b675a80 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -207,6 +207,7 @@ void cpufreq_cpu_release(struct cpufreq_policy *policy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy);
+void refresh_frequency_limits(struct cpufreq_policy *policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
 bool have_governor_per_policy(void);
-- 
cgit v1.2.3


From 18c49926c4bf4915e5194d1de3299c0537229f9f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 5 Jul 2019 16:21:24 +0530
Subject: cpufreq: Add QoS requests for userspace constraints

This implements QoS requests to manage userspace configuration of min
and max frequency.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: syzbot <syzbot+de771ae9390dffed7266@syzkaller.appspotmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 98 ++++++++++++++++++++++++++---------------------
 include/linux/cpufreq.h   |  8 +---
 2 files changed, 57 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 79bac52919a5..99aa7d20b458 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -718,23 +718,15 @@ static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
 static ssize_t store_##file_name					\
 (struct cpufreq_policy *policy, const char *buf, size_t count)		\
 {									\
-	int ret, temp;							\
-	struct cpufreq_policy new_policy;				\
+	unsigned long val;						\
+	int ret;							\
 									\
-	memcpy(&new_policy, policy, sizeof(*policy));			\
-	new_policy.min = policy->user_policy.min;			\
-	new_policy.max = policy->user_policy.max;			\
-									\
-	ret = sscanf(buf, "%u", &new_policy.object);			\
+	ret = sscanf(buf, "%lu", &val);					\
 	if (ret != 1)							\
 		return -EINVAL;						\
 									\
-	temp = new_policy.object;					\
-	ret = cpufreq_set_policy(policy, &new_policy);		\
-	if (!ret)							\
-		policy->user_policy.object = temp;			\
-									\
-	return ret ? ret : count;					\
+	ret = dev_pm_qos_update_request(policy->object##_freq_req, val);\
+	return ret >= 0 ? count : ret;					\
 }
 
 store_one(scaling_min_freq, min);
@@ -1124,8 +1116,6 @@ void refresh_frequency_limits(struct cpufreq_policy *policy)
 		new_policy = *policy;
 		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-		new_policy.min = policy->user_policy.min;
-		new_policy.max = policy->user_policy.max;
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
@@ -1281,6 +1271,9 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 				   DEV_PM_QOS_MAX_FREQUENCY);
 	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
 				   DEV_PM_QOS_MIN_FREQUENCY);
+	dev_pm_qos_remove_request(policy->max_freq_req);
+	dev_pm_qos_remove_request(policy->min_freq_req);
+	kfree(policy->min_freq_req);
 
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
@@ -1359,16 +1352,50 @@ static int cpufreq_online(unsigned int cpu)
 	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
 
 	if (new_policy) {
-		policy->user_policy.min = policy->min;
-		policy->user_policy.max = policy->max;
+		struct device *dev = get_cpu_device(cpu);
 
 		for_each_cpu(j, policy->related_cpus) {
 			per_cpu(cpufreq_cpu_data, j) = policy;
 			add_cpu_dev_symlink(policy, j);
 		}
-	} else {
-		policy->min = policy->user_policy.min;
-		policy->max = policy->user_policy.max;
+
+		policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req),
+					       GFP_KERNEL);
+		if (!policy->min_freq_req)
+			goto out_destroy_policy;
+
+		ret = dev_pm_qos_add_request(dev, policy->min_freq_req,
+					     DEV_PM_QOS_MIN_FREQUENCY,
+					     policy->min);
+		if (ret < 0) {
+			/*
+			 * So we don't call dev_pm_qos_remove_request() for an
+			 * uninitialized request.
+			 */
+			kfree(policy->min_freq_req);
+			policy->min_freq_req = NULL;
+
+			dev_err(dev, "Failed to add min-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
+
+		/*
+		 * This must be initialized right here to avoid calling
+		 * dev_pm_qos_remove_request() on uninitialized request in case
+		 * of errors.
+		 */
+		policy->max_freq_req = policy->min_freq_req + 1;
+
+		ret = dev_pm_qos_add_request(dev, policy->max_freq_req,
+					     DEV_PM_QOS_MAX_FREQUENCY,
+					     policy->max);
+		if (ret < 0) {
+			policy->max_freq_req = NULL;
+			dev_err(dev, "Failed to add max-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
 	}
 
 	if (cpufreq_driver->get && has_target()) {
@@ -2342,7 +2369,6 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 {
 	struct cpufreq_governor *old_gov;
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
-	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2350,24 +2376,12 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 
 	memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
 
-	/*
-	* This check works well when we store new min/max freq attributes,
-	* because new_policy is a copy of policy with one field updated.
-	*/
-	if (new_policy->min > new_policy->max)
-		return -EINVAL;
-
 	/*
 	 * PM QoS framework collects all the requests from users and provide us
 	 * the final aggregated value here.
 	 */
-	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
-	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
-
-	if (min > new_policy->min)
-		new_policy->min = min;
-	if (max < new_policy->max)
-		new_policy->max = max;
+	new_policy->min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	new_policy->max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
 
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
@@ -2456,10 +2470,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
  * @cpu: CPU to re-evaluate the policy for.
  *
  * Update the current frequency for the cpufreq policy of @cpu and use
- * cpufreq_set_policy() to re-apply the min and max limits saved in the
- * user_policy sub-structure of that policy, which triggers the evaluation
- * of policy notifiers and the cpufreq driver's ->verify() callback for the
- * policy in question, among other things.
+ * cpufreq_set_policy() to re-apply the min and max limits, which triggers the
+ * evaluation of policy notifiers and the cpufreq driver's ->verify() callback
+ * for the policy in question, among other things.
  */
 void cpufreq_update_policy(unsigned int cpu)
 {
@@ -2519,10 +2532,9 @@ static int cpufreq_boost_set_sw(int state)
 			break;
 		}
 
-		down_write(&policy->rwsem);
-		policy->user_policy.max = policy->max;
-		cpufreq_governor_limits(policy);
-		up_write(&policy->rwsem);
+		ret = dev_pm_qos_update_request(policy->max_freq_req, policy->max);
+		if (ret)
+			break;
 	}
 
 	return ret;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1fa37b675a80..afc683021ac5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -50,11 +50,6 @@ struct cpufreq_cpuinfo {
 	unsigned int		transition_latency;
 };
 
-struct cpufreq_user_policy {
-	unsigned int		min;    /* in kHz */
-	unsigned int		max;    /* in kHz */
-};
-
 struct cpufreq_policy {
 	/* CPUs sharing clock, require sw coordination */
 	cpumask_var_t		cpus;	/* Online CPUs only */
@@ -84,7 +79,8 @@ struct cpufreq_policy {
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
-	struct cpufreq_user_policy user_policy;
+	struct dev_pm_qos_request *min_freq_req;
+	struct dev_pm_qos_request *max_freq_req;
 	struct cpufreq_frequency_table	*freq_table;
 	enum cpufreq_table_sorting freq_table_sorted;
 
-- 
cgit v1.2.3


From d4117d63a30876a3654f587c3a419db63d8b529d Mon Sep 17 00:00:00 2001
From: Kweh Hock Leong <hock.leong.kweh@intel.com>
Date: Sat, 6 Jul 2019 01:33:27 +0800
Subject: net: stmmac: enable clause 45 mdio support

DWMAC4 is capable to support clause 45 mdio communication.
This patch enable the feature on stmmac_mdio_write() and
stmmac_mdio_read() by following phy_write_mmd() and
phy_read_mmd() mdiobus read write implementation format.

Reviewed-by: Li, Yifan <yifan2.li@intel.com>
Signed-off-by: Kweh Hock Leong <hock.leong.kweh@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 43 ++++++++++++++++++-----
 include/linux/phy.h                               |  2 ++
 2 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 18cadf0b0d66..4304c1abc5d1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -24,11 +24,14 @@
 
 #define MII_BUSY 0x00000001
 #define MII_WRITE 0x00000002
+#define MII_DATA_MASK GENMASK(15, 0)
 
 /* GMAC4 defines */
 #define MII_GMAC4_GOC_SHIFT		2
+#define MII_GMAC4_REG_ADDR_SHIFT	16
 #define MII_GMAC4_WRITE			(1 << MII_GMAC4_GOC_SHIFT)
 #define MII_GMAC4_READ			(3 << MII_GMAC4_GOC_SHIFT)
+#define MII_GMAC4_C45E			BIT(1)
 
 /* XGMAC defines */
 #define MII_XGMAC_SADDR			BIT(18)
@@ -155,22 +158,34 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
-	u32 v;
-	int data;
 	u32 value = MII_BUSY;
+	int data = 0;
+	u32 v;
 
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
 	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
 		& priv->hw->mii.clk_csr_mask;
-	if (priv->plat->has_gmac4)
+	if (priv->plat->has_gmac4) {
 		value |= MII_GMAC4_READ;
+		if (phyreg & MII_ADDR_C45) {
+			value |= MII_GMAC4_C45E;
+			value &= ~priv->hw->mii.reg_mask;
+			value |= ((phyreg >> MII_DEVADDR_C45_SHIFT) <<
+			       priv->hw->mii.reg_shift) &
+			       priv->hw->mii.reg_mask;
+
+			data |= (phyreg & MII_REGADDR_C45_MASK) <<
+				MII_GMAC4_REG_ADDR_SHIFT;
+		}
+	}
 
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
 			       100, 10000))
 		return -EBUSY;
 
+	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
@@ -178,7 +193,7 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 		return -EBUSY;
 
 	/* Read the data from the MII data register */
-	data = (int)readl(priv->ioaddr + mii_data);
+	data = (int)readl(priv->ioaddr + mii_data) & MII_DATA_MASK;
 
 	return data;
 }
@@ -198,8 +213,9 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
-	u32 v;
 	u32 value = MII_BUSY;
+	int data = phydata;
+	u32 v;
 
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
@@ -207,10 +223,21 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 
 	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
 		& priv->hw->mii.clk_csr_mask;
-	if (priv->plat->has_gmac4)
+	if (priv->plat->has_gmac4) {
 		value |= MII_GMAC4_WRITE;
-	else
+		if (phyreg & MII_ADDR_C45) {
+			value |= MII_GMAC4_C45E;
+			value &= ~priv->hw->mii.reg_mask;
+			value |= ((phyreg >> MII_DEVADDR_C45_SHIFT) <<
+			       priv->hw->mii.reg_shift) &
+			       priv->hw->mii.reg_mask;
+
+			data |= (phyreg & MII_REGADDR_C45_MASK) <<
+				MII_GMAC4_REG_ADDR_SHIFT;
+		}
+	} else {
 		value |= MII_WRITE;
+	}
 
 	/* Wait until any existing MII operation is complete */
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
@@ -218,7 +245,7 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 		return -EBUSY;
 
 	/* Set the MII address register to write */
-	writel(phydata, priv->ioaddr + mii_data);
+	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
 	/* Wait until any existing MII operation is complete */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d0af7d37fdf9..1739c6dc470e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -195,6 +195,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit
    IEEE 802.3ae clause 45 addressing mode used by 10GIGE phy chips. */
 #define MII_ADDR_C45 (1<<30)
+#define MII_DEVADDR_C45_SHIFT	16
+#define MII_REGADDR_C45_MASK	GENMASK(15, 0)
 
 struct device;
 struct phylink;
-- 
cgit v1.2.3


From 333f7909a8573145811c4ab7d8c9092301707721 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Jul 2019 20:14:16 +0100
Subject: coallocate socket_wq with socket itself

socket->wq is assign-once, set when we are initializing both
struct socket it's in and struct socket_wq it points to.  As the
matter of fact, the only reason for separate allocation was the
ability to RCU-delay freeing of socket_wq.  RCU-delaying the
freeing of socket itself gets rid of that need, so we can just
fold struct socket_wq into the end of struct socket and simplify
the life both for sock_alloc_inode() (one allocation instead of
two) and for tun/tap oddballs, where we used to embed struct socket
and struct socket_wq into the same structure (now - embedding just
the struct socket).

Note that reference to struct socket_wq in struct sock does remain
a reference - that's unchanged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c      |  5 ++---
 drivers/net/tun.c      |  8 +++-----
 include/linux/if_tap.h |  1 -
 include/linux/net.h    |  4 ++--
 include/net/sock.h     |  4 ++--
 net/core/sock.c        |  2 +-
 net/socket.c           | 19 +++++--------------
 7 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 8e01390c738e..dd614c2cd994 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -520,8 +520,7 @@ static int tap_open(struct inode *inode, struct file *file)
 		goto err;
 	}
 
-	RCU_INIT_POINTER(q->sock.wq, &q->wq);
-	init_waitqueue_head(&q->wq.wait);
+	init_waitqueue_head(&q->sock.wq.wait);
 	q->sock.type = SOCK_RAW;
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
@@ -579,7 +578,7 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
 		goto out;
 
 	mask = 0;
-	poll_wait(file, &q->wq.wait, wait);
+	poll_wait(file, &q->sock.wq.wait, wait);
 
 	if (!ptr_ring_empty(&q->ring))
 		mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d7c55e0fa8f4..3d443597bd04 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -160,7 +160,6 @@ struct tun_pcpu_stats {
 struct tun_file {
 	struct sock sk;
 	struct socket socket;
-	struct socket_wq wq;
 	struct tun_struct __rcu *tun;
 	struct fasync_struct *fasync;
 	/* only used for fasnyc */
@@ -2165,7 +2164,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 		goto out;
 	}
 
-	add_wait_queue(&tfile->wq.wait, &wait);
+	add_wait_queue(&tfile->socket.wq.wait, &wait);
 
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -2185,7 +2184,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 	}
 
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tfile->wq.wait, &wait);
+	remove_wait_queue(&tfile->socket.wq.wait, &wait);
 
 out:
 	*err = error;
@@ -3415,8 +3414,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->flags = 0;
 	tfile->ifindex = 0;
 
-	init_waitqueue_head(&tfile->wq.wait);
-	RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
+	init_waitqueue_head(&tfile->socket.wq.wait);
 
 	tfile->socket.file = file;
 	tfile->socket.ops = &tun_socket_ops;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 8e66866c11be..915a187cfabd 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -62,7 +62,6 @@ struct tap_dev {
 struct tap_queue {
 	struct sock sk;
 	struct socket sock;
-	struct socket_wq wq;
 	int vnet_hdr_sz;
 	struct tap_dev __rcu *tap;
 	struct file *file;
diff --git a/include/linux/net.h b/include/linux/net.h
index f7d672cf25b5..9cafb5f353a9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -116,11 +116,11 @@ struct socket {
 
 	unsigned long		flags;
 
-	struct socket_wq	*wq;
-
 	struct file		*file;
 	struct sock		*sk;
 	const struct proto_ops	*ops;
+
+	struct socket_wq	wq;
 };
 
 struct vm_area_struct;
diff --git a/include/net/sock.h b/include/net/sock.h
index 6cbc16136357..228db3998e46 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1822,7 +1822,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	WARN_ON(parent->sk);
 	write_lock_bh(&sk->sk_callback_lock);
-	rcu_assign_pointer(sk->sk_wq, parent->wq);
+	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2100,7 +2100,7 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
 				  poll_table *p)
 {
 	if (!poll_does_not_wait(p)) {
-		poll_wait(filp, &sock->wq->wait, p);
+		poll_wait(filp, &sock->wq.wait, p);
 		/* We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
diff --git a/net/core/sock.c b/net/core/sock.c
index 0eb21384079d..3e073ca6138f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2847,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	if (sock) {
 		sk->sk_type	=	sock->type;
-		RCU_INIT_POINTER(sk->sk_wq, sock->wq);
+		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
 		sock->sk	=	sk;
 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
 	} else {
diff --git a/net/socket.c b/net/socket.c
index 541719a2443d..16449d6daeca 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -234,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
 	struct socket_alloc *ei;
-	struct socket_wq *wq;
 
 	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq) {
-		kmem_cache_free(sock_inode_cachep, ei);
-		return NULL;
-	}
-	init_waitqueue_head(&wq->wait);
-	wq->fasync_list = NULL;
-	wq->flags = 0;
-	ei->socket.wq = wq;
+	init_waitqueue_head(&ei->socket.wq.wait);
+	ei->socket.wq.fasync_list = NULL;
+	ei->socket.wq.flags = 0;
 
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
@@ -263,7 +256,6 @@ static void sock_free_inode(struct inode *inode)
 	struct socket_alloc *ei;
 
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	kfree(ei->socket.wq);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -599,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
 		module_put(owner);
 	}
 
-	if (sock->wq->fasync_list)
+	if (sock->wq.fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
 	if (!sock->file) {
@@ -1288,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on)
 {
 	struct socket *sock = filp->private_data;
 	struct sock *sk = sock->sk;
-	struct socket_wq *wq;
+	struct socket_wq *wq = &sock->wq;
 
 	if (sk == NULL)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = sock->wq;
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
cgit v1.2.3


From 6413139dfc641aaaa30580b59696a5f7ea274194 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 7 Jul 2019 05:51:55 -0400
Subject: skbuff: increase verbosity when dumping skb data

skb_warn_bad_offload and netdev_rx_csum_fault trigger on hard to debug
issues. Dump more state and the header.

Optionally dump the entire packet and linear segment. This is required
to debug checksum bugs that may include bytes past skb_tail_pointer().

Both call sites call this function inside a net_ratelimit() block.
Limit full packet log further to a hard limit of can_dump_full (5).

Based on an earlier patch by Cong Wang, see link below.

Changes v1 -> v2
  - dump frag_list only on full_pkt

Link: https://patchwork.ozlabs.org/patch/1000841/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/dev.c         | 16 ++------
 net/core/skbuff.c      | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ece49d5f8ef..1fdfdbb34e8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1024,6 +1024,7 @@ static inline bool skb_unref(struct sk_buff *skb)
 void skb_release_head_state(struct sk_buff *skb);
 void kfree_skb(struct sk_buff *skb);
 void kfree_skb_list(struct sk_buff *segs);
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
 void skb_tx_error(struct sk_buff *skb);
 void consume_skb(struct sk_buff *skb);
 void __consume_stateless_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 58529318b3a9..fc676b2610e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
 		else
 			name = netdev_name(dev);
 	}
-	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
-	     "gso_type=%d ip_summed=%d\n",
+	skb_dump(KERN_WARNING, skb, false);
+	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 	     name, dev ? &dev->features : &null_features,
-	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
-	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
-	     skb_shinfo(skb)->gso_type, skb->ip_summed);
+	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 }
 
 /*
@@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 {
 	if (net_ratelimit()) {
 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
-		if (dev)
-			pr_err("dev features: %pNF\n", &dev->features);
-		pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
-		       skb->len, skb->data_len, skb->pkt_type,
-		       skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
-		       skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
-		       skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
+		skb_dump(KERN_ERR, skb, true);
 		dump_stack();
 	}
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5323441a12cc..cdb0ccdaac0b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -707,6 +707,105 @@ void kfree_skb_list(struct sk_buff *segs)
 }
 EXPORT_SYMBOL(kfree_skb_list);
 
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+	static atomic_t can_dump_full = ATOMIC_INIT(5);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct net_device *dev = skb->dev;
+	struct sock *sk = skb->sk;
+	struct sk_buff *list_skb;
+	bool has_mac, has_trans;
+	int headroom, tailroom;
+	int i, len, seg_len;
+
+	if (full_pkt)
+		full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
+
+	if (full_pkt)
+		len = skb->len;
+	else
+		len = min_t(int, skb->len, MAX_HEADER + 128);
+
+	headroom = skb_headroom(skb);
+	tailroom = skb_tailroom(skb);
+
+	has_mac = skb_mac_header_was_set(skb);
+	has_trans = skb_transport_header_was_set(skb);
+
+	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+	       level, skb->len, headroom, skb_headlen(skb), tailroom,
+	       has_mac ? skb->mac_header : -1,
+	       has_mac ? skb_mac_header_len(skb) : -1,
+	       skb->network_header,
+	       has_trans ? skb_network_header_len(skb) : -1,
+	       has_trans ? skb->transport_header : -1,
+	       sh->tx_flags, sh->nr_frags,
+	       sh->gso_size, sh->gso_type, sh->gso_segs,
+	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
+	       skb->csum_valid, skb->csum_level,
+	       skb->hash, skb->sw_hash, skb->l4_hash,
+	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+
+	if (dev)
+		printk("%sdev name=%s feat=0x%pNF\n",
+		       level, dev->name, &dev->features);
+	if (sk)
+		printk("%ssk family=%hu type=%hu proto=%hu\n",
+		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+	if (full_pkt && headroom)
+		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb->head, headroom, false);
+
+	seg_len = min_t(int, skb_headlen(skb), len);
+	if (seg_len)
+		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb->data, seg_len, false);
+	len -= seg_len;
+
+	if (full_pkt && tailroom)
+		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb_tail_pointer(skb), tailroom, false);
+
+	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		u32 p_off, p_len, copied;
+		struct page *p;
+		u8 *vaddr;
+
+		skb_frag_foreach_page(frag, frag->page_offset,
+				      skb_frag_size(frag), p, p_off, p_len,
+				      copied) {
+			seg_len = min_t(int, p_len, len);
+			vaddr = kmap_atomic(p);
+			print_hex_dump(level, "skb frag:     ",
+				       DUMP_PREFIX_OFFSET,
+				       16, 1, vaddr + p_off, seg_len, false);
+			kunmap_atomic(vaddr);
+			len -= seg_len;
+			if (!len)
+				break;
+		}
+	}
+
+	if (full_pkt && skb_has_frag_list(skb)) {
+		printk("skb fraglist:\n");
+		skb_walk_frags(skb, list_skb)
+			skb_dump(level, list_skb, true);
+	}
+}
+EXPORT_SYMBOL(skb_dump);
+
 /**
  *	skb_tx_error - report an sk_buff xmit error
  *	@skb: buffer that triggered an error
-- 
cgit v1.2.3


From 8822e270d697010e6a4fd42a319dbefc33db91e1 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:54 +0100
Subject: net: core: move push MPLS functionality from OvS to core helper

Open vSwitch provides code to push an MPLS header to a packet. In
preparation for supporting this in TC, move the push code to an skb helper
that can be reused.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 31 +++--------------------
 3 files changed, 69 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1fdfdbb34e8e..1dc55000710c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3447,6 +3447,7 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cdb0ccdaac0b..495fd743a935 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -66,6 +66,7 @@
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
+#include <net/mpls.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -5425,6 +5426,69 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 }
 EXPORT_SYMBOL(skb_vlan_push);
 
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+			     __be16 ethertype)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be16 diff[] = { ~hdr->h_proto, ethertype };
+
+		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+	}
+
+	hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after the mac header
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+{
+	struct mpls_shim_hdr *lse;
+	int err;
+
+	if (unlikely(!eth_p_mpls(mpls_proto)))
+		return -EINVAL;
+
+	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+	if (skb->encapsulation)
+		return -EINVAL;
+
+	err = skb_cow_head(skb, MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (!skb->inner_protocol) {
+		skb_set_inner_network_header(skb, skb->mac_len);
+		skb_set_inner_protocol(skb, skb->protocol);
+	}
+
+	skb_push(skb, MPLS_HLEN);
+	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
+
+	lse = mpls_hdr(skb);
+	lse->label_stack_entry = mpls_lse;
+	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+	if (skb->dev && skb->dev->type == ARPHRD_ETHER)
+		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+	skb->protocol = mpls_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index bd131469e4ca..a9a6c9cbf946 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -175,34 +175,11 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
-	struct mpls_shim_hdr *new_mpls_lse;
-
-	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
-	if (skb->encapsulation)
-		return -ENOTSUPP;
-
-	if (skb_cow_head(skb, MPLS_HLEN) < 0)
-		return -ENOMEM;
-
-	if (!skb->inner_protocol) {
-		skb_set_inner_network_header(skb, skb->mac_len);
-		skb_set_inner_protocol(skb, skb->protocol);
-	}
-
-	skb_push(skb, MPLS_HLEN);
-	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
-
-	new_mpls_lse = mpls_hdr(skb);
-	new_mpls_lse->label_stack_entry = mpls->mpls_lse;
-
-	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
+	int err;
 
-	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
-		update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
-	skb->protocol = mpls->mpls_ethertype;
+	err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype);
+	if (err)
+		return err;
 
 	invalidate_flow_key(key);
 	return 0;
-- 
cgit v1.2.3


From ed246cee09b9865145a2e1e34f63ec0e31dd83a5 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:55 +0100
Subject: net: core: move pop MPLS functionality from OvS to core helper

Open vSwitch provides code to pop an MPLS header to a packet. In
preparation for supporting this in TC, move the pop code to an skb helper
that can be reused.

Remove the, now unused, update_ethertype static function from OvS.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 37 ++-----------------------------------
 3 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1dc55000710c..08d1c8e70540 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3448,6 +3448,7 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 495fd743a935..8c00be4d8919 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5489,6 +5489,48 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_push);
 
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+{
+	int err;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+
+	__skb_pull(skb, MPLS_HLEN);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
+
+	if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+		struct ethhdr *hdr;
+
+		/* use mpls_hdr() to get ethertype to account for VLANs. */
+		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+		skb_mod_eth_type(skb, hdr, next_proto);
+	}
+	skb->protocol = next_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a9a6c9cbf946..62715bb8d611 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -160,18 +160,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			      struct sw_flow_key *key,
 			      const struct nlattr *attr, int len);
 
-static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
-			     __be16 ethertype)
-{
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		__be16 diff[] = { ~(hdr->h_proto), ethertype };
-
-		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
-	}
-
-	hdr->h_proto = ethertype;
-}
-
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
@@ -190,31 +178,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 {
 	int err;
 
-	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
-	if (unlikely(err))
+	err = skb_mpls_pop(skb, ethertype);
+	if (err)
 		return err;
 
-	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
-
-	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
-
-	__skb_pull(skb, MPLS_HLEN);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
-
-	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
-		struct ethhdr *hdr;
-
-		/* mpls_hdr() is used to locate the ethertype field correctly in the
-		 * presence of VLAN tags.
-		 */
-		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
-		update_ethertype(skb, hdr, ethertype);
-	}
-	if (eth_p_mpls(skb->protocol))
-		skb->protocol = ethertype;
-
 	invalidate_flow_key(key);
 	return 0;
 }
-- 
cgit v1.2.3


From d27cf5c59a12f66425df29cd81f61aa73ef14ac1 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:56 +0100
Subject: net: core: add MPLS update core helper and use in OvS

Open vSwitch allows the updating of an existing MPLS header on a packet.
In preparation for supporting similar functionality in TC, move this to a
common skb helper function.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 33 +++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 13 +++----------
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08d1c8e70540..9f7e01f2be83 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3449,6 +3449,7 @@ int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c00be4d8919..93443a01ab39 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5531,6 +5531,39 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_pop);
 
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+	int err;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+	}
+
+	mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 62715bb8d611..3572e11b6f21 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -193,19 +193,12 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	__be32 lse;
 	int err;
 
-	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
-	if (unlikely(err))
-		return err;
-
 	stack = mpls_hdr(skb);
 	lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		__be32 diff[] = { ~(stack->label_stack_entry), lse };
-
-		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
-	}
+	err = skb_mpls_update_lse(skb, lse);
+	if (err)
+		return err;
 
-	stack->label_stack_entry = lse;
 	flow_key->mpls.top_lse = lse;
 	return 0;
 }
-- 
cgit v1.2.3


From 2a2ea50870baa3fb4de0872c5b60828138654ca7 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:57 +0100
Subject: net: sched: add mpls manipulation actions to TC

Currently, TC offers the ability to match on the MPLS fields of a packet
through the use of the flow_dissector_key_mpls struct. However, as yet, TC
actions do not allow the modification or manipulation of such fields.

Add a new module that registers TC action ops to allow manipulation of
MPLS. This includes the ability to push and pop headers as well as modify
the contents of new or existing headers. A further action to decrement the
TTL field of an MPLS header is also provided with a new helper added to
support this.

Examples of the usage of the new action with flower rules to push and pop
MPLS labels are:

tc filter add dev eth0 protocol ip parent ffff: flower \
    action mpls push protocol mpls_uc label 123  \
    action mirred egress redirect dev eth1

tc filter add dev eth0 protocol mpls_uc parent ffff: flower \
    action mpls pop protocol ipv4  \
    action mirred egress redirect dev eth1

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h              |   1 +
 include/net/tc_act/tc_mpls.h        |  30 +++
 include/uapi/linux/pkt_cls.h        |   3 +-
 include/uapi/linux/tc_act/tc_mpls.h |  33 +++
 net/core/skbuff.c                   |  30 +++
 net/sched/Kconfig                   |  11 +
 net/sched/Makefile                  |   1 +
 net/sched/act_mpls.c                | 406 ++++++++++++++++++++++++++++++++++++
 8 files changed, 514 insertions(+), 1 deletion(-)
 create mode 100644 include/net/tc_act/tc_mpls.h
 create mode 100644 include/uapi/linux/tc_act/tc_mpls.h
 create mode 100644 net/sched/act_mpls.c

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9f7e01f2be83..9d7a2c28ea35 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3450,6 +3450,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
+int skb_mpls_dec_ttl(struct sk_buff *skb);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
new file mode 100644
index 000000000000..4bc3d9250ef0
--- /dev/null
+++ b/include/net/tc_act/tc_mpls.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#ifndef __NET_TC_MPLS_H
+#define __NET_TC_MPLS_H
+
+#include <linux/tc_act/tc_mpls.h>
+#include <net/act_api.h>
+
+struct tcf_mpls_params {
+	int tcfm_action;
+	u32 tcfm_label;
+	u8 tcfm_tc;
+	u8 tcfm_ttl;
+	u8 tcfm_bos;
+	__be16 tcfm_proto;
+	struct rcu_head	rcu;
+};
+
+#define ACT_MPLS_TC_NOT_SET	0xff
+#define ACT_MPLS_BOS_NOT_SET	0xff
+#define ACT_MPLS_LABEL_NOT_SET	0xffffffff
+
+struct tcf_mpls {
+	struct tc_action common;
+	struct tcf_mpls_params __rcu *mpls_p;
+};
+#define to_mpls(a) ((struct tcf_mpls *)a)
+
+#endif /* __NET_TC_MPLS_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8cc6b6777b3c..e22ef4a940bc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -104,8 +104,9 @@ enum tca_id {
 	TCA_ID_SIMP = TCA_ACT_SIMP,
 	TCA_ID_IFE = TCA_ACT_IFE,
 	TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
-	/* other actions go here */
 	TCA_ID_CTINFO,
+	TCA_ID_MPLS,
+	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
 
diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h
new file mode 100644
index 000000000000..9360e95273c7
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_mpls.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#ifndef __LINUX_TC_MPLS_H
+#define __LINUX_TC_MPLS_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_MPLS_ACT_POP	1
+#define TCA_MPLS_ACT_PUSH	2
+#define TCA_MPLS_ACT_MODIFY	3
+#define TCA_MPLS_ACT_DEC_TTL	4
+
+struct tc_mpls {
+	tc_gen;		/* generic TC action fields. */
+	int m_action;	/* action of type TCA_MPLS_ACT_*. */
+};
+
+enum {
+	TCA_MPLS_UNSPEC,
+	TCA_MPLS_TM,	/* struct tcf_t; time values associated with action. */
+	TCA_MPLS_PARMS,	/* struct tc_mpls; action type and general TC fields. */
+	TCA_MPLS_PAD,
+	TCA_MPLS_PROTO,	/* be16; eth_type of pushed or next (for pop) header. */
+	TCA_MPLS_LABEL,	/* u32; MPLS label. Lower 20 bits are used. */
+	TCA_MPLS_TC,	/* u8; MPLS TC field. Lower 3 bits are used. */
+	TCA_MPLS_TTL,	/* u8; MPLS TTL field. Must not be 0. */
+	TCA_MPLS_BOS,	/* u8; MPLS BOS field. Either 1 or 0. */
+	__TCA_MPLS_MAX,
+};
+#define TCA_MPLS_MAX (__TCA_MPLS_MAX - 1)
+
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93443a01ab39..6f1e31f674a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -59,6 +59,7 @@
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
 #include <linux/if_vlan.h>
+#include <linux/mpls.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -5564,6 +5565,35 @@ int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
 
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+	u32 lse;
+	u8 ttl;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+	if (!--ttl)
+		return -EINVAL;
+
+	lse &= ~MPLS_LS_TTL_MASK;
+	lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 360fdd3eaa77..731f5fbc2a3c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -842,6 +842,17 @@ config NET_ACT_CSUM
 	  To compile this code as a module, choose M here: the
 	  module will be called act_csum.
 
+config NET_ACT_MPLS
+	tristate "MPLS manipulation"
+	depends on NET_CLS_ACT
+	help
+	  Say Y here to push or pop MPLS headers.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_mpls.
+
 config NET_ACT_VLAN
         tristate "Vlan manipulation"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index d54bfcbd7981..c26603606c22 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
 obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS)	+= act_mpls.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 000000000000..ca2597ce4ac9
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+
+static unsigned int mpls_net_id;
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT	255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+			       struct tcf_mpls_params *p, bool set_bos)
+{
+	u32 new_lse = 0;
+
+	if (lse)
+		new_lse = be32_to_cpu(lse->label_stack_entry);
+
+	if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+		new_lse &= ~MPLS_LS_LABEL_MASK;
+		new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+	}
+	if (p->tcfm_ttl) {
+		new_lse &= ~MPLS_LS_TTL_MASK;
+		new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+	}
+	if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+		new_lse &= ~MPLS_LS_TC_MASK;
+		new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+	}
+	if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+		new_lse &= ~MPLS_LS_S_MASK;
+		new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+	} else if (set_bos) {
+		new_lse |= 1 << MPLS_LS_S_SHIFT;
+	}
+
+	return cpu_to_be32(new_lse);
+}
+
+static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+	__be32 new_lse;
+	int ret;
+
+	tcf_lastuse_update(&m->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+	/* Ensure 'data' points at mac_header prior calling mpls manipulating
+	 * functions.
+	 */
+	if (skb_at_tc_ingress(skb))
+		skb_push_rcsum(skb, skb->mac_len);
+
+	ret = READ_ONCE(m->tcf_action);
+
+	p = rcu_dereference_bh(m->mpls_p);
+
+	switch (p->tcfm_action) {
+	case TCA_MPLS_ACT_POP:
+		if (skb_mpls_pop(skb, p->tcfm_proto))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_PUSH:
+		new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
+		if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_MODIFY:
+		new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+		if (skb_mpls_update_lse(skb, new_lse))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_DEC_TTL:
+		if (skb_mpls_dec_ttl(skb))
+			goto drop;
+		break;
+	}
+
+	if (skb_at_tc_ingress(skb))
+		skb_pull_rcsum(skb, skb->mac_len);
+
+	return ret;
+
+drop:
+	qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+	return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+		       struct netlink_ext_ack *extack)
+{
+	const u32 *label = nla_data(attr);
+
+	if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+		NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+	[TCA_MPLS_UNSPEC]	= { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
+	[TCA_MPLS_PARMS]	= NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+	[TCA_MPLS_PROTO]	= { .type = NLA_U16 },
+	[TCA_MPLS_LABEL]	= NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
+	[TCA_MPLS_TC]		= NLA_POLICY_RANGE(NLA_U8, 0, 7),
+	[TCA_MPLS_TTL]		= NLA_POLICY_MIN(NLA_U8, 1),
+	[TCA_MPLS_BOS]		= NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action **a,
+			 int ovr, int bind, bool rtnl_held,
+			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+	struct nlattr *tb[TCA_MPLS_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tcf_mpls_params *p;
+	struct tc_mpls *parm;
+	bool exists = false;
+	struct tcf_mpls *m;
+	int ret = 0, err;
+	u8 mpls_ttl = 0;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_MPLS_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+		return -EINVAL;
+	}
+	parm = nla_data(tb[TCA_MPLS_PARMS]);
+
+	/* Verify parameters against action type. */
+	switch (parm->m_action) {
+	case TCA_MPLS_ACT_POP:
+		if (!tb[TCA_MPLS_PROTO]) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+			return -EINVAL;
+		}
+		if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+			return -EINVAL;
+		}
+		if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+		    tb[TCA_MPLS_BOS]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+			return -EINVAL;
+		}
+		break;
+	case TCA_MPLS_ACT_DEC_TTL:
+		if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+		    tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+			return -EINVAL;
+		}
+		break;
+	case TCA_MPLS_ACT_PUSH:
+		if (!tb[TCA_MPLS_LABEL]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+			return -EINVAL;
+		}
+		if (tb[TCA_MPLS_PROTO] &&
+		    !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+			return -EPROTONOSUPPORT;
+		}
+		/* Push needs a TTL - if not specified, set a default value. */
+		if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+			mpls_ttl = net->mpls.default_ttl ?
+				   net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+			mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+		}
+		break;
+	case TCA_MPLS_ACT_MODIFY:
+		if (tb[TCA_MPLS_PROTO]) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+			return -EINVAL;
+		}
+		break;
+	default:
+		NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+		return -EINVAL;
+	}
+
+	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	if (err < 0)
+		return err;
+	exists = err;
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_mpls_ops, bind, true);
+		if (ret) {
+			tcf_idr_cleanup(tn, parm->index);
+			return ret;
+		}
+
+		ret = ACT_P_CREATED;
+	} else if (!ovr) {
+		tcf_idr_release(*a, bind);
+		return -EEXIST;
+	}
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	m = to_mpls(*a);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		err = -ENOMEM;
+		goto put_chain;
+	}
+
+	p->tcfm_action = parm->m_action;
+	p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) :
+					     ACT_MPLS_LABEL_NOT_SET;
+	p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) :
+				       ACT_MPLS_TC_NOT_SET;
+	p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) :
+					 mpls_ttl;
+	p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) :
+					 ACT_MPLS_BOS_NOT_SET;
+	p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) :
+					     htons(ETH_P_MPLS_UC);
+
+	spin_lock_bh(&m->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+	rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+	spin_unlock_bh(&m->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+	if (p)
+		kfree_rcu(p, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+	return ret;
+put_chain:
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+
+	p = rcu_dereference_protected(m->mpls_p, 1);
+	if (p)
+		kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+	struct tc_mpls opt = {
+		.index    = m->tcf_index,
+		.refcnt   = refcount_read(&m->tcf_refcnt) - ref,
+		.bindcnt  = atomic_read(&m->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&m->tcf_lock);
+	opt.action = m->tcf_action;
+	p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+	opt.m_action = p->tcfm_action;
+
+	if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+	    nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+		goto nla_put_failure;
+
+	if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+	    nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+		goto nla_put_failure;
+
+	if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+		goto nla_put_failure;
+
+	if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+	    nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+		goto nla_put_failure;
+
+	if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &m->tcf_tm);
+
+	if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+		goto nla_put_failure;
+
+	spin_unlock_bh(&m->tcf_lock);
+
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&m->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+static int tcf_mpls_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_mpls_ops = {
+	.kind		=	"mpls",
+	.id		=	TCA_ID_MPLS,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_mpls_act,
+	.dump		=	tcf_mpls_dump,
+	.init		=	tcf_mpls_init,
+	.cleanup	=	tcf_mpls_cleanup,
+	.walk		=	tcf_mpls_walker,
+	.lookup		=	tcf_mpls_search,
+	.size		=	sizeof(struct tcf_mpls),
+};
+
+static __net_init int mpls_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tc_action_net_init(tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, mpls_net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+	.init = mpls_init_net,
+	.exit_batch = mpls_exit_net,
+	.id   = &mpls_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+	return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+	tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
-- 
cgit v1.2.3


From 87b512def792579641499d9bef1d640994ea9c18 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 27 Jun 2019 20:50:46 -0500
Subject: objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read.  So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section.  The '.rodata' prefix ensures that the data
will be placed in the rodata section by the vmlinux linker script.  The
double periods are part of an existing convention which distinguishes
kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h |  5 +++++
 tools/objtool/check.c    | 27 ++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	".pushsection .discard.unreachable\n\t"				\
 	".long 999b - .\n\t"						\
 	".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
 #else
 #define annotate_reachable()
 #define annotate_unreachable()
+#define __annotate_jump_table
 #endif
 
 #ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@
 
 #define FAKE_JUMP_OFFSET -1
 
+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
 struct alternative {
 	struct list_head list;
 	struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,
 
 		/*
 		 * Make sure the .rodata address isn't associated with a
-		 * symbol.  gcc jump tables are anonymous data.
+		 * symbol.  GCC jump tables are anonymous data.
+		 *
+		 * Also support C jump tables which are in the same format as
+		 * switch jump tables.  For objtool to recognize them, they
+		 * need to be placed in the C_JUMP_TABLE_SECTION section.  They
+		 * have symbols associated with them.
 		 */
-		if (find_symbol_containing(rodata_sec, table_offset))
+		if (find_symbol_containing(rodata_sec, table_offset) &&
+		    strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
 			continue;
 
 		rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
 	bool found = false;
 
 	/*
-	 * This searches for the .rodata section or multiple .rodata.func_name
-	 * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
-	 * rodata sections are ignored as they don't contain jump tables.
+	 * Search for the following rodata sections, each of which can
+	 * potentially contain jump tables:
+	 *
+	 * - .rodata: can contain GCC switch tables
+	 * - .rodata.<func>: same, if -fdata-sections is being used
+	 * - .rodata..c_jump_table: contains C annotated jump tables
+	 *
+	 * .rodata.str1.* sections are ignored; they don't contain jump tables.
 	 */
 	for_each_sec(file, sec) {
-		if (!strncmp(sec->name, ".rodata", 7) &&
-		    !strstr(sec->name, ".str1.")) {
+		if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+		    !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
 			sec->rodata = true;
 			found = true;
 		}
-- 
cgit v1.2.3


From f6b6aefee70aa5261deec7feab80c249bf58397f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 30 May 2019 08:05:58 -0500
Subject: PCI: Fix typos and whitespace errors

Fix typos in drivers/pci.  Comment and whitespace changes only.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
---
 drivers/pci/ats.c                            |  2 +-
 drivers/pci/controller/dwc/pcie-armada8k.c   |  2 +-
 drivers/pci/controller/dwc/pcie-kirin.c      |  2 +-
 drivers/pci/controller/pci-aardvark.c        |  2 +-
 drivers/pci/controller/pcie-iproc-platform.c |  2 +-
 drivers/pci/controller/pcie-iproc.c          |  2 +-
 drivers/pci/controller/vmd.c                 |  2 +-
 drivers/pci/mmap.c                           |  2 +-
 drivers/pci/msi.c                            | 43 ++++++++++++++--------------
 drivers/pci/p2pdma.c                         |  6 ++--
 drivers/pci/pci-bridge-emul.c                |  2 +-
 drivers/pci/pci-pf-stub.c                    |  2 +-
 drivers/pci/pci.c                            |  2 +-
 drivers/pci/pcie/aer_inject.c                |  2 +-
 include/linux/pci.h                          |  2 +-
 include/linux/pci_ids.h                      |  6 ++--
 16 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 97c08146534a..e18499243f84 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
  * @pdev: PCI device structure
  *
  * Returns negative value when PASID capability is not present.
- * Otherwise it returns the numer of supported PASIDs.
+ * Otherwise it returns the number of supported PASIDs.
  */
 int pci_max_pasids(struct pci_dev *pdev)
 {
diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c
index 0c389a30ef5d..9012d5f60be9 100644
--- a/drivers/pci/controller/dwc/pcie-armada8k.c
+++ b/drivers/pci/controller/dwc/pcie-armada8k.c
@@ -55,7 +55,7 @@ struct armada8k_pcie {
 #define PCIE_ARUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x5C)
 #define PCIE_AWUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x60)
 /*
- * AR/AW Cache defauls: Normal memory, Write-Back, Read / Write
+ * AR/AW Cache defaults: Normal memory, Write-Back, Read / Write
  * allocate
  */
 #define ARCACHE_DEFAULT_VALUE		0x3511
diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c
index 9b599296205d..8df1914226be 100644
--- a/drivers/pci/controller/dwc/pcie-kirin.c
+++ b/drivers/pci/controller/dwc/pcie-kirin.c
@@ -2,7 +2,7 @@
 /*
  * PCIe host controller driver for Kirin Phone SoCs
  *
- * Copyright (C) 2017 Hilisicon Electronics Co., Ltd.
+ * Copyright (C) 2017 HiSilicon Electronics Co., Ltd.
  *		http://www.huawei.com
  *
  * Author: Xiaowei Song <songxiaowei@huawei.com>
diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c
index 134e0306ff00..fc0fe4d4de49 100644
--- a/drivers/pci/controller/pci-aardvark.c
+++ b/drivers/pci/controller/pci-aardvark.c
@@ -308,7 +308,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
 
 	advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG);
 
-	/* Unmask all MSI's */
+	/* Unmask all MSIs */
 	advk_writel(pcie, 0, PCIE_MSI_MASK_REG);
 
 	/* Enable summary interrupt for GIC SPI source */
diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c
index f30f5f3fb5c1..5a3550b6bb29 100644
--- a/drivers/pci/controller/pcie-iproc-platform.c
+++ b/drivers/pci/controller/pcie-iproc-platform.c
@@ -87,7 +87,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 
 	/*
 	 * DT nodes are not used by all platforms that use the iProc PCIe
-	 * core driver. For platforms that require explict inbound mapping
+	 * core driver. For platforms that require explicit inbound mapping
 	 * configuration, "dma-ranges" would have been present in DT
 	 */
 	pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges");
diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c
index e3ca46497470..2d457bfdaf66 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c
@@ -163,7 +163,7 @@ enum iproc_pcie_ib_map_type {
  * @size_unit: inbound mapping region size unit, could be SZ_1K, SZ_1M, or
  * SZ_1G
  * @region_sizes: list of supported inbound mapping region sizes in KB, MB, or
- * GB, depedning on the size unit
+ * GB, depending on the size unit
  * @nr_sizes: number of supported inbound mapping region sizes
  * @nr_windows: number of supported inbound mapping windows for the region
  * @imap_addr_offset: register offset between the upper and lower 32-bit
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 999a5509e57e..4575e0c6dc4b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -627,7 +627,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * 32-bit resources.  __pci_assign_resource() enforces that
 	 * artificial restriction to make sure everything will fit.
 	 *
-	 * The only way we could use a 64-bit non-prefechable MEMBAR is
+	 * The only way we could use a 64-bit non-prefetchable MEMBAR is
 	 * if its address is <4GB so that we can convert it to a 32-bit
 	 * resource.  To be visible to the host OS, all VMD endpoints must
 	 * be initially configured by platform BIOS, which includes setting
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index 24505b08de40..b8c9011987f4 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -73,7 +73,7 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 #elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
 
 /*
- * Legacy setup: Impement pci_mmap_resource_range() as a wrapper around
+ * Legacy setup: Implement pci_mmap_resource_range() as a wrapper around
  * the architecture's pci_mmap_page_range(), converting to "user visible"
  * addresses as necessary.
  */
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..59a6d232f77a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -237,7 +237,7 @@ static void msi_set_mask_bit(struct irq_data *data, u32 flag)
 }
 
 /**
- * pci_msi_mask_irq - Generic irq chip callback to mask PCI/MSI interrupts
+ * pci_msi_mask_irq - Generic IRQ chip callback to mask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_mask_irq(struct irq_data *data)
@@ -247,7 +247,7 @@ void pci_msi_mask_irq(struct irq_data *data)
 EXPORT_SYMBOL_GPL(pci_msi_mask_irq);
 
 /**
- * pci_msi_unmask_irq - Generic irq chip callback to unmask PCI/MSI interrupts
+ * pci_msi_unmask_irq - Generic IRQ chip callback to unmask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_unmask_irq(struct irq_data *data)
@@ -588,11 +588,11 @@ static int msi_verify_entries(struct pci_dev *dev)
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
  * @nvec: number of interrupts to allocate
- * @affd: description of automatic irq affinity assignments (may be %NULL)
+ * @affd: description of automatic IRQ affinity assignments (may be %NULL)
  *
  * Setup the MSI capability structure of the device with the requested
  * number of interrupts.  A return value of zero indicates the successful
- * setup of an entry with the new MSI irq.  A negative return value indicates
+ * setup of an entry with the new MSI IRQ.  A negative return value indicates
  * an error, and a positive return value indicates the number of interrupts
  * which could have been allocated.
  */
@@ -609,7 +609,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	if (!entry)
 		return -ENOMEM;
 
-	/* All MSIs are unmasked by default, Mask them all */
+	/* All MSIs are unmasked by default; mask them all */
 	mask = msi_mask(entry->msi_attrib.multi_cap);
 	msi_mask_irq(entry, mask, mask);
 
@@ -637,7 +637,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 		return ret;
 	}
 
-	/* Set MSI enabled bits	 */
+	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
 	pci_msi_set_enable(dev, 1);
 	dev->msi_enabled = 1;
@@ -729,11 +729,11 @@ static void msix_program_entries(struct pci_dev *dev,
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
- * @affd: Optional pointer to enable automatic affinity assignement
+ * @affd: Optional pointer to enable automatic affinity assignment
  *
  * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ * single MSI-X IRQ. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated IRQs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
@@ -789,7 +789,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 out_avail:
 	if (ret < 0) {
 		/*
-		 * If we had some success, report the number of irqs
+		 * If we had some success, report the number of IRQs
 		 * we succeeded in setting up.
 		 */
 		struct msi_desc *entry;
@@ -812,7 +812,7 @@ out_free:
 /**
  * pci_msi_supported - check whether MSI may be enabled on a device
  * @dev: pointer to the pci_dev data structure of MSI device function
- * @nvec: how many MSIs have been requested ?
+ * @nvec: how many MSIs have been requested?
  *
  * Look at global flags, the device itself, and its parent buses
  * to determine if MSI/-X are supported for the device. If MSI/-X is
@@ -896,7 +896,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
 	/* Keep cached state to be restored */
 	__pci_msi_desc_mask_irq(desc, mask, ~mask);
 
-	/* Restore dev->irq to its default pin-assertion irq */
+	/* Restore dev->irq to its default pin-assertion IRQ */
 	dev->irq = desc->msi_attrib.default_irq;
 	pcibios_alloc_irq(dev);
 }
@@ -958,7 +958,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 		}
 	}
 
-	/* Check whether driver already requested for MSI irq */
+	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
@@ -1026,7 +1026,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (!pci_msi_supported(dev, minvec))
 		return -EINVAL;
 
-	/* Check whether driver already requested MSI-X irqs */
+	/* Check whether driver already requested MSI-X IRQs */
 	if (dev->msix_enabled) {
 		pci_info(dev, "can't enable MSI (MSI-X already enabled)\n");
 		return -EINVAL;
@@ -1113,8 +1113,8 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
  * pci_enable_msix_range - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of MSI-X entries
- * @minvec: minimum number of MSI-X irqs requested
- * @maxvec: maximum number of MSI-X irqs requested
+ * @minvec: minimum number of MSI-X IRQs requested
+ * @maxvec: maximum number of MSI-X IRQs requested
  *
  * Setup the MSI-X capability structure of device function with a maximum
  * possible number of interrupts in the range between @minvec and @maxvec
@@ -1179,7 +1179,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			return msi_vecs;
 	}
 
-	/* use legacy irq if allowed */
+	/* use legacy IRQ if allowed */
 	if (flags & PCI_IRQ_LEGACY) {
 		if (min_vecs == 1 && dev->irq) {
 			/*
@@ -1248,7 +1248,7 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 EXPORT_SYMBOL(pci_irq_vector);
 
 /**
- * pci_irq_get_affinity - return the affinity of a particular msi vector
+ * pci_irq_get_affinity - return the affinity of a particular MSI vector
  * @dev:	PCI device to operate on
  * @nr:		device-relative interrupt vector index (0-based).
  */
@@ -1280,7 +1280,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 EXPORT_SYMBOL(pci_irq_get_affinity);
 
 /**
- * pci_irq_get_node - return the numa node of a particular msi vector
+ * pci_irq_get_node - return the NUMA node of a particular MSI vector
  * @pdev:	PCI device to operate on
  * @vec:	device-relative interrupt vector index (0-based).
  */
@@ -1330,7 +1330,7 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
 /**
  * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
  * @dev:	Pointer to the PCI device
- * @desc:	Pointer to the msi descriptor
+ * @desc:	Pointer to the MSI descriptor
  *
  * The ID number is only used within the irqdomain.
  */
@@ -1348,7 +1348,8 @@ static inline bool pci_msi_desc_is_multi_msi(struct msi_desc *desc)
 }
 
 /**
- * pci_msi_domain_check_cap - Verify that @domain supports the capabilities for @dev
+ * pci_msi_domain_check_cap - Verify that @domain supports the capabilities
+ * 			      for @dev
  * @domain:	The interrupt domain to check
  * @info:	The domain info for verification
  * @dev:	The device to check
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 742928d0053e..d953cc7d9a54 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
 
 /*
  * Note this function returns the parent PCI device with a
- * reference taken. It is the caller's responsibily to drop
+ * reference taken. It is the caller's responsibility to drop
  * the reference.
  */
 static struct pci_dev *find_parent_pci_dev(struct device *dev)
@@ -380,7 +380,7 @@ static int upstream_bridge_distance(struct pci_dev *provider,
 
 	/*
 	 * Allow the connection if both devices are on a whitelisted root
-	 * complex, but add an arbitary large value to the distance.
+	 * complex, but add an arbitrary large value to the distance.
 	 */
 	if (root_complex_whitelist(provider) &&
 	    root_complex_whitelist(client))
@@ -439,7 +439,7 @@ static int upstream_bridge_distance_warn(struct pci_dev *provider,
 }
 
 /**
- * pci_p2pdma_distance_many - Determive the cumulative distance between
+ * pci_p2pdma_distance_many - Determine the cumulative distance between
  *	a p2pdma provider and the clients in use.
  * @provider: p2pdma provider to check against the client list
  * @clients: array of devices to check (NULL-terminated)
diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c
index 83fb077d0b41..06083b86d4f4 100644
--- a/drivers/pci/pci-bridge-emul.c
+++ b/drivers/pci/pci-bridge-emul.c
@@ -305,7 +305,7 @@ int pci_bridge_emul_init(struct pci_bridge_emul *bridge,
 }
 
 /*
- * Cleanup a pci_bridge_emul structure that was previously initilized
+ * Cleanup a pci_bridge_emul structure that was previously initialized
  * using pci_bridge_emul_init().
  */
 void pci_bridge_emul_cleanup(struct pci_bridge_emul *bridge)
diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
index 9795649fc6f9..ef293e735c55 100644
--- a/drivers/pci/pci-pf-stub.c
+++ b/drivers/pci/pci-pf-stub.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
  *
- * This driver is meant to act as a "whitelist" for devices that provde
+ * This driver is meant to act as a "whitelist" for devices that provide
  * SR-IOV functionality while at the same time not actually needing a
  * driver of their own.
  */
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..3fd4eaa32b21 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4501,7 +4501,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe)
 
 	/*
 	 * Wait for Transaction Pending bit to clear.  A word-aligned test
-	 * is used, so we use the conrol offset rather than status and shift
+	 * is used, so we use the control offset rather than status and shift
 	 * the test bit to match.
 	 */
 	if (!pci_wait_for_pending(dev, pos + PCI_AF_CTRL,
diff --git a/drivers/pci/pcie/aer_inject.c b/drivers/pci/pcie/aer_inject.c
index 043b8b0cfcc5..6988fe7389b9 100644
--- a/drivers/pci/pcie/aer_inject.c
+++ b/drivers/pci/pcie/aer_inject.c
@@ -2,7 +2,7 @@
 /*
  * PCIe AER software error injection support.
  *
- * Debuging PCIe AER code is quite difficult because it is hard to
+ * Debugging PCIe AER code is quite difficult because it is hard to
  * trigger various real hardware errors. Software based error
  * injection can fake almost all kinds of errors with the help of a
  * user space helper tool aer-inject, which can be gotten from:
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..fb207a22d686 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -382,7 +382,7 @@ struct pci_dev {
 
 	unsigned int	is_busmaster:1;		/* Is busmaster */
 	unsigned int	no_msi:1;		/* May not use MSI */
-	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
+	unsigned int	no_64bit_msi:1;		/* May only use 32-bit MSIs */
 	unsigned int	block_cfg_access:1;	/* Config space access blocked */
 	unsigned int	broken_parity_status:1;	/* Generates false positive parity */
 	unsigned int	irq_reroute_variant:2;	/* Needs IRQ rerouting variant */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..0dd239f11e91 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1112,7 +1112,7 @@
 
 #define PCI_VENDOR_ID_AL		0x10b9
 #define PCI_DEVICE_ID_AL_M1533		0x1533
-#define PCI_DEVICE_ID_AL_M1535 		0x1535
+#define PCI_DEVICE_ID_AL_M1535		0x1535
 #define PCI_DEVICE_ID_AL_M1541		0x1541
 #define PCI_DEVICE_ID_AL_M1563		0x1563
 #define PCI_DEVICE_ID_AL_M1621		0x1621
@@ -1752,7 +1752,7 @@
 #define PCI_VENDOR_ID_STALLION		0x124d
 
 /* Allied Telesyn */
-#define PCI_VENDOR_ID_AT    		0x1259
+#define PCI_VENDOR_ID_AT		0x1259
 #define PCI_SUBDEVICE_ID_AT_2700FX	0x2701
 #define PCI_SUBDEVICE_ID_AT_2701FX	0x2703
 
@@ -2550,7 +2550,7 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF2	0x1700
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
-#define PCI_VENDOR_ID_HUAWEI         	0x19e5
+#define PCI_VENDOR_ID_HUAWEI		0x19e5
 
 #define PCI_VENDOR_ID_NETRONOME		0x19ee
 #define PCI_DEVICE_ID_NETRONOME_NFP4000	0x4000
-- 
cgit v1.2.3


From 675dd90ad0932f2c03912a5252458d792bd7033a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:33:42 -0400
Subject: xprtrdma: Modernize ops->connect

Adapt and apply changes that were made to the TCP socket connect
code. See the following commits for details on the purpose of
these changes:

Commit 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly")
Commit 3851f1cdb2b8 ("SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout")
Commit 02910177aede ("SUNRPC: Fix reconnection timeouts")

Some common transport code is moved to xprt.c to satisfy the code
duplication police.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h     |  3 ++
 include/trace/events/rpcrdma.h  | 31 +++++++++++++++++++
 net/sunrpc/sched.c              |  1 +
 net/sunrpc/xprt.c               | 32 ++++++++++++++++++++
 net/sunrpc/xprtrdma/transport.c | 66 +++++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 net/sunrpc/xprtsock.c           | 23 ++------------
 7 files changed, 121 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..cc78fd38ea7d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -334,6 +334,9 @@ struct xprt_class {
  */
 struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
+unsigned long		xprt_reconnect_delay(const struct rpc_xprt *xprt);
+void			xprt_reconnect_backoff(struct rpc_xprt *xprt,
+					       unsigned long init_to);
 void			xprt_reserve(struct rpc_task *task);
 void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 98023d91a72d..f6a4eaa85a3e 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -375,6 +375,37 @@ DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
 DEFINE_RXPRT_EVENT(xprtrdma_op_close);
 DEFINE_RXPRT_EVENT(xprtrdma_op_connect);
 
+TRACE_EVENT(xprtrdma_op_set_cto,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned long connect,
+		unsigned long reconnect
+	),
+
+	TP_ARGS(r_xprt, connect, reconnect),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned long, connect)
+		__field(unsigned long, reconnect)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->connect = connect;
+		__entry->reconnect = reconnect;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__entry->connect / HZ, __entry->reconnect / HZ
+	)
+);
+
 TRACE_EVENT(xprtrdma_qp_event,
 	TP_PROTO(
 		const struct rpcrdma_xprt *r_xprt,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index bb04ae52803a..5ad5dead7bfc 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -58,6 +58,7 @@ static struct rpc_wait_queue delay_queue;
  */
 struct workqueue_struct *rpciod_workqueue __read_mostly;
 struct workqueue_struct *xprtiod_workqueue __read_mostly;
+EXPORT_SYMBOL_GPL(xprtiod_workqueue);
 
 unsigned long
 rpc_task_timeout(const struct rpc_task *task)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ad21880d5601..b1f54b7ccc0c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -850,6 +850,38 @@ void xprt_connect(struct rpc_task *task)
 	xprt_release_write(xprt, task);
 }
 
+/**
+ * xprt_reconnect_delay - compute the wait before scheduling a connect
+ * @xprt: transport instance
+ *
+ */
+unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)
+{
+	unsigned long start, now = jiffies;
+
+	start = xprt->stat.connect_start + xprt->reestablish_timeout;
+	if (time_after(start, now))
+		return start - now;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_delay);
+
+/**
+ * xprt_reconnect_backoff - compute the new re-establish timeout
+ * @xprt: transport instance
+ * @init_to: initial reestablish timeout
+ *
+ */
+void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)
+{
+	xprt->reestablish_timeout <<= 1;
+	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+	if (xprt->reestablish_timeout < init_to)
+		xprt->reestablish_timeout = init_to;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_backoff);
+
 enum xprt_xid_rb_cmp {
 	XID_RB_EQUAL,
 	XID_RB_LEFT,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3688e0782587..4993aa49ecbe 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -298,6 +298,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 	module_put(THIS_MODULE);
 }
 
+/* 60 second timeout, no retries */
 static const struct rpc_timeout xprt_rdma_default_timeout = {
 	.to_initval = 60 * HZ,
 	.to_maxval = 60 * HZ,
@@ -323,8 +324,9 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (!xprt)
 		return ERR_PTR(-ENOMEM);
 
-	/* 60 second timeout, no retries */
 	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->connect_timeout = xprt->timeout->to_initval;
+	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
 	xprt->bind_timeout = RPCRDMA_BIND_TO;
 	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
@@ -487,31 +489,64 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 }
 
 /**
- * xprt_rdma_connect - try to establish a transport connection
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+					      unsigned long connect_timeout,
+					      unsigned long reconnect_timeout)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+	spin_lock(&xprt->transport_lock);
+
+	if (connect_timeout < xprt->connect_timeout) {
+		struct rpc_timeout to;
+		unsigned long initval;
+
+		to = *xprt->timeout;
+		initval = connect_timeout;
+		if (initval < RPCRDMA_INIT_REEST_TO << 1)
+			initval = RPCRDMA_INIT_REEST_TO << 1;
+		to.to_initval = initval;
+		to.to_maxval = initval;
+		r_xprt->rx_timeout = to;
+		xprt->timeout = &r_xprt->rx_timeout;
+		xprt->connect_timeout = connect_timeout;
+	}
+
+	if (reconnect_timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = reconnect_timeout;
+
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
  * @xprt: transport state
- * @task: RPC scheduler context
+ * @task: RPC scheduler context (unused)
  *
  */
 static void
 xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	unsigned long delay;
 
 	trace_xprtrdma_op_connect(r_xprt);
+
+	delay = 0;
 	if (r_xprt->rx_ep.rep_connected != 0) {
-		/* Reconnect */
-		schedule_delayed_work(&r_xprt->rx_connect_worker,
-				      xprt->reestablish_timeout);
-		xprt->reestablish_timeout <<= 1;
-		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
-		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
-	} else {
-		schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
-		if (!RPC_IS_ASYNC(task))
-			flush_delayed_work(&r_xprt->rx_connect_worker);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
 	}
+	queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+			   delay);
 }
 
 /**
@@ -769,6 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
+	.set_connect_timeout	= xprt_rdma_tcp_set_connect_timeout,
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 117e32816e4f..8378f45d2da7 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -432,6 +432,7 @@ struct rpcrdma_xprt {
 	struct rpcrdma_ep	rx_ep;
 	struct rpcrdma_buffer	rx_buf;
 	struct delayed_work	rx_connect_worker;
+	struct rpc_timeout	rx_timeout;
 	struct rpcrdma_stats	rx_stats;
 };
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c69951ed2ebc..b154600085d6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2402,25 +2402,6 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
-static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
-{
-	unsigned long start, now = jiffies;
-
-	start = xprt->stat.connect_start + xprt->reestablish_timeout;
-	if (time_after(start, now))
-		return start - now;
-	return 0;
-}
-
-static void xs_reconnect_backoff(struct rpc_xprt *xprt)
-{
-	xprt->reestablish_timeout <<= 1;
-	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
-		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
-	if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
-		xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-}
-
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @xprt: pointer to transport structure
@@ -2450,8 +2431,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		/* Start by resetting any existing state */
 		xs_reset_transport(transport);
 
-		delay = xs_reconnect_delay(xprt);
-		xs_reconnect_backoff(xprt);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
 
 	} else
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-- 
cgit v1.2.3


From 75a56758d6390ea6db523ad26ce378f34b907b0c Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 9 Jul 2019 10:30:49 +0300
Subject: net/flow_dissector: add connection tracking dissection

Retreives connection tracking zone, mark, label, and state from
a SKB.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 10 ++++++++++
 include/net/flow_dissector.h | 15 +++++++++++++++
 net/core/flow_dissector.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9d7a2c28ea35..d8af86d995d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1325,6 +1325,16 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,
 			   struct flow_dissector *flow_dissector,
 			   void *target_container);
 
+/* Gets a skb connection tracking info, ctinfo map should be a
+ * a map of mapsize to translate enum ip_conntrack_info states
+ * to user states.
+ */
+void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+		    struct flow_dissector *flow_dissector,
+		    void *target_container,
+		    u16 *ctinfo_map,
+		    size_t mapsize);
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 02478e48fae4..90bd210be060 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -208,6 +208,20 @@ struct flow_dissector_key_meta {
 	int ingress_ifindex;
 };
 
+/**
+ * struct flow_dissector_key_ct:
+ * @ct_state: conntrack state after converting with map
+ * @ct_mark: conttrack mark
+ * @ct_zone: conntrack zone
+ * @ct_labels: conntrack labels
+ */
+struct flow_dissector_key_ct {
+	u16	ct_state;
+	u16	ct_zone;
+	u32	ct_mark;
+	u32	ct_labels[4];
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -234,6 +248,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
 	FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
 	FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
+	FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 01ad60b5aa75..3e6fedb57bc1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -27,6 +27,10 @@
 #include <scsi/fc/fc_fcoe.h>
 #include <uapi/linux/batadv_packet.h>
 #include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
 
 static DEFINE_MUTEX(flow_dissector_mutex);
 
@@ -231,6 +235,46 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 	ctrl->addr_type = type;
 }
 
+void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+		    struct flow_dissector *flow_dissector,
+		    void *target_container,
+		    u16 *ctinfo_map,
+		    size_t mapsize)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	struct flow_dissector_key_ct *key;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_labels *cl;
+	struct nf_conn *ct;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+		return;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return;
+
+	key = skb_flow_dissector_target(flow_dissector,
+					FLOW_DISSECTOR_KEY_CT,
+					target_container);
+
+	if (ctinfo < mapsize)
+		key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+	key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+	key->ct_mark = ct->mark;
+#endif
+
+	cl = nf_ct_labels_find(ct);
+	if (cl)
+		memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From 0fa03c624d8fc9932d0f27c39a9deca6a37e0e17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:34:07 -0600
Subject: io_uring: add support for sendmsg()

This is done through IORING_OP_SENDMSG. There's a new sqe->msg_flags
for the flags argument, and the msghdr struct is passed in the
sqe->addr field.

We use MSG_DONTWAIT to force an inline fast path if sendmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/socket.h        |  4 ++++
 include/uapi/linux/io_uring.h |  2 ++
 net/socket.c                  |  7 +++++++
 4 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f0ef4956f87..5d4cd8c4132d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,6 +1390,43 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct user_msghdr __user *msg;
+		unsigned flags;
+
+		flags = READ_ONCE(sqe->msg_flags);
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		msg = (struct user_msghdr __user *) (unsigned long)
+			READ_ONCE(sqe->addr);
+
+		ret = __sys_sendmsg_sock(sock, msg, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return ret;
+	}
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1675,6 +1712,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SYNC_FILE_RANGE:
 		ret = io_sync_file_range(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_SENDMSG:
+		ret = io_sendmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b57cd8bf96e2..9d770ef3ced5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -12,6 +12,7 @@
 
 struct pid;
 struct cred;
+struct socket;
 
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -374,6 +375,9 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 			  unsigned int vlen, unsigned int flags,
 			  bool forbid_cmsg_compat);
+extern long __sys_sendmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 10b7c45f6d57..d74742d6269f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,6 +27,7 @@ struct io_uring_sqe {
 		__u32		fsync_flags;
 		__u16		poll_events;
 		__u32		sync_range_flags;
+		__u32		msg_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -58,6 +59,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_ADD	6
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
+#define IORING_OP_SENDMSG	9
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index bffec466b4f1..b9536940255e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2313,6 +2313,13 @@ out_freeiov:
 /*
  *	BSD sendmsg interface
  */
+long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+}
 
 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
-- 
cgit v1.2.3


From aa1fa28fc73ea6b740ee7b62bf3b07141883dbb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:38:09 -0600
Subject: io_uring: add support for recvmsg()

This is done through IORING_OP_RECVMSG. This opcode uses the same
sqe->msg_flags that IORING_OP_SENDMSG added, and we pass in the
msghdr struct in the sqe->addr field as well.

We use MSG_DONTWAIT to force an inline fast path if recvmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 31 +++++++++++++++++++++++++++----
 include/linux/socket.h        |  3 +++
 include/uapi/linux/io_uring.h |  1 +
 net/socket.c                  |  8 ++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d4cd8c4132d..8d86e31b0762 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,10 +1390,12 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      bool force_nonblock)
-{
 #if defined(CONFIG_NET)
+static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   bool force_nonblock,
+		   long (*fn)(struct socket *, struct user_msghdr __user *,
+				unsigned int))
+{
 	struct socket *sock;
 	int ret;
 
@@ -1414,7 +1416,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		msg = (struct user_msghdr __user *) (unsigned long)
 			READ_ONCE(sqe->addr);
 
-		ret = __sys_sendmsg_sock(sock, msg, flags);
+		ret = fn(sock, msg, flags);
 		if (force_nonblock && ret == -EAGAIN)
 			return ret;
 	}
@@ -1422,6 +1424,24 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
+}
+#endif
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -1715,6 +1735,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SENDMSG:
 		ret = io_sendmsg(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_RECVMSG:
+		ret = io_recvmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9d770ef3ced5..97523818cb14 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -378,6 +378,9 @@ extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 extern long __sys_sendmsg_sock(struct socket *sock,
 			       struct user_msghdr __user *msg,
 			       unsigned int flags);
+extern long __sys_recvmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d74742d6269f..1e1652f25cc1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -60,6 +60,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
 #define IORING_OP_SENDMSG	9
+#define IORING_OP_RECVMSG	10
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index b9536940255e..98354cc18840 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2494,6 +2494,14 @@ out_freeiov:
  *	BSD recvmsg interface
  */
 
+long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+}
+
 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
 {
-- 
cgit v1.2.3


From 6605bdd59c21bb34c8f14ac4d6f2d419185f3528 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 28 Jun 2019 09:53:29 -0700
Subject: nvme: add I/O characteristics fields

Several new fields have been introduced in version 1.4 of the NVMe spec
at offsets that were defined as reserved in version 1.3d of the NVMe
spec. Update the definition of the nvme_id_ns data structure such that
it is in sync with version 1.4 of the NVMe spec. This change preserves
backwards compatibility.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d98b2d8baf4e..01aa6a6c241d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -315,7 +315,7 @@ struct nvme_id_ns {
 	__u8			nmic;
 	__u8			rescap;
 	__u8			fpi;
-	__u8			rsvd33;
+	__u8			dlfeat;
 	__le16			nawun;
 	__le16			nawupf;
 	__le16			nacwu;
@@ -324,11 +324,17 @@ struct nvme_id_ns {
 	__le16			nabspf;
 	__le16			noiob;
 	__u8			nvmcap[16];
-	__u8			rsvd64[28];
+	__le16			npwg;
+	__le16			npwa;
+	__le16			npdg;
+	__le16			npda;
+	__le16			nows;
+	__u8			rsvd74[18];
 	__le32			anagrpid;
 	__u8			rsvd96[3];
 	__u8			nsattr;
-	__u8			rsvd100[4];
+	__le16			nvmsetid;
+	__le16			endgid;
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
-- 
cgit v1.2.3


From 13990cf8a180cc070f0b1266140e799db8754289 Mon Sep 17 00:00:00 2001
From: Amol Surati <suratiamol@gmail.com>
Date: Sun, 7 Jul 2019 14:27:29 +0530
Subject: ide: use BIT() macro for defining bit-flags

The BIT() macro is available for defining the required bit-flags.

Since it operates on an unsigned value and expands to an unsigned result,
using it, instead of an expression like (1 << x), also fixes the problem
of shifting a signed 32-bit value by 31 bits (e.g. 1 << 31).

Signed-off-by: Amol Surati <suratiamol@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ide.h | 272 ++++++++++++++++++++++++++--------------------------
 1 file changed, 136 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 971cf76a78a0..46b771d6999e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -253,9 +253,9 @@ static inline void ide_std_init_ports(struct ide_hw *hw,
  * Special Driver Flags
  */
 enum {
-	IDE_SFLAG_SET_GEOMETRY		= (1 << 0),
-	IDE_SFLAG_RECALIBRATE		= (1 << 1),
-	IDE_SFLAG_SET_MULTMODE		= (1 << 2),
+	IDE_SFLAG_SET_GEOMETRY		= BIT(0),
+	IDE_SFLAG_RECALIBRATE		= BIT(1),
+	IDE_SFLAG_SET_MULTMODE		= BIT(2),
 };
 
 /*
@@ -267,13 +267,13 @@ typedef enum {
 } ide_startstop_t;
 
 enum {
-	IDE_VALID_ERROR 		= (1 << 1),
+	IDE_VALID_ERROR 		= BIT(1),
 	IDE_VALID_FEATURE		= IDE_VALID_ERROR,
-	IDE_VALID_NSECT 		= (1 << 2),
-	IDE_VALID_LBAL			= (1 << 3),
-	IDE_VALID_LBAM			= (1 << 4),
-	IDE_VALID_LBAH			= (1 << 5),
-	IDE_VALID_DEVICE		= (1 << 6),
+	IDE_VALID_NSECT 		= BIT(2),
+	IDE_VALID_LBAL			= BIT(3),
+	IDE_VALID_LBAM			= BIT(4),
+	IDE_VALID_LBAH			= BIT(5),
+	IDE_VALID_DEVICE		= BIT(6),
 	IDE_VALID_LBA			= IDE_VALID_LBAL |
 					  IDE_VALID_LBAM |
 					  IDE_VALID_LBAH,
@@ -289,24 +289,24 @@ enum {
 };
 
 enum {
-	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_WRITE			= (1 << 1),
-	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 2),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 3),
+	IDE_TFLAG_LBA48			= BIT(0),
+	IDE_TFLAG_WRITE			= BIT(1),
+	IDE_TFLAG_CUSTOM_HANDLER	= BIT(2),
+	IDE_TFLAG_DMA_PIO_FALLBACK	= BIT(3),
 	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= (1 << 4),
+	IDE_TFLAG_IO_16BIT		= BIT(4),
 	/* struct ide_cmd was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= (1 << 5),
-	IDE_TFLAG_FS			= (1 << 6),
-	IDE_TFLAG_MULTI_PIO		= (1 << 7),
-	IDE_TFLAG_SET_XFER		= (1 << 8),
+	IDE_TFLAG_DYN			= BIT(5),
+	IDE_TFLAG_FS			= BIT(6),
+	IDE_TFLAG_MULTI_PIO		= BIT(7),
+	IDE_TFLAG_SET_XFER		= BIT(8),
 };
 
 enum {
-	IDE_FTFLAG_FLAGGED		= (1 << 0),
-	IDE_FTFLAG_SET_IN_FLAGS		= (1 << 1),
-	IDE_FTFLAG_OUT_DATA		= (1 << 2),
-	IDE_FTFLAG_IN_DATA		= (1 << 3),
+	IDE_FTFLAG_FLAGGED		= BIT(0),
+	IDE_FTFLAG_SET_IN_FLAGS		= BIT(1),
+	IDE_FTFLAG_OUT_DATA		= BIT(2),
+	IDE_FTFLAG_IN_DATA		= BIT(3),
 };
 
 struct ide_taskfile {
@@ -357,13 +357,13 @@ struct ide_cmd {
 /* ATAPI packet command flags */
 enum {
 	/* set when an error is considered normal - no retry (ide-tape) */
-	PC_FLAG_ABORT			= (1 << 0),
-	PC_FLAG_SUPPRESS_ERROR		= (1 << 1),
-	PC_FLAG_WAIT_FOR_DSC		= (1 << 2),
-	PC_FLAG_DMA_OK			= (1 << 3),
-	PC_FLAG_DMA_IN_PROGRESS		= (1 << 4),
-	PC_FLAG_DMA_ERROR		= (1 << 5),
-	PC_FLAG_WRITING			= (1 << 6),
+	PC_FLAG_ABORT			= BIT(0),
+	PC_FLAG_SUPPRESS_ERROR		= BIT(1),
+	PC_FLAG_WAIT_FOR_DSC		= BIT(2),
+	PC_FLAG_DMA_OK			= BIT(3),
+	PC_FLAG_DMA_IN_PROGRESS		= BIT(4),
+	PC_FLAG_DMA_ERROR		= BIT(5),
+	PC_FLAG_WRITING			= BIT(6),
 };
 
 #define ATAPI_WAIT_PC		(60 * HZ)
@@ -417,111 +417,111 @@ struct ide_disk_ops {
 
 /* ATAPI device flags */
 enum {
-	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
+	IDE_AFLAG_DRQ_INTERRUPT		= BIT(0),
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-	IDE_AFLAG_NO_EJECT		= (1 << 1),
+	IDE_AFLAG_NO_EJECT		= BIT(1),
 	/* Drive is a pre ATAPI 1.2 drive. */
-	IDE_AFLAG_PRE_ATAPI12		= (1 << 2),
+	IDE_AFLAG_PRE_ATAPI12		= BIT(2),
 	/* TOC addresses are in BCD. */
-	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
+	IDE_AFLAG_TOCADDR_AS_BCD	= BIT(3),
 	/* TOC track numbers are in BCD. */
-	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
+	IDE_AFLAG_TOCTRACKS_AS_BCD	= BIT(4),
 	/* Saved TOC information is current. */
-	IDE_AFLAG_TOC_VALID		= (1 << 6),
+	IDE_AFLAG_TOC_VALID		= BIT(6),
 	/* We think that the drive door is locked. */
-	IDE_AFLAG_DOOR_LOCKED		= (1 << 7),
+	IDE_AFLAG_DOOR_LOCKED		= BIT(7),
 	/* SET_CD_SPEED command is unsupported. */
-	IDE_AFLAG_NO_SPEED_SELECT	= (1 << 8),
-	IDE_AFLAG_VERTOS_300_SSD	= (1 << 9),
-	IDE_AFLAG_VERTOS_600_ESD	= (1 << 10),
-	IDE_AFLAG_SANYO_3CD		= (1 << 11),
-	IDE_AFLAG_FULL_CAPS_PAGE	= (1 << 12),
-	IDE_AFLAG_PLAY_AUDIO_OK		= (1 << 13),
-	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 14),
+	IDE_AFLAG_NO_SPEED_SELECT	= BIT(8),
+	IDE_AFLAG_VERTOS_300_SSD	= BIT(9),
+	IDE_AFLAG_VERTOS_600_ESD	= BIT(10),
+	IDE_AFLAG_SANYO_3CD		= BIT(11),
+	IDE_AFLAG_FULL_CAPS_PAGE	= BIT(12),
+	IDE_AFLAG_PLAY_AUDIO_OK		= BIT(13),
+	IDE_AFLAG_LE_SPEED_FIELDS	= BIT(14),
 
 	/* ide-floppy */
 	/* Avoid commands not supported in Clik drive */
-	IDE_AFLAG_CLIK_DRIVE		= (1 << 15),
+	IDE_AFLAG_CLIK_DRIVE		= BIT(15),
 	/* Requires BH algorithm for packets */
-	IDE_AFLAG_ZIP_DRIVE		= (1 << 16),
+	IDE_AFLAG_ZIP_DRIVE		= BIT(16),
 	/* Supports format progress report */
-	IDE_AFLAG_SRFP			= (1 << 17),
+	IDE_AFLAG_SRFP			= BIT(17),
 
 	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= (1 << 18),
+	IDE_AFLAG_IGNORE_DSC		= BIT(18),
 	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= (1 <<	19),
+	IDE_AFLAG_ADDRESS_VALID		= BIT(19),
 	/* Device already opened */
-	IDE_AFLAG_BUSY			= (1 << 20),
+	IDE_AFLAG_BUSY			= BIT(20),
 	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= (1 << 21),
+	IDE_AFLAG_DETECT_BS		= BIT(21),
 	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= (1 << 22),
+	IDE_AFLAG_FILEMARK		= BIT(22),
 	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 23),
+	IDE_AFLAG_MEDIUM_PRESENT	= BIT(23),
 
-	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 24),
+	IDE_AFLAG_NO_AUTOCLOSE		= BIT(24),
 };
 
 /* device flags */
 enum {
 	/* restore settings after device reset */
-	IDE_DFLAG_KEEP_SETTINGS		= (1 << 0),
+	IDE_DFLAG_KEEP_SETTINGS		= BIT(0),
 	/* device is using DMA for read/write */
-	IDE_DFLAG_USING_DMA		= (1 << 1),
+	IDE_DFLAG_USING_DMA		= BIT(1),
 	/* okay to unmask other IRQs */
-	IDE_DFLAG_UNMASK		= (1 << 2),
+	IDE_DFLAG_UNMASK		= BIT(2),
 	/* don't attempt flushes */
-	IDE_DFLAG_NOFLUSH		= (1 << 3),
+	IDE_DFLAG_NOFLUSH		= BIT(3),
 	/* DSC overlap */
-	IDE_DFLAG_DSC_OVERLAP		= (1 << 4),
+	IDE_DFLAG_DSC_OVERLAP		= BIT(4),
 	/* give potential excess bandwidth */
-	IDE_DFLAG_NICE1			= (1 << 5),
+	IDE_DFLAG_NICE1			= BIT(5),
 	/* device is physically present */
-	IDE_DFLAG_PRESENT		= (1 << 6),
+	IDE_DFLAG_PRESENT		= BIT(6),
 	/* disable Host Protected Area */
-	IDE_DFLAG_NOHPA			= (1 << 7),
+	IDE_DFLAG_NOHPA			= BIT(7),
 	/* id read from device (synthetic if not set) */
-	IDE_DFLAG_ID_READ		= (1 << 8),
-	IDE_DFLAG_NOPROBE		= (1 << 9),
+	IDE_DFLAG_ID_READ		= BIT(8),
+	IDE_DFLAG_NOPROBE		= BIT(9),
 	/* need to do check_media_change() */
-	IDE_DFLAG_REMOVABLE		= (1 << 10),
+	IDE_DFLAG_REMOVABLE		= BIT(10),
 	/* needed for removable devices */
-	IDE_DFLAG_ATTACH		= (1 << 11),
-	IDE_DFLAG_FORCED_GEOM		= (1 << 12),
+	IDE_DFLAG_ATTACH		= BIT(11),
+	IDE_DFLAG_FORCED_GEOM		= BIT(12),
 	/* disallow setting unmask bit */
-	IDE_DFLAG_NO_UNMASK		= (1 << 13),
+	IDE_DFLAG_NO_UNMASK		= BIT(13),
 	/* disallow enabling 32-bit I/O */
-	IDE_DFLAG_NO_IO_32BIT		= (1 << 14),
+	IDE_DFLAG_NO_IO_32BIT		= BIT(14),
 	/* for removable only: door lock/unlock works */
-	IDE_DFLAG_DOORLOCKING		= (1 << 15),
+	IDE_DFLAG_DOORLOCKING		= BIT(15),
 	/* disallow DMA */
-	IDE_DFLAG_NODMA			= (1 << 16),
+	IDE_DFLAG_NODMA			= BIT(16),
 	/* powermanagement told us not to do anything, so sleep nicely */
-	IDE_DFLAG_BLOCKED		= (1 << 17),
+	IDE_DFLAG_BLOCKED		= BIT(17),
 	/* sleeping & sleep field valid */
-	IDE_DFLAG_SLEEPING		= (1 << 18),
-	IDE_DFLAG_POST_RESET		= (1 << 19),
-	IDE_DFLAG_UDMA33_WARNED		= (1 << 20),
-	IDE_DFLAG_LBA48			= (1 << 21),
+	IDE_DFLAG_SLEEPING		= BIT(18),
+	IDE_DFLAG_POST_RESET		= BIT(19),
+	IDE_DFLAG_UDMA33_WARNED		= BIT(20),
+	IDE_DFLAG_LBA48			= BIT(21),
 	/* status of write cache */
-	IDE_DFLAG_WCACHE		= (1 << 22),
+	IDE_DFLAG_WCACHE		= BIT(22),
 	/* used for ignoring ATA_DF */
-	IDE_DFLAG_NOWERR		= (1 << 23),
+	IDE_DFLAG_NOWERR		= BIT(23),
 	/* retrying in PIO */
-	IDE_DFLAG_DMA_PIO_RETRY		= (1 << 24),
-	IDE_DFLAG_LBA			= (1 << 25),
+	IDE_DFLAG_DMA_PIO_RETRY		= BIT(24),
+	IDE_DFLAG_LBA			= BIT(25),
 	/* don't unload heads */
-	IDE_DFLAG_NO_UNLOAD		= (1 << 26),
+	IDE_DFLAG_NO_UNLOAD		= BIT(26),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 27),
-	IDE_DFLAG_MEDIA_CHANGED		= (1 << 28),
+	IDE_DFLAG_PARKED		= BIT(27),
+	IDE_DFLAG_MEDIA_CHANGED		= BIT(28),
 	/* write protect */
-	IDE_DFLAG_WP			= (1 << 29),
-	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 30),
-	IDE_DFLAG_NIEN_QUIRK		= (1 << 31),
+	IDE_DFLAG_WP			= BIT(29),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= BIT(30),
+	IDE_DFLAG_NIEN_QUIRK		= BIT(31),
 };
 
 struct ide_drive_s {
@@ -709,7 +709,7 @@ struct ide_dma_ops {
 };
 
 enum {
-	IDE_PFLAG_PROBING		= (1 << 0),
+	IDE_PFLAG_PROBING		= BIT(0),
 };
 
 struct ide_host;
@@ -862,7 +862,7 @@ extern struct mutex ide_setting_mtx;
  * configurable drive settings
  */
 
-#define DS_SYNC	(1 << 0)
+#define DS_SYNC	BIT(0)
 
 struct ide_devset {
 	int		(*get)(ide_drive_t *);
@@ -1000,15 +1000,15 @@ static inline void ide_proc_unregister_driver(ide_drive_t *drive,
 
 enum {
 	/* enter/exit functions */
-	IDE_DBG_FUNC =			(1 << 0),
+	IDE_DBG_FUNC =			BIT(0),
 	/* sense key/asc handling */
-	IDE_DBG_SENSE =			(1 << 1),
+	IDE_DBG_SENSE =			BIT(1),
 	/* packet commands handling */
-	IDE_DBG_PC =			(1 << 2),
+	IDE_DBG_PC =			BIT(2),
 	/* request handling */
-	IDE_DBG_RQ =			(1 << 3),
+	IDE_DBG_RQ =			BIT(3),
 	/* driver probing/setup */
-	IDE_DBG_PROBE =			(1 << 4),
+	IDE_DBG_PROBE =			BIT(4),
 };
 
 /* DRV_NAME has to be defined in the driver before using the macro below */
@@ -1171,10 +1171,10 @@ ssize_t ide_park_store(struct device *dev, struct device_attribute *attr,
  * the tail of our block device request queue and wait for their completion.
  */
 enum {
-	REQ_IDETAPE_PC1		= (1 << 0), /* packet command (first stage) */
-	REQ_IDETAPE_PC2		= (1 << 1), /* packet command (second stage) */
-	REQ_IDETAPE_READ	= (1 << 2),
-	REQ_IDETAPE_WRITE	= (1 << 3),
+	REQ_IDETAPE_PC1		= BIT(0), /* packet command (first stage) */
+	REQ_IDETAPE_PC2		= BIT(1), /* packet command (second stage) */
+	REQ_IDETAPE_READ	= BIT(2),
+	REQ_IDETAPE_WRITE	= BIT(3),
 };
 
 int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
@@ -1264,71 +1264,71 @@ struct ide_pci_enablebit {
 
 enum {
 	/* Uses ISA control ports not PCI ones. */
-	IDE_HFLAG_ISA_PORTS		= (1 << 0),
+	IDE_HFLAG_ISA_PORTS		= BIT(0),
 	/* single port device */
-	IDE_HFLAG_SINGLE		= (1 << 1),
+	IDE_HFLAG_SINGLE		= BIT(1),
 	/* don't use legacy PIO blacklist */
-	IDE_HFLAG_PIO_NO_BLACKLIST	= (1 << 2),
+	IDE_HFLAG_PIO_NO_BLACKLIST	= BIT(2),
 	/* set for the second port of QD65xx */
-	IDE_HFLAG_QD_2ND_PORT		= (1 << 3),
+	IDE_HFLAG_QD_2ND_PORT		= BIT(3),
 	/* use PIO8/9 for prefetch off/on */
-	IDE_HFLAG_ABUSE_PREFETCH	= (1 << 4),
+	IDE_HFLAG_ABUSE_PREFETCH	= BIT(4),
 	/* use PIO6/7 for fast-devsel off/on */
-	IDE_HFLAG_ABUSE_FAST_DEVSEL	= (1 << 5),
+	IDE_HFLAG_ABUSE_FAST_DEVSEL	= BIT(5),
 	/* use 100-102 and 200-202 PIO values to set DMA modes */
-	IDE_HFLAG_ABUSE_DMA_MODES	= (1 << 6),
+	IDE_HFLAG_ABUSE_DMA_MODES	= BIT(6),
 	/*
 	 * keep DMA setting when programming PIO mode, may be used only
 	 * for hosts which have separate PIO and DMA timings (ie. PMAC)
 	 */
-	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= (1 << 7),
+	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= BIT(7),
 	/* program host for the transfer mode after programming device */
-	IDE_HFLAG_POST_SET_MODE		= (1 << 8),
+	IDE_HFLAG_POST_SET_MODE		= BIT(8),
 	/* don't program host/device for the transfer mode ("smart" hosts) */
-	IDE_HFLAG_NO_SET_MODE		= (1 << 9),
+	IDE_HFLAG_NO_SET_MODE		= BIT(9),
 	/* trust BIOS for programming chipset/device for DMA */
-	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= (1 << 10),
+	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= BIT(10),
 	/* host is CS5510/CS5520 */
-	IDE_HFLAG_CS5520		= (1 << 11),
+	IDE_HFLAG_CS5520		= BIT(11),
 	/* ATAPI DMA is unsupported */
-	IDE_HFLAG_NO_ATAPI_DMA		= (1 << 12),
+	IDE_HFLAG_NO_ATAPI_DMA		= BIT(12),
 	/* set if host is a "non-bootable" controller */
-	IDE_HFLAG_NON_BOOTABLE		= (1 << 13),
+	IDE_HFLAG_NON_BOOTABLE		= BIT(13),
 	/* host doesn't support DMA */
-	IDE_HFLAG_NO_DMA		= (1 << 14),
+	IDE_HFLAG_NO_DMA		= BIT(14),
 	/* check if host is PCI IDE device before allowing DMA */
-	IDE_HFLAG_NO_AUTODMA		= (1 << 15),
+	IDE_HFLAG_NO_AUTODMA		= BIT(15),
 	/* host uses MMIO */
-	IDE_HFLAG_MMIO			= (1 << 16),
+	IDE_HFLAG_MMIO			= BIT(16),
 	/* no LBA48 */
-	IDE_HFLAG_NO_LBA48		= (1 << 17),
+	IDE_HFLAG_NO_LBA48		= BIT(17),
 	/* no LBA48 DMA */
-	IDE_HFLAG_NO_LBA48_DMA		= (1 << 18),
+	IDE_HFLAG_NO_LBA48_DMA		= BIT(18),
 	/* data FIFO is cleared by an error */
-	IDE_HFLAG_ERROR_STOPS_FIFO	= (1 << 19),
+	IDE_HFLAG_ERROR_STOPS_FIFO	= BIT(19),
 	/* serialize ports */
-	IDE_HFLAG_SERIALIZE		= (1 << 20),
+	IDE_HFLAG_SERIALIZE		= BIT(20),
 	/* host is DTC2278 */
-	IDE_HFLAG_DTC2278		= (1 << 21),
+	IDE_HFLAG_DTC2278		= BIT(21),
 	/* 4 devices on a single set of I/O ports */
-	IDE_HFLAG_4DRIVES		= (1 << 22),
+	IDE_HFLAG_4DRIVES		= BIT(22),
 	/* host is TRM290 */
-	IDE_HFLAG_TRM290		= (1 << 23),
+	IDE_HFLAG_TRM290		= BIT(23),
 	/* use 32-bit I/O ops */
-	IDE_HFLAG_IO_32BIT		= (1 << 24),
+	IDE_HFLAG_IO_32BIT		= BIT(24),
 	/* unmask IRQs */
-	IDE_HFLAG_UNMASK_IRQS		= (1 << 25),
-	IDE_HFLAG_BROKEN_ALTSTATUS	= (1 << 26),
+	IDE_HFLAG_UNMASK_IRQS		= BIT(25),
+	IDE_HFLAG_BROKEN_ALTSTATUS	= BIT(26),
 	/* serialize ports if DMA is possible (for sl82c105) */
-	IDE_HFLAG_SERIALIZE_DMA		= (1 << 27),
+	IDE_HFLAG_SERIALIZE_DMA		= BIT(27),
 	/* force host out of "simplex" mode */
-	IDE_HFLAG_CLEAR_SIMPLEX		= (1 << 28),
+	IDE_HFLAG_CLEAR_SIMPLEX		= BIT(28),
 	/* DSC overlap is unsupported */
-	IDE_HFLAG_NO_DSC		= (1 << 29),
+	IDE_HFLAG_NO_DSC		= BIT(29),
 	/* never use 32-bit I/O ops */
-	IDE_HFLAG_NO_IO_32BIT		= (1 << 30),
+	IDE_HFLAG_NO_IO_32BIT		= BIT(30),
 	/* never unmask IRQs */
-	IDE_HFLAG_NO_UNMASK_IRQS	= (1 << 31),
+	IDE_HFLAG_NO_UNMASK_IRQS	= BIT(31),
 };
 
 #ifdef CONFIG_BLK_DEV_OFFBOARD
@@ -1536,16 +1536,16 @@ struct ide_timing {
 };
 
 enum {
-	IDE_TIMING_SETUP	= (1 << 0),
-	IDE_TIMING_ACT8B	= (1 << 1),
-	IDE_TIMING_REC8B	= (1 << 2),
-	IDE_TIMING_CYC8B	= (1 << 3),
+	IDE_TIMING_SETUP	= BIT(0),
+	IDE_TIMING_ACT8B	= BIT(1),
+	IDE_TIMING_REC8B	= BIT(2),
+	IDE_TIMING_CYC8B	= BIT(3),
 	IDE_TIMING_8BIT		= IDE_TIMING_ACT8B | IDE_TIMING_REC8B |
 				  IDE_TIMING_CYC8B,
-	IDE_TIMING_ACTIVE	= (1 << 4),
-	IDE_TIMING_RECOVER	= (1 << 5),
-	IDE_TIMING_CYCLE	= (1 << 6),
-	IDE_TIMING_UDMA		= (1 << 7),
+	IDE_TIMING_ACTIVE	= BIT(4),
+	IDE_TIMING_RECOVER	= BIT(5),
+	IDE_TIMING_CYCLE	= BIT(6),
+	IDE_TIMING_UDMA		= BIT(7),
 	IDE_TIMING_ALL		= IDE_TIMING_SETUP | IDE_TIMING_8BIT |
 				  IDE_TIMING_ACTIVE | IDE_TIMING_RECOVER |
 				  IDE_TIMING_CYCLE | IDE_TIMING_UDMA,
-- 
cgit v1.2.3


From 79a986721decf428ba539e6e2c941c987acce655 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Jun 2019 11:20:42 +0200
Subject: dma-mapping: remove dma_max_pfn

These days, the DMA mapping code must bounce buffers for any unsupported
address. If the driver needs to optimize for natively supported ranges,
then it should use dma_get_required_mask.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 arch/arm/include/asm/dma-mapping.h | 7 -------
 include/linux/dma-mapping.h        | 7 -------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 03ba90ffc0f8..7e0486ad1318 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -89,13 +89,6 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr)
 }
 #endif
 
-/* The ARM override for dma_max_pfn() */
-static inline unsigned long dma_max_pfn(struct device *dev)
-{
-	return dma_to_pfn(dev, *dev->dma_mask);
-}
-#define dma_max_pfn(dev) dma_max_pfn(dev)
-
 /* do not use this function in a driver */
 static inline bool is_device_dma_coherent(struct device *dev)
 {
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 6309a721394b..8d13e28a8e07 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -729,13 +729,6 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 	return -EIO;
 }
 
-#ifndef dma_max_pfn
-static inline unsigned long dma_max_pfn(struct device *dev)
-{
-	return (*dev->dma_mask >> PAGE_SHIFT) + dev->dma_pfn_offset;
-}
-#endif
-
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
-- 
cgit v1.2.3


From cdc238eb72f6b94b6c33b98c07b9fc3ac5e57b18 Mon Sep 17 00:00:00 2001
From: Yi Wang <wang.yi59@zte.com.cn>
Date: Wed, 10 Jul 2019 08:24:03 +0800
Subject: kvm: x86: Fix -Wmissing-prototypes warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We get a warning when build kernel W=1:

arch/x86/kvm/../../../virt/kvm/eventfd.c:48:1: warning: no previous prototype for ‘kvm_arch_irqfd_allowed’ [-Wmissing-prototypes]
 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
 ^

The reason is kvm_arch_irqfd_allowed() is declared in arch/x86/kvm/irq.h,
which is not included by eventfd.c. Considering kvm_arch_irqfd_allowed()
is a weakly defined function in eventfd.c, remove the declaration to
kvm_host.h can fix this.

Signed-off-by: Yi Wang <wang.yi59@zte.com.cn>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.h       | 1 -
 include/linux/kvm_host.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index fd210cdd4983..d5005cc26521 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -114,7 +114,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return mode != KVM_IRQCHIP_NONE;
 }
 
-bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index abafddb9fe2c..b91829ee3db1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -993,6 +993,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
-- 
cgit v1.2.3


From 9b0eb69b75bccada2d341d7e7ca342f0cb1c9a6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:48 -0700
Subject: cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP
 usages

btrfs is going to use css_put() and wbc helpers to improve cgroup
writeback support.  Add dummy css_get() definition and export wbc
helpers to prepare for module and !CONFIG_CGROUP builds.

Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 1 +
 fs/fs-writeback.c      | 3 +++
 include/linux/cgroup.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8afa52b0d148..ad7a91dec934 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -48,6 +48,7 @@ struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9ebfb1b28430..a8a40bc26c2f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 		wb_put(wb);
 }
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 	if (unlikely(wb_dying(wbc->wb)))
 		inode_switch_wbs(inode, wbc->wb_id);
 }
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
 
 /**
  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -701,6 +703,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
+EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
  * wbc_account_io - account IO issued during writeback
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3745ecdad925..852d885df10a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 struct cgroup_subsys_state;
 struct cgroup;
 
+static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t) { return 0; }
-- 
cgit v1.2.3


From 34e51a5e1a6e939ed7d99c38173821ab86d577f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:49 -0700
Subject: blkcg, writeback: Rename wbc_account_io() to
 wbc_account_cgroup_owner()

wbc_account_io() does a very specific job - try to see which cgroup is
actually dirtying an inode and transfer its ownership to the majority
dirtier if needed.  The name is too generic and confusing.  Let's
rename it to something more specific.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 fs/btrfs/extent_io.c                    | 4 ++--
 fs/buffer.c                             | 2 +-
 fs/ext4/page-io.c                       | 2 +-
 fs/f2fs/data.c                          | 4 ++--
 fs/fs-writeback.c                       | 8 ++++----
 fs/mpage.c                              | 2 +-
 include/linux/writeback.h               | 8 ++++----
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index a5c845338d6d..6223f485f7e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2114,7 +2114,7 @@ following two functions.
 	a queue (device) has been associated with the bio and
 	before submission.
 
-  wbc_account_io(@wbc, @page, @bytes)
+  wbc_account_cgroup_owner(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
 	While this function doesn't care exactly when it's called
 	during the writeback session, it's the easiest and most
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index db337e53aab3..5106008f5e28 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			bio = NULL;
 		} else {
 			if (wbc)
-				wbc_account_io(wbc, page, page_size);
+				wbc_account_cgroup_owner(wbc, page, page_size);
 			return 0;
 		}
 	}
@@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	bio->bi_opf = opf;
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, page, page_size);
+		wbc_account_cgroup_owner(wbc, page, page_size);
 	}
 
 	*bio_ret = bio;
diff --git a/fs/buffer.c b/fs/buffer.c
index e450c55f6434..40547bbbea94 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3093,7 +3093,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, bh->b_page, bh->b_size);
+		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
 	}
 
 	submit_bio(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4690618a92e9..56e287f5ee50 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -404,7 +404,7 @@ submit_and_retry:
 	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
-	wbc_account_io(io->io_wbc, page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
 	io->io_next_block++;
 	return 0;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eda4181d2092..e1cab1717ac7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
 
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
@@ -537,7 +537,7 @@ alloc_new:
 	}
 
 	if (fio->io_wbc)
-		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a8a40bc26c2f..0aef79e934bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -706,7 +706,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
- * wbc_account_io - account IO issued during writeback
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
  * @wbc: writeback_control of the writeback in progress
  * @page: page being written out
  * @bytes: number of bytes being written out
@@ -715,8 +715,8 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
  * controlled by @wbc.  Keep the book for foreign inode detection.  See
  * wbc_detach_inode().
  */
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes)
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes)
 {
 	struct cgroup_subsys_state *css;
 	int id;
@@ -753,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
 	else
 		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 }
-EXPORT_SYMBOL_GPL(wbc_account_io);
+EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 
 /**
  * inode_congested - test whether an inode is congested
diff --git a/fs/mpage.c b/fs/mpage.c
index 436a85260394..a63620cdb73a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -647,7 +647,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
-	wbc_account_io(wbc, page, PAGE_SIZE);
+	wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..dda5cf228172 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -188,8 +188,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes);
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes);
 void cgroup_writeback_umount(void);
 
 /**
@@ -291,8 +291,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
 }
 
-static inline void wbc_account_io(struct writeback_control *wbc,
-				  struct page *page, size_t bytes)
+static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+					    struct page *page, size_t bytes)
 {
 }
 
-- 
cgit v1.2.3


From 27b36d8fa81fa8274fb72f4eb1484026f6b6daa8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:50 -0700
Subject: blkcg, writeback: Add wbc->no_cgroup_owner

When writeback IOs are bounced through async layers, the IOs should
only be accounted against the wbc from the original bdi writeback to
avoid confusing cgroup inode ownership arbitration.  Add
wbc->no_cgroup_owner to allow disabling wbc cgroup owner accounting.
This will be used make btrfs compression work well with cgroup IO
control.

v2: Renamed from no_wbc_acct to no_cgroup_owner and added comment as
    per Jan.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 2 +-
 include/linux/writeback.h | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0aef79e934bb..542b02d170f8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -727,7 +727,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
 	 * regular writeback instead of writing things out itself.
 	 */
-	if (!wbc->wb)
+	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
 	css = mem_cgroup_css_from_page(page);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dda5cf228172..33a50fa09fac 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,6 +68,15 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+
+	/*
+	 * When writeback IOs are bounced through async layers, only the
+	 * initial synchronous phase should be accounted towards inode
+	 * cgroup ownership arbitration to avoid confusion.  Later stages
+	 * can set the following flag to disable the accounting.
+	 */
+	unsigned no_cgroup_owner:1;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
-- 
cgit v1.2.3


From 653c45c6b90c9659facbef10546d1f3a8e37d0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:51 -0700
Subject: blkcg, writeback: Implement wbc_blkcg_css()

Add a helper to determine the target blkcg from wbc.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 33a50fa09fac..e056a22075cf 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/blk_types.h>
+#include <linux/blk-cgroup.h>
 
 struct bio;
 
@@ -101,6 +102,16 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
 	return 0;
 }
 
+static inline struct cgroup_subsys_state *
+wbc_blkcg_css(struct writeback_control *wbc)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	if (wbc->wb)
+		return wbc->wb->blkcg_css;
+#endif
+	return blkcg_root_css;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
-- 
cgit v1.2.3


From d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:52 -0700
Subject: blkcg: implement REQ_CGROUP_PUNT

When a shared kthread needs to issue a bio for a cgroup, doing so
synchronously can lead to priority inversions as the kthread can be
trapped waiting for that cgroup.  This patch implements
REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing
to a dedicated per-blkcg work item to avoid such priority inversions.

This will be used to fix priority inversions in btrfs compression and
should be generally useful as we grow filesystem support for
comprehensive IO control.

Cc: Chris Mason <clm@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c            |  3 +++
 include/linux/backing-dev.h |  1 +
 include/linux/blk-cgroup.h  | 16 +++++++++++++-
 include/linux/blk_types.h   | 10 +++++++++
 include/linux/writeback.h   | 13 ++++++++---
 6 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
 static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
 
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
 
+	WARN_ON(!bio_list_empty(&blkg->async_bios));
+
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
 	if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
 	call_rcu(&blkg->rcu_head, __blkg_release);
 }
 
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+					     async_bio_work);
+	struct bio_list bios = BIO_EMPTY_LIST;
+	struct bio *bio;
+
+	/* as long as there are pending bios, @blkg can't go away */
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_merge(&bios, &blkg->async_bios);
+	bio_list_init(&blkg->async_bios);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	while ((bio = bio_list_pop(&bios)))
+		submit_bio(bio);
+}
+
 /**
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
+	spin_lock_init(&blkg->async_bio_lock);
+	bio_list_init(&blkg->async_bios);
+	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 	blkg->blkcg = blkcg;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+
+	/* consume the flag first */
+	bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+	/* never bounce for the root cgroup */
+	if (!blkg->parent)
+		return false;
+
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_add(&blkg->async_bios, bio);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	return true;
+}
+
 /*
  * Scale the accumulated delay based on how long it has been since we updated
  * the delay.  We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 	atomic64_add(delta, &blkg->delay_nsec);
 }
 
+static int __init blkcg_init(void)
+{
+	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					    WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!blkcg_punt_bio_wq)
+		return -ENOMEM;
+	return 0;
+}
+subsys_initcall(blkcg_init);
+
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index edd009213f5b..260e36a2c343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
  */
 blk_qc_t submit_bio(struct bio *bio)
 {
+	if (blkcg_punt_bio_submit(bio))
+		return BLK_QC_T_NONE;
+
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
 
 static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
-	struct rcu_head			rcu_head;
+	spinlock_t			async_bio_lock;
+	struct bio_list			async_bios;
+	struct work_struct		async_bio_work;
 
 	atomic_t			use_delay;
 	atomic64_t			delay_nsec;
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+
+	struct rcu_head			rcu_head;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 				  struct bio *bio) { return false; }
 #endif
 
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_CGROUP_PUNT)
+		return __blkcg_punt_bio_submit(bio);
+	else
+		return false;
+}
 
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
+	/*
+	 * When a shared kthread needs to issue a bio for a cgroup, doing
+	 * so synchronously can lead to priority inversions as the kthread
+	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
+	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
+	 * work item to avoid such priority inversions.
+	 */
+	__REQ_CGROUP_PUNT,
 
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
+
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 #define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e056a22075cf..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
+	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
 
 static inline int wbc_to_write_flags(struct writeback_control *wbc)
 {
+	int flags = 0;
+
+	if (wbc->punt_to_cgroup)
+		flags = REQ_CGROUP_PUNT;
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		return REQ_SYNC;
+		flags |= REQ_SYNC;
 	else if (wbc->for_kupdate || wbc->for_background)
-		return REQ_BACKGROUND;
+		flags |= REQ_BACKGROUND;
 
-	return 0;
+	return flags;
 }
 
 static inline struct cgroup_subsys_state *
-- 
cgit v1.2.3


From 113ab72ed4794c193509a97d7c6d32a6886e1682 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Jul 2019 13:53:10 +0900
Subject: block: Fix potential overflow in blk_report_zones()

For large values of the number of zones reported and/or large zone
sizes, the sector increment calculated with

blk_queue_zone_sectors(q) * n

in blk_report_zones() loop can overflow the unsigned int type used for
the calculation as both "n" and blk_queue_zone_sectors() value are
unsigned int. E.g. for a device with 256 MB zones (524288 sectors),
overflow happens with 8192 or more zones reported.

Changing the return type of blk_queue_zone_sectors() to sector_t, fixes
this problem and avoids overflow problem for all other callers of this
helper too. The same change is also applied to the bdev_zone_sectors()
helper.

Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..3249738242b4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
 					     sector_t nr_sectors)
 {
-	unsigned long zone_sectors = blk_queue_zone_sectors(q);
+	sector_t zone_sectors = blk_queue_zone_sectors(q);
 
 	return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c482371c8b3..259bd7ad8312 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -681,7 +681,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
-static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
+static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
 {
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
@@ -1418,7 +1418,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
-static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
+static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-- 
cgit v1.2.3


From 36847a005489cfb74dc6388952da73346f867dca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:56:08 +0900
Subject: block: Remove unused definitions

The ELV_MQUEUE_XXX definitions in include/linux/elevator.h are unused
since the removal of elevator_may_queue_fn in kernel 5.0. Remove these
definitions and also remove the documentation of elevator_may_queue_fn
in Documentiation/block/biodoc.txt.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt | 5 -----
 include/linux/elevator.h       | 9 ---------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 31c177663ed5..5a4a799fe61b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -843,11 +843,6 @@ elevator_latter_req_fn		These return the request before or after the
 
 elevator_completed_req_fn	called when a request is completed.
 
-elevator_may_queue_fn		returns true if the scheduler wants to allow the
-				current context to queue a new request even if
-				it is over the queue limit. This must be used
-				very carefully!!
-
 elevator_set_req_fn
 elevator_put_req_fn		Must be used to allocate and free any elevator
 				specific storage for a request.
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 169bb2e02516..38590c30a11d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_FLUSH	5
 #define ELEVATOR_INSERT_SORT_MERGE	6
 
-/*
- * return values from elevator_may_queue_fn
- */
-enum {
-	ELV_MQUEUE_MAY,
-	ELV_MQUEUE_NO,
-	ELV_MQUEUE_MUST,
-};
-
 #define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
-- 
cgit v1.2.3


From 9305d5d721f2bd5e2eeb670035159b560ca211ca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:57:41 +0900
Subject: block: Fix elevator name declaration

The elevator_name field in struct elevator_type is declared as an array
of characters (ELV_NAME_MAX size) but in practice used as a string
pointer with its initialization done statically within each
elevator elevator_type structure declaration.

Change the declaration of elevator_name to the more appropriate
"const char *" type.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 38590c30a11d..17cd0078377c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -75,7 +75,7 @@ struct elevator_type
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
-	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_name;
 	const char *elevator_alias;
 	struct module *elevator_owner;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3


From 028db3e290f15ac509084c0fc3b9d021f668f877 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 10 Jul 2019 18:43:43 -0700
Subject: Revert "Merge tag 'keys-acl-20190703' of
 git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs"

This reverts merge 0f75ef6a9cff49ff612f7ce0578bced9d0b38325 (and thus
effectively commits

   7a1ade847596 ("keys: Provide KEYCTL_GRANT_PERMISSION")
   2e12256b9a76 ("keys: Replace uid/gid/perm permissions checking with an ACL")

that the merge brought in).

It turns out that it breaks booting with an encrypted volume, and Eric
biggers reports that it also breaks the fscrypt tests [1] and loading of
in-kernel X.509 certificates [2].

The root cause of all the breakage is likely the same, but David Howells
is off email so rather than try to work it out it's getting reverted in
order to not impact the rest of the merge window.

 [1] https://lore.kernel.org/lkml/20190710011559.GA7973@sol.localdomain/
 [2] https://lore.kernel.org/lkml/20190710013225.GB7973@sol.localdomain/

Link: https://lore.kernel.org/lkml/CAHk-=wjxoeMJfeBahnWH=9zShKp2bsVy527vo3_y8HfOdhwAAw@mail.gmail.com/
Reported-by: Eric Biggers <ebiggers@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/security/keys/core.rst               | 128 ++------
 Documentation/security/keys/request-key.rst        |   9 +-
 certs/blacklist.c                                  |   7 +-
 certs/system_keyring.c                             |  12 +-
 drivers/md/dm-crypt.c                              |   2 +-
 drivers/nvdimm/security.c                          |   2 +-
 fs/afs/security.c                                  |   2 +-
 fs/cifs/cifs_spnego.c                              |  25 +-
 fs/cifs/cifsacl.c                                  |  28 +-
 fs/cifs/connect.c                                  |   4 +-
 fs/crypto/keyinfo.c                                |   2 +-
 fs/ecryptfs/ecryptfs_kernel.h                      |   2 +-
 fs/ecryptfs/keystore.c                             |   2 +-
 fs/fscache/object-list.c                           |   2 +-
 fs/nfs/nfs4idmap.c                                 |  30 +-
 fs/ubifs/auth.c                                    |   2 +-
 include/linux/key.h                                | 121 ++++---
 include/uapi/linux/keyctl.h                        |  65 ----
 lib/digsig.c                                       |   2 +-
 net/ceph/ceph_common.c                             |   2 +-
 net/dns_resolver/dns_key.c                         |  12 +-
 net/dns_resolver/dns_query.c                       |  15 +-
 net/rxrpc/key.c                                    |  19 +-
 net/wireless/reg.c                                 |   6 +-
 security/integrity/digsig.c                        |  31 +-
 security/integrity/digsig_asymmetric.c             |   2 +-
 security/integrity/evm/evm_crypto.c                |   2 +-
 security/integrity/ima/ima_mok.c                   |  13 +-
 security/integrity/integrity.h                     |   6 +-
 .../integrity/platform_certs/platform_keyring.c    |  14 +-
 security/keys/compat.c                             |   2 -
 security/keys/encrypted-keys/encrypted.c           |   2 +-
 security/keys/encrypted-keys/masterkey_trusted.c   |   2 +-
 security/keys/gc.c                                 |   2 +-
 security/keys/internal.h                           |  16 +-
 security/keys/key.c                                |  29 +-
 security/keys/keyctl.c                             | 104 ++----
 security/keys/keyring.c                            |  27 +-
 security/keys/permission.c                         | 361 ++-------------------
 security/keys/persistent.c                         |  27 +-
 security/keys/proc.c                               |  22 +-
 security/keys/process_keys.c                       |  86 ++---
 security/keys/request_key.c                        |  34 +-
 security/keys/request_key_auth.c                   |  15 +-
 security/selinux/hooks.c                           |  16 +-
 security/smack/smack_lsm.c                         |   3 +-
 46 files changed, 325 insertions(+), 992 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index bc561ca95c86..d6d8b0b756b6 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -57,9 +57,9 @@ Each key has a number of attributes:
      type provides an operation to perform a match between the description on a
      key and a criterion string.
 
-  *  Each key has an owner user ID, a group ID and an ACL.  These are used to
-     control what a process may do to a key from userspace, and whether a
-     kernel service will be able to find the key.
+  *  Each key has an owner user ID, a group ID and a permissions mask. These
+     are used to control what a process may do to a key from userspace, and
+     whether a kernel service will be able to find the key.
 
   *  Each key can be set to expire at a specific time by the key type's
      instantiation function. Keys can also be immortal.
@@ -198,110 +198,43 @@ The key service provides a number of features besides keys:
 Key Access Permissions
 ======================
 
-Keys have an owner user ID, a group ID and an ACL.  The ACL is made up of a
-sequence of ACEs that each contain three elements:
+Keys have an owner user ID, a group access ID, and a permissions mask. The mask
+has up to eight bits each for possessor, user, group and other access. Only
+six of each set of eight bits are defined. These permissions granted are:
 
-  * The type of subject.
-  * The subject.
+  *  View
 
-    These two together indicate the subject to whom the permits are granted.
-    The type can be one of:
+     This permits a key or keyring's attributes to be viewed - including key
+     type and description.
 
-     * ``KEY_ACE_SUBJ_STANDARD``
+  *  Read
 
-       The subject is a standard 'macro' type.  The subject can be one of:
-
-        * ``KEY_ACE_EVERYONE``
-
-	  The permits are granted to everyone.  It replaces the old 'other'
-	  type on the assumption that you wouldn't grant a permission to other
-	  that you you wouldn't grant to everyone else.
-
-	* ``KEY_ACE_OWNER``
-
-	  The permits are granted to the owner of the key (key->uid).
-
-	* ``KEY_ACE_GROUP``
-
-	  The permits are granted to the key's group (key->gid).
-
-	* ``KEY_ACE_POSSESSOR``
-
-	  The permits are granted to anyone who possesses the key.
-
-  * The set of permits granted to the subject.  These include:
-
-     * ``KEY_ACE_VIEW``
-
-       This permits a key or keyring's attributes to be viewed - including the
-       key type and description.
-
-     * ``KEY_ACE_READ``
-
-       This permits a key's payload to be viewed or a keyring's list of linked
-       keys.
-
-     * ``KEY_ACE_WRITE``
-
-       This permits a key's payload to be instantiated or updated, or it allows
-       a link to be added to or removed from a keyring.
-
-     * ``KEY_ACE_SEARCH``
-
-       This permits keyrings to be searched and keys to be found. Searches can
-       only recurse into nested keyrings that have search permission set.
-
-     * ``KEY_ACE_LINK``
-
-       This permits a key or keyring to be linked to. To create a link from a
-       keyring to a key, a process must have Write permission on the keyring
-       and Link permission on the key.
-
-     * ``KEY_ACE_SET_SECURITY``
-
-       This permits a key's UID, GID and permissions mask to be changed.
+     This permits a key's payload to be viewed or a keyring's list of linked
+     keys.
 
-     * ``KEY_ACE_INVAL``
+  *  Write
 
-       This permits a key to be invalidated with KEYCTL_INVALIDATE.
+     This permits a key's payload to be instantiated or updated, or it allows a
+     link to be added to or removed from a keyring.
 
-     * ``KEY_ACE_REVOKE``
+  *  Search
 
-       This permits a key to be revoked with KEYCTL_REVOKE.
+     This permits keyrings to be searched and keys to be found. Searches can
+     only recurse into nested keyrings that have search permission set.
 
-     * ``KEY_ACE_JOIN``
+  *  Link
 
-       This permits a keyring to be joined as a session by
-       KEYCTL_JOIN_SESSION_KEYRING or KEYCTL_SESSION_TO_PARENT.
+     This permits a key or keyring to be linked to. To create a link from a
+     keyring to a key, a process must have Write permission on the keyring and
+     Link permission on the key.
 
-     * ``KEY_ACE_CLEAR``
+  *  Set Attribute
 
-       This permits a keyring to be cleared.
+     This permits a key's UID, GID and permissions mask to be changed.
 
 For changing the ownership, group ID or permissions mask, being the owner of
 the key or having the sysadmin capability is sufficient.
 
-The legacy KEYCTL_SETPERM and KEYCTL_DESCRIBE functions can only see/generate
-View, Read, Write, Search, Link and SetAttr permits, and do this for each of
-possessor, user, group and other permission sets as a 32-bit flag mask.  These
-will be approximated/inferred:
-
-	SETPERM Permit	Implied ACE Permit
-	===============	=======================
-	Search		Inval, Join
-	Write		Revoke, Clear
-	Setattr		Set Security, Revoke
-
-	ACE Permit	Described as
-	===============	=======================
-	Inval		Search
-	Join		Search
-	Revoke		Write (unless Setattr)
-	Clear		write
-	Set Security	Setattr
-
-'Other' will be approximated as/inferred from the 'Everyone' subject.
-
 
 SELinux Support
 ===============
@@ -1151,8 +1084,7 @@ payload contents" for more information.
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_info,
-				struct key_acl *acl);
+				const char *callout_info);
 
     This is used to request a key or keyring with a description that matches
     the description specified according to the key type's match_preparse()
@@ -1167,8 +1099,6 @@ payload contents" for more information.
     If successful, the key will have been attached to the default keyring for
     implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
 
-    If a key is created, it will be given the specified ACL.
-
     See also Documentation/security/keys/request-key.rst.
 
 
@@ -1177,8 +1107,7 @@ payload contents" for more information.
 	struct key *request_key_tag(const struct key_type *type,
 				    const char *description,
 				    struct key_tag *domain_tag,
-				    const char *callout_info,
-				    struct key_acl *acl);
+				    const char *callout_info);
 
     This is identical to request_key(), except that a domain tag may be
     specifies that causes search algorithm to only match keys matching that
@@ -1193,8 +1122,7 @@ payload contents" for more information.
 					     struct key_tag *domain_tag,
 					     const void *callout_info,
 					     size_t callout_len,
-					     void *aux,
-					     struct key_acl *acl);
+					     void *aux);
 
     This is identical to request_key_tag(), except that the auxiliary data is
     passed to the key_type->request_key() op if it exists, and the
@@ -1267,7 +1195,7 @@ payload contents" for more information.
 
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
-				  struct key_acl *acl,
+				  key_perm_t perm,
 				  struct key_restriction *restrict_link,
 				  unsigned long flags,
 				  struct key *dest);
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index f356fd06c8d5..35f2296b704a 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -11,16 +11,14 @@ The process starts by either the kernel requesting a service by calling
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_info,
-				struct key_acl *acl);
+				const char *callout_info);
 
 or::
 
 	struct key *request_key_tag(const struct key_type *type,
 				    const char *description,
 				    const struct key_tag *domain_tag,
-				    const char *callout_info,
-				    struct key_acl *acl);
+				    const char *callout_info);
 
 or::
 
@@ -29,8 +27,7 @@ or::
 					     const struct key_tag *domain_tag,
 					     const char *callout_info,
 					     size_t callout_len,
-					     void *aux,
-					     struct key_acl *acl);
+					     void *aux);
 
 or::
 
diff --git a/certs/blacklist.c b/certs/blacklist.c
index 93d70b885f8e..ec00bf337eb6 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -89,7 +89,8 @@ int mark_hash_blacklisted(const char *hash)
 				   hash,
 				   NULL,
 				   0,
-				   &internal_key_acl,
+				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				    KEY_USR_VIEW),
 				   KEY_ALLOC_NOT_IN_QUOTA |
 				   KEY_ALLOC_BUILT_IN);
 	if (IS_ERR(key)) {
@@ -148,7 +149,9 @@ static int __init blacklist_init(void)
 		keyring_alloc(".blacklist",
 			      KUIDT_INIT(0), KGIDT_INIT(0),
 			      current_cred(),
-			      &internal_keyring_acl,
+			      (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			      KEY_USR_VIEW | KEY_USR_READ |
+			      KEY_USR_SEARCH,
 			      KEY_ALLOC_NOT_IN_QUOTA |
 			      KEY_FLAG_KEEP,
 			      NULL, NULL);
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 57be78b5fdfc..1eba08a1af82 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -99,7 +99,9 @@ static __init int system_trusted_keyring_init(void)
 	builtin_trusted_keys =
 		keyring_alloc(".builtin_trusted_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA,
+			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+			      KEY_ALLOC_NOT_IN_QUOTA,
 			      NULL, NULL);
 	if (IS_ERR(builtin_trusted_keys))
 		panic("Can't allocate builtin trusted keyring\n");
@@ -108,7 +110,10 @@ static __init int system_trusted_keyring_init(void)
 	secondary_trusted_keys =
 		keyring_alloc(".secondary_trusted_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      &internal_writable_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA,
+			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			       KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH |
+			       KEY_USR_WRITE),
+			      KEY_ALLOC_NOT_IN_QUOTA,
 			      get_builtin_and_secondary_restriction(),
 			      NULL);
 	if (IS_ERR(secondary_trusted_keys))
@@ -158,7 +163,8 @@ static __init int load_system_certificate_list(void)
 					   NULL,
 					   p,
 					   plen,
-					   &internal_key_acl,
+					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+					   KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fd3ca9bfe54..1b16d34bb785 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2035,7 +2035,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
 		return -ENOMEM;
 
 	key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user,
-			  key_desc + 1, NULL, NULL);
+			  key_desc + 1, NULL);
 	if (IS_ERR(key)) {
 		kzfree(new_key_string);
 		return PTR_ERR(key);
diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c
index 99a5708b37e3..a570f2263a42 100644
--- a/drivers/nvdimm/security.c
+++ b/drivers/nvdimm/security.c
@@ -55,7 +55,7 @@ static struct key *nvdimm_request_key(struct nvdimm *nvdimm)
 	struct device *dev = &nvdimm->dev;
 
 	sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id);
-	key = request_key(&key_type_encrypted, desc, "", NULL);
+	key = request_key(&key_type_encrypted, desc, "");
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) == -ENOKEY)
 			dev_dbg(dev, "request_key() found no key\n");
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 8866703b2e6c..71e71c07568f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -28,7 +28,7 @@ struct key *afs_request_key(struct afs_cell *cell)
 
 	_debug("key %s", cell->anonymous_key->description);
 	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
-			  NULL, NULL);
+			  NULL);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) != -ENOKEY) {
 			_leave(" = %ld", PTR_ERR(key));
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index d1b439ad0f1a..7f01c6e60791 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -32,25 +32,6 @@
 #include "cifsproto.h"
 static const struct cred *spnego_cred;
 
-static struct key_acl cifs_spnego_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
-static struct key_acl cifs_spnego_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR),
-	}
-};
-
 /* create a new cifs key */
 static int
 cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
@@ -189,8 +170,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 
 	cifs_dbg(FYI, "key description = %s\n", description);
 	saved_cred = override_creds(spnego_cred);
-	spnego_key = request_key(&cifs_spnego_key_type, description, "",
-				 &cifs_spnego_key_acl);
+	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 	revert_creds(saved_cred);
 
 #ifdef CONFIG_CIFS_DEBUG2
@@ -227,7 +207,8 @@ init_cifs_spnego(void)
 
 	keyring = keyring_alloc(".cifs_spnego",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				&cifs_spnego_keyring_acl,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 78eed72f3af0..1d377b7f2860 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -33,25 +33,6 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static struct key_acl cifs_idmap_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
-static struct key_acl cifs_idmap_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
@@ -317,8 +298,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 
 	rc = 0;
 	saved_cred = override_creds(root_cred);
-	sidkey = request_key(&cifs_idmap_key_type, desc, "",
-			     &cifs_idmap_key_acl);
+	sidkey = request_key(&cifs_idmap_key_type, desc, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
 		cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
@@ -423,8 +403,7 @@ try_upcall_to_get_id:
 		return -ENOMEM;
 
 	saved_cred = override_creds(root_cred);
-	sidkey = request_key(&cifs_idmap_key_type, sidstr, "",
-			     &cifs_idmap_key_acl);
+	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
 		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
@@ -502,7 +481,8 @@ init_cifs_idmap(void)
 
 	keyring = keyring_alloc(".cifs_idmap",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				&cifs_idmap_keyring_acl,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ae6bae2ecb5d..714a359c7c8d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2992,7 +2992,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	}
 
 	cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
-	key = request_key(&key_type_logon, desc, "", NULL);
+	key = request_key(&key_type_logon, desc, "");
 	if (IS_ERR(key)) {
 		if (!ses->domainName) {
 			cifs_dbg(FYI, "domainName is NULL\n");
@@ -3003,7 +3003,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 		/* didn't work, try to find a domain key */
 		sprintf(desc, "cifs:d:%s", ses->domainName);
 		cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
-		key = request_key(&key_type_logon, desc, "", NULL);
+		key = request_key(&key_type_logon, desc, "");
 		if (IS_ERR(key)) {
 			rc = PTR_ERR(key);
 			goto out_err;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 4f85af8ab239..dcd91a3fbe49 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -92,7 +92,7 @@ find_and_lock_process_key(const char *prefix,
 	if (!description)
 		return ERR_PTR(-ENOMEM);
 
-	key = request_key(&key_type_logon, description, NULL, NULL);
+	key = request_key(&key_type_logon, description, NULL);
 	kfree(description);
 	if (IS_ERR(key))
 		return key;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 67844fe41a61..1c1a56be7ea2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -91,7 +91,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
 
 static inline struct key *ecryptfs_get_encrypted_key(char *sig)
 {
-	return request_key(&key_type_encrypted, sig, NULL, NULL);
+	return request_key(&key_type_encrypted, sig, NULL);
 }
 
 #else
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ba382f135918..9536e592e25a 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1610,7 +1610,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 {
 	int rc = 0;
 
-	(*auth_tok_key) = request_key(&key_type_user, sig, NULL, NULL);
+	(*auth_tok_key) = request_key(&key_type_user, sig, NULL);
 	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
 		(*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
 		if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 67b7bda5647a..72ebfe578f40 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -317,7 +317,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
 	const char *buf;
 	int len;
 
-	key = request_key(&key_type_user, "fscache:objlist", NULL, NULL);
+	key = request_key(&key_type_user, "fscache:objlist", NULL);
 	if (IS_ERR(key))
 		goto no_config;
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 69679f4f2e6c..1e7296395d71 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -72,25 +72,6 @@ struct idmap {
 	const struct cred	*cred;
 };
 
-static struct key_acl nfs_idmap_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
-static struct key_acl nfs_idmap_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
 static struct user_namespace *idmap_userns(const struct idmap *idmap)
 {
 	if (idmap && idmap->cred)
@@ -227,7 +208,8 @@ int nfs_idmap_init(void)
 
 	keyring = keyring_alloc(".id_resolver",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				&nfs_idmap_keyring_acl,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -305,13 +287,11 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
 		return ERR_PTR(ret);
 
 	if (!idmap->cred || idmap->cred->user_ns == &init_user_ns)
-		rkey = request_key(&key_type_id_resolver, desc, "",
-				   &nfs_idmap_key_acl);
+		rkey = request_key(&key_type_id_resolver, desc, "");
 	if (IS_ERR(rkey)) {
 		mutex_lock(&idmap->idmap_mutex);
 		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
-						desc, NULL, "", 0, idmap,
-						&nfs_idmap_key_acl);
+						desc, NULL, "", 0, idmap);
 		mutex_unlock(&idmap->idmap_mutex);
 	}
 	if (!IS_ERR(rkey))
@@ -340,6 +320,8 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	}
 
 	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
 	ret = key_validate(rkey);
 	if (ret < 0)
 		goto out_up;
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 38718026ad0b..60f43b93d06e 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -227,7 +227,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 	snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
 		 c->auth_hash_name);
 
-	keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL, NULL);
+	keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL);
 
 	if (IS_ERR(keyring_key)) {
 		ubifs_err(c, "Failed to request key: %ld",
diff --git a/include/linux/key.h b/include/linux/key.h
index 6fef6684501f..91f391cd272e 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -27,15 +27,50 @@
 /* key handle serial number */
 typedef int32_t key_serial_t;
 
+/* key handle permissions mask */
+typedef uint32_t key_perm_t;
+
 struct key;
 struct net;
 
 #ifdef CONFIG_KEYS
 
-#include <linux/keyctl.h>
-
 #undef KEY_DEBUGGING
 
+#define KEY_POS_VIEW	0x01000000	/* possessor can view a key's attributes */
+#define KEY_POS_READ	0x02000000	/* possessor can read key payload / view keyring */
+#define KEY_POS_WRITE	0x04000000	/* possessor can update key payload / add link to keyring */
+#define KEY_POS_SEARCH	0x08000000	/* possessor can find a key in search / search a keyring */
+#define KEY_POS_LINK	0x10000000	/* possessor can create a link to a key/keyring */
+#define KEY_POS_SETATTR	0x20000000	/* possessor can set key attributes */
+#define KEY_POS_ALL	0x3f000000
+
+#define KEY_USR_VIEW	0x00010000	/* user permissions... */
+#define KEY_USR_READ	0x00020000
+#define KEY_USR_WRITE	0x00040000
+#define KEY_USR_SEARCH	0x00080000
+#define KEY_USR_LINK	0x00100000
+#define KEY_USR_SETATTR	0x00200000
+#define KEY_USR_ALL	0x003f0000
+
+#define KEY_GRP_VIEW	0x00000100	/* group permissions... */
+#define KEY_GRP_READ	0x00000200
+#define KEY_GRP_WRITE	0x00000400
+#define KEY_GRP_SEARCH	0x00000800
+#define KEY_GRP_LINK	0x00001000
+#define KEY_GRP_SETATTR	0x00002000
+#define KEY_GRP_ALL	0x00003f00
+
+#define KEY_OTH_VIEW	0x00000001	/* third party permissions... */
+#define KEY_OTH_READ	0x00000002
+#define KEY_OTH_WRITE	0x00000004
+#define KEY_OTH_SEARCH	0x00000008
+#define KEY_OTH_LINK	0x00000010
+#define KEY_OTH_SETATTR	0x00000020
+#define KEY_OTH_ALL	0x0000003f
+
+#define KEY_PERM_UNDEF	0xffffffff
+
 struct seq_file;
 struct user_struct;
 struct signal_struct;
@@ -78,36 +113,6 @@ union key_payload {
 	void			*data[4];
 };
 
-struct key_ace {
-	unsigned int		type;
-	unsigned int		perm;
-	union {
-		kuid_t		uid;
-		kgid_t		gid;
-		unsigned int	subject_id;
-	};
-};
-
-struct key_acl {
-	refcount_t		usage;
-	unsigned short		nr_ace;
-	bool			possessor_viewable;
-	struct rcu_head		rcu;
-	struct key_ace		aces[];
-};
-
-#define KEY_POSSESSOR_ACE(perms) {			\
-		.type = KEY_ACE_SUBJ_STANDARD,		\
-		.perm = perms,				\
-		.subject_id = KEY_ACE_POSSESSOR		\
-	}
-
-#define KEY_OWNER_ACE(perms) {				\
-		.type = KEY_ACE_SUBJ_STANDARD,		\
-		.perm = perms,				\
-		.subject_id = KEY_ACE_OWNER		\
-	}
-
 /*****************************************************************************/
 /*
  * key reference with possession attribute handling
@@ -174,7 +179,6 @@ struct key {
 	struct rw_semaphore	sem;		/* change vs change sem */
 	struct key_user		*user;		/* owner of this key */
 	void			*security;	/* security data for this key */
-	struct key_acl		__rcu *acl;
 	union {
 		time64_t	expiry;		/* time at which key expires (or 0) */
 		time64_t	revoked_at;	/* time at which key was revoked */
@@ -182,6 +186,7 @@ struct key {
 	time64_t		last_used_at;	/* last time used for LRU keyring discard */
 	kuid_t			uid;
 	kgid_t			gid;
+	key_perm_t		perm;		/* access permissions */
 	unsigned short		quotalen;	/* length added to quota */
 	unsigned short		datalen;	/* payload data length
 						 * - may not match RCU dereferenced payload
@@ -205,7 +210,6 @@ struct key {
 #define KEY_FLAG_ROOT_CAN_INVAL	7	/* set if key can be invalidated by root without permission */
 #define KEY_FLAG_KEEP		8	/* set if key should not be removed */
 #define KEY_FLAG_UID_KEYRING	9	/* set if key is a user or user session keyring */
-#define KEY_FLAG_HAS_ACL	10	/* Set if KEYCTL_SETACL called on key */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -254,7 +258,7 @@ extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     kuid_t uid, kgid_t gid,
 			     const struct cred *cred,
-			     struct key_acl *acl,
+			     key_perm_t perm,
 			     unsigned long flags,
 			     struct key_restriction *restrict_link);
 
@@ -291,8 +295,7 @@ static inline void key_ref_put(key_ref_t key_ref)
 extern struct key *request_key_tag(struct key_type *type,
 				   const char *description,
 				   struct key_tag *domain_tag,
-				   const char *callout_info,
-				   struct key_acl *acl);
+				   const char *callout_info);
 
 extern struct key *request_key_rcu(struct key_type *type,
 				   const char *description,
@@ -303,24 +306,21 @@ extern struct key *request_key_with_auxdata(struct key_type *type,
 					    struct key_tag *domain_tag,
 					    const void *callout_info,
 					    size_t callout_len,
-					    void *aux,
-					    struct key_acl *acl);
+					    void *aux);
 
 /**
  * request_key - Request a key and wait for construction
  * @type: Type of key.
  * @description: The searchable description of the key.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
- * @acl: The ACL to attach to a new key (or NULL).
  *
  * As for request_key_tag(), but with the default global domain tag.
  */
 static inline struct key *request_key(struct key_type *type,
 				      const char *description,
-				      const char *callout_info,
-				      struct key_acl *acl)
+				      const char *callout_info)
 {
-	return request_key_tag(type, description, NULL, callout_info, acl);
+	return request_key_tag(type, description, NULL, callout_info);
 }
 
 #ifdef CONFIG_NET
@@ -330,7 +330,6 @@ static inline struct key *request_key(struct key_type *type,
  * @description: The searchable description of the key.
  * @net: The network namespace that is the key's domain of operation.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
- * @acl: The ACL to attach to a new key (or NULL).
  *
  * As for request_key() except that it does not add the returned key to a
  * keyring if found, new keys are always allocated in the user's quota, the
@@ -340,8 +339,8 @@ static inline struct key *request_key(struct key_type *type,
  * Furthermore, it then works as wait_for_key_construction() to wait for the
  * completion of keys undergoing construction with a non-interruptible wait.
  */
-#define request_key_net(type, description, net, callout_info, acl)	\
-	request_key_tag(type, description, net->key_domain, callout_info, acl);
+#define request_key_net(type, description, net, callout_info) \
+	request_key_tag(type, description, net->key_domain, callout_info);
 #endif /* CONFIG_NET */
 
 extern int wait_for_key_construction(struct key *key, bool intr);
@@ -353,7 +352,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring,
 				      const char *description,
 				      const void *payload,
 				      size_t plen,
-				      struct key_acl *acl,
+				      key_perm_t perm,
 				      unsigned long flags);
 
 extern int key_update(key_ref_t key,
@@ -373,7 +372,7 @@ extern int key_unlink(struct key *keyring,
 
 extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 				 const struct cred *cred,
-				 struct key_acl *acl,
+				 key_perm_t perm,
 				 unsigned long flags,
 				 struct key_restriction *restrict_link,
 				 struct key *dest);
@@ -406,29 +405,19 @@ static inline key_serial_t key_serial(const struct key *key)
 extern void key_set_timeout(struct key *, unsigned);
 
 extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
-				 u32 desired_perm);
+				 key_perm_t perm);
 extern void key_free_user_ns(struct user_namespace *);
 
 /*
  * The permissions required on a key that we're looking up.
  */
-#define	KEY_NEED_VIEW	0x001	/* Require permission to view attributes */
-#define	KEY_NEED_READ	0x002	/* Require permission to read content */
-#define	KEY_NEED_WRITE	0x004	/* Require permission to update / modify */
-#define	KEY_NEED_SEARCH	0x008	/* Require permission to search (keyring) or find (key) */
-#define	KEY_NEED_LINK	0x010	/* Require permission to link */
-#define	KEY_NEED_SETSEC	0x020	/* Require permission to set owner, group, ACL */
-#define	KEY_NEED_INVAL	0x040	/* Require permission to invalidate key */
-#define	KEY_NEED_REVOKE	0x080	/* Require permission to revoke key */
-#define	KEY_NEED_JOIN	0x100	/* Require permission to join keyring as session */
-#define	KEY_NEED_CLEAR	0x200	/* Require permission to clear a keyring */
-#define KEY_NEED_ALL	0x3ff
-
-#define OLD_KEY_NEED_SETATTR 0x20 /* Used to be Require permission to change attributes */
-
-extern struct key_acl internal_key_acl;
-extern struct key_acl internal_keyring_acl;
-extern struct key_acl internal_writable_keyring_acl;
+#define	KEY_NEED_VIEW	0x01	/* Require permission to view attributes */
+#define	KEY_NEED_READ	0x02	/* Require permission to read content */
+#define	KEY_NEED_WRITE	0x04	/* Require permission to update / modify */
+#define	KEY_NEED_SEARCH	0x08	/* Require permission to search (keyring) or find (key) */
+#define	KEY_NEED_LINK	0x10	/* Require permission to link */
+#define	KEY_NEED_SETATTR 0x20	/* Require permission to change attributes */
+#define	KEY_NEED_ALL	0x3f	/* All the above permissions */
 
 static inline short key_read_state(const struct key *key)
 {
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index 1f7a4e737214..ed3d5893830d 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -15,69 +15,6 @@
 
 #include <linux/types.h>
 
-/*
- * Keyring permission grant definitions
- */
-enum key_ace_subject_type {
-	KEY_ACE_SUBJ_STANDARD	= 0,	/* subject is one of key_ace_standard_subject */
-	nr__key_ace_subject_type
-};
-
-enum key_ace_standard_subject {
-	KEY_ACE_EVERYONE	= 0,	/* Everyone, including owner and group */
-	KEY_ACE_GROUP		= 1,	/* The key's group */
-	KEY_ACE_OWNER		= 2,	/* The owner of the key */
-	KEY_ACE_POSSESSOR	= 3,	/* Any process that possesses of the key */
-	nr__key_ace_standard_subject
-};
-
-#define KEY_ACE_VIEW		0x00000001 /* Can describe the key */
-#define KEY_ACE_READ		0x00000002 /* Can read the key content */
-#define KEY_ACE_WRITE		0x00000004 /* Can update/modify the key content */
-#define KEY_ACE_SEARCH		0x00000008 /* Can find the key by search */
-#define KEY_ACE_LINK		0x00000010 /* Can make a link to the key */
-#define KEY_ACE_SET_SECURITY	0x00000020 /* Can set owner, group, ACL */
-#define KEY_ACE_INVAL		0x00000040 /* Can invalidate the key */
-#define KEY_ACE_REVOKE		0x00000080 /* Can revoke the key */
-#define KEY_ACE_JOIN		0x00000100 /* Can join keyring */
-#define KEY_ACE_CLEAR		0x00000200 /* Can clear keyring */
-#define KEY_ACE__PERMS		0xffffffff
-
-/*
- * Old-style permissions mask, deprecated in favour of ACL.
- */
-#define KEY_POS_VIEW	0x01000000	/* possessor can view a key's attributes */
-#define KEY_POS_READ	0x02000000	/* possessor can read key payload / view keyring */
-#define KEY_POS_WRITE	0x04000000	/* possessor can update key payload / add link to keyring */
-#define KEY_POS_SEARCH	0x08000000	/* possessor can find a key in search / search a keyring */
-#define KEY_POS_LINK	0x10000000	/* possessor can create a link to a key/keyring */
-#define KEY_POS_SETATTR	0x20000000	/* possessor can set key attributes */
-#define KEY_POS_ALL	0x3f000000
-
-#define KEY_USR_VIEW	0x00010000	/* user permissions... */
-#define KEY_USR_READ	0x00020000
-#define KEY_USR_WRITE	0x00040000
-#define KEY_USR_SEARCH	0x00080000
-#define KEY_USR_LINK	0x00100000
-#define KEY_USR_SETATTR	0x00200000
-#define KEY_USR_ALL	0x003f0000
-
-#define KEY_GRP_VIEW	0x00000100	/* group permissions... */
-#define KEY_GRP_READ	0x00000200
-#define KEY_GRP_WRITE	0x00000400
-#define KEY_GRP_SEARCH	0x00000800
-#define KEY_GRP_LINK	0x00001000
-#define KEY_GRP_SETATTR	0x00002000
-#define KEY_GRP_ALL	0x00003f00
-
-#define KEY_OTH_VIEW	0x00000001	/* third party permissions... */
-#define KEY_OTH_READ	0x00000002
-#define KEY_OTH_WRITE	0x00000004
-#define KEY_OTH_SEARCH	0x00000008
-#define KEY_OTH_LINK	0x00000010
-#define KEY_OTH_SETATTR	0x00000020
-#define KEY_OTH_ALL	0x0000003f
-
 /* special process keyring shortcut IDs */
 #define KEY_SPEC_THREAD_KEYRING		-1	/* - key ID for thread-specific keyring */
 #define KEY_SPEC_PROCESS_KEYRING	-2	/* - key ID for process-specific keyring */
@@ -132,7 +69,6 @@ enum key_ace_standard_subject {
 #define KEYCTL_RESTRICT_KEYRING		29	/* Restrict keys allowed to link to a keyring */
 #define KEYCTL_MOVE			30	/* Move keys between keyrings */
 #define KEYCTL_CAPABILITIES		31	/* Find capabilities of keyrings subsystem */
-#define KEYCTL_GRANT_PERMISSION		32	/* Grant a permit to a key */
 
 /* keyctl structures */
 struct keyctl_dh_params {
@@ -194,6 +130,5 @@ struct keyctl_pkey_params {
 #define KEYCTL_CAPS0_MOVE		0x80 /* KEYCTL_MOVE supported */
 #define KEYCTL_CAPS1_NS_KEYRING_NAME	0x01 /* Keyring names are per-user_namespace */
 #define KEYCTL_CAPS1_NS_KEY_TAG		0x02 /* Key indexing can include a namespace tag */
-#define KEYCTL_CAPS1_ACL_ALTERABLE	0x04 /* Keys have internal ACL that can be altered */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/lib/digsig.c b/lib/digsig.c
index ab0800f98eaf..e0627c3e53b2 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -224,7 +224,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen,
 		else
 			key = key_ref_to_ptr(kref);
 	} else {
-		key = request_key(&key_type_user, name, NULL, NULL);
+		key = request_key(&key_type_user, name, NULL);
 	}
 	if (IS_ERR(key)) {
 		pr_err("key not found, id: %s\n", name);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 38de80d01aae..1c811c74bfc0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -306,7 +306,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
 	int err = 0;
 	struct ceph_crypto_key *ckey;
 
-	ukey = request_key(&key_type_ceph, name, NULL, NULL);
+	ukey = request_key(&key_type_ceph, name, NULL);
 	if (IS_ERR(ukey)) {
 		/* request_key errors don't map nicely to mount(2)
 		   errors; don't even try, but still printk */
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 6b201531b165..3e1a90669006 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -46,15 +46,6 @@ const struct cred *dns_resolver_cache;
 
 #define	DNS_ERRORNO_OPTION	"dnserror"
 
-static struct key_acl dns_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR),
-	}
-};
-
 /*
  * Preparse instantiation data for a dns_resolver key.
  *
@@ -352,7 +343,8 @@ static int __init init_dns_resolver(void)
 
 	keyring = keyring_alloc(".dns_resolver",
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				&dns_keyring_acl,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 236baf2bfa4c..cab4e0df924f 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -47,16 +47,6 @@
 
 #include "internal.h"
 
-static struct key_acl dns_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_INVAL),
-	}
-};
-
 /**
  * dns_query - Query the DNS
  * @net: The network namespace to operate in.
@@ -135,8 +125,7 @@ int dns_query(struct net *net,
 	 * add_key() to preinstall malicious redirections
 	 */
 	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key_net(&key_type_dns_resolver, desc, net, options,
-			       &dns_key_acl);
+	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
 	revert_creds(saved_cred);
 	kfree(desc);
 	if (IS_ERR(rkey)) {
@@ -146,6 +135,8 @@ int dns_query(struct net *net,
 
 	down_read(&rkey->sem);
 	set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+	rkey->perm |= KEY_USR_VIEW;
+
 	ret = key_validate(rkey);
 	if (ret < 0)
 		goto put;
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 2032f6a8225e..6c3f35fac42d 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -23,14 +23,6 @@
 #include <keys/user-type.h>
 #include "ar-internal.h"
 
-static struct key_acl rxrpc_null_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 1,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ),
-	}
-};
-
 static int rxrpc_vet_description_s(const char *);
 static int rxrpc_preparse(struct key_preparsed_payload *);
 static int rxrpc_preparse_s(struct key_preparsed_payload *);
@@ -918,8 +910,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk),
-			      NULL, NULL);
+	key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
@@ -950,8 +941,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
 	if (IS_ERR(description))
 		return PTR_ERR(description);
 
-	key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk),
-			      NULL, NULL);
+	key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL);
 	if (IS_ERR(key)) {
 		kfree(description);
 		_leave(" = %ld", PTR_ERR(key));
@@ -984,8 +974,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 	_enter("");
 
 	key = key_alloc(&key_type_rxrpc, "x",
-			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			&internal_key_acl,
+			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
 			KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
@@ -1033,7 +1022,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
 
 	key = key_alloc(&key_type_rxrpc, keyname,
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			&rxrpc_null_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key))
 		return key;
 
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 298fe91557f7..4831ad745f91 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -741,7 +741,8 @@ static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen)
 
 		key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1),
 					   "asymmetric", NULL, p, plen,
-					   &internal_key_acl, 
+					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+					    KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
@@ -767,7 +768,8 @@ static int __init load_builtin_regdb_keys(void)
 	builtin_regdb_keys =
 		keyring_alloc(".builtin_regdb_keys",
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			      &internal_keyring_acl, 
+			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
 			      KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(builtin_regdb_keys))
 		return PTR_ERR(builtin_regdb_keys);
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index f9f3c8ffe786..868ade3e8970 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -47,8 +47,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 
 	if (!keyring[id]) {
 		keyring[id] =
-			request_key(&key_type_keyring, keyring_name[id],
-				    NULL, NULL);
+			request_key(&key_type_keyring, keyring_name[id], NULL);
 		if (IS_ERR(keyring[id])) {
 			int err = PTR_ERR(keyring[id]);
 			pr_err("no %s keyring: %d\n", keyring_name[id], err);
@@ -71,14 +70,14 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 }
 
 static int __init __integrity_init_keyring(const unsigned int id,
-					   struct key_acl *acl,
+					   key_perm_t perm,
 					   struct key_restriction *restriction)
 {
 	const struct cred *cred = current_cred();
 	int err = 0;
 
 	keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0),
-				    KGIDT_INIT(0), cred, acl,
+				    KGIDT_INIT(0), cred, perm,
 				    KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL);
 	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
@@ -96,7 +95,10 @@ static int __init __integrity_init_keyring(const unsigned int id,
 int __init integrity_init_keyring(const unsigned int id)
 {
 	struct key_restriction *restriction;
-	struct key_acl *acl = &internal_keyring_acl;
+	key_perm_t perm;
+
+	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW
+		| KEY_USR_READ | KEY_USR_SEARCH;
 
 	if (id == INTEGRITY_KEYRING_PLATFORM) {
 		restriction = NULL;
@@ -111,14 +113,14 @@ int __init integrity_init_keyring(const unsigned int id)
 		return -ENOMEM;
 
 	restriction->check = restrict_link_to_ima;
-	acl = &internal_writable_keyring_acl;
+	perm |= KEY_USR_WRITE;
 
 out:
-	return __integrity_init_keyring(id, acl, restriction);
+	return __integrity_init_keyring(id, perm, restriction);
 }
 
-static int __init integrity_add_key(const unsigned int id, const void *data,
-				    off_t size, struct key_acl *acl)
+int __init integrity_add_key(const unsigned int id, const void *data,
+			     off_t size, key_perm_t perm)
 {
 	key_ref_t key;
 	int rc = 0;
@@ -127,7 +129,7 @@ static int __init integrity_add_key(const unsigned int id, const void *data,
 		return -EINVAL;
 
 	key = key_create_or_update(make_key_ref(keyring[id], 1), "asymmetric",
-				   NULL, data, size, acl ?: &internal_key_acl,
+				   NULL, data, size, perm,
 				   KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key)) {
 		rc = PTR_ERR(key);
@@ -147,6 +149,7 @@ int __init integrity_load_x509(const unsigned int id, const char *path)
 	void *data;
 	loff_t size;
 	int rc;
+	key_perm_t perm;
 
 	rc = kernel_read_file_from_path(path, &data, &size, 0,
 					READING_X509_CERTIFICATE);
@@ -155,19 +158,21 @@ int __init integrity_load_x509(const unsigned int id, const char *path)
 		return rc;
 	}
 
+	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ;
+
 	pr_info("Loading X.509 certificate: %s\n", path);
-	rc = integrity_add_key(id, data, size, NULL);
+	rc = integrity_add_key(id, (const void *)data, size, perm);
 
 	vfree(data);
 	return rc;
 }
 
 int __init integrity_load_cert(const unsigned int id, const char *source,
-			       const void *data, size_t len, struct key_acl *acl)
+			       const void *data, size_t len, key_perm_t perm)
 {
 	if (!data)
 		return -EINVAL;
 
 	pr_info("Loading X.509 certificate: %s\n", source);
-	return integrity_add_key(id, data, len, acl);
+	return integrity_add_key(id, data, len, perm);
 }
diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c
index a29df775fdd8..55aec161d0e1 100644
--- a/security/integrity/digsig_asymmetric.c
+++ b/security/integrity/digsig_asymmetric.c
@@ -53,7 +53,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid)
 		else
 			key = key_ref_to_ptr(kref);
 	} else {
-		key = request_key(&key_type_asymmetric, name, NULL, NULL);
+		key = request_key(&key_type_asymmetric, name, NULL);
 	}
 
 	if (IS_ERR(key)) {
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 466eebd3b4aa..d485f6fc908e 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -356,7 +356,7 @@ int evm_init_key(void)
 	struct encrypted_key_payload *ekp;
 	int rc;
 
-	evm_key = request_key(&key_type_encrypted, EVMKEY, NULL, NULL);
+	evm_key = request_key(&key_type_encrypted, EVMKEY, NULL);
 	if (IS_ERR(evm_key))
 		return -ENOENT;
 
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index b52ae1476ec3..36cadadbfba4 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -16,15 +16,6 @@
 #include <keys/system_keyring.h>
 
 
-static struct key_acl integrity_blacklist_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH),
-	}
-};
-
 struct key *ima_blacklist_keyring;
 
 /*
@@ -44,7 +35,9 @@ __init int ima_mok_init(void)
 
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-			        &integrity_blacklist_keyring_acl,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ |
+				KEY_USR_WRITE | KEY_USR_SEARCH,
 				KEY_ALLOC_NOT_IN_QUOTA,
 				restriction, NULL);
 
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 875c6a7a5af1..ed12d8e13d04 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -12,8 +12,6 @@
 #include <linux/key.h>
 #include <linux/audit.h>
 
-struct key_acl;
-
 /* iint action cache flags */
 #define IMA_MEASURE		0x00000001
 #define IMA_MEASURED		0x00000002
@@ -157,7 +155,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 int __init integrity_init_keyring(const unsigned int id);
 int __init integrity_load_x509(const unsigned int id, const char *path);
 int __init integrity_load_cert(const unsigned int id, const char *source,
-			       const void *data, size_t len, struct key_acl *acl);
+			       const void *data, size_t len, key_perm_t perm);
 #else
 
 static inline int integrity_digsig_verify(const unsigned int id,
@@ -175,7 +173,7 @@ static inline int integrity_init_keyring(const unsigned int id)
 static inline int __init integrity_load_cert(const unsigned int id,
 					     const char *source,
 					     const void *data, size_t len,
-					     struct key_acl *acl)
+					     key_perm_t perm)
 {
 	return 0;
 }
diff --git a/security/integrity/platform_certs/platform_keyring.c b/security/integrity/platform_certs/platform_keyring.c
index 7646e35f2d91..bcafd7387729 100644
--- a/security/integrity/platform_certs/platform_keyring.c
+++ b/security/integrity/platform_certs/platform_keyring.c
@@ -14,15 +14,6 @@
 #include <linux/slab.h>
 #include "../integrity.h"
 
-static struct key_acl platform_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
 /**
  * add_to_platform_keyring - Add to platform keyring without validation.
  * @source: Source of key
@@ -35,10 +26,13 @@ static struct key_acl platform_key_acl = {
 void __init add_to_platform_keyring(const char *source, const void *data,
 				    size_t len)
 {
+	key_perm_t perm;
 	int rc;
 
+	perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW;
+
 	rc = integrity_load_cert(INTEGRITY_KEYRING_PLATFORM, source, data, len,
-				 &platform_key_acl);
+				 perm);
 	if (rc)
 		pr_info("Error adding keys to platform keyring %s\n", source);
 }
diff --git a/security/keys/compat.c b/security/keys/compat.c
index b0e59546e7bd..9bcc404131aa 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -157,8 +157,6 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 
 	case KEYCTL_MOVE:
 		return keyctl_keyring_move(arg2, arg3, arg4, arg5);
-	case KEYCTL_GRANT_PERMISSION:
-		return keyctl_grant_permission(arg2, arg3, arg4, arg5);
 
 	case KEYCTL_CAPABILITIES:
 		return keyctl_capabilities(compat_ptr(arg2), arg3);
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 9df560e477c2..60720f58cbe0 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -304,7 +304,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k
 	const struct user_key_payload *upayload;
 	struct key *ukey;
 
-	ukey = request_key(&key_type_user, master_desc, NULL, NULL);
+	ukey = request_key(&key_type_user, master_desc, NULL);
 	if (IS_ERR(ukey))
 		goto error;
 
diff --git a/security/keys/encrypted-keys/masterkey_trusted.c b/security/keys/encrypted-keys/masterkey_trusted.c
index d649f2f29475..c68528aa49c6 100644
--- a/security/keys/encrypted-keys/masterkey_trusted.c
+++ b/security/keys/encrypted-keys/masterkey_trusted.c
@@ -30,7 +30,7 @@ struct key *request_trusted_key(const char *trusted_desc,
 	struct trusted_key_payload *tpayload;
 	struct key *tkey;
 
-	tkey = request_key(&key_type_trusted, trusted_desc, NULL, NULL);
+	tkey = request_key(&key_type_trusted, trusted_desc, NULL);
 	if (IS_ERR(tkey))
 		goto error;
 
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 48c3e124c272..671dd730ecfc 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -151,7 +151,6 @@ static noinline void key_gc_unused_keys(struct list_head *keys)
 
 		key_user_put(key->user);
 		key_put_tag(key->domain_tag);
-		key_put_acl(rcu_access_pointer(key->acl));
 		kfree(key->description);
 
 		memzero_explicit(key, sizeof(*key));
@@ -221,6 +220,7 @@ continue_scanning:
 			if (key->type == key_gc_dead_keytype) {
 				gc_state |= KEY_GC_FOUND_DEAD_KEY;
 				set_bit(KEY_FLAG_DEAD, &key->flags);
+				key->perm = 0;
 				goto skip_dead_key;
 			} else if (key->type == &key_type_keyring &&
 				   key->restrict_link) {
diff --git a/security/keys/internal.h b/security/keys/internal.h
index e0c5bb8b1685..c039373488bd 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -84,11 +84,8 @@ extern struct rb_root key_serial_tree;
 extern spinlock_t key_serial_lock;
 extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
-extern struct key_acl default_key_acl;
-extern struct key_acl joinable_keyring_acl;
 
 extern void key_set_index_key(struct keyring_index_key *index_key);
-
 extern struct key_type *key_type_lookup(const char *type);
 extern void key_type_put(struct key_type *ktype);
 
@@ -159,7 +156,6 @@ extern struct key *request_key_and_link(struct key_type *type,
 					const void *callout_info,
 					size_t callout_len,
 					void *aux,
-					struct key_acl *acl,
 					struct key *dest_keyring,
 					unsigned long flags);
 
@@ -183,10 +179,7 @@ extern void key_gc_keytype(struct key_type *ktype);
 
 extern int key_task_permission(const key_ref_t key_ref,
 			       const struct cred *cred,
-			       u32 desired_perm);
-extern unsigned int key_acl_to_perm(const struct key_acl *acl);
-extern long key_set_acl(struct key *key, struct key_acl *acl);
-extern void key_put_acl(struct key_acl *acl);
+			       key_perm_t perm);
 
 /*
  * Check to see whether permission is granted to use a key in the desired way.
@@ -233,7 +226,7 @@ extern long keyctl_keyring_search(key_serial_t, const char __user *,
 				  const char __user *, key_serial_t);
 extern long keyctl_read_key(key_serial_t, char __user *, size_t);
 extern long keyctl_chown_key(key_serial_t, uid_t, gid_t);
-extern long keyctl_setperm_key(key_serial_t, unsigned int);
+extern long keyctl_setperm_key(key_serial_t, key_perm_t);
 extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
@@ -338,11 +331,6 @@ static inline long keyctl_pkey_e_d_s(int op,
 
 extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen);
 
-extern long keyctl_grant_permission(key_serial_t keyid,
-				    enum key_ace_subject_type type,
-				    unsigned int subject,
-				    unsigned int perm);
-
 /*
  * Debugging key validation
  */
diff --git a/security/keys/key.c b/security/keys/key.c
index 519211a996e7..764f4c57913e 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -195,7 +195,7 @@ serial_exists:
  * @uid: The owner of the new key.
  * @gid: The group ID for the new key's group permissions.
  * @cred: The credentials specifying UID namespace.
- * @acl: The ACL to attach to the new key.
+ * @perm: The permissions mask of the new key.
  * @flags: Flags specifying quota properties.
  * @restrict_link: Optional link restriction for new keyrings.
  *
@@ -223,7 +223,7 @@ serial_exists:
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
-		      struct key_acl *acl, unsigned long flags,
+		      key_perm_t perm, unsigned long flags,
 		      struct key_restriction *restrict_link)
 {
 	struct key_user *user = NULL;
@@ -246,9 +246,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	desclen = strlen(desc);
 	quotalen = desclen + 1 + type->def_datalen;
 
-	if (!acl)
-		acl = &default_key_acl;
-
 	/* get hold of the key tracking for this user */
 	user = key_user_lookup(uid);
 	if (!user)
@@ -295,8 +292,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->datalen = type->def_datalen;
 	key->uid = uid;
 	key->gid = gid;
-	refcount_inc(&acl->usage);
-	rcu_assign_pointer(key->acl, acl);
+	key->perm = perm;
 	key->restrict_link = restrict_link;
 	key->last_used_at = ktime_get_real_seconds();
 
@@ -791,7 +787,7 @@ error:
  * @description: The searchable description for the key.
  * @payload: The data to use to instantiate or update the key.
  * @plen: The length of @payload.
- * @acl: The ACL to attach if a key is created.
+ * @perm: The permissions mask for a new key.
  * @flags: The quota flags for a new key.
  *
  * Search the destination keyring for a key of the same description and if one
@@ -814,7 +810,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       const char *description,
 			       const void *payload,
 			       size_t plen,
-			       struct key_acl *acl,
+			       key_perm_t perm,
 			       unsigned long flags)
 {
 	struct keyring_index_key index_key = {
@@ -911,9 +907,22 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			goto found_matching_key;
 	}
 
+	/* if the client doesn't provide, decide on the permissions we want */
+	if (perm == KEY_PERM_UNDEF) {
+		perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
+		perm |= KEY_USR_VIEW;
+
+		if (index_key.type->read)
+			perm |= KEY_POS_READ;
+
+		if (index_key.type == &key_type_keyring ||
+		    index_key.type->update)
+			perm |= KEY_POS_WRITE;
+	}
+
 	/* allocate a new key */
 	key = key_alloc(index_key.type, index_key.description,
-			cred->fsuid, cred->fsgid, cred, acl, flags, NULL);
+			cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_link_end;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index c2dd66d556d4..9b898c969558 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -37,8 +37,7 @@ static const unsigned char keyrings_capabilities[2] = {
 	       KEYCTL_CAPS0_MOVE
 	       ),
 	[1] = (KEYCTL_CAPS1_NS_KEYRING_NAME |
-	       KEYCTL_CAPS1_NS_KEY_TAG |
-	       KEYCTL_CAPS1_ACL_ALTERABLE),
+	       KEYCTL_CAPS1_NS_KEY_TAG),
 };
 
 static int key_get_type_from_user(char *type,
@@ -131,7 +130,8 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
 	/* create or update the requested key and add it to the target
 	 * keyring */
 	key_ref = key_create_or_update(keyring_ref, type, description,
-				       payload, plen, NULL, KEY_ALLOC_IN_QUOTA);
+				       payload, plen, KEY_PERM_UNDEF,
+				       KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key_ref)) {
 		ret = key_ref_to_ptr(key_ref)->serial;
 		key_ref_put(key_ref);
@@ -221,8 +221,7 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type,
 
 	/* do the search */
 	key = request_key_and_link(ktype, description, NULL, callout_info,
-				   callout_len, NULL, NULL,
-				   key_ref_to_ptr(dest_ref),
+				   callout_len, NULL, key_ref_to_ptr(dest_ref),
 				   KEY_ALLOC_IN_QUOTA);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -384,10 +383,16 @@ long keyctl_revoke_key(key_serial_t id)
 	struct key *key;
 	long ret;
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_REVOKE);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
-		goto error;
+		if (ret != -EACCES)
+			goto error;
+		key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
+		if (IS_ERR(key_ref)) {
+			ret = PTR_ERR(key_ref);
+			goto error;
+		}
 	}
 
 	key = key_ref_to_ptr(key_ref);
@@ -421,7 +426,7 @@ long keyctl_invalidate_key(key_serial_t id)
 
 	kenter("%d", id);
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_INVAL);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 
@@ -466,7 +471,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	struct key *keyring;
 	long ret;
 
-	keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_CLEAR);
+	keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 
@@ -641,7 +646,6 @@ long keyctl_describe_key(key_serial_t keyid,
 			 size_t buflen)
 {
 	struct key *key, *instkey;
-	unsigned int perm;
 	key_ref_t key_ref;
 	char *infobuf;
 	long ret;
@@ -671,10 +675,6 @@ okay:
 	key = key_ref_to_ptr(key_ref);
 	desclen = strlen(key->description);
 
-	rcu_read_lock();
-	perm = key_acl_to_perm(rcu_dereference(key->acl));
-	rcu_read_unlock();
-
 	/* calculate how much information we're going to return */
 	ret = -ENOMEM;
 	infobuf = kasprintf(GFP_KERNEL,
@@ -682,7 +682,7 @@ okay:
 			    key->type->name,
 			    from_kuid_munged(current_user_ns(), key->uid),
 			    from_kgid_munged(current_user_ns(), key->gid),
-			    perm);
+			    key->perm);
 	if (!infobuf)
 		goto error2;
 	infolen = strlen(infobuf);
@@ -899,7 +899,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group)
 		goto error;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETSEC);
+				  KEY_NEED_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -994,25 +994,18 @@ quota_overrun:
  * the key need not be fully instantiated yet.  If the caller does not have
  * sysadmin capability, it may only change the permission on keys that it owns.
  */
-long keyctl_setperm_key(key_serial_t id, unsigned int perm)
+long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 {
-	struct key_acl *acl;
 	struct key *key;
 	key_ref_t key_ref;
 	long ret;
-	int nr, i, j;
 
+	ret = -EINVAL;
 	if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
-		return -EINVAL;
-
-	nr = 0;
-	if (perm & KEY_POS_ALL) nr++;
-	if (perm & KEY_USR_ALL) nr++;
-	if (perm & KEY_GRP_ALL) nr++;
-	if (perm & KEY_OTH_ALL) nr++;
+		goto error;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETSEC);
+				  KEY_NEED_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -1020,45 +1013,17 @@ long keyctl_setperm_key(key_serial_t id, unsigned int perm)
 
 	key = key_ref_to_ptr(key_ref);
 
-	ret = -EOPNOTSUPP;
-	if (test_bit(KEY_FLAG_HAS_ACL, &key->flags))
-		goto error_key;
+	/* make the changes with the locks held to prevent chown/chmod races */
+	ret = -EACCES;
+	down_write(&key->sem);
 
-	ret = -ENOMEM;
-	acl = kzalloc(struct_size(acl, aces, nr), GFP_KERNEL);
-	if (!acl)
-		goto error_key;
-
-	refcount_set(&acl->usage, 1);
-	acl->nr_ace = nr;
-	j = 0;
-	for (i = 0; i < 4; i++) {
-		struct key_ace *ace = &acl->aces[j];
-		unsigned int subset = (perm >> (i * 8)) & KEY_OTH_ALL;
-
-		if (!subset)
-			continue;
-		ace->type = KEY_ACE_SUBJ_STANDARD;
-		ace->subject_id = KEY_ACE_EVERYONE + i;
-		ace->perm = subset;
-		if (subset & (KEY_OTH_WRITE | KEY_OTH_SETATTR))
-			ace->perm |= KEY_ACE_REVOKE;
-		if (subset & KEY_OTH_SEARCH)
-			ace->perm |= KEY_ACE_INVAL;
-		if (key->type == &key_type_keyring) {
-			if (subset & KEY_OTH_SEARCH)
-				ace->perm |= KEY_ACE_JOIN;
-			if (subset & KEY_OTH_WRITE)
-				ace->perm |= KEY_ACE_CLEAR;
-		}
-		j++;
+	/* if we're not the sysadmin, we can only change a key that we own */
+	if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) {
+		key->perm = perm;
+		ret = 0;
 	}
 
-	/* make the changes with the locks held to prevent chown/chmod races */
-	down_write(&key->sem);
-	ret = key_set_acl(key, acl);
 	up_write(&key->sem);
-error_key:
 	key_put(key);
 error:
 	return ret;
@@ -1423,7 +1388,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout)
 	long ret;
 
 	key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
-				  KEY_NEED_SETSEC);
+				  KEY_NEED_SETATTR);
 	if (IS_ERR(key_ref)) {
 		/* setting the timeout on a key under construction is permitted
 		 * if we have the authorisation token handy */
@@ -1574,7 +1539,7 @@ long keyctl_get_security(key_serial_t keyid,
  * Attempt to install the calling process's session keyring on the process's
  * parent process.
  *
- * The keyring must exist and must grant the caller JOIN permission, and the
+ * The keyring must exist and must grant the caller LINK permission, and the
  * parent process must be single-threaded and must have the same effective
  * ownership as this process and mustn't be SUID/SGID.
  *
@@ -1591,7 +1556,7 @@ long keyctl_session_to_parent(void)
 	struct cred *cred;
 	int ret;
 
-	keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_JOIN);
+	keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK);
 	if (IS_ERR(keyring_r))
 		return PTR_ERR(keyring_r);
 
@@ -1693,7 +1658,7 @@ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type,
 	char *restriction = NULL;
 	long ret;
 
-	key_ref = lookup_user_key(id, 0, KEY_NEED_SETSEC);
+	key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
 	if (IS_ERR(key_ref))
 		return PTR_ERR(key_ref);
 
@@ -1799,7 +1764,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 	case KEYCTL_SETPERM:
 		return keyctl_setperm_key((key_serial_t) arg2,
-					  (unsigned int)arg3);
+					  (key_perm_t) arg3);
 
 	case KEYCTL_INSTANTIATE:
 		return keyctl_instantiate_key((key_serial_t) arg2,
@@ -1888,11 +1853,6 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 					   (key_serial_t)arg3,
 					   (key_serial_t)arg4,
 					   (unsigned int)arg5);
-	case KEYCTL_GRANT_PERMISSION:
-		return keyctl_grant_permission((key_serial_t)arg2,
-					       (enum key_ace_subject_type)arg3,
-					       (unsigned int)arg4,
-					       (unsigned int)arg5);
 
 	case KEYCTL_CAPABILITIES:
 		return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3);
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 3b5458f23a95..febf36c6ddc5 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -515,19 +515,11 @@ static long keyring_read(const struct key *keyring,
 	return ret;
 }
 
-/**
- * keyring_alloc - Allocate a keyring and link into the destination
- * @description: The key description to allow the key to be searched out.
- * @uid: The owner of the new key.
- * @gid: The group ID for the new key's group permissions.
- * @cred: The credentials specifying UID namespace.
- * @acl: The ACL to attach to the new key.
- * @flags: Flags specifying quota properties.
- * @restrict_link: Optional link restriction for new keyrings.
- * @dest: Destination keyring.
+/*
+ * Allocate a keyring and link into the destination keyring.
  */
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
-			  const struct cred *cred, struct key_acl *acl,
+			  const struct cred *cred, key_perm_t perm,
 			  unsigned long flags,
 			  struct key_restriction *restrict_link,
 			  struct key *dest)
@@ -536,7 +528,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, cred, acl, flags, restrict_link);
+			    uid, gid, cred, perm, flags, restrict_link);
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
@@ -1140,11 +1132,10 @@ found:
 /*
  * Find a keyring with the specified name.
  *
- * Only keyrings that have nonzero refcount, are not revoked, and are owned by
- * a user in the current user namespace are considered.  If @uid_keyring is
- * %true, the keyring additionally must have been allocated as a user or user
- * session keyring; otherwise, it must grant JOIN permission directly to the
- * caller (ie. not through possession).
+ * Only keyrings that have nonzero refcount, are not revoked, and are owned by a
+ * user in the current user namespace are considered.  If @uid_keyring is %true,
+ * the keyring additionally must have been allocated as a user or user session
+ * keyring; otherwise, it must grant Search permission directly to the caller.
  *
  * Returns a pointer to the keyring with the keyring's refcount having being
  * incremented on success.  -ENOKEY is returned if a key could not be found.
@@ -1178,7 +1169,7 @@ struct key *find_keyring_by_name(const char *name, bool uid_keyring)
 				continue;
 		} else {
 			if (key_permission(make_key_ref(keyring, 0),
-					   KEY_NEED_JOIN) < 0)
+					   KEY_NEED_SEARCH) < 0)
 				continue;
 		}
 
diff --git a/security/keys/permission.c b/security/keys/permission.c
index fd8a5dc6910a..085f907b64ac 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -7,67 +7,13 @@
 
 #include <linux/export.h>
 #include <linux/security.h>
-#include <linux/user_namespace.h>
-#include <linux/uaccess.h>
 #include "internal.h"
 
-struct key_acl default_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-EXPORT_SYMBOL(default_key_acl);
-
-struct key_acl joinable_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces	= {
-		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_LINK | KEY_ACE_JOIN),
-	}
-};
-EXPORT_SYMBOL(joinable_keyring_acl);
-
-struct key_acl internal_key_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH),
-	}
-};
-EXPORT_SYMBOL(internal_key_acl);
-
-struct key_acl internal_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH),
-	}
-};
-EXPORT_SYMBOL(internal_keyring_acl);
-
-struct key_acl internal_writable_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH),
-	}
-};
-EXPORT_SYMBOL(internal_writable_keyring_acl);
-
 /**
  * key_task_permission - Check a key can be used
  * @key_ref: The key to check.
  * @cred: The credentials to use.
- * @desired_perm: The permission to check for.
+ * @perm: The permissions to check for.
  *
  * Check to see whether permission is granted to use a key in the desired way,
  * but permit the security modules to override.
@@ -78,73 +24,53 @@ EXPORT_SYMBOL(internal_writable_keyring_acl);
  * permissions bits or the LSM check.
  */
 int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
-			unsigned int desired_perm)
+			unsigned perm)
 {
-	const struct key_acl *acl;
-	const struct key *key;
-	unsigned int allow = 0;
-	int i;
-
-	BUILD_BUG_ON(KEY_NEED_VIEW	!= KEY_ACE_VIEW		||
-		     KEY_NEED_READ	!= KEY_ACE_READ		||
-		     KEY_NEED_WRITE	!= KEY_ACE_WRITE	||
-		     KEY_NEED_SEARCH	!= KEY_ACE_SEARCH	||
-		     KEY_NEED_LINK	!= KEY_ACE_LINK		||
-		     KEY_NEED_SETSEC	!= KEY_ACE_SET_SECURITY	||
-		     KEY_NEED_INVAL	!= KEY_ACE_INVAL	||
-		     KEY_NEED_REVOKE	!= KEY_ACE_REVOKE	||
-		     KEY_NEED_JOIN	!= KEY_ACE_JOIN		||
-		     KEY_NEED_CLEAR	!= KEY_ACE_CLEAR);
+	struct key *key;
+	key_perm_t kperm;
+	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
-	rcu_read_lock();
-
-	acl = rcu_dereference(key->acl);
-	if (!acl || acl->nr_ace == 0)
-		goto no_access_rcu;
+	/* use the second 8-bits of permissions for keys the caller owns */
+	if (uid_eq(key->uid, cred->fsuid)) {
+		kperm = key->perm >> 16;
+		goto use_these_perms;
+	}
 
-	for (i = 0; i < acl->nr_ace; i++) {
-		const struct key_ace *ace = &acl->aces[i];
+	/* use the third 8-bits of permissions for keys the caller has a group
+	 * membership in common with */
+	if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) {
+		if (gid_eq(key->gid, cred->fsgid)) {
+			kperm = key->perm >> 8;
+			goto use_these_perms;
+		}
 
-		switch (ace->type) {
-		case KEY_ACE_SUBJ_STANDARD:
-			switch (ace->subject_id) {
-			case KEY_ACE_POSSESSOR:
-				if (is_key_possessed(key_ref))
-					allow |= ace->perm;
-				break;
-			case KEY_ACE_OWNER:
-				if (uid_eq(key->uid, cred->fsuid))
-					allow |= ace->perm;
-				break;
-			case KEY_ACE_GROUP:
-				if (gid_valid(key->gid)) {
-					if (gid_eq(key->gid, cred->fsgid))
-						allow |= ace->perm;
-					else if (groups_search(cred->group_info, key->gid))
-						allow |= ace->perm;
-				}
-				break;
-			case KEY_ACE_EVERYONE:
-				allow |= ace->perm;
-				break;
-			}
-			break;
+		ret = groups_search(cred->group_info, key->gid);
+		if (ret) {
+			kperm = key->perm >> 8;
+			goto use_these_perms;
 		}
 	}
 
-	rcu_read_unlock();
+	/* otherwise use the least-significant 8-bits */
+	kperm = key->perm;
+
+use_these_perms:
 
-	if (!(allow & desired_perm))
-		goto no_access;
+	/* use the top 8-bits of permissions for keys the caller possesses
+	 * - possessor permissions are additive with other permissions
+	 */
+	if (is_key_possessed(key_ref))
+		kperm |= key->perm >> 24;
 
-	return security_key_permission(key_ref, cred, desired_perm);
+	kperm = kperm & perm & KEY_NEED_ALL;
 
-no_access_rcu:
-	rcu_read_unlock();
-no_access:
-	return -EACCES;
+	if (kperm != perm)
+		return -EACCES;
+
+	/* let LSM be the final arbiter */
+	return security_key_permission(key_ref, cred, perm);
 }
 EXPORT_SYMBOL(key_task_permission);
 
@@ -178,218 +104,3 @@ int key_validate(const struct key *key)
 	return 0;
 }
 EXPORT_SYMBOL(key_validate);
-
-/*
- * Roughly render an ACL to an old-style permissions mask.  We cannot
- * accurately render what the ACL, particularly if it has ACEs that represent
- * subjects outside of { poss, user, group, other }.
- */
-unsigned int key_acl_to_perm(const struct key_acl *acl)
-{
-	unsigned int perm = 0, tperm;
-	int i;
-
-	BUILD_BUG_ON(KEY_OTH_VIEW	!= KEY_ACE_VIEW		||
-		     KEY_OTH_READ	!= KEY_ACE_READ		||
-		     KEY_OTH_WRITE	!= KEY_ACE_WRITE	||
-		     KEY_OTH_SEARCH	!= KEY_ACE_SEARCH	||
-		     KEY_OTH_LINK	!= KEY_ACE_LINK		||
-		     KEY_OTH_SETATTR	!= KEY_ACE_SET_SECURITY);
-
-	if (!acl || acl->nr_ace == 0)
-		return 0;
-
-	for (i = 0; i < acl->nr_ace; i++) {
-		const struct key_ace *ace = &acl->aces[i];
-
-		switch (ace->type) {
-		case KEY_ACE_SUBJ_STANDARD:
-			tperm = ace->perm & KEY_OTH_ALL;
-
-			/* Invalidation and joining were allowed by SEARCH */
-			if (ace->perm & (KEY_ACE_INVAL | KEY_ACE_JOIN))
-				tperm |= KEY_OTH_SEARCH;
-
-			/* Revocation was allowed by either SETATTR or WRITE */
-			if ((ace->perm & KEY_ACE_REVOKE) && !(tperm & KEY_OTH_SETATTR))
-				tperm |= KEY_OTH_WRITE;
-
-			/* Clearing was allowed by WRITE */
-			if (ace->perm & KEY_ACE_CLEAR)
-				tperm |= KEY_OTH_WRITE;
-
-			switch (ace->subject_id) {
-			case KEY_ACE_POSSESSOR:
-				perm |= tperm << 24;
-				break;
-			case KEY_ACE_OWNER:
-				perm |= tperm << 16;
-				break;
-			case KEY_ACE_GROUP:
-				perm |= tperm << 8;
-				break;
-			case KEY_ACE_EVERYONE:
-				perm |= tperm << 0;
-				break;
-			}
-		}
-	}
-
-	return perm;
-}
-
-/*
- * Destroy a key's ACL.
- */
-void key_put_acl(struct key_acl *acl)
-{
-	if (acl && refcount_dec_and_test(&acl->usage))
-		kfree_rcu(acl, rcu);
-}
-
-/*
- * Try to set the ACL.  This either attaches or discards the proposed ACL.
- */
-long key_set_acl(struct key *key, struct key_acl *acl)
-{
-	int i;
-
-	/* If we're not the sysadmin, we can only change a key that we own. */
-	if (!capable(CAP_SYS_ADMIN) && !uid_eq(key->uid, current_fsuid())) {
-		key_put_acl(acl);
-		return -EACCES;
-	}
-
-	for (i = 0; i < acl->nr_ace; i++) {
-		const struct key_ace *ace = &acl->aces[i];
-		if (ace->type == KEY_ACE_SUBJ_STANDARD &&
-		    ace->subject_id == KEY_ACE_POSSESSOR) {
-			if (ace->perm & KEY_ACE_VIEW)
-				acl->possessor_viewable = true;
-			break;
-		}
-	}
-
-	rcu_swap_protected(key->acl, acl, lockdep_is_held(&key->sem));
-	key_put_acl(acl);
-	return 0;
-}
-
-/*
- * Allocate a new ACL with an extra ACE slot.
- */
-static struct key_acl *key_alloc_acl(const struct key_acl *old_acl, int nr, int skip)
-{
-	struct key_acl *acl;
-	int nr_ace, i, j = 0;
-
-	nr_ace = old_acl->nr_ace + nr;
-	if (nr_ace > 16)
-		return ERR_PTR(-EINVAL);
-
-	acl = kzalloc(struct_size(acl, aces, nr_ace), GFP_KERNEL);
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-
-	refcount_set(&acl->usage, 1);
-	acl->nr_ace = nr_ace;
-	for (i = 0; i < old_acl->nr_ace; i++) {
-		if (i == skip)
-			continue;
-		acl->aces[j] = old_acl->aces[i];
-		j++;
-	}
-	return acl;
-}
-
-/*
- * Generate the revised ACL.
- */
-static long key_change_acl(struct key *key, struct key_ace *new_ace)
-{
-	struct key_acl *acl, *old;
-	int i;
-
-	old = rcu_dereference_protected(key->acl, lockdep_is_held(&key->sem));
-
-	for (i = 0; i < old->nr_ace; i++)
-		if (old->aces[i].type == new_ace->type &&
-		    old->aces[i].subject_id == new_ace->subject_id)
-			goto found_match;
-
-	if (new_ace->perm == 0)
-		return 0; /* No permissions to remove.  Add deny record? */
-
-	acl = key_alloc_acl(old, 1, -1);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	acl->aces[i] = *new_ace;
-	goto change;
-
-found_match:
-	if (new_ace->perm == 0)
-		goto delete_ace;
-	if (new_ace->perm == old->aces[i].perm)
-		return 0;
-	acl = key_alloc_acl(old, 0, -1);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	acl->aces[i].perm = new_ace->perm;
-	goto change;
-
-delete_ace:
-	acl = key_alloc_acl(old, -1, i);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	goto change;
-
-change:
-	return key_set_acl(key, acl);
-}
-
-/*
- * Add, alter or remove (if perm == 0) an ACE in a key's ACL.
- */
-long keyctl_grant_permission(key_serial_t keyid,
-			     enum key_ace_subject_type type,
-			     unsigned int subject,
-			     unsigned int perm)
-{
-	struct key_ace new_ace;
-	struct key *key;
-	key_ref_t key_ref;
-	long ret;
-
-	new_ace.type = type;
-	new_ace.perm = perm;
-
-	switch (type) {
-	case KEY_ACE_SUBJ_STANDARD:
-		if (subject >= nr__key_ace_standard_subject)
-			return -ENOENT;
-		new_ace.subject_id = subject;
-		break;
-
-	default:
-		return -ENOENT;
-	}
-
-	key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_SETSEC);
-	if (IS_ERR(key_ref)) {
-		ret = PTR_ERR(key_ref);
-		goto error;
-	}
-
-	key = key_ref_to_ptr(key_ref);
-
-	down_write(&key->sem);
-
-	/* If we're not the sysadmin, we can only change a key that we own */
-	ret = -EACCES;
-	if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid()))
-		ret = key_change_acl(key, &new_ace);
-	up_write(&key->sem);
-	key_put(key);
-error:
-	return ret;
-}
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index 8171c90d4c9a..97af230aa4b2 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -12,27 +12,6 @@
 
 unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
 
-static struct key_acl persistent_register_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
-static struct key_acl persistent_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE |
-				  KEY_ACE_SEARCH | KEY_ACE_LINK |
-				  KEY_ACE_CLEAR | KEY_ACE_INVAL),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
 /*
  * Create the persistent keyring register for the current user namespace.
  *
@@ -43,7 +22,8 @@ static int key_create_persistent_register(struct user_namespace *ns)
 	struct key *reg = keyring_alloc(".persistent_register",
 					KUIDT_INIT(0), KGIDT_INIT(0),
 					current_cred(),
-					&persistent_register_keyring_acl,
+					((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+					 KEY_USR_VIEW | KEY_USR_READ),
 					KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
@@ -76,7 +56,8 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
 
 	persistent = keyring_alloc(index_key->description,
 				   uid, INVALID_GID, current_cred(),
-				   &persistent_keyring_acl,
+				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				    KEY_USR_VIEW | KEY_USR_READ),
 				   KEY_ALLOC_NOT_IN_QUOTA, NULL,
 				   ns->persistent_keyring_register);
 	if (IS_ERR(persistent))
diff --git a/security/keys/proc.c b/security/keys/proc.c
index b394ad1e874b..415f3f1c2da0 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -110,13 +110,11 @@ static struct key *find_ge_key(struct seq_file *p, key_serial_t id)
 }
 
 static void *proc_keys_start(struct seq_file *p, loff_t *_pos)
-	__acquires(rcu)
 	__acquires(key_serial_lock)
 {
 	key_serial_t pos = *_pos;
 	struct key *key;
 
-	rcu_read_lock();
 	spin_lock(&key_serial_lock);
 
 	if (*_pos > INT_MAX)
@@ -146,15 +144,12 @@ static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos)
 
 static void proc_keys_stop(struct seq_file *p, void *v)
 	__releases(key_serial_lock)
-	__releases(rcu)
 {
 	spin_unlock(&key_serial_lock);
-	rcu_read_unlock();
 }
 
 static int proc_keys_show(struct seq_file *m, void *v)
 {
-	const struct key_acl *acl;
 	struct rb_node *_p = v;
 	struct key *key = rb_entry(_p, struct key, serial_node);
 	unsigned long flags;
@@ -162,7 +157,6 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	time64_t now, expiry;
 	char xbuf[16];
 	short state;
-	bool check_pos;
 	u64 timo;
 	int rc;
 
@@ -176,15 +170,15 @@ static int proc_keys_show(struct seq_file *m, void *v)
 					   KEYRING_SEARCH_RECURSE),
 	};
 
-	acl = rcu_dereference(key->acl);
-	check_pos = acl->possessor_viewable;
+	key_ref = make_key_ref(key, 0);
 
 	/* determine if the key is possessed by this process (a test we can
 	 * skip if the key does not indicate the possessor can view it
 	 */
-	key_ref = make_key_ref(key, 0);
-	if (check_pos) {
+	if (key->perm & KEY_POS_VIEW) {
+		rcu_read_lock();
 		skey_ref = search_cred_keyrings_rcu(&ctx);
+		rcu_read_unlock();
 		if (!IS_ERR(skey_ref)) {
 			key_ref_put(skey_ref);
 			key_ref = make_key_ref(key, 1);
@@ -194,10 +188,12 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	/* check whether the current task is allowed to view the key */
 	rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW);
 	if (rc < 0)
-		goto out;
+		return 0;
 
 	now = ktime_get_real_seconds();
 
+	rcu_read_lock();
+
 	/* come up with a suitable timeout value */
 	expiry = READ_ONCE(key->expiry);
 	if (expiry == 0) {
@@ -236,7 +232,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		   showflag(flags, 'i', KEY_FLAG_INVALIDATED),
 		   refcount_read(&key->usage),
 		   xbuf,
-		   key_acl_to_perm(acl),
+		   key->perm,
 		   from_kuid_munged(seq_user_ns(m), key->uid),
 		   from_kgid_munged(seq_user_ns(m), key->gid),
 		   key->type->name);
@@ -247,7 +243,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		key->type->describe(key, m);
 	seq_putc(m, '\n');
 
-out:
+	rcu_read_unlock();
 	return 0;
 }
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index aa3bfcadbc66..09541de31f2f 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -32,47 +32,6 @@ struct key_user root_key_user = {
 	.uid		= GLOBAL_ROOT_UID,
 };
 
-static struct key_acl user_reg_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.possessor_viewable = true,
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_WRITE | KEY_ACE_SEARCH),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
-static struct key_acl user_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.possessor_viewable = true,
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE |
-				  KEY_ACE_SEARCH | KEY_ACE_LINK),
-		KEY_OWNER_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)),
-	}
-};
-
-static struct key_acl session_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.possessor_viewable = true,
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN),
-		KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ),
-	}
-};
-
-static struct key_acl thread_and_process_keyring_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.possessor_viewable = true,
-	.nr_ace	= 2,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
 /*
  * Get or create a user register keyring.
  */
@@ -92,8 +51,11 @@ static struct key *get_user_register(struct user_namespace *user_ns)
 	if (!reg_keyring) {
 		reg_keyring = keyring_alloc(".user_reg",
 					    user_ns->owner, INVALID_GID,
-					    &init_cred, &user_reg_keyring_acl,
-					    0, NULL, NULL);
+					    &init_cred,
+					    KEY_POS_WRITE | KEY_POS_SEARCH |
+					    KEY_USR_VIEW | KEY_USR_READ,
+					    0,
+					    NULL, NULL);
 		if (!IS_ERR(reg_keyring))
 			smp_store_release(&user_ns->user_keyring_register,
 					  reg_keyring);
@@ -115,11 +77,14 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	const struct cred *cred = current_cred();
 	struct user_namespace *user_ns = current_user_ns();
 	struct key *reg_keyring, *uid_keyring, *session_keyring;
+	key_perm_t user_keyring_perm;
 	key_ref_t uid_keyring_r, session_keyring_r;
 	uid_t uid = from_kuid(user_ns, cred->user->uid);
 	char buf[20];
 	int ret;
 
+	user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;
+
 	kenter("%u", uid);
 
 	reg_keyring = get_user_register(user_ns);
@@ -139,7 +104,7 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	kdebug("_uid %p", uid_keyring_r);
 	if (uid_keyring_r == ERR_PTR(-EAGAIN)) {
 		uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
-					    cred, &user_keyring_acl,
+					    cred, user_keyring_perm,
 					    KEY_ALLOC_UID_KEYRING |
 					    KEY_ALLOC_IN_QUOTA,
 					    NULL, reg_keyring);
@@ -161,7 +126,7 @@ int look_up_user_keyrings(struct key **_user_keyring,
 	kdebug("_uid_ses %p", session_keyring_r);
 	if (session_keyring_r == ERR_PTR(-EAGAIN)) {
 		session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
-						cred, &user_keyring_acl,
+						cred, user_keyring_perm,
 						KEY_ALLOC_UID_KEYRING |
 						KEY_ALLOC_IN_QUOTA,
 						NULL, NULL);
@@ -261,7 +226,7 @@ int install_thread_keyring_to_cred(struct cred *new)
 		return 0;
 
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
-				&thread_and_process_keyring_acl,
+				KEY_POS_ALL | KEY_USR_VIEW,
 				KEY_ALLOC_QUOTA_OVERRUN,
 				NULL, NULL);
 	if (IS_ERR(keyring))
@@ -308,7 +273,7 @@ int install_process_keyring_to_cred(struct cred *new)
 		return 0;
 
 	keyring = keyring_alloc("_pid", new->uid, new->gid, new,
-				&thread_and_process_keyring_acl,
+				KEY_POS_ALL | KEY_USR_VIEW,
 				KEY_ALLOC_QUOTA_OVERRUN,
 				NULL, NULL);
 	if (IS_ERR(keyring))
@@ -363,7 +328,8 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
 		keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
-					&session_keyring_acl, flags, NULL, NULL);
+					KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
+					flags, NULL, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	} else {
@@ -643,7 +609,7 @@ bool lookup_user_key_possessed(const struct key *key,
  * returned key reference.
  */
 key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
-			  unsigned int desired_perm)
+			  key_perm_t perm)
 {
 	struct keyring_search_context ctx = {
 		.match_data.cmp		= lookup_user_key_possessed,
@@ -818,12 +784,12 @@ try_again:
 		case -ERESTARTSYS:
 			goto invalid_key;
 		default:
-			if (desired_perm)
+			if (perm)
 				goto invalid_key;
 		case 0:
 			break;
 		}
-	} else if (desired_perm) {
+	} else if (perm) {
 		ret = key_validate(key);
 		if (ret < 0)
 			goto invalid_key;
@@ -835,11 +801,9 @@ try_again:
 		goto invalid_key;
 
 	/* check the permissions */
-	if (desired_perm) {
-		ret = key_task_permission(key_ref, ctx.cred, desired_perm);
-		if (ret < 0)
-			goto invalid_key;
-	}
+	ret = key_task_permission(key_ref, ctx.cred, perm);
+	if (ret < 0)
+		goto invalid_key;
 
 	key->last_used_at = ktime_get_real_seconds();
 
@@ -904,13 +868,13 @@ long join_session_keyring(const char *name)
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
 		keyring = keyring_alloc(
-			name, old->uid, old->gid, old, &joinable_keyring_acl,
+			name, old->uid, old->gid, old,
+			KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
 			KEY_ALLOC_IN_QUOTA, NULL, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
 		}
-		goto no_perm_test;
 	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
@@ -919,12 +883,6 @@ long join_session_keyring(const char *name)
 		goto error3;
 	}
 
-	ret = key_task_permission(make_key_ref(keyring, false), old,
-				  KEY_NEED_JOIN);
-	if (ret < 0)
-		goto error3;
-
-no_perm_test:
 	/* we've got a keyring - now to install it */
 	ret = install_session_keyring_to_cred(new, keyring);
 	if (ret < 0)
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 46c5187ce03f..7325f382dbf4 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -135,7 +135,8 @@ static int call_sbin_request_key(struct key *authkey, void *aux)
 
 	cred = get_current_cred();
 	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
-				NULL, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
+				KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
 	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -366,11 +367,11 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 			       struct key *dest_keyring,
 			       unsigned long flags,
 			       struct key_user *user,
-			       struct key_acl *acl,
 			       struct key **_key)
 {
 	struct assoc_array_edit *edit = NULL;
 	struct key *key;
+	key_perm_t perm;
 	key_ref_t key_ref;
 	int ret;
 
@@ -380,9 +381,17 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 	*_key = NULL;
 	mutex_lock(&user->cons_lock);
 
+	perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
+	perm |= KEY_USR_VIEW;
+	if (ctx->index_key.type->read)
+		perm |= KEY_POS_READ;
+	if (ctx->index_key.type == &key_type_keyring ||
+	    ctx->index_key.type->update)
+		perm |= KEY_POS_WRITE;
+
 	key = key_alloc(ctx->index_key.type, ctx->index_key.description,
 			ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
-			acl, flags, NULL);
+			perm, flags, NULL);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -465,7 +474,6 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx,
 					  const char *callout_info,
 					  size_t callout_len,
 					  void *aux,
-					  struct key_acl *acl,
 					  struct key *dest_keyring,
 					  unsigned long flags)
 {
@@ -488,7 +496,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx,
 		goto error_put_dest_keyring;
 	}
 
-	ret = construct_alloc_key(ctx, dest_keyring, flags, user, acl, &key);
+	ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key);
 	key_user_put(user);
 
 	if (ret == 0) {
@@ -526,7 +534,6 @@ error:
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
- * @acl: The ACL to attach if a new key is created.
  * @dest_keyring: Where to cache the key.
  * @flags: Flags to key_alloc().
  *
@@ -554,7 +561,6 @@ struct key *request_key_and_link(struct key_type *type,
 				 const void *callout_info,
 				 size_t callout_len,
 				 void *aux,
-				 struct key_acl *acl,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
@@ -629,7 +635,7 @@ struct key *request_key_and_link(struct key_type *type,
 			goto error_free;
 
 		key = construct_key_and_link(&ctx, callout_info, callout_len,
-					     aux, acl, dest_keyring, flags);
+					     aux, dest_keyring, flags);
 	}
 
 error_free:
@@ -672,7 +678,6 @@ EXPORT_SYMBOL(wait_for_key_construction);
  * @description: The searchable description of the key.
  * @domain_tag: The domain in which the key operates.
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
- * @acl: The ACL to attach if a new key is created.
  *
  * As for request_key_and_link() except that it does not add the returned key
  * to a keyring if found, new keys are always allocated in the user's quota,
@@ -685,8 +690,7 @@ EXPORT_SYMBOL(wait_for_key_construction);
 struct key *request_key_tag(struct key_type *type,
 			    const char *description,
 			    struct key_tag *domain_tag,
-			    const char *callout_info,
-			    struct key_acl *acl)
+			    const char *callout_info)
 {
 	struct key *key;
 	size_t callout_len = 0;
@@ -696,7 +700,7 @@ struct key *request_key_tag(struct key_type *type,
 		callout_len = strlen(callout_info);
 	key = request_key_and_link(type, description, domain_tag,
 				   callout_info, callout_len,
-				   NULL, acl, NULL, KEY_ALLOC_IN_QUOTA);
+				   NULL, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
 		if (ret < 0) {
@@ -716,7 +720,6 @@ EXPORT_SYMBOL(request_key_tag);
  * @callout_info: The data to pass to the instantiation upcall (or NULL).
  * @callout_len: The length of callout_info.
  * @aux: Auxiliary data for the upcall.
- * @acl: The ACL to attach if a new key is created.
  *
  * As for request_key_and_link() except that it does not add the returned key
  * to a keyring if found and new keys are always allocated in the user's quota.
@@ -729,15 +732,14 @@ struct key *request_key_with_auxdata(struct key_type *type,
 				     struct key_tag *domain_tag,
 				     const void *callout_info,
 				     size_t callout_len,
-				     void *aux,
-				     struct key_acl *acl)
+				     void *aux)
 {
 	struct key *key;
 	int ret;
 
 	key = request_key_and_link(type, description, domain_tag,
 				   callout_info, callout_len,
-				   aux, acl, NULL, KEY_ALLOC_IN_QUOTA);
+				   aux, NULL, KEY_ALLOC_IN_QUOTA);
 	if (!IS_ERR(key)) {
 		ret = wait_for_key_construction(key, false);
 		if (ret < 0) {
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 27e437d94b81..e73ec040e250 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -24,17 +24,6 @@ static void request_key_auth_revoke(struct key *);
 static void request_key_auth_destroy(struct key *);
 static long request_key_auth_read(const struct key *, char __user *, size_t);
 
-static struct key_acl request_key_auth_acl = {
-	.usage	= REFCOUNT_INIT(1),
-	.nr_ace	= 2,
-	.possessor_viewable = true,
-	.aces = {
-		KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH |
-				  KEY_ACE_LINK),
-		KEY_OWNER_ACE(KEY_ACE_VIEW),
-	}
-};
-
 /*
  * The request-key authorisation key type definition.
  */
@@ -221,8 +210,8 @@ struct key *request_key_auth_new(struct key *target, const char *op,
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
 			    cred->fsuid, cred->fsgid, cred,
-			    &request_key_auth_acl,
-			    KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_POS_LINK |
+			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(authkey)) {
 		ret = PTR_ERR(authkey);
 		goto error_free_rka;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4bef86ed463b..74dd46de01b6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6502,7 +6502,6 @@ static int selinux_key_permission(key_ref_t key_ref,
 {
 	struct key *key;
 	struct key_security_struct *ksec;
-	unsigned oldstyle_perm;
 	u32 sid;
 
 	/* if no specific permissions are requested, we skip the
@@ -6511,26 +6510,13 @@ static int selinux_key_permission(key_ref_t key_ref,
 	if (perm == 0)
 		return 0;
 
-	oldstyle_perm = perm & (KEY_NEED_VIEW | KEY_NEED_READ | KEY_NEED_WRITE |
-				KEY_NEED_SEARCH | KEY_NEED_LINK);
-	if (perm & KEY_NEED_SETSEC)
-		oldstyle_perm |= OLD_KEY_NEED_SETATTR;
-	if (perm & KEY_NEED_INVAL)
-		oldstyle_perm |= KEY_NEED_SEARCH;
-	if (perm & KEY_NEED_REVOKE && !(perm & OLD_KEY_NEED_SETATTR))
-		oldstyle_perm |= KEY_NEED_WRITE;
-	if (perm & KEY_NEED_JOIN)
-		oldstyle_perm |= KEY_NEED_SEARCH;
-	if (perm & KEY_NEED_CLEAR)
-		oldstyle_perm |= KEY_NEED_WRITE;
-
 	sid = cred_sid(cred);
 
 	key = key_ref_to_ptr(key_ref);
 	ksec = key->security;
 
 	return avc_has_perm(&selinux_state,
-			    sid, ksec->sid, SECCLASS_KEY, oldstyle_perm, NULL);
+			    sid, ksec->sid, SECCLASS_KEY, perm, NULL);
 }
 
 static int selinux_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 50c536cad85b..4c5e5a438f8b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4284,8 +4284,7 @@ static int smack_key_permission(key_ref_t key_ref,
 #endif
 	if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW))
 		request |= MAY_READ;
-	if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETSEC |
-		    KEY_NEED_INVAL | KEY_NEED_REVOKE | KEY_NEED_CLEAR))
+	if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR))
 		request |= MAY_WRITE;
 	rc = smk_access(tkp, keyp->security, request, &ad);
 	rc = smk_bu_note("key access", tkp, keyp->security, request, rc);
-- 
cgit v1.2.3


From 9c1f2a5dc2948b9f1170d4202c84745f0b0ff0c9 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Tue, 4 Jun 2019 12:01:46 -0500
Subject: mailbox: omap: Add support for TI K3 SoCs

The TI K3 AM65x and J721E family of SoCs have a new Mailbox IP that
is based on the existing Mailbox IP present in OMAP architecture based
SoCs. Each instance of the legacy OMAP Mailbox IP is now a single cluster
within the newer Mailbox IP instance on K3 architecture based SoCs. A
single K3 Mailbox IP instance has multiple clusters with each cluster
providing the same functionality as the existing OMAP Mailbox IP.

Reuse the existing OMAP Mailbox driver to extend the support for this
newer IP present within the Main NavSS block on K3 SoCs. The K3 family
of SoCs use 64-bit ARMv8 processors for running Linux, so the driver is
also enhanced to deal with the differences between the 32-bit message
payloads and the 64-bit pointers used by the client drivers.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig        |  2 +-
 drivers/mailbox/omap-mailbox.c | 43 +++++++++++++++++++++++++-----------------
 include/linux/omap-mailbox.h   |  4 +++-
 3 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index b709481a8de6..ab4eb750bbdd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -54,7 +54,7 @@ config ARMADA_37XX_RWTM_MBOX
 
 config OMAP2PLUS_MBOX
 	tristate "OMAP2+ Mailbox framework support"
-	depends on ARCH_OMAP2PLUS
+	depends on ARCH_OMAP2PLUS || ARCH_K3
 	help
 	  Mailbox implementation for OMAP family chips with hardware for
 	  interprocessor communication involving DSP, IVA1.0 and IVA2 in
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index ca50177a33f2..a3cd63583cf7 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -3,7 +3,7 @@
  * OMAP mailbox driver
  *
  * Copyright (C) 2006-2009 Nokia Corporation. All rights reserved.
- * Copyright (C) 2013-2016 Texas Instruments Incorporated - http://www.ti.com
+ * Copyright (C) 2013-2019 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
@@ -141,14 +141,14 @@ void mbox_write_reg(struct omap_mbox_device *mdev, u32 val, size_t ofs)
 }
 
 /* Mailbox FIFO handle functions */
-static mbox_msg_t mbox_fifo_read(struct omap_mbox *mbox)
+static u32 mbox_fifo_read(struct omap_mbox *mbox)
 {
 	struct omap_mbox_fifo *fifo = &mbox->rx_fifo;
 
-	return (mbox_msg_t)mbox_read_reg(mbox->parent, fifo->msg);
+	return mbox_read_reg(mbox->parent, fifo->msg);
 }
 
-static void mbox_fifo_write(struct omap_mbox *mbox, mbox_msg_t msg)
+static void mbox_fifo_write(struct omap_mbox *mbox, u32 msg)
 {
 	struct omap_mbox_fifo *fifo = &mbox->tx_fifo;
 
@@ -256,14 +256,16 @@ static void mbox_rx_work(struct work_struct *work)
 {
 	struct omap_mbox_queue *mq =
 			container_of(work, struct omap_mbox_queue, work);
-	mbox_msg_t msg;
+	mbox_msg_t data;
+	u32 msg;
 	int len;
 
 	while (kfifo_len(&mq->fifo) >= sizeof(msg)) {
 		len = kfifo_out(&mq->fifo, (unsigned char *)&msg, sizeof(msg));
 		WARN_ON(len != sizeof(msg));
+		data = msg;
 
-		mbox_chan_received_data(mq->mbox->chan, (void *)msg);
+		mbox_chan_received_data(mq->mbox->chan, (void *)data);
 		spin_lock_irq(&mq->lock);
 		if (mq->full) {
 			mq->full = false;
@@ -286,7 +288,7 @@ static void __mbox_tx_interrupt(struct omap_mbox *mbox)
 static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 {
 	struct omap_mbox_queue *mq = mbox->rxq;
-	mbox_msg_t msg;
+	u32 msg;
 	int len;
 
 	while (!mbox_fifo_empty(mbox)) {
@@ -540,13 +542,13 @@ static void omap_mbox_chan_shutdown(struct mbox_chan *chan)
 	mutex_unlock(&mdev->cfg_lock);
 }
 
-static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
 		_omap_mbox_enable_irq(mbox, IRQ_RX);
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 		_omap_mbox_disable_irq(mbox, IRQ_RX);
 
@@ -558,12 +560,12 @@ static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
 	return ret;
 }
 
-static int omap_mbox_chan_send(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 	}
 
@@ -576,14 +578,15 @@ static int omap_mbox_chan_send_data(struct mbox_chan *chan, void *data)
 {
 	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
 	int ret;
+	u32 msg = omap_mbox_message(data);
 
 	if (!mbox)
 		return -EINVAL;
 
 	if (mbox->send_no_irq)
-		ret = omap_mbox_chan_send_noirq(mbox, data);
+		ret = omap_mbox_chan_send_noirq(mbox, msg);
 	else
-		ret = omap_mbox_chan_send(mbox, data);
+		ret = omap_mbox_chan_send(mbox, msg);
 
 	return ret;
 }
@@ -656,6 +659,10 @@ static const struct of_device_id omap_mailbox_of_match[] = {
 		.compatible	= "ti,omap4-mailbox",
 		.data		= &omap4_data,
 	},
+	{
+		.compatible	= "ti,am654-mailbox",
+		.data		= &omap4_data,
+	},
 	{
 		/* end */
 	},
@@ -830,7 +837,10 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	mdev->intr_type = intr_type;
 	mdev->mboxes = list;
 
-	/* OMAP does not have a Tx-Done IRQ, but rather a Tx-Ready IRQ */
+	/*
+	 * OMAP/K3 Mailbox IP does not have a Tx-Done IRQ, but rather a Tx-Ready
+	 * IRQ and is needed to run the Tx state machine
+	 */
 	mdev->controller.txdone_irq = true;
 	mdev->controller.dev = mdev->dev;
 	mdev->controller.ops = &omap_mbox_chan_ops;
@@ -899,9 +909,8 @@ static int __init omap_mbox_init(void)
 		return err;
 
 	/* kfifo size sanity check: alignment and minimal size */
-	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(mbox_msg_t));
-	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size,
-							sizeof(mbox_msg_t));
+	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(u32));
+	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size, sizeof(u32));
 
 	err = platform_driver_register(&omap_mbox_driver);
 	if (err)
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 6dbcd2da0332..8aa984ec1f38 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -6,7 +6,9 @@
 #ifndef OMAP_MAILBOX_H
 #define OMAP_MAILBOX_H
 
-typedef u32 mbox_msg_t;
+typedef uintptr_t mbox_msg_t;
+
+#define omap_mbox_message(data) (u32)(mbox_msg_t)(data)
 
 typedef int __bitwise omap_mbox_irq_t;
 #define IRQ_TX ((__force omap_mbox_irq_t) 1)
-- 
cgit v1.2.3


From ff956826a403f5cf189978d5ff6b3eb53aa11610 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:24 +0800
Subject: intel_rapl: introduce intel_rapl.h

Create a new header file for the common definitions that might be used
by different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                   |   1 +
 drivers/powercap/intel_rapl.c | 101 +------------------------------------
 include/linux/intel_rapl.h    | 113 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 99 deletions(-)
 create mode 100644 include/linux/intel_rapl.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 28a36f1efe02..9ded49d371da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12679,6 +12679,7 @@ F:	drivers/base/power/
 F:	include/linux/pm.h
 F:	include/linux/pm_*
 F:	include/linux/powercap.h
+F:	include/linux/intel_rapl.h
 F:	drivers/powercap/
 F:	kernel/configs/nopm.config
 
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9be9f20ff056..adb35ec9f939 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -18,8 +18,9 @@
 #include <linux/cpu.h>
 #include <linux/powercap.h>
 #include <linux/suspend.h>
-#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
 
+#include <asm/iosf_mbi.h>
 #include <asm/processor.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
@@ -74,59 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
-enum rapl_domain_type {
-	RAPL_DOMAIN_PACKAGE, /* entire package/socket */
-	RAPL_DOMAIN_PP0, /* core power plane */
-	RAPL_DOMAIN_PP1, /* graphics uncore */
-	RAPL_DOMAIN_DRAM,/* DRAM control_type */
-	RAPL_DOMAIN_PLATFORM, /* PSys control_type */
-	RAPL_DOMAIN_MAX,
-};
-
-enum rapl_domain_reg_id {
-	RAPL_DOMAIN_REG_LIMIT,
-	RAPL_DOMAIN_REG_STATUS,
-	RAPL_DOMAIN_REG_PERF,
-	RAPL_DOMAIN_REG_POLICY,
-	RAPL_DOMAIN_REG_INFO,
-	RAPL_DOMAIN_REG_MAX,
-};
-
 /* per domain data, some are optional */
-enum rapl_primitives {
-	ENERGY_COUNTER,
-	POWER_LIMIT1,
-	POWER_LIMIT2,
-	FW_LOCK,
-
-	PL1_ENABLE,  /* power limit 1, aka long term */
-	PL1_CLAMP,   /* allow frequency to go below OS request */
-	PL2_ENABLE,  /* power limit 2, aka short term, instantaneous */
-	PL2_CLAMP,
-
-	TIME_WINDOW1, /* long term */
-	TIME_WINDOW2, /* short term */
-	THERMAL_SPEC_POWER,
-	MAX_POWER,
-
-	MIN_POWER,
-	MAX_TIME_WINDOW,
-	THROTTLED_TIME,
-	PRIORITY_LEVEL,
-
-	/* below are not raw primitive data */
-	AVERAGE_POWER,
-	NR_RAPL_PRIMITIVES,
-};
-
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-/* Can be expanded to include events, etc.*/
-struct rapl_domain_data {
-	u64 primitives[NR_RAPL_PRIMITIVES];
-	unsigned long timestamp;
-};
-
 struct msrl_action {
 	u32 msr_no;
 	u64 clear_mask;
@@ -138,60 +89,12 @@ struct msrl_action {
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
 
-#define NR_POWER_LIMITS (2)
-struct rapl_power_limit {
-	struct powercap_zone_constraint *constraint;
-	int prim_id; /* primitive ID used to enable */
-	struct rapl_domain *domain;
-	const char *name;
-	u64 last_power_limit;
-};
-
 static const char pl1_name[] = "long_term";
 static const char pl2_name[] = "short_term";
 
-struct rapl_package;
-struct rapl_domain {
-	const char *name;
-	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
-	struct powercap_zone power_zone;
-	struct rapl_domain_data rdd;
-	struct rapl_power_limit rpl[NR_POWER_LIMITS];
-	u64 attr_map; /* track capabilities */
-	unsigned int state;
-	unsigned int domain_energy_unit;
-	struct rapl_package *rp;
-};
 #define power_zone_to_rapl_domain(_zone) \
 	container_of(_zone, struct rapl_domain, power_zone)
 
-/* maximum rapl package domain name: package-%d-die-%d */
-#define PACKAGE_DOMAIN_NAME_LENGTH 30
-
-
-/* Each rapl package contains multiple domains, these are the common
- * data across RAPL domains within a package.
- */
-struct rapl_package {
-	unsigned int id; /* logical die id, equals physical 1-die systems */
-	unsigned int nr_domains;
-	unsigned long domain_map; /* bit map of active domains */
-	unsigned int power_unit;
-	unsigned int energy_unit;
-	unsigned int time_unit;
-	struct rapl_domain *domains; /* array of domains, sized at runtime */
-	struct powercap_zone *power_zone; /* keep track of parent zone */
-	unsigned long power_limit_irq; /* keep track of package power limit
-					* notify interrupt enable status.
-					*/
-	struct list_head plist;
-	int lead_cpu; /* one active cpu per package for access */
-	/* Track active cpus */
-	struct cpumask cpumask;
-	char name[PACKAGE_DOMAIN_NAME_LENGTH];
-};
-
 struct rapl_defaults {
 	u8 floor_freq_reg_addr;
 	int (*check_unit)(struct rapl_package *rp, int cpu);
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
new file mode 100644
index 000000000000..94716036d829
--- /dev/null
+++ b/include/linux/intel_rapl.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Data types and headers for RAPL support
+ *
+ *  Copyright (C) 2019  Intel Corporation.
+ *
+ *  Author: Zhang Rui <rui.zhang@intel.com>
+ */
+
+#ifndef __INTEL_RAPL_H__
+#define __INTEL_RAPL_H__
+
+#include <linux/types.h>
+#include <linux/powercap.h>
+
+enum rapl_domain_type {
+	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
+	RAPL_DOMAIN_PP0,	/* core power plane */
+	RAPL_DOMAIN_PP1,	/* graphics uncore */
+	RAPL_DOMAIN_DRAM,	/* DRAM control_type */
+	RAPL_DOMAIN_PLATFORM,	/* PSys control_type */
+	RAPL_DOMAIN_MAX,
+};
+
+enum rapl_domain_reg_id {
+	RAPL_DOMAIN_REG_LIMIT,
+	RAPL_DOMAIN_REG_STATUS,
+	RAPL_DOMAIN_REG_PERF,
+	RAPL_DOMAIN_REG_POLICY,
+	RAPL_DOMAIN_REG_INFO,
+	RAPL_DOMAIN_REG_MAX,
+};
+
+struct rapl_package;
+
+enum rapl_primitives {
+	ENERGY_COUNTER,
+	POWER_LIMIT1,
+	POWER_LIMIT2,
+	FW_LOCK,
+
+	PL1_ENABLE,		/* power limit 1, aka long term */
+	PL1_CLAMP,		/* allow frequency to go below OS request */
+	PL2_ENABLE,		/* power limit 2, aka short term, instantaneous */
+	PL2_CLAMP,
+
+	TIME_WINDOW1,		/* long term */
+	TIME_WINDOW2,		/* short term */
+	THERMAL_SPEC_POWER,
+	MAX_POWER,
+
+	MIN_POWER,
+	MAX_TIME_WINDOW,
+	THROTTLED_TIME,
+	PRIORITY_LEVEL,
+
+	/* below are not raw primitive data */
+	AVERAGE_POWER,
+	NR_RAPL_PRIMITIVES,
+};
+
+struct rapl_domain_data {
+	u64 primitives[NR_RAPL_PRIMITIVES];
+	unsigned long timestamp;
+};
+
+#define NR_POWER_LIMITS (2)
+struct rapl_power_limit {
+	struct powercap_zone_constraint *constraint;
+	int prim_id;		/* primitive ID used to enable */
+	struct rapl_domain *domain;
+	const char *name;
+	u64 last_power_limit;
+};
+
+struct rapl_package;
+
+struct rapl_domain {
+	const char *name;
+	enum rapl_domain_type id;
+	int regs[RAPL_DOMAIN_REG_MAX];
+	struct powercap_zone power_zone;
+	struct rapl_domain_data rdd;
+	struct rapl_power_limit rpl[NR_POWER_LIMITS];
+	u64 attr_map;		/* track capabilities */
+	unsigned int state;
+	unsigned int domain_energy_unit;
+	struct rapl_package *rp;
+};
+
+/* maximum rapl package domain name: package-%d-die-%d */
+#define PACKAGE_DOMAIN_NAME_LENGTH 30
+
+struct rapl_package {
+	unsigned int id;	/* logical die id, equals physical 1-die systems */
+	unsigned int nr_domains;
+	unsigned long domain_map;	/* bit map of active domains */
+	unsigned int power_unit;
+	unsigned int energy_unit;
+	unsigned int time_unit;
+	struct rapl_domain *domains;	/* array of domains, sized at runtime */
+	struct powercap_zone *power_zone;	/* keep track of parent zone */
+	unsigned long power_limit_irq;	/* keep track of package power limit
+					 * notify interrupt enable status.
+					 */
+	struct list_head plist;
+	int lead_cpu;		/* one active cpu per package for access */
+	/* Track active cpus */
+	struct cpumask cpumask;
+	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+};
+
+#endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From 7ebf8eff63b4f349e7b2ded6aa5036d94bdf94b9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:25 +0800
Subject: intel_rapl: introduce struct rapl_if_private

Introduce a new structure, rapl_if_private, to save the private data
for different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 59 +++++++++++++++++++++----------------------
 include/linux/intel_rapl.h    | 15 +++++++++++
 2 files changed, 44 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index adb35ec9f939..e05d92d67525 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -75,6 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv;
+
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
@@ -155,17 +158,14 @@ static const char * const rapl_domain_names[] = {
 	"psys",
 };
 
-static struct powercap_control_type *control_type; /* PowerCap Controller */
-static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
-
 /* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu)
+static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
 
 	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id)
+		if (rp->id == id && rp->priv->control_type == priv->control_type)
 			return rp;
 	}
 
@@ -1090,12 +1090,12 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 
 static void rapl_unregister_powercap(void)
 {
-	if (platform_rapl_domain) {
-		powercap_unregister_zone(control_type,
-					 &platform_rapl_domain->power_zone);
-		kfree(platform_rapl_domain);
+	if (&rapl_msr_priv.platform_rapl_domain) {
+		powercap_unregister_zone(rapl_msr_priv.control_type,
+					 &rapl_msr_priv.platform_rapl_domain->power_zone);
+		kfree(rapl_msr_priv.platform_rapl_domain);
 	}
-	powercap_unregister_control_type(control_type);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
 }
 
 static int rapl_package_register_powercap(struct rapl_package *rp)
@@ -1113,7 +1113,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			nr_pl = find_nr_power_limit(rd);
 			pr_debug("register package domain %s\n", rp->name);
 			power_zone = powercap_register_zone(&rd->power_zone,
-							control_type,
+							rp->priv->control_type,
 							rp->name, NULL,
 							&zone_ops[rd->id],
 							nr_pl,
@@ -1140,7 +1140,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 		/* number of power limits per domain varies */
 		nr_pl = find_nr_power_limit(rd);
 		power_zone = powercap_register_zone(&rd->power_zone,
-						control_type, rd->name,
+						rp->priv->control_type, rd->name,
 						rp->power_zone,
 						&zone_ops[rd->id], nr_pl,
 						&constraint_ops);
@@ -1161,7 +1161,7 @@ err_cleanup:
 	 */
 	while (--rd >= rp->domains) {
 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 
 	return ret;
@@ -1191,9 +1191,9 @@ static int __init rapl_register_psys(void)
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
 	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0);
+	rd->rp = rapl_find_package_domain(0, &rapl_msr_priv);
 
-	power_zone = powercap_register_zone(&rd->power_zone, control_type,
+	power_zone = powercap_register_zone(&rd->power_zone, rapl_msr_priv.control_type,
 					    "psys", NULL,
 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
 					    2, &constraint_ops);
@@ -1203,17 +1203,17 @@ static int __init rapl_register_psys(void)
 		return PTR_ERR(power_zone);
 	}
 
-	platform_rapl_domain = rd;
+	rapl_msr_priv.platform_rapl_domain = rd;
 
 	return 0;
 }
 
 static int __init rapl_register_powercap(void)
 {
-	control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(control_type)) {
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(control_type);
+		return PTR_ERR(rapl_msr_priv.control_type);
 	}
 	return 0;
 }
@@ -1338,16 +1338,16 @@ static void rapl_remove_package(struct rapl_package *rp)
 		}
 		pr_debug("remove package, undo power limit on %s: %s\n",
 			 rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 	/* do parent zone last */
-	powercap_unregister_zone(control_type, &rd_package->power_zone);
+	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
 	list_del(&rp->plist);
 	kfree(rp);
 }
 
 /* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
@@ -1361,6 +1361,7 @@ static struct rapl_package *rapl_add_package(int cpu)
 	/* add the new package to the list */
 	rp->id = id;
 	rp->lead_cpu = cpu;
+	rp->priv = priv;
 
 	if (topology_max_die_per_package() > 1)
 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
@@ -1399,9 +1400,9 @@ static int rapl_cpu_online(unsigned int cpu)
 {
 	struct rapl_package *rp;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp) {
-		rp = rapl_add_package(cpu);
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
 		if (IS_ERR(rp))
 			return PTR_ERR(rp);
 	}
@@ -1414,7 +1415,7 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	struct rapl_package *rp;
 	int lead_cpu;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp)
 		return 0;
 
@@ -1427,8 +1428,6 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	return 0;
 }
 
-static enum cpuhp_state pcap_rapl_online;
-
 static void power_limit_state_save(void)
 {
 	struct rapl_package *rp;
@@ -1538,7 +1537,7 @@ static int __init rapl_init(void)
 				rapl_cpu_online, rapl_cpu_down_prep);
 	if (ret < 0)
 		goto err_unreg;
-	pcap_rapl_online = ret;
+	rapl_msr_priv.pcap_rapl_online = ret;
 
 	/* Don't bail out if PSys is not supported */
 	rapl_register_psys();
@@ -1550,7 +1549,7 @@ static int __init rapl_init(void)
 	return 0;
 
 err_unreg_all:
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 
 err_unreg:
 	rapl_unregister_powercap();
@@ -1560,7 +1559,7 @@ err_unreg:
 static void __exit rapl_exit(void)
 {
 	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 	rapl_unregister_powercap();
 }
 
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 94716036d829..7bf1683e4a63 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,20 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+/**
+ * struct rapl_if_priv: private data for different RAPL interfaces
+ * @control_type:		Each RAPL interface must have its own powercap
+ *				control type.
+ * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
+ *				level RAPL control.
+ * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ */
+struct rapl_if_priv {
+	struct powercap_control_type *control_type;
+	struct rapl_domain *platform_rapl_domain;
+	enum cpuhp_state pcap_rapl_online;
+};
+
 /* maximum rapl package domain name: package-%d-die-%d */
 #define PACKAGE_DOMAIN_NAME_LENGTH 30
 
@@ -108,6 +122,7 @@ struct rapl_package {
 	/* Track active cpus */
 	struct cpumask cpumask;
 	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+	struct rapl_if_priv *priv;
 };
 
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From 7fde2712a7adab721eaabafbd8ff93dff3262d35 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:26 +0800
Subject: intel_rapl: abstract register address

MSR and MMIO RAPL interface have different sets of registers, thus the
RAPL register address should be obtained from interface specific
structure, i.e. struct rapl_if_private, instead.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 73 +++++++++++++++++++------------------------
 include/linux/intel_rapl.h    |  4 +++
 2 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index e05d92d67525..9f22aed49f24 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -76,7 +76,19 @@ enum unit_type {
 };
 
 /* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv;
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
 
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
@@ -541,15 +553,17 @@ static void rapl_init_domains(struct rapl_package *rp)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
 		switch (mask) {
 		case BIT(RAPL_DOMAIN_PACKAGE):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
 			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PKG_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PKG_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_PKG_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_PKG_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->rpl[1].prim_id = PL2_ENABLE;
@@ -558,33 +572,18 @@ static void rapl_init_domains(struct rapl_package *rp)
 		case BIT(RAPL_DOMAIN_PP0):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
 			rd->id = RAPL_DOMAIN_PP0;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP0_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP0_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP0_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_PP1):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
 			rd->id = RAPL_DOMAIN_PP1;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP1_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP1_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP1_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_DRAM):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
 			rd->id = RAPL_DOMAIN_DRAM;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_DRAM_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_DRAM_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_DRAM_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_DRAM_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->domain_energy_unit =
@@ -806,9 +805,9 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 
@@ -832,9 +831,9 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 	value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
@@ -1173,10 +1172,10 @@ static int __init rapl_register_psys(void)
 	struct powercap_zone *power_zone;
 	u64 val;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS], &val) || !val)
 		return -ENODEV;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT], &val) || !val)
 		return -ENODEV;
 
 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
@@ -1185,8 +1184,8 @@ static int __init rapl_register_psys(void)
 
 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
 	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PLATFORM_POWER_LIMIT;
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PLATFORM_ENERGY_STATUS;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
 	rd->rpl[0].prim_id = PL1_ENABLE;
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
@@ -1218,23 +1217,17 @@ static int __init rapl_register_powercap(void)
 	return 0;
 }
 
-static int rapl_check_domain(int cpu, int domain)
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
 {
-	unsigned msr;
+	u32 reg;
 	u64 val = 0;
 
 	switch (domain) {
 	case RAPL_DOMAIN_PACKAGE:
-		msr = MSR_PKG_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP0:
-		msr = MSR_PP0_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP1:
-		msr = MSR_PP1_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_DRAM:
-		msr = MSR_DRAM_ENERGY_STATUS;
+		reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
 		break;
 	case RAPL_DOMAIN_PLATFORM:
 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
@@ -1246,7 +1239,7 @@ static int rapl_check_domain(int cpu, int domain)
 	/* make sure domain counters are available and contains non-zero
 	 * values, otherwise skip it.
 	 */
-	if (rdmsrl_safe_on_cpu(cpu, msr, &val) || !val)
+	if (rdmsrl_safe_on_cpu(cpu, reg, &val) || !val)
 		return -ENODEV;
 
 	return 0;
@@ -1293,7 +1286,7 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i)) {
+		if (!rapl_check_domain(cpu, i, rp)) {
 			rp->domain_map |= 1 << i;
 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
 		}
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 7bf1683e4a63..ec2c9e83274f 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -95,11 +95,15 @@ struct rapl_domain {
  * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
  *				level RAPL control.
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ * @reg_unit:			Register for getting energy/power/time unit.
+ * @regs:			Register sets for different RAPL Domains.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
+	u32 reg_unit;
+	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3


From beea8df821d928e7755917da6c1e45d6afde5148 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:27 +0800
Subject: intel_rapl: abstract register access operations

MSR and MMIO RAPL interfaces have different ways to access the registers,
thus in order to abstract the register access operations, two callbacks,
.read_raw()/.write_raw() are introduced, and they should be implemented by
MSR RAPL and MMIO RAPL interface driver respectly.

This patch implements them for the MSR I/F only.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 110 ++++++++++++++++++++++--------------------
 include/linux/intel_rapl.h    |  13 +++++
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9f22aed49f24..d3b9d1cf4d48 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -93,13 +93,6 @@ static struct rapl_if_priv rapl_msr_priv = {
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-struct msrl_action {
-	u32 msr_no;
-	u64 clear_mask;
-	u64 set_mask;
-	int err;
-};
-
 #define	DOMAIN_STATE_INACTIVE           BIT(0)
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
@@ -692,16 +685,16 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
 			bool xlate, u64 *data)
 {
-	u64 value, final;
-	u32 msr;
+	u64 value;
 	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
 	int cpu;
 
 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 		return -EINVAL;
 
-	msr = rd->regs[rp->id];
-	if (!msr)
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
 		return -EINVAL;
 
 	cpu = rd->rp->lead_cpu;
@@ -717,47 +710,23 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 		return 0;
 	}
 
-	if (rdmsrl_safe_on_cpu(cpu, msr, &value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
-	final = value & rp->mask;
-	final = final >> rp->shift;
+	value = ra.value >> rp->shift;
+
 	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, final, 0);
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
 	else
-		*data = final;
+		*data = value;
 
 	return 0;
 }
 
-
-static int msrl_update_safe(u32 msr_no, u64 clear_mask, u64 set_mask)
-{
-	int err;
-	u64 val;
-
-	err = rdmsrl_safe(msr_no, &val);
-	if (err)
-		goto out;
-
-	val &= ~clear_mask;
-	val |= set_mask;
-
-	err = wrmsrl_safe(msr_no, val);
-
-out:
-	return err;
-}
-
-static void msrl_update_func(void *info)
-{
-	struct msrl_action *ma = info;
-
-	ma->err = msrl_update_safe(ma->msr_no, ma->clear_mask, ma->set_mask);
-}
-
 /* Similar use of primitive info in the read counterpart */
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
@@ -766,7 +735,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	struct rapl_primitive_info *rp = &rpi[prim];
 	int cpu;
 	u64 bits;
-	struct msrl_action ma;
+	struct reg_action ra;
 	int ret;
 
 	cpu = rd->rp->lead_cpu;
@@ -774,17 +743,13 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	bits <<= rp->shift;
 	bits &= rp->mask;
 
-	memset(&ma, 0, sizeof(ma));
+	memset(&ra, 0, sizeof(ra));
 
-	ma.msr_no = rd->regs[rp->id];
-	ma.clear_mask = rp->mask;
-	ma.set_mask = bits;
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
 
-	ret = smp_call_function_single(cpu, msrl_update_func, &ma, 1);
-	if (ret)
-		WARN_ON_ONCE(ret);
-	else
-		ret = ma.err;
+	ret = rd->rp->priv->write_raw(cpu, &ra);
 
 	return ret;
 }
@@ -1507,6 +1472,43 @@ static struct notifier_block rapl_pm_notifier = {
 	.notifier_call = rapl_pm_callback,
 };
 
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
 static int __init rapl_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -1522,6 +1524,8 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
 	ret = rapl_register_powercap();
 	if (ret)
 		return ret;
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ec2c9e83274f..ff215d64d114 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,13 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+struct reg_action {
+	u32 reg;
+	u64 mask;
+	u64 value;
+	int err;
+};
+
 /**
  * struct rapl_if_priv: private data for different RAPL interfaces
  * @control_type:		Each RAPL interface must have its own powercap
@@ -97,6 +104,10 @@ struct rapl_domain {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @read_raw:			Callback for reading RAPL interface specific
+ *				registers.
+ * @write_raw:			Callback for writing RAPL interface specific
+ *				registers.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
@@ -104,6 +115,8 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u32 reg_unit;
 	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int (*read_raw)(int cpu, struct reg_action *ra);
+	int (*write_raw)(int cpu, struct reg_action *ra);
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3


From 3382388d714891fc0f575926189f33d22e7c960b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:30 +0800
Subject: intel_rapl: abstract RAPL common code

Split intel_rapl.c to intel_rapl_common.c and intel_rapl_msr.c, where
intel_rapl_common.c contains the common code that can be used by both MSR
and MMIO interface.
intel_rapl_msr.c contains the implementation of RAPL MSR interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig             |   11 +-
 drivers/powercap/Makefile            |    3 +-
 drivers/powercap/intel_rapl.c        | 1574 ----------------------------------
 drivers/powercap/intel_rapl_common.c | 1469 +++++++++++++++++++++++++++++++
 drivers/powercap/intel_rapl_msr.c    |  163 ++++
 include/linux/intel_rapl.h           |    7 +
 6 files changed, 1648 insertions(+), 1579 deletions(-)
 delete mode 100644 drivers/powercap/intel_rapl.c
 create mode 100644 drivers/powercap/intel_rapl_common.c
 create mode 100644 drivers/powercap/intel_rapl_msr.c

(limited to 'include/linux')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 42d3798c88f0..dc1c1381d7fa 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -16,14 +16,17 @@ menuconfig POWERCAP
 
 if POWERCAP
 # Client driver configurations go here.
+config INTEL_RAPL_CORE
+	tristate
+
 config INTEL_RAPL
-	tristate "Intel RAPL Support"
+	tristate "Intel RAPL Support via MSR Interface"
 	depends on X86 && IOSF_MBI
-	default n
+	select INTEL_RAPL_CORE
 	---help---
 	  This enables support for the Intel Running Average Power Limit (RAPL)
-	  technology which allows power limits to be enforced and monitored on
-	  modern Intel processors (Sandy Bridge and later).
+	  technology via MSR interface, which allows power limits to be enforced
+	  and monitored on modern Intel processors (Sandy Bridge and later).
 
 	  In RAPL, the platform level settings are divided into domains for
 	  fine grained control. These domains include processor package, DRAM
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 81c8ccaba6e7..7255c94ec61c 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
-obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
+obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
 obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
deleted file mode 100644
index aa54c06ed518..000000000000
--- a/drivers/powercap/intel_rapl.c
+++ /dev/null
@@ -1,1574 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Running Average Power Limit (RAPL) Driver
- * Copyright (c) 2013, Intel Corporation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
-
-#include <asm/iosf_mbi.h>
-#include <asm/processor.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
-/* Local defines */
-#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
-
-/* bitmasks for RAPL MSRs, used by primitive access functions */
-#define ENERGY_STATUS_MASK      0xffffffff
-
-#define POWER_LIMIT1_MASK       0x7FFF
-#define POWER_LIMIT1_ENABLE     BIT(15)
-#define POWER_LIMIT1_CLAMP      BIT(16)
-
-#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
-#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
-#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
-
-#define TIME_WINDOW1_MASK       (0x7FULL<<17)
-#define TIME_WINDOW2_MASK       (0x7FULL<<49)
-
-#define POWER_UNIT_OFFSET	0
-#define POWER_UNIT_MASK		0x0F
-
-#define ENERGY_UNIT_OFFSET	0x08
-#define ENERGY_UNIT_MASK	0x1F00
-
-#define TIME_UNIT_OFFSET	0x10
-#define TIME_UNIT_MASK		0xF0000
-
-#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
-#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
-#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
-#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
-
-#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
-#define PP_POLICY_MASK         0x1F
-
-/* Non HW constants */
-#define RAPL_PRIMITIVE_DERIVED       BIT(1) /* not from raw data */
-#define RAPL_PRIMITIVE_DUMMY         BIT(2)
-
-#define TIME_WINDOW_MAX_MSEC 40000
-#define TIME_WINDOW_MIN_MSEC 250
-#define ENERGY_UNIT_SCALE    1000 /* scale from driver unit to powercap unit */
-enum unit_type {
-	ARBITRARY_UNIT, /* no translation */
-	POWER_UNIT,
-	ENERGY_UNIT,
-	TIME_UNIT,
-};
-
-/* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv = {
-	.reg_unit = MSR_RAPL_POWER_UNIT,
-	.regs[RAPL_DOMAIN_PACKAGE] = {
-		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
-	.regs[RAPL_DOMAIN_PP0] = {
-		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
-	.regs[RAPL_DOMAIN_PP1] = {
-		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
-	.regs[RAPL_DOMAIN_DRAM] = {
-		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
-	.regs[RAPL_DOMAIN_PLATFORM] = {
-		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
-};
-
-/* per domain data, some are optional */
-#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
-
-#define	DOMAIN_STATE_INACTIVE           BIT(0)
-#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
-#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
-
-static const char pl1_name[] = "long_term";
-static const char pl2_name[] = "short_term";
-
-#define power_zone_to_rapl_domain(_zone) \
-	container_of(_zone, struct rapl_domain, power_zone)
-
-struct rapl_defaults {
-	u8 floor_freq_reg_addr;
-	int (*check_unit)(struct rapl_package *rp, int cpu);
-	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
-	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
-				bool to_raw);
-	unsigned int dram_domain_energy_unit;
-};
-static struct rapl_defaults *rapl_defaults;
-
-/* Sideband MBI registers */
-#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
-#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
-
-#define PACKAGE_PLN_INT_SAVED   BIT(0)
-#define MAX_PRIM_NAME (32)
-
-/* per domain data. used to describe individual knobs such that access function
- * can be consolidated into one instead of many inline functions.
- */
-struct rapl_primitive_info {
-	const char *name;
-	u64 mask;
-	int shift;
-	enum rapl_domain_reg_id id;
-	enum unit_type unit;
-	u32 flag;
-};
-
-#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
-		.name = #p,			\
-		.mask = m,			\
-		.shift = s,			\
-		.id = i,			\
-		.unit = u,			\
-		.flag = f			\
-	}
-
-static void rapl_init_domains(struct rapl_package *rp);
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data);
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value);
-static u64 rapl_unit_xlate(struct rapl_domain *rd,
-			enum unit_type type, u64 value,
-			int to_raw);
-static void package_power_limit_irq_save(struct rapl_package *rp);
-
-static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
-
-static const char * const rapl_domain_names[] = {
-	"package",
-	"core",
-	"uncore",
-	"dram",
-	"psys",
-};
-
-/* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id && rp->priv->control_type == priv->control_type)
-			return rp;
-	}
-
-	return NULL;
-}
-
-static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
-{
-	struct rapl_domain *rd;
-	u64 energy_now;
-
-	/* prevent CPU hotplug, make sure the RAPL domain does not go
-	 * away while reading the counter.
-	 */
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-
-	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
-		*energy_raw = energy_now;
-		put_online_cpus();
-
-		return 0;
-	}
-	put_online_cpus();
-
-	return -EIO;
-}
-
-static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
-
-	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
-	return 0;
-}
-
-static int release_zone(struct powercap_zone *power_zone)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	struct rapl_package *rp = rd->rp;
-
-	/* package zone is the last zone of a package, we can free
-	 * memory here since all children has been unregistered.
-	 */
-	if (rd->id == RAPL_DOMAIN_PACKAGE) {
-		kfree(rd);
-		rp->domains = NULL;
-	}
-
-	return 0;
-
-}
-
-static int find_nr_power_limit(struct rapl_domain *rd)
-{
-	int i, nr_pl = 0;
-
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		if (rd->rpl[i].name)
-			nr_pl++;
-	}
-
-	return nr_pl;
-}
-
-static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
-		return -EACCES;
-
-	get_online_cpus();
-	rapl_write_data_raw(rd, PL1_ENABLE, mode);
-	if (rapl_defaults->set_floor_freq)
-		rapl_defaults->set_floor_freq(rd, mode);
-	put_online_cpus();
-
-	return 0;
-}
-
-static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	u64 val;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		*mode = false;
-		return 0;
-	}
-	get_online_cpus();
-	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
-		put_online_cpus();
-		return -EIO;
-	}
-	*mode = val;
-	put_online_cpus();
-
-	return 0;
-}
-
-/* per RAPL domain ops, in the order of rapl_domain_type */
-static const struct powercap_zone_ops zone_ops[] = {
-	/* RAPL_DOMAIN_PACKAGE */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP0 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP1 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_DRAM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PLATFORM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-};
-
-
-/*
- * Constraint index used by powercap can be different than power limit (PL)
- * index in that some  PLs maybe missing due to non-existant MSRs. So we
- * need to convert here by finding the valid PLs only (name populated).
- */
-static int contraint_to_pl(struct rapl_domain *rd, int cid)
-{
-	int i, j;
-
-	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
-		if ((rd->rpl[i].name) && j++ == cid) {
-			pr_debug("%s: index %d\n", __func__, i);
-			return i;
-		}
-	}
-	pr_err("Cannot find matching power limit for constraint %d\n", cid);
-
-	return -EINVAL;
-}
-
-static int set_power_limit(struct powercap_zone *power_zone, int cid,
-			u64 power_limit)
-{
-	struct rapl_domain *rd;
-	struct rapl_package *rp;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_exit;
-	}
-
-	rp = rd->rp;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
-			rd->name);
-		ret = -EACCES;
-		goto set_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	if (!ret)
-		package_power_limit_irq_save(rp);
-set_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = POWER_LIMIT1;
-		break;
-	case PL2_ENABLE:
-		prim = POWER_LIMIT2;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-get_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static int set_time_window(struct powercap_zone *power_zone, int cid,
-								u64 window)
-{
-	struct rapl_domain *rd;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW1, window);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW2, window);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-set_time_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
-		break;
-	case PL2_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (!ret)
-		*data = val;
-
-get_time_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static const char *get_constraint_name(struct powercap_zone *power_zone, int cid)
-{
-	struct rapl_domain *rd;
-	int id;
-
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id >= 0)
-		return rd->rpl[id].name;
-
-	return NULL;
-}
-
-
-static int get_max_power(struct powercap_zone *power_zone, int id,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = THERMAL_SPEC_POWER;
-		break;
-	case PL2_ENABLE:
-		prim = MAX_POWER;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-	put_online_cpus();
-
-	return ret;
-}
-
-static const struct powercap_zone_constraint_ops constraint_ops = {
-	.set_power_limit_uw = set_power_limit,
-	.get_power_limit_uw = get_current_power_limit,
-	.set_time_window_us = set_time_window,
-	.get_time_window_us = get_time_window,
-	.get_max_power_uw = get_max_power,
-	.get_name = get_constraint_name,
-};
-
-/* called after domain detection and package level data are set */
-static void rapl_init_domains(struct rapl_package *rp)
-{
-	int i;
-	struct rapl_domain *rd = rp->domains;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		unsigned int mask = rp->domain_map & (1 << i);
-
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->rpl[1].prim_id = PL2_ENABLE;
-			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->domain_energy_unit =
-				rapl_defaults->dram_domain_energy_unit;
-			if (rd->domain_energy_unit)
-				pr_info("DRAM domain energy unit %dpj\n",
-					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
-		}
-	}
-}
-
-static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
-			u64 value, int to_raw)
-{
-	u64 units = 1;
-	struct rapl_package *rp = rd->rp;
-	u64 scale = 1;
-
-	switch (type) {
-	case POWER_UNIT:
-		units = rp->power_unit;
-		break;
-	case ENERGY_UNIT:
-		scale = ENERGY_UNIT_SCALE;
-		/* per domain unit takes precedence */
-		if (rd->domain_energy_unit)
-			units = rd->domain_energy_unit;
-		else
-			units = rp->energy_unit;
-		break;
-	case TIME_UNIT:
-		return rapl_defaults->compute_time_window(rp, value, to_raw);
-	case ARBITRARY_UNIT:
-	default:
-		return value;
-	};
-
-	if (to_raw)
-		return div64_u64(value, units) * scale;
-
-	value *= units;
-
-	return div64_u64(value, scale);
-}
-
-/* in the order of enum rapl_primitives */
-static struct rapl_primitive_info rpi[] = {
-	/* name, mask, shift, msr index, unit divisor */
-	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
-				RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
-				0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
-				RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
-				RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
-				RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
-	/* non-hardware */
-	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
-				RAPL_PRIMITIVE_DERIVED),
-	{NULL, 0, 0, 0},
-};
-
-/* Read primitive data based on its related struct rapl_primitive_info.
- * if xlate flag is set, return translated data based on data units, i.e.
- * time, energy, and power.
- * RAPL MSRs are non-architectual and are laid out not consistently across
- * domains. Here we use primitive info to allow writing consolidated access
- * functions.
- * For a given primitive, it is processed by MSR mask and shift. Unit conversion
- * is pre-assigned based on RAPL unit MSRs read at init time.
- * 63-------------------------- 31--------------------------- 0
- * |                           xxxxx (mask)                   |
- * |                                |<- shift ----------------|
- * 63-------------------------- 31--------------------------- 0
- */
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data)
-{
-	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
-	struct reg_action ra;
-	int cpu;
-
-	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
-		return -EINVAL;
-
-	ra.reg = rd->regs[rp->id];
-	if (!ra.reg)
-		return -EINVAL;
-
-	cpu = rd->rp->lead_cpu;
-
-	/* special-case package domain, which uses a different bit*/
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
-		rp->shift = 63;
-	}
-	/* non-hardware data are collected by the polling thread */
-	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
-		*data = rd->rdd.primitives[prim];
-		return 0;
-	}
-
-	ra.mask = rp->mask;
-
-	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
-		return -EIO;
-	}
-
-	value = ra.value >> rp->shift;
-
-	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
-	else
-		*data = value;
-
-	return 0;
-}
-
-/* Similar use of primitive info in the read counterpart */
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value)
-{
-	struct rapl_primitive_info *rp = &rpi[prim];
-	int cpu;
-	u64 bits;
-	struct reg_action ra;
-	int ret;
-
-	cpu = rd->rp->lead_cpu;
-	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
-	bits <<= rp->shift;
-	bits &= rp->mask;
-
-	memset(&ra, 0, sizeof(ra));
-
-	ra.reg = rd->regs[rp->id];
-	ra.mask = rp->mask;
-	ra.value = bits;
-
-	ret = rd->rp->priv->write_raw(cpu, &ra);
-
-	return ret;
-}
-
-/*
- * Raw RAPL data stored in MSRs are in certain scales. We need to
- * convert them into standard units based on the units reported in
- * the RAPL unit MSRs. This is specific to CPUs as the method to
- * calculate units differ on different CPUs.
- * We convert the units to below format based on CPUs.
- * i.e.
- * energy unit: picoJoules  : Represented in picoJoules by default
- * power unit : microWatts  : Represented in milliWatts by default
- * time unit  : microseconds: Represented in seconds by default
- */
-static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = 1000000 / (1 << value);
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = (1 << value) * 1000;
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static void power_limit_irq_save_cpu(void *info)
-{
-	u32 l, h = 0;
-	struct rapl_package *rp = (struct rapl_package *)info;
-
-	/* save the state of PLN irq mask bit before disabling it */
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
-		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
-		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
-	}
-	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-
-/* REVISIT:
- * When package power limit is set artificially low by RAPL, LVT
- * thermal interrupt for package power limit should be ignored
- * since we are not really exceeding the real limit. The intention
- * is to avoid excessive interrupts while we are trying to save power.
- * A useful feature might be routing the package_power_limit interrupt
- * to userspace via eventfd. once we have a usecase, this is simple
- * to do by adding an atomic notifier.
- */
-
-static void package_power_limit_irq_save(struct rapl_package *rp)
-{
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
-}
-
-/*
- * Restore per package power limit interrupt enable state. Called from cpu
- * hotplug code on package removal.
- */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
-	u32 l, h;
-
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	/* irq enable state not saved, nothing to restore */
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
-		return;
-
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-
-	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
-		l |= PACKAGE_THERM_INT_PLN_ENABLE;
-	else
-		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
-{
-	int nr_powerlimit = find_nr_power_limit(rd);
-
-	/* always enable clamp such that p-state can go below OS requested
-	 * range. power capping priority over guranteed frequency.
-	 */
-	rapl_write_data_raw(rd, PL1_CLAMP, mode);
-
-	/* some domains have pl2 */
-	if (nr_powerlimit > 1) {
-		rapl_write_data_raw(rd, PL2_ENABLE, mode);
-		rapl_write_data_raw(rd, PL2_CLAMP, mode);
-	}
-}
-
-static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
-{
-	static u32 power_ctrl_orig_val;
-	u32 mdata;
-
-	if (!rapl_defaults->floor_freq_reg_addr) {
-		pr_err("Invalid floor frequency config register\n");
-		return;
-	}
-
-	if (!power_ctrl_orig_val)
-		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
-			      rapl_defaults->floor_freq_reg_addr,
-			      &power_ctrl_orig_val);
-	mdata = power_ctrl_orig_val;
-	if (enable) {
-		mdata &= ~(0x7f << 8);
-		mdata |= 1 << 8;
-	}
-	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
-		       rapl_defaults->floor_freq_reg_addr, mdata);
-}
-
-static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	u64 f, y; /* fraction and exp. used for time unit */
-
-	/*
-	 * Special processing based on 2^Y*(1+F/4), refer
-	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
-	 */
-	if (!to_raw) {
-		f = (value & 0x60) >> 5;
-		y = value & 0x1f;
-		value = (1 << y) * (4 + f) * rp->time_unit / 4;
-	} else {
-		do_div(value, rp->time_unit);
-		y = ilog2(value);
-		f = div64_u64(4 * (value - (1 << y)), 1 << y);
-		value = (y & 0x1f) | ((f & 0x3) << 5);
-	}
-	return value;
-}
-
-static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	/*
-	 * Atom time unit encoding is straight forward val * time_unit,
-	 * where time_unit is default to 1 sec. Never 0.
-	 */
-	if (!to_raw)
-		return (value) ? value *= rp->time_unit : rp->time_unit;
-	else
-		value = div64_u64(value, rp->time_unit);
-
-	return value;
-}
-
-static const struct rapl_defaults rapl_defaults_core = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-};
-
-static const struct rapl_defaults rapl_defaults_hsw_server = {
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-	.dram_domain_energy_unit = 15300,
-};
-
-static const struct rapl_defaults rapl_defaults_byt = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_tng = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_ann = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_cht = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct x86_cpu_id rapl_ids[] __initconst = {
-	INTEL_CPU_FAM6(SANDYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SANDYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(IVYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(IVYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(HASWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_ULT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(BROADWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_XEON_D,	rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(SKYLAKE_DESKTOP,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_X,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(KABYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(KABYLAKE_DESKTOP,	rapl_defaults_core),
-	INTEL_CPU_FAM6(CANNONLAKE_MOBILE,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ICELAKE_MOBILE,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(ATOM_SILVERMONT,		rapl_defaults_byt),
-	INTEL_CPU_FAM6(ATOM_AIRMONT,		rapl_defaults_cht),
-	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID,	rapl_defaults_tng),
-	INTEL_CPU_FAM6(ATOM_AIRMONT_MID,	rapl_defaults_ann),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_X,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_TREMONT_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(XEON_PHI_KNL,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(XEON_PHI_KNM,		rapl_defaults_hsw_server),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
-
-/* Read once for all raw primitive data for domains */
-static void rapl_update_domain_data(struct rapl_package *rp)
-{
-	int dmn, prim;
-	u64 val;
-
-	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
-		pr_debug("update %s domain %s data\n", rp->name,
-			 rp->domains[dmn].name);
-		/* exclude non-raw primitives */
-		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
-			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-						rpi[prim].unit, &val))
-				rp->domains[dmn].rdd.primitives[prim] =	val;
-		}
-	}
-
-}
-
-static int rapl_package_register_powercap(struct rapl_package *rp)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone = NULL;
-	int nr_pl, ret;
-
-	/* Update the domain data of the new package */
-	rapl_update_domain_data(rp);
-
-	/* first we register package domain as the parent zone*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			nr_pl = find_nr_power_limit(rd);
-			pr_debug("register package domain %s\n", rp->name);
-			power_zone = powercap_register_zone(&rd->power_zone,
-							rp->priv->control_type,
-							rp->name, NULL,
-							&zone_ops[rd->id],
-							nr_pl,
-							&constraint_ops);
-			if (IS_ERR(power_zone)) {
-				pr_debug("failed to register power zone %s\n",
-					rp->name);
-				return PTR_ERR(power_zone);
-			}
-			/* track parent zone in per package/socket data */
-			rp->power_zone = power_zone;
-			/* done, only one package domain per socket */
-			break;
-		}
-	}
-	if (!power_zone) {
-		pr_err("no package domain found, unknown topology!\n");
-		return -ENODEV;
-	}
-	/* now register domains as children of the socket/package*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE)
-			continue;
-		/* number of power limits per domain varies */
-		nr_pl = find_nr_power_limit(rd);
-		power_zone = powercap_register_zone(&rd->power_zone,
-						rp->priv->control_type, rd->name,
-						rp->power_zone,
-						&zone_ops[rd->id], nr_pl,
-						&constraint_ops);
-
-		if (IS_ERR(power_zone)) {
-			pr_debug("failed to register power_zone, %s:%s\n",
-				rp->name, rd->name);
-			ret = PTR_ERR(power_zone);
-			goto err_cleanup;
-		}
-	}
-	return 0;
-
-err_cleanup:
-	/*
-	 * Clean up previously initialized domains within the package if we
-	 * failed after the first domain setup.
-	 */
-	while (--rd >= rp->domains) {
-		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-
-	return ret;
-}
-
-static int __init rapl_add_platform_domain(struct rapl_if_priv *priv)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone;
-	struct reg_action ra;
-	int ret;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return -ENOMEM;
-
-	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
-	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	rd->rpl[0].prim_id = PL1_ENABLE;
-	rd->rpl[0].name = pl1_name;
-	rd->rpl[1].prim_id = PL2_ENABLE;
-	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0, priv);
-
-	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
-					    "psys", NULL,
-					    &zone_ops[RAPL_DOMAIN_PLATFORM],
-					    2, &constraint_ops);
-
-	if (IS_ERR(power_zone)) {
-		kfree(rd);
-		return PTR_ERR(power_zone);
-	}
-
-	priv->platform_rapl_domain = rd;
-
-	return 0;
-}
-
-static void rapl_remove_platform_domain(struct rapl_if_priv *priv)
-{
-	if (priv->platform_rapl_domain) {
-		powercap_unregister_zone(priv->control_type,
-			&priv->platform_rapl_domain->power_zone);
-		kfree(priv->platform_rapl_domain);
-	}
-}
-
-static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
-{
-	struct reg_action ra;
-
-	switch (domain) {
-	case RAPL_DOMAIN_PACKAGE:
-	case RAPL_DOMAIN_PP0:
-	case RAPL_DOMAIN_PP1:
-	case RAPL_DOMAIN_DRAM:
-		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
-		break;
-	case RAPL_DOMAIN_PLATFORM:
-		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
-		return -EINVAL;
-	default:
-		pr_err("invalid domain id %d\n", domain);
-		return -EINVAL;
-	}
-	/* make sure domain counters are available and contains non-zero
-	 * values, otherwise skip it.
-	 */
-
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
-		return -ENODEV;
-
-	return 0;
-}
-
-
-/*
- * Check if power limits are available. Two cases when they are not available:
- * 1. Locked by BIOS, in this case we still provide read-only access so that
- *    users can see what limit is set by the BIOS.
- * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
- *    exist at all. In this case, we do not show the contraints in powercap.
- *
- * Called after domains are detected and initialized.
- */
-static void rapl_detect_powerlimit(struct rapl_domain *rd)
-{
-	u64 val64;
-	int i;
-
-	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
-	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
-		if (val64) {
-			pr_info("RAPL %s domain %s locked by BIOS\n",
-				rd->rp->name, rd->name);
-			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
-		}
-	}
-	/* check if power limit MSRs exists, otherwise domain is monitoring only */
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		int prim = rd->rpl[i].prim_id;
-		if (rapl_read_data_raw(rd, prim, false, &val64))
-			rd->rpl[i].name = NULL;
-	}
-}
-
-/* Detect active and valid domains for the given CPU, caller must
- * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
- */
-static int rapl_detect_domains(struct rapl_package *rp, int cpu)
-{
-	struct rapl_domain *rd;
-	int i;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i, rp)) {
-			rp->domain_map |= 1 << i;
-			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
-		}
-	}
-	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
-	if (!rp->nr_domains) {
-		pr_debug("no valid rapl domains found in %s\n", rp->name);
-		return -ENODEV;
-	}
-	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
-
-	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
-			GFP_KERNEL);
-	if (!rp->domains)
-		return -ENOMEM;
-
-	rapl_init_domains(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
-		rapl_detect_powerlimit(rd);
-
-	return 0;
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static void rapl_remove_package(struct rapl_package *rp)
-{
-	struct rapl_domain *rd, *rd_package = NULL;
-
-	package_power_limit_irq_restore(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		rapl_write_data_raw(rd, PL1_ENABLE, 0);
-		rapl_write_data_raw(rd, PL1_CLAMP, 0);
-		if (find_nr_power_limit(rd) > 1) {
-			rapl_write_data_raw(rd, PL2_ENABLE, 0);
-			rapl_write_data_raw(rd, PL2_CLAMP, 0);
-		}
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			rd_package = rd;
-			continue;
-		}
-		pr_debug("remove package, undo power limit on %s: %s\n",
-			 rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-	/* do parent zone last */
-	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
-	list_del(&rp->plist);
-	kfree(rp);
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	int ret;
-
-	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
-	if (!rp)
-		return ERR_PTR(-ENOMEM);
-
-	/* add the new package to the list */
-	rp->id = id;
-	rp->lead_cpu = cpu;
-	rp->priv = priv;
-
-	if (topology_max_die_per_package() > 1)
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
-			"package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
-	else
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
-			c->phys_proc_id);
-
-	/* check if the package contains valid domains */
-	if (rapl_detect_domains(rp, cpu) ||
-		rapl_defaults->check_unit(rp, cpu)) {
-		ret = -ENODEV;
-		goto err_free_package;
-	}
-	ret = rapl_package_register_powercap(rp);
-	if (!ret) {
-		INIT_LIST_HEAD(&rp->plist);
-		list_add(&rp->plist, &rapl_packages);
-		return rp;
-	}
-
-err_free_package:
-	kfree(rp->domains);
-	kfree(rp);
-	return ERR_PTR(ret);
-}
-
-/* Handles CPU hotplug on multi-socket systems.
- * If a CPU goes online as the first CPU of the physical package
- * we add the RAPL package to the system. Similarly, when the last
- * CPU of the package is removed, we remove the RAPL package and its
- * associated domains. Cooling devices are handled accordingly at
- * per-domain level.
- */
-static int rapl_cpu_online(unsigned int cpu)
-{
-	struct rapl_package *rp;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp) {
-		rp = rapl_add_package(cpu, &rapl_msr_priv);
-		if (IS_ERR(rp))
-			return PTR_ERR(rp);
-	}
-	cpumask_set_cpu(cpu, &rp->cpumask);
-	return 0;
-}
-
-static int rapl_cpu_down_prep(unsigned int cpu)
-{
-	struct rapl_package *rp;
-	int lead_cpu;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp)
-		return 0;
-
-	cpumask_clear_cpu(cpu, &rp->cpumask);
-	lead_cpu = cpumask_first(&rp->cpumask);
-	if (lead_cpu >= nr_cpu_ids)
-		rapl_remove_package(rp);
-	else if (rp->lead_cpu == cpu)
-		rp->lead_cpu = lead_cpu;
-	return 0;
-}
-
-static void power_limit_state_save(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, ret, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT1,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			case PL2_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT2,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static void power_limit_state_restore(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT1,
-						rd->rpl[i].last_power_limit);
-				break;
-			case PL2_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT2,
-						rd->rpl[i].last_power_limit);
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static int rapl_pm_callback(struct notifier_block *nb,
-	unsigned long mode, void *_unused)
-{
-	switch (mode) {
-	case PM_SUSPEND_PREPARE:
-		power_limit_state_save();
-		break;
-	case PM_POST_SUSPEND:
-		power_limit_state_restore();
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block rapl_pm_notifier = {
-	.notifier_call = rapl_pm_callback,
-};
-
-static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
-{
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
-		return -EIO;
-	}
-	ra->value &= ra->mask;
-	return 0;
-}
-
-static void rapl_msr_update_func(void *info)
-{
-	struct reg_action *ra = info;
-	u64 val;
-
-	ra->err = rdmsrl_safe(ra->reg, &val);
-	if (ra->err)
-		return;
-
-	val &= ~ra->mask;
-	val |= ra->value;
-
-	ra->err = wrmsrl_safe(ra->reg, val);
-}
-
-
-static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
-{
-	int ret;
-
-	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
-	if (WARN_ON_ONCE(ret))
-		return ret;
-
-	return ra->err;
-}
-
-static int __init rapl_init(void)
-{
-	const struct x86_cpu_id *id;
-	int ret;
-
-	id = x86_match_cpu(rapl_ids);
-	if (!id) {
-		pr_err("driver does not support CPU family %d model %d\n",
-			boot_cpu_data.x86, boot_cpu_data.x86_model);
-
-		return -ENODEV;
-	}
-
-	rapl_defaults = (struct rapl_defaults *)id->driver_data;
-
-	rapl_msr_priv.read_raw = rapl_msr_read_raw;
-	rapl_msr_priv.write_raw = rapl_msr_write_raw;
-
-	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(rapl_msr_priv.control_type)) {
-		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(rapl_msr_priv.control_type);
-	}
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
-				rapl_cpu_online, rapl_cpu_down_prep);
-	if (ret < 0)
-		goto err_unreg;
-	rapl_msr_priv.pcap_rapl_online = ret;
-
-	/* Don't bail out if PSys is not supported */
-	rapl_add_platform_domain(&rapl_msr_priv);
-
-	ret = register_pm_notifier(&rapl_pm_notifier);
-	if (ret)
-		goto err_unreg_all;
-
-	return 0;
-
-err_unreg_all:
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-
-err_unreg:
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-	return ret;
-}
-
-static void __exit rapl_exit(void)
-{
-	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-	rapl_remove_platform_domain(&rapl_msr_priv);
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-}
-
-module_init(rapl_init);
-module_exit(rapl_exit);
-
-MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
-MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
new file mode 100644
index 000000000000..34a82531a7cf
--- /dev/null
+++ b/drivers/powercap/intel_rapl_common.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common code for Intel Running Average Power Limit (RAPL) support.
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
+
+#include <linux/processor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* bitmasks for RAPL MSRs, used by primitive access functions */
+#define ENERGY_STATUS_MASK      0xffffffff
+
+#define POWER_LIMIT1_MASK       0x7FFF
+#define POWER_LIMIT1_ENABLE     BIT(15)
+#define POWER_LIMIT1_CLAMP      BIT(16)
+
+#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
+#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
+#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
+#define POWER_PACKAGE_LOCK      BIT_ULL(63)
+#define POWER_PP_LOCK           BIT(31)
+
+#define TIME_WINDOW1_MASK       (0x7FULL<<17)
+#define TIME_WINDOW2_MASK       (0x7FULL<<49)
+
+#define POWER_UNIT_OFFSET	0
+#define POWER_UNIT_MASK		0x0F
+
+#define ENERGY_UNIT_OFFSET	0x08
+#define ENERGY_UNIT_MASK	0x1F00
+
+#define TIME_UNIT_OFFSET	0x10
+#define TIME_UNIT_MASK		0xF0000
+
+#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
+#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
+#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
+#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
+
+#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
+#define PP_POLICY_MASK         0x1F
+
+/* Non HW constants */
+#define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
+#define RAPL_PRIMITIVE_DUMMY         BIT(2)
+
+#define TIME_WINDOW_MAX_MSEC 40000
+#define TIME_WINDOW_MIN_MSEC 250
+#define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
+enum unit_type {
+	ARBITRARY_UNIT,		/* no translation */
+	POWER_UNIT,
+	ENERGY_UNIT,
+	TIME_UNIT,
+};
+
+/* per domain data, some are optional */
+#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
+
+#define	DOMAIN_STATE_INACTIVE           BIT(0)
+#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
+#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
+
+static const char pl1_name[] = "long_term";
+static const char pl2_name[] = "short_term";
+
+#define power_zone_to_rapl_domain(_zone) \
+	container_of(_zone, struct rapl_domain, power_zone)
+
+struct rapl_defaults {
+	u8 floor_freq_reg_addr;
+	int (*check_unit)(struct rapl_package *rp, int cpu);
+	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
+	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
+				    bool to_raw);
+	unsigned int dram_domain_energy_unit;
+};
+static struct rapl_defaults *rapl_defaults;
+
+/* Sideband MBI registers */
+#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
+#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
+
+#define PACKAGE_PLN_INT_SAVED   BIT(0)
+#define MAX_PRIM_NAME (32)
+
+/* per domain data. used to describe individual knobs such that access function
+ * can be consolidated into one instead of many inline functions.
+ */
+struct rapl_primitive_info {
+	const char *name;
+	u64 mask;
+	int shift;
+	enum rapl_domain_reg_id id;
+	enum unit_type unit;
+	u32 flag;
+};
+
+#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
+		.name = #p,			\
+		.mask = m,			\
+		.shift = s,			\
+		.id = i,			\
+		.unit = u,			\
+		.flag = f			\
+	}
+
+static void rapl_init_domains(struct rapl_package *rp);
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim,
+			      bool xlate, u64 *data);
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value);
+static u64 rapl_unit_xlate(struct rapl_domain *rd,
+			   enum unit_type type, u64 value, int to_raw);
+static void package_power_limit_irq_save(struct rapl_package *rp);
+static int rapl_init_core(void);
+static void rapl_remove_core(void);
+
+static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
+
+static const char *const rapl_domain_names[] = {
+	"package",
+	"core",
+	"uncore",
+	"dram",
+	"psys",
+};
+
+static int get_energy_counter(struct powercap_zone *power_zone,
+			      u64 *energy_raw)
+{
+	struct rapl_domain *rd;
+	u64 energy_now;
+
+	/* prevent CPU hotplug, make sure the RAPL domain does not go
+	 * away while reading the counter.
+	 */
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+
+	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
+		*energy_raw = energy_now;
+		put_online_cpus();
+
+		return 0;
+	}
+	put_online_cpus();
+
+	return -EIO;
+}
+
+static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
+
+	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
+	return 0;
+}
+
+static int release_zone(struct powercap_zone *power_zone)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	struct rapl_package *rp = rd->rp;
+
+	/* package zone is the last zone of a package, we can free
+	 * memory here since all children has been unregistered.
+	 */
+	if (rd->id == RAPL_DOMAIN_PACKAGE) {
+		kfree(rd);
+		rp->domains = NULL;
+	}
+
+	return 0;
+
+}
+
+static int find_nr_power_limit(struct rapl_domain *rd)
+{
+	int i, nr_pl = 0;
+
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		if (rd->rpl[i].name)
+			nr_pl++;
+	}
+
+	return nr_pl;
+}
+
+static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
+		return -EACCES;
+
+	get_online_cpus();
+	rapl_write_data_raw(rd, PL1_ENABLE, mode);
+	if (rapl_defaults->set_floor_freq)
+		rapl_defaults->set_floor_freq(rd, mode);
+	put_online_cpus();
+
+	return 0;
+}
+
+static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	u64 val;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		*mode = false;
+		return 0;
+	}
+	get_online_cpus();
+	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
+		put_online_cpus();
+		return -EIO;
+	}
+	*mode = val;
+	put_online_cpus();
+
+	return 0;
+}
+
+/* per RAPL domain ops, in the order of rapl_domain_type */
+static const struct powercap_zone_ops zone_ops[] = {
+	/* RAPL_DOMAIN_PACKAGE */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP0 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP1 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_DRAM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PLATFORM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+};
+
+/*
+ * Constraint index used by powercap can be different than power limit (PL)
+ * index in that some  PLs maybe missing due to non-existent MSRs. So we
+ * need to convert here by finding the valid PLs only (name populated).
+ */
+static int contraint_to_pl(struct rapl_domain *rd, int cid)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
+		if ((rd->rpl[i].name) && j++ == cid) {
+			pr_debug("%s: index %d\n", __func__, i);
+			return i;
+		}
+	}
+	pr_err("Cannot find matching power limit for constraint %d\n", cid);
+
+	return -EINVAL;
+}
+
+static int set_power_limit(struct powercap_zone *power_zone, int cid,
+			   u64 power_limit)
+{
+	struct rapl_domain *rd;
+	struct rapl_package *rp;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_exit;
+	}
+
+	rp = rd->rp;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		dev_warn(&power_zone->dev,
+			 "%s locked by BIOS, monitoring only\n", rd->name);
+		ret = -EACCES;
+		goto set_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (!ret)
+		package_power_limit_irq_save(rp);
+set_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
+				   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = POWER_LIMIT1;
+		break;
+	case PL2_ENABLE:
+		prim = POWER_LIMIT2;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+get_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static int set_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 window)
+{
+	struct rapl_domain *rd;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW1, window);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW2, window);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+set_time_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
+		break;
+	case PL2_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (!ret)
+		*data = val;
+
+get_time_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static const char *get_constraint_name(struct powercap_zone *power_zone,
+				       int cid)
+{
+	struct rapl_domain *rd;
+	int id;
+
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id >= 0)
+		return rd->rpl[id].name;
+
+	return NULL;
+}
+
+static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = THERMAL_SPEC_POWER;
+		break;
+	case PL2_ENABLE:
+		prim = MAX_POWER;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+	put_online_cpus();
+
+	return ret;
+}
+
+static const struct powercap_zone_constraint_ops constraint_ops = {
+	.set_power_limit_uw = set_power_limit,
+	.get_power_limit_uw = get_current_power_limit,
+	.set_time_window_us = set_time_window,
+	.get_time_window_us = get_time_window,
+	.get_max_power_uw = get_max_power,
+	.get_name = get_constraint_name,
+};
+
+/* called after domain detection and package level data are set */
+static void rapl_init_domains(struct rapl_package *rp)
+{
+	int i;
+	struct rapl_domain *rd = rp->domains;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
+		switch (mask) {
+		case BIT(RAPL_DOMAIN_PACKAGE):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
+			rd->id = RAPL_DOMAIN_PACKAGE;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->rpl[1].prim_id = PL2_ENABLE;
+			rd->rpl[1].name = pl2_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP0):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
+			rd->id = RAPL_DOMAIN_PP0;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP1):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
+			rd->id = RAPL_DOMAIN_PP1;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_DRAM):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
+			rd->id = RAPL_DOMAIN_DRAM;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->domain_energy_unit =
+			    rapl_defaults->dram_domain_energy_unit;
+			if (rd->domain_energy_unit)
+				pr_info("DRAM domain energy unit %dpj\n",
+					rd->domain_energy_unit);
+			break;
+		}
+		if (mask) {
+			rd->rp = rp;
+			rd++;
+		}
+	}
+}
+
+static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
+			   u64 value, int to_raw)
+{
+	u64 units = 1;
+	struct rapl_package *rp = rd->rp;
+	u64 scale = 1;
+
+	switch (type) {
+	case POWER_UNIT:
+		units = rp->power_unit;
+		break;
+	case ENERGY_UNIT:
+		scale = ENERGY_UNIT_SCALE;
+		/* per domain unit takes precedence */
+		if (rd->domain_energy_unit)
+			units = rd->domain_energy_unit;
+		else
+			units = rp->energy_unit;
+		break;
+	case TIME_UNIT:
+		return rapl_defaults->compute_time_window(rp, value, to_raw);
+	case ARBITRARY_UNIT:
+	default:
+		return value;
+	};
+
+	if (to_raw)
+		return div64_u64(value, units) * scale;
+
+	value *= units;
+
+	return div64_u64(value, scale);
+}
+
+/* in the order of enum rapl_primitives */
+static struct rapl_primitive_info rpi[] = {
+	/* name, mask, shift, msr index, unit divisor */
+	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
+			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
+			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
+			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
+			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
+			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	/* non-hardware */
+	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
+			    RAPL_PRIMITIVE_DERIVED),
+	{NULL, 0, 0, 0},
+};
+
+/* Read primitive data based on its related struct rapl_primitive_info.
+ * if xlate flag is set, return translated data based on data units, i.e.
+ * time, energy, and power.
+ * RAPL MSRs are non-architectual and are laid out not consistently across
+ * domains. Here we use primitive info to allow writing consolidated access
+ * functions.
+ * For a given primitive, it is processed by MSR mask and shift. Unit conversion
+ * is pre-assigned based on RAPL unit MSRs read at init time.
+ * 63-------------------------- 31--------------------------- 0
+ * |                           xxxxx (mask)                   |
+ * |                                |<- shift ----------------|
+ * 63-------------------------- 31--------------------------- 0
+ */
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim, bool xlate, u64 *data)
+{
+	u64 value;
+	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
+	int cpu;
+
+	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
+		return -EINVAL;
+
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
+		return -EINVAL;
+
+	cpu = rd->rp->lead_cpu;
+
+	/* special-case package domain, which uses a different bit */
+	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
+		rp->mask = POWER_PACKAGE_LOCK;
+		rp->shift = 63;
+	}
+	/* non-hardware data are collected by the polling thread */
+	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
+		*data = rd->rdd.primitives[prim];
+		return 0;
+	}
+
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		return -EIO;
+	}
+
+	value = ra.value >> rp->shift;
+
+	if (xlate)
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
+	else
+		*data = value;
+
+	return 0;
+}
+
+/* Similar use of primitive info in the read counterpart */
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value)
+{
+	struct rapl_primitive_info *rp = &rpi[prim];
+	int cpu;
+	u64 bits;
+	struct reg_action ra;
+	int ret;
+
+	cpu = rd->rp->lead_cpu;
+	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
+	bits <<= rp->shift;
+	bits &= rp->mask;
+
+	memset(&ra, 0, sizeof(ra));
+
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
+
+	ret = rd->rp->priv->write_raw(cpu, &ra);
+
+	return ret;
+}
+
+/*
+ * Raw RAPL data stored in MSRs are in certain scales. We need to
+ * convert them into standard units based on the units reported in
+ * the RAPL unit MSRs. This is specific to CPUs as the method to
+ * calculate units differ on different CPUs.
+ * We convert the units to below format based on CPUs.
+ * i.e.
+ * energy unit: picoJoules  : Represented in picoJoules by default
+ * power unit : microWatts  : Represented in milliWatts by default
+ * time unit  : microseconds: Represented in seconds by default
+ */
+static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = 1000000 / (1 << value);
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = (1 << value) * 1000;
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static void power_limit_irq_save_cpu(void *info)
+{
+	u32 l, h = 0;
+	struct rapl_package *rp = (struct rapl_package *)info;
+
+	/* save the state of PLN irq mask bit before disabling it */
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
+		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
+		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
+	}
+	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+/* REVISIT:
+ * When package power limit is set artificially low by RAPL, LVT
+ * thermal interrupt for package power limit should be ignored
+ * since we are not really exceeding the real limit. The intention
+ * is to avoid excessive interrupts while we are trying to save power.
+ * A useful feature might be routing the package_power_limit interrupt
+ * to userspace via eventfd. once we have a usecase, this is simple
+ * to do by adding an atomic notifier.
+ */
+
+static void package_power_limit_irq_save(struct rapl_package *rp)
+{
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
+}
+
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
+{
+	u32 l, h;
+
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	/* irq enable state not saved, nothing to restore */
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+		return;
+
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+
+	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
+		l |= PACKAGE_THERM_INT_PLN_ENABLE;
+	else
+		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
+{
+	int nr_powerlimit = find_nr_power_limit(rd);
+
+	/* always enable clamp such that p-state can go below OS requested
+	 * range. power capping priority over guranteed frequency.
+	 */
+	rapl_write_data_raw(rd, PL1_CLAMP, mode);
+
+	/* some domains have pl2 */
+	if (nr_powerlimit > 1) {
+		rapl_write_data_raw(rd, PL2_ENABLE, mode);
+		rapl_write_data_raw(rd, PL2_CLAMP, mode);
+	}
+}
+
+static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
+{
+	static u32 power_ctrl_orig_val;
+	u32 mdata;
+
+	if (!rapl_defaults->floor_freq_reg_addr) {
+		pr_err("Invalid floor frequency config register\n");
+		return;
+	}
+
+	if (!power_ctrl_orig_val)
+		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
+			      rapl_defaults->floor_freq_reg_addr,
+			      &power_ctrl_orig_val);
+	mdata = power_ctrl_orig_val;
+	if (enable) {
+		mdata &= ~(0x7f << 8);
+		mdata |= 1 << 8;
+	}
+	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
+		       rapl_defaults->floor_freq_reg_addr, mdata);
+}
+
+static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	u64 f, y;		/* fraction and exp. used for time unit */
+
+	/*
+	 * Special processing based on 2^Y*(1+F/4), refer
+	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
+	 */
+	if (!to_raw) {
+		f = (value & 0x60) >> 5;
+		y = value & 0x1f;
+		value = (1 << y) * (4 + f) * rp->time_unit / 4;
+	} else {
+		do_div(value, rp->time_unit);
+		y = ilog2(value);
+		f = div64_u64(4 * (value - (1 << y)), 1 << y);
+		value = (y & 0x1f) | ((f & 0x3) << 5);
+	}
+	return value;
+}
+
+static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	/*
+	 * Atom time unit encoding is straight forward val * time_unit,
+	 * where time_unit is default to 1 sec. Never 0.
+	 */
+	if (!to_raw)
+		return (value) ? value *= rp->time_unit : rp->time_unit;
+
+	value = div64_u64(value, rp->time_unit);
+
+	return value;
+}
+
+static const struct rapl_defaults rapl_defaults_core = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+};
+
+static const struct rapl_defaults rapl_defaults_hsw_server = {
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+	.dram_domain_energy_unit = 15300,
+};
+
+static const struct rapl_defaults rapl_defaults_byt = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_tng = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_ann = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_cht = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct x86_cpu_id rapl_ids[] __initconst = {
+	INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
+	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
+	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
+	INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
+
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
+{
+	int dmn, prim;
+	u64 val;
+
+	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+		pr_debug("update %s domain %s data\n", rp->name,
+			 rp->domains[dmn].name);
+		/* exclude non-raw primitives */
+		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+						rpi[prim].unit, &val))
+				rp->domains[dmn].rdd.primitives[prim] = val;
+		}
+	}
+
+}
+
+static int rapl_package_register_powercap(struct rapl_package *rp)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone = NULL;
+	int nr_pl, ret;
+
+	/* Update the domain data of the new package */
+	rapl_update_domain_data(rp);
+
+	/* first we register package domain as the parent zone */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			nr_pl = find_nr_power_limit(rd);
+			pr_debug("register package domain %s\n", rp->name);
+			power_zone = powercap_register_zone(&rd->power_zone,
+					    rp->priv->control_type, rp->name,
+					    NULL, &zone_ops[rd->id], nr_pl,
+					    &constraint_ops);
+			if (IS_ERR(power_zone)) {
+				pr_debug("failed to register power zone %s\n",
+					 rp->name);
+				return PTR_ERR(power_zone);
+			}
+			/* track parent zone in per package/socket data */
+			rp->power_zone = power_zone;
+			/* done, only one package domain per socket */
+			break;
+		}
+	}
+	if (!power_zone) {
+		pr_err("no package domain found, unknown topology!\n");
+		return -ENODEV;
+	}
+	/* now register domains as children of the socket/package */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE)
+			continue;
+		/* number of power limits per domain varies */
+		nr_pl = find_nr_power_limit(rd);
+		power_zone = powercap_register_zone(&rd->power_zone,
+						    rp->priv->control_type,
+						    rd->name, rp->power_zone,
+						    &zone_ops[rd->id], nr_pl,
+						    &constraint_ops);
+
+		if (IS_ERR(power_zone)) {
+			pr_debug("failed to register power_zone, %s:%s\n",
+				 rp->name, rd->name);
+			ret = PTR_ERR(power_zone);
+			goto err_cleanup;
+		}
+	}
+	return 0;
+
+err_cleanup:
+	/*
+	 * Clean up previously initialized domains within the package if we
+	 * failed after the first domain setup.
+	 */
+	while (--rd >= rp->domains) {
+		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+
+	return ret;
+}
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone;
+	struct reg_action ra;
+	int ret;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
+	rd->id = RAPL_DOMAIN_PLATFORM;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	rd->rpl[0].prim_id = PL1_ENABLE;
+	rd->rpl[0].name = pl1_name;
+	rd->rpl[1].prim_id = PL2_ENABLE;
+	rd->rpl[1].name = pl2_name;
+	rd->rp = rapl_find_package_domain(0, priv);
+
+	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
+					    "psys", NULL,
+					    &zone_ops[RAPL_DOMAIN_PLATFORM],
+					    2, &constraint_ops);
+
+	if (IS_ERR(power_zone)) {
+		kfree(rd);
+		return PTR_ERR(power_zone);
+	}
+
+	priv->platform_rapl_domain = rd;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
+
+void rapl_remove_platform_domain(struct rapl_if_priv *priv)
+{
+	if (priv->platform_rapl_domain) {
+		powercap_unregister_zone(priv->control_type,
+				 &priv->platform_rapl_domain->power_zone);
+		kfree(priv->platform_rapl_domain);
+	}
+}
+EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
+
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
+{
+	struct reg_action ra;
+
+	switch (domain) {
+	case RAPL_DOMAIN_PACKAGE:
+	case RAPL_DOMAIN_PP0:
+	case RAPL_DOMAIN_PP1:
+	case RAPL_DOMAIN_DRAM:
+		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
+		break;
+	case RAPL_DOMAIN_PLATFORM:
+		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
+		return -EINVAL;
+	default:
+		pr_err("invalid domain id %d\n", domain);
+		return -EINVAL;
+	}
+	/* make sure domain counters are available and contains non-zero
+	 * values, otherwise skip it.
+	 */
+
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
+		return -ENODEV;
+
+	return 0;
+}
+
+/*
+ * Check if power limits are available. Two cases when they are not available:
+ * 1. Locked by BIOS, in this case we still provide read-only access so that
+ *    users can see what limit is set by the BIOS.
+ * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
+ *    exist at all. In this case, we do not show the constraints in powercap.
+ *
+ * Called after domains are detected and initialized.
+ */
+static void rapl_detect_powerlimit(struct rapl_domain *rd)
+{
+	u64 val64;
+	int i;
+
+	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
+	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
+		if (val64) {
+			pr_info("RAPL %s domain %s locked by BIOS\n",
+				rd->rp->name, rd->name);
+			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
+		}
+	}
+	/* check if power limit MSR exists, otherwise domain is monitoring only */
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		int prim = rd->rpl[i].prim_id;
+
+		if (rapl_read_data_raw(rd, prim, false, &val64))
+			rd->rpl[i].name = NULL;
+	}
+}
+
+/* Detect active and valid domains for the given CPU, caller must
+ * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
+ */
+static int rapl_detect_domains(struct rapl_package *rp, int cpu)
+{
+	struct rapl_domain *rd;
+	int i;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		/* use physical package id to read counters */
+		if (!rapl_check_domain(cpu, i, rp)) {
+			rp->domain_map |= 1 << i;
+			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
+		}
+	}
+	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
+	if (!rp->nr_domains) {
+		pr_debug("no valid rapl domains found in %s\n", rp->name);
+		return -ENODEV;
+	}
+	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
+
+	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
+			      GFP_KERNEL);
+	if (!rp->domains)
+		return -ENOMEM;
+
+	rapl_init_domains(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
+		rapl_detect_powerlimit(rd);
+
+	return 0;
+}
+
+/* called from CPU hotplug notifier, hotplug lock held */
+void rapl_remove_package(struct rapl_package *rp)
+{
+	struct rapl_domain *rd, *rd_package = NULL;
+
+	package_power_limit_irq_restore(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		rapl_write_data_raw(rd, PL1_ENABLE, 0);
+		rapl_write_data_raw(rd, PL1_CLAMP, 0);
+		if (find_nr_power_limit(rd) > 1) {
+			rapl_write_data_raw(rd, PL2_ENABLE, 0);
+			rapl_write_data_raw(rd, PL2_CLAMP, 0);
+		}
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			rd_package = rd;
+			continue;
+		}
+		pr_debug("remove package, undo power limit on %s: %s\n",
+			 rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+	/* do parent zone last */
+	powercap_unregister_zone(rp->priv->control_type,
+				 &rd_package->power_zone);
+	list_del(&rp->plist);
+	if (list_empty(&rapl_packages))
+		rapl_remove_core();
+	kfree(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_remove_package);
+
+/* caller to ensure CPU hotplug lock is held */
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (rp->id == id
+		    && rp->priv->control_type == priv->control_type)
+			return rp;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain);
+
+/* called from CPU hotplug notifier, hotplug lock held */
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int ret;
+
+	ret = rapl_init_core();
+	if (ret)
+		return ERR_PTR(ret);
+
+	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
+	if (!rp)
+		return ERR_PTR(-ENOMEM);
+
+	/* add the new package to the list */
+	rp->id = id;
+	rp->lead_cpu = cpu;
+	rp->priv = priv;
+
+	if (topology_max_die_per_package() > 1)
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
+			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
+	else
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
+			 c->phys_proc_id);
+
+	/* check if the package contains valid domains */
+	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
+		ret = -ENODEV;
+		goto err_free_package;
+	}
+	ret = rapl_package_register_powercap(rp);
+	if (!ret) {
+		INIT_LIST_HEAD(&rp->plist);
+		list_add(&rp->plist, &rapl_packages);
+		return rp;
+	}
+
+err_free_package:
+	kfree(rp->domains);
+	kfree(rp);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(rapl_add_package);
+
+static void power_limit_state_save(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, ret, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT1, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			case PL2_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT2, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static void power_limit_state_restore(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT1,
+					    rd->rpl[i].last_power_limit);
+				break;
+			case PL2_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT2,
+					    rd->rpl[i].last_power_limit);
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static int rapl_pm_callback(struct notifier_block *nb,
+			    unsigned long mode, void *_unused)
+{
+	switch (mode) {
+	case PM_SUSPEND_PREPARE:
+		power_limit_state_save();
+		break;
+	case PM_POST_SUSPEND:
+		power_limit_state_restore();
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rapl_pm_notifier = {
+	.notifier_call = rapl_pm_callback,
+};
+
+static int rapl_init_core(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	if (rapl_defaults)
+		return 0;
+
+	id = x86_match_cpu(rapl_ids);
+	if (!id) {
+		pr_err("driver does not support CPU family %d model %d\n",
+		       boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+		return -ENODEV;
+	}
+
+	rapl_defaults = (struct rapl_defaults *)id->driver_data;
+
+	ret = register_pm_notifier(&rapl_pm_notifier);
+
+	return 0;
+}
+
+static void rapl_remove_core(void)
+{
+	unregister_pm_notifier(&rapl_pm_notifier);
+	rapl_defaults = NULL;
+}
+
+MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
new file mode 100644
index 000000000000..89645222e3e0
--- /dev/null
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Running Average Power Limit (RAPL) Driver via MSR interface
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <linux/intel_rapl.h>
+#include <linux/processor.h>
+
+#include <asm/iosf_mbi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
+
+/* Handles CPU hotplug on multi-socket systems.
+ * If a CPU goes online as the first CPU of the physical package
+ * we add the RAPL package to the system. Similarly, when the last
+ * CPU of the package is removed, we remove the RAPL package and its
+ * associated domains. Cooling devices are handled accordingly at
+ * per-domain level.
+ */
+static int rapl_cpu_online(unsigned int cpu)
+{
+	struct rapl_package *rp;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp) {
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
+		if (IS_ERR(rp))
+			return PTR_ERR(rp);
+	}
+	cpumask_set_cpu(cpu, &rp->cpumask);
+	return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+	struct rapl_package *rp;
+	int lead_cpu;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp)
+		return 0;
+
+	cpumask_clear_cpu(cpu, &rp->cpumask);
+	lead_cpu = cpumask_first(&rp->cpumask);
+	if (lead_cpu >= nr_cpu_ids)
+		rapl_remove_package(rp);
+	else if (rp->lead_cpu == cpu)
+		rp->lead_cpu = lead_cpu;
+	return 0;
+}
+
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
+static int __init rapl_msr_init(void)
+{
+	int ret;
+
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
+
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
+		pr_debug("failed to register powercap control_type.\n");
+		return PTR_ERR(rapl_msr_priv.control_type);
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+				rapl_cpu_online, rapl_cpu_down_prep);
+	if (ret < 0)
+		goto out;
+	rapl_msr_priv.pcap_rapl_online = ret;
+
+	/* Don't bail out if PSys is not supported */
+	rapl_add_platform_domain(&rapl_msr_priv);
+
+	return 0;
+
+out:
+	if (ret)
+		powercap_unregister_control_type(rapl_msr_priv.control_type);
+	return ret;
+}
+
+static void __exit rapl_msr_exit(void)
+{
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
+	rapl_remove_platform_domain(&rapl_msr_priv);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
+}
+
+module_init(rapl_msr_init);
+module_exit(rapl_msr_exit);
+
+MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface");
+MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ff215d64d114..9579f458fe4d 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -142,4 +142,11 @@ struct rapl_package {
 	struct rapl_if_priv *priv;
 };
 
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv);
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv);
+void rapl_remove_package(struct rapl_package *rp);
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv);
+void rapl_remove_platform_domain(struct rapl_if_priv *priv);
+
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From d978e755aabe215cb67bf713e103ed3916ec306d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:31 +0800
Subject: intel_rapl: support 64 bit register

RAPL MMIO interface uses 64 bit registers, thus force use 64 bit register
for all the RAPL code.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c |  6 +++---
 drivers/powercap/intel_rapl_msr.c    | 11 +++++++----
 include/linux/intel_rapl.h           |  8 ++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 34a82531a7cf..8e4de036f6d0 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -689,7 +689,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	ra.mask = rp->mask;
 
 	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
@@ -749,7 +749,7 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
@@ -777,7 +777,7 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 89645222e3e0..6cd8a8fb9238 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -84,8 +84,10 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 
 static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 {
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+	u32 msr = (u32)ra->reg;
+
+	if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
 		return -EIO;
 	}
 	ra->value &= ra->mask;
@@ -95,16 +97,17 @@ static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 static void rapl_msr_update_func(void *info)
 {
 	struct reg_action *ra = info;
+	u32 msr = (u32)ra->reg;
 	u64 val;
 
-	ra->err = rdmsrl_safe(ra->reg, &val);
+	ra->err = rdmsrl_safe(msr, &val);
 	if (ra->err)
 		return;
 
 	val &= ~ra->mask;
 	val |= ra->value;
 
-	ra->err = wrmsrl_safe(ra->reg, val);
+	ra->err = wrmsrl_safe(msr, val);
 }
 
 static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 9579f458fe4d..649e19981eb0 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -78,7 +78,7 @@ struct rapl_package;
 struct rapl_domain {
 	const char *name;
 	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
+	u64 regs[RAPL_DOMAIN_REG_MAX];
 	struct powercap_zone power_zone;
 	struct rapl_domain_data rdd;
 	struct rapl_power_limit rpl[NR_POWER_LIMITS];
@@ -89,7 +89,7 @@ struct rapl_domain {
 };
 
 struct reg_action {
-	u32 reg;
+	u64 reg;
 	u64 mask;
 	u64 value;
 	int err;
@@ -113,8 +113,8 @@ struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
-	u32 reg_unit;
-	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	u64 reg_unit;
+	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3


From 0c2ddedd8bcb88c4100acb9e0fc5ac8752d09501 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:32 +0800
Subject: intel_rapl: support two power limits for every RAPL domain

RAPL MSR interface supports 2 power limits for package domain, and 1 power
limit for other domains, while RAPL MMIO interface supports 2 power limits
for both package and dram domains.
And when 2 power limits are supported, the FW_LOCK bit is in bit 63 of the
register, instead of bit 31.

Remove the assumption that only pakcage domain supports 2 power limits.
And allow the RAPL interface driver to specify the number of power limits
supported, for every single RAPL domain it owns..

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 72 +++++++++++++-----------------------
 drivers/powercap/intel_rapl_msr.c    |  1 +
 include/linux/intel_rapl.h           |  2 +
 3 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 8e4de036f6d0..db8df19d8133 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -38,8 +38,8 @@
 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
+#define POWER_HIGH_LOCK         BIT_ULL(63)
+#define POWER_LOW_LOCK          BIT(31)
 
 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
@@ -513,60 +513,38 @@ static const struct powercap_zone_constraint_ops constraint_ops = {
 /* called after domain detection and package level data are set */
 static void rapl_init_domains(struct rapl_package *rp)
 {
-	int i;
+	enum rapl_domain_type i;
+	enum rapl_domain_reg_id j;
 	struct rapl_domain *rd = rp->domains;
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
 
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		if (!mask)
+			continue;
+
+		rd->rp = rp;
+		rd->name = rapl_domain_names[i];
+		rd->id = i;
+		rd->rpl[0].prim_id = PL1_ENABLE;
+		rd->rpl[0].name = pl1_name;
+		/* some domain may support two power limits */
+		if (rp->priv->limits[i] == 2) {
 			rd->rpl[1].prim_id = PL2_ENABLE;
 			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		}
+
+		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
+			rd->regs[j] = rp->priv->regs[i][j];
+
+		if (i == RAPL_DOMAIN_DRAM) {
 			rd->domain_energy_unit =
 			    rapl_defaults->dram_domain_energy_unit;
 			if (rd->domain_energy_unit)
 				pr_info("DRAM domain energy unit %dpj\n",
 					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
 		}
+		rd++;
 	}
 }
 
@@ -613,7 +591,7 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
@@ -675,9 +653,9 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 
 	cpu = rd->rp->lead_cpu;
 
-	/* special-case package domain, which uses a different bit */
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
+	/* domain with 2 limits has different bit */
+	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
+		rp->mask = POWER_HIGH_LOCK;
 		rp->shift = 63;
 	}
 	/* non-hardware data are collected by the polling thread */
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 6cd8a8fb9238..bc14a4579acb 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -41,6 +41,7 @@ static struct rapl_if_priv rapl_msr_priv = {
 		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
 	.regs[RAPL_DOMAIN_PLATFORM] = {
 		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+	.limits[RAPL_DOMAIN_PACKAGE] = 2,
 };
 
 /* Handles CPU hotplug on multi-socket systems.
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 649e19981eb0..0c179d92d110 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -104,6 +104,7 @@ struct reg_action {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @limits:			Number of power limits supported by each domain.
  * @read_raw:			Callback for reading RAPL interface specific
  *				registers.
  * @write_raw:			Callback for writing RAPL interface specific
@@ -115,6 +116,7 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u64 reg_unit;
 	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int limits[RAPL_DOMAIN_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3


From bedc0fd0f9b517698193d644f914b33951856fd2 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 11 Jul 2019 09:55:56 -0400
Subject: RDMA/core: Fix -Wunused-const-variable warnings

The commit below introduced a few compilation warnings.

In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from drivers/net/ethernet/mellanox/mlx5/core/uar.c:36:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~
In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:37:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~

Since only ib_cq_rdma_dim_work() in drivers/infiniband/core/cq.c uses it,
just move the definition over there.

Fixes: f4915455dcf0 ("linux/dim: Implement RDMA adaptive moderation (DIM)")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cq.c | 13 +++++++++++++
 include/linux/dim.h          | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index ffd6e24109d5..7c599878ccf7 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,19 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
 static void ib_cq_rdma_dim_work(struct work_struct *w)
 {
 	struct dim *dim = container_of(w, struct dim, work);
diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa69730c3b8d..d3a0fbfff2bb 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -374,19 +374,6 @@ void net_dim(struct dim *dim, struct dim_sample end_sample);
 #define RDMA_DIM_PARAMS_NUM_PROFILES 9
 #define RDMA_DIM_START_PROFILE 0
 
-static const struct dim_cq_moder
-rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
-	{1,   0, 1,  0},
-	{1,   0, 4,  0},
-	{2,   0, 4,  0},
-	{2,   0, 8,  0},
-	{4,   0, 8,  0},
-	{16,  0, 8,  0},
-	{16,  0, 16, 0},
-	{32,  0, 16, 0},
-	{32,  0, 32, 0},
-};
-
 /**
  * rdma_dim - Runs the adaptive moderation.
  * @dim: The moderation struct.
-- 
cgit v1.2.3


From b516ea586d717472178e6ef1c152e85608b0ce32 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 8 Jul 2019 13:17:44 +0800
Subject: PCI: Enable NVIDIA HDA controllers

Many NVIDIA GPUs can be configured as either a single-function video device
or a multi-function device with video at function 0 and an HDA audio
controller at function 1.  The HDA controller can be enabled or disabled by
a bit in the function 0 config space.

Some BIOSes leave the HDA disabled, which means the HDMI connector from the
NVIDIA GPU may not work.  Sometimes the BIOS enables the HDA if an HDMI
cable is connected at boot time, but that doesn't handle hotplug cases.

Enable the HDA controller on device enumeration and resume and re-read the
header type, which tells us whether the GPU is a multi-function device.

This quirk is limited to NVIDIA PCI devices with the VGA Controller device
class.  This is expected to correspond to product configurations where the
NVIDIA GPU has connectors attached.  Other products where the device class
is 3D Controller are expected to correspond to configurations where the
NVIDIA GPU is dedicated (dGPU) and has no connectors.  See original post
(URL below) for more details.

This commit takes inspiration from an earlier patch by Daniel Drake.

Link: https://lore.kernel.org/r/20190708051744.24039-1-drake@endlessm.com v2
Link: https://lore.kernel.org/r/20190613063514.15317-1-drake@endlessm.com v1
Link: https://devtalk.nvidia.com/default/topic/1024022
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=75985
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Daniel Drake <drake@endlessm.com>
[bhelgaas: commit log, log message, return early if already enabled]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Aaron Plattner <aplattner@nvidia.com>
Cc: Peter Wu <peter@lekensteyn.nl>
Cc: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Maik Freudenberg <hhfeuer@gmx.de>
---
 drivers/pci/quirks.c    | 30 ++++++++++++++++++++++++++++++
 include/linux/pci_ids.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c66c0ca446c4..208aacf39329 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -5011,6 +5011,36 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 			      PCI_CLASS_SERIAL_UNKNOWN, 8,
 			      quirk_gpu_usb_typec_ucsi);
 
+/*
+ * Enable the NVIDIA GPU integrated HDA controller if the BIOS left it
+ * disabled.  https://devtalk.nvidia.com/default/topic/1024022
+ */
+static void quirk_nvidia_hda(struct pci_dev *gpu)
+{
+	u8 hdr_type;
+	u32 val;
+
+	/* There was no integrated HDA controller before MCP89 */
+	if (gpu->device < PCI_DEVICE_ID_NVIDIA_GEFORCE_320M)
+		return;
+
+	/* Bit 25 at offset 0x488 enables the HDA controller */
+	pci_read_config_dword(gpu, 0x488, &val);
+	if (val & BIT(25))
+		return;
+
+	pci_info(gpu, "Enabling HDA controller\n");
+	pci_write_config_dword(gpu, 0x488, val | BIT(25));
+
+	/* The GPU becomes a multi-function device when the HDA is enabled */
+	pci_read_config_byte(gpu, PCI_HEADER_TYPE, &hdr_type);
+	gpu->multifunction = !!(hdr_type & 0x80);
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+DECLARE_PCI_FIXUP_CLASS_RESUME_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+
 /*
  * Some IDT switches incorrectly flag an ACS Source Validation error on
  * completions for config read requests even though PCIe r4.0, sec
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..66898463b81f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1336,6 +1336,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS    0x0752
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE       0x0759
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS     0x07D8
+#define PCI_DEVICE_ID_NVIDIA_GEFORCE_320M           0x08A0
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS     0x0AA2
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA	    0x0D85
 
-- 
cgit v1.2.3


From db849faa9bef993a1379dc510623f750a72fa7ce Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 3 May 2019 13:14:59 -0700
Subject: net/mlx5e: Rx, Fix checksum calculation for new hardware

CQE checksum full mode in new HW, provides a full checksum of rx frame.
Covering bytes starting from eth protocol up to last byte in the received
frame (frame_size - ETH_HLEN), as expected by the stack.

Fixing up skb->csum by the driver is not required in such case. This fix
is to avoid wrong checksum calculation in drivers which already support
the new hardware with the new checksum mode.

Fixes: 85327a9c4150 ("net/mlx5: Update the list of the PCI supported devices")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 7 ++++++-
 include/linux/mlx5/mlx5_ifc.h                     | 3 ++-
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc6797e24571..cc227a7aa79f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -294,6 +294,7 @@ enum {
 	MLX5E_RQ_STATE_ENABLED,
 	MLX5E_RQ_STATE_AM,
 	MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
+	MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
 };
 
 struct mlx5e_cq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8e8350b38aa..98d75271fc73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -855,6 +855,9 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (err)
 		goto err_destroy_rq;
 
+	if (MLX5_CAP_ETH(c->mdev, cqe_checksum_full))
+		__set_bit(MLX5E_RQ_STATE_CSUM_FULL, &c->rq.state);
+
 	if (params->rx_dim_enabled)
 		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 13133e7f088e..8a5f9411cac6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -873,8 +873,14 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 		if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP))
 			goto csum_unnecessary;
 
+		stats->csum_complete++;
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+
+		if (test_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state))
+			return; /* CQE csum covers all received bytes */
+
+		/* csum might need some fixups ...*/
 		if (network_depth > ETH_HLEN)
 			/* CQE csum is calculated from the IP header and does
 			 * not cover VLAN headers (if present). This will add
@@ -885,7 +891,6 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 						 skb->csum);
 
 		mlx5e_skb_padding_csum(skb, network_depth, proto, stats);
-		stats->csum_complete++;
 		return;
 	}
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5e74305e2e57..7e42efa143a0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -749,7 +749,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp[0x1];
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
-	u8         reserved_at_23[0xd];
+	u8         cqe_checksum_full[0x1];
+	u8         reserved_at_24[0xc];
 	u8         max_vxlan_udp_ports[0x8];
 	u8         reserved_at_38[0x6];
 	u8         max_geneve_opt_len[0x1];
-- 
cgit v1.2.3


From bd976e52725965ddcceb9abecbcc7ca46863665c Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:16 +0900
Subject: block: Kill gfp_t argument of blkdev_report_zones()

Only GFP_KERNEL and GFP_NOIO are used with blkdev_report_zones(). In
preparation of using vmalloc() for large report buffer and zone array
allocations used by this function, remove its "gfp_t gfp_mask" argument
and rely on the caller context to use memalloc_noio_save/restore() where
necessary (block layer zone revalidation and dm-zoned I/O error path).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c              | 31 +++++++++++++++++++------------
 drivers/block/null_blk.h       |  3 +--
 drivers/block/null_blk_zoned.c |  3 +--
 drivers/md/dm-flakey.c         |  5 ++---
 drivers/md/dm-linear.c         |  5 ++---
 drivers/md/dm-zoned-metadata.c | 16 ++++++++++++----
 drivers/md/dm.c                |  6 ++----
 drivers/scsi/sd.h              |  3 +--
 drivers/scsi/sd_zbc.c          |  6 ++----
 fs/f2fs/super.c                |  4 +---
 include/linux/blkdev.h         |  5 ++---
 include/linux/device-mapper.h  |  3 +--
 12 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3249738242b4..58ced170b424 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/sched/mm.h>
 
 #include "blk.h"
 
@@ -117,8 +118,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
 }
 
 static int blk_report_zones(struct gendisk *disk, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask)
+			    struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = disk->queue;
 	unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +127,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
 
 	while (z < nrz && sector < capacity) {
 		n = nrz - z;
-		ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
-					       gfp_mask);
+		ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
 		if (ret)
 			return ret;
 		if (!n)
@@ -149,17 +148,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
  * @sector:	Sector from which to report zones
  * @zones:	Array of zone structures where to return the zones information
  * @nr_zones:	Number of zone structures in the zone array
- * @gfp_mask:	Memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Get zone information starting from the zone containing @sector.
  *    The number of zone information reported may be less than the number
  *    requested by @nr_zones. The number of zones actually reported is
  *    returned in @nr_zones.
+ *    The caller must use memalloc_noXX_save/restore() calls to control
+ *    memory allocations done within this function (zone array and command
+ *    buffer allocation by the device driver).
  */
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int i, nrz;
@@ -184,7 +184,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 	nrz = min(*nr_zones,
 		  __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
 	ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
-			       zones, &nrz, gfp_mask);
+			       zones, &nrz);
 	if (ret)
 		return ret;
 
@@ -305,9 +305,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!zones)
 		return -ENOMEM;
 
-	ret = blkdev_report_zones(bdev, rep.sector,
-				  zones, &rep.nr_zones,
-				  GFP_KERNEL);
+	ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
 	if (ret)
 		goto out;
 
@@ -415,6 +413,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
 	unsigned int i, rep_nr_zones = 0, z = 0, nrz;
 	struct blk_zone *zones = NULL;
+	unsigned int noio_flag;
 	sector_t sector = 0;
 	int ret = 0;
 
@@ -427,6 +426,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		return 0;
 	}
 
+	/*
+	 * Ensure that all memory allocations in this context are done as
+	 * if GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
+
 	if (!blk_queue_is_zoned(q) || !nr_zones) {
 		nr_zones = 0;
 		goto update;
@@ -449,7 +454,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	while (z < nr_zones) {
 		nrz = min(nr_zones - z, rep_nr_zones);
-		ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
+		ret = blk_report_zones(disk, sector, zones, &nrz);
 		if (ret)
 			goto out;
 		if (!nrz)
@@ -480,6 +485,8 @@ update:
 	blk_mq_unfreeze_queue(q);
 
 out:
+	memalloc_noio_restore(noio_flag);
+
 	free_pages((unsigned long)zones,
 		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
 	kfree(seq_zones_wlock);
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 34b22d6523ba..4b9bbe3bb5a1 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -89,8 +89,7 @@ struct nullb {
 int null_zone_init(struct nullb_device *dev);
 void null_zone_exit(struct nullb_device *dev);
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask);
+		     struct blk_zone *zones, unsigned int *nr_zones);
 void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 			unsigned int nr_sectors);
 void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index fca0c97ff1aa..cb28d93f2bd1 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev)
 }
 
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask)
+		     struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct nullb *nullb = disk->private_data;
 	struct nullb_device *dev = nullb->dev;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index a9bc518156f2..2900fbde89b3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int flakey_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct flakey_c *fc = ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..ecefe6703736 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int linear_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..9faf3e49c7af 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/crc32.h>
+#include <linux/sched/mm.h>
 
 #define	DM_MSG_PREFIX		"zoned metadata"
 
@@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
 	while (sector < dev->capacity) {
 		/* Get zone information */
 		nr_blkz = DMZ_REPORT_NR_ZONES;
-		ret = blkdev_report_zones(dev->bdev, sector, blkz,
-					  &nr_blkz, GFP_KERNEL);
+		ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
 		if (ret) {
 			dmz_dev_err(dev, "Report zones failed %d", ret);
 			goto out;
@@ -1201,12 +1201,20 @@ out:
 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
 	unsigned int nr_blkz = 1;
+	unsigned int noio_flag;
 	struct blk_zone blkz;
 	int ret;
 
-	/* Get zone information from disk */
+	/*
+	 * Get zone information from disk. Since blkdev_report_zones() uses
+	 * GFP_KERNEL by default for memory allocations, set the per-task
+	 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
+	 * GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
-				  &blkz, &nr_blkz, GFP_NOIO);
+				  &blkz, &nr_blkz);
+	memalloc_noio_restore(noio_flag);
 	if (!nr_blkz)
 		ret = -EIO;
 	if (ret) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..61f1152b74e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
 	struct mapped_device *md = disk->private_data;
@@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 	 * So there is no need to loop here trying to fill the entire array
 	 * of zones.
 	 */
-	ret = tgt->type->report_zones(tgt, sector, zones,
-				      nr_zones, gfp_mask);
+	ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
 
 out:
 	dm_put_live_table(md, srcu_idx);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5796ace76225..38c50946fc42 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			    struct scsi_sense_hdr *sshdr);
 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask);
+			       struct blk_zone *zones, unsigned int *nr_zones);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7334024b64f1..ec3764c8f3f1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -109,13 +109,11 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
  * @sector: Start 512B sector of the report
  * @zones: Array of zone descriptors
  * @nr_zones: Number of descriptors in the array
- * @gfp_mask: Memory allocation mask
  *
  * Execute a report zones command on the target disk.
  */
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	unsigned int i, buflen, nrz = *nr_zones;
@@ -134,7 +132,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 	 */
 	buflen = min(queue_max_hw_sectors(disk->queue) << 9,
 		     roundup((nrz + 1) * 64, 512));
-	buf = kmalloc(buflen, gfp_mask);
+	buf = kmalloc(buflen, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6b959bbb336a..4e91ba6c8a2e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2841,9 +2841,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	while (zones && sector < nr_sectors) {
 
 		nr_zones = F2FS_REPORT_NR_ZONES;
-		err = blkdev_report_zones(bdev, sector,
-					  zones, &nr_zones,
-					  GFP_KERNEL);
+		err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
 		if (err)
 			break;
 		if (!nr_zones) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 259bd7ad8312..05036e3e3458 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -347,7 +347,7 @@ struct queue_limits {
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-			       unsigned int *nr_zones, gfp_t gfp_mask);
+			       unsigned int *nr_zones);
 extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
 			      sector_t nr_sectors, gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -1673,8 +1673,7 @@ struct block_device_operations {
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask);
+			    struct blk_zone *zones, unsigned int *nr_zones);
 	struct module *owner;
 	const struct pr_ops *pr_ops;
 };
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..3b470cb03b66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
 
 typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
 				   struct blk_zone *zones,
-				   unsigned int *nr_zones,
-				   gfp_t gfp_mask);
+				   unsigned int *nr_zones);
 
 /*
  * These iteration functions are typically used to check (and combine)
-- 
cgit v1.2.3


From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:18 +0900
Subject: block: Limit zone array allocation size

Limit the size of the struct blk_zone array used in
blk_revalidate_disk_zones() to avoid memory allocation failures leading
to disk revalidation failure. Also further reduce the likelyhood of
such failures by using kvcalloc() (that is vmalloc()) instead of
allocating contiguous pages with alloc_pages().

Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 36 ++++++++++++++++++++----------------
 include/linux/blkdev.h |  5 +++++
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 58ced170b424..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 
 #include "blk.h"
@@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
  * Allocate an array of struct blk_zone to get nr_zones zone information.
  * The allocated array may be smaller than nr_zones.
  */
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
 {
-	size_t size = *nr_zones * sizeof(struct blk_zone);
-	struct page *page;
-	int order;
-
-	for (order = get_order(size); order >= 0; order--) {
-		page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
-		if (page) {
-			*nr_zones = min_t(unsigned int, *nr_zones,
-				(PAGE_SIZE << order) / sizeof(struct blk_zone));
-			return page_address(page);
-		}
+	struct blk_zone *zones;
+	size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+	/*
+	 * GFP_KERNEL here is meaningless as the caller task context has
+	 * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+	 * with memalloc_noio_save().
+	 */
+	zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!zones) {
+		*nr_zones = 0;
+		return NULL;
 	}
 
-	return NULL;
+	*nr_zones = nrz;
+
+	return zones;
 }
 
 void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	/* Get zone information and initialize seq_zones_bitmap */
 	rep_nr_zones = nr_zones;
-	zones = blk_alloc_zones(q->node, &rep_nr_zones);
+	zones = blk_alloc_zones(&rep_nr_zones);
 	if (!zones)
 		goto out;
 
@@ -487,8 +492,7 @@ update:
 out:
 	memalloc_noio_restore(noio_flag);
 
-	free_pages((unsigned long)zones,
-		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
+	kvfree(zones);
 	kfree(seq_zones_wlock);
 	kfree(seq_zones_bitmap);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05036e3e3458..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,6 +344,11 @@ struct queue_limits {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES	8192U
+
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-- 
cgit v1.2.3


From a101b043c44dfcb63bed7f29a675e9fa0259005e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 11 Jul 2019 16:33:12 -0400
Subject: SUNRPC: Fix transport accounting when caller specifies an rpc_xprt

Ensure that we do the required accounting for the round robin queue
when the caller to rpc_init_task() has passed in a transport to be
used.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Reported-by: Neil Brown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 42 ++++++++++++++++++++----------------------
 net/sunrpc/sched.c          |  3 ++-
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4619098affa3..4e070e00c143 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -164,6 +164,8 @@ void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_task_release_transport(struct rpc_task *);
 void		rpc_task_release_client(struct rpc_task *);
+struct rpc_xprt	*rpc_task_get_xprt(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt);
 
 int		rpcb_create_local(struct net *);
 void		rpcb_put_local(struct net *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d599fab8adcb..383555d2b522 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -978,11 +978,10 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
-static struct rpc_xprt *
-rpc_task_get_xprt(struct rpc_clnt *clnt)
+struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
 	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
 
 	if (!xprt)
 		return NULL;
@@ -995,24 +994,6 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
-static struct rpc_xprt *
-rpc_task_get_first_xprt(struct rpc_clnt *clnt)
-{
-	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt;
-
-	rcu_read_lock();
-	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
-	if (xprt) {
-		atomic_long_inc(&xprt->queuelen);
-		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
-		atomic_long_inc(&xps->xps_queuelen);
-	}
-	rcu_read_unlock();
-
-	return xprt;
-}
-
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1057,6 +1038,23 @@ void rpc_task_release_client(struct rpc_task *task)
 	}
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	rcu_read_unlock();
+	return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_next_xprt(struct rpc_clnt *clnt)
+{
+	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
+}
+
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
@@ -1065,7 +1063,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
 		task->tk_xprt = rpc_task_get_first_xprt(clnt);
 	else
-		task->tk_xprt = rpc_task_get_xprt(clnt);
+		task->tk_xprt = rpc_task_get_next_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8a0779e963f9..1f275aba786f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1092,7 +1092,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+	task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
+			xprt_get(task_setup_data->rpc_xprt));
 
 	task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
 
-- 
cgit v1.2.3


From 1df379924304b687263942452836db1d725155df Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 2 Jul 2019 12:03:50 +1000
Subject: clk: consoldiate the __clk_get_hw() declarations

Without this we were getting errors like:

In file included from drivers/clk/clkdev.c:22:0:
drivers/clk/clk.h:36:23: error: static declaration of '__clk_get_hw' follows non-static declaration
include/linux/clk-provider.h:808:16: note: previous declaration of '__clk_get_hw' was here

Fixes: 59fcdce425b7 ("clk: Remove ifdef for COMMON_CLK in clk-provider.h")
fixes: 73e0e496afda ("clkdev: Always allocate a struct clk and call __clk_get() w/ CCF")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.h             | 4 ----
 drivers/clk/imx/clk-imx6q.c   | 1 +
 drivers/clk/imx/clk-imx6sll.c | 1 +
 drivers/clk/imx/clk-imx6sx.c  | 1 +
 drivers/clk/imx/clk-imx6ul.c  | 1 +
 drivers/clk/imx/clk-imx7d.c   | 1 +
 drivers/clk/imx/clk.c         | 1 +
 include/linux/clk-provider.h  | 7 +++++++
 8 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.h b/drivers/clk/clk.h
index d8400d623b34..2d801900cad5 100644
--- a/drivers/clk/clk.h
+++ b/drivers/clk/clk.h
@@ -33,10 +33,6 @@ clk_hw_create_clk(struct device *dev, struct clk_hw *hw, const char *dev_id,
 {
 	return (struct clk *)hw;
 }
-static struct clk_hw *__clk_get_hw(struct clk *clk)
-{
-	return (struct clk_hw *)clk;
-}
 static inline void __clk_put(struct clk *clk) { }
 
 #endif
diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c
index 708e7c5590dd..fa5ef3cc2240 100644
--- a/drivers/clk/imx/clk-imx6q.c
+++ b/drivers/clk/imx/clk-imx6q.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
diff --git a/drivers/clk/imx/clk-imx6sll.c b/drivers/clk/imx/clk-imx6sll.c
index 7eea448cb9a9..a9548c4b6d78 100644
--- a/drivers/clk/imx/clk-imx6sll.c
+++ b/drivers/clk/imx/clk-imx6sll.c
@@ -7,6 +7,7 @@
 #include <dt-bindings/clock/imx6sll-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6sx.c b/drivers/clk/imx/clk-imx6sx.c
index 91558b09bf9e..77748d6d4ccc 100644
--- a/drivers/clk/imx/clk-imx6sx.c
+++ b/drivers/clk/imx/clk-imx6sx.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6sx-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c
index fd60d1549f71..e0e4625aacd0 100644
--- a/drivers/clk/imx/clk-imx6ul.c
+++ b/drivers/clk/imx/clk-imx6ul.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6ul-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx7d.c b/drivers/clk/imx/clk-imx7d.c
index 5b8a0c729f90..0ff3eb14d3af 100644
--- a/drivers/clk/imx/clk-imx7d.c
+++ b/drivers/clk/imx/clk-imx7d.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx7d-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk.c b/drivers/clk/imx/clk.c
index 1efed86217f7..588d1f45325d 100644
--- a/drivers/clk/imx/clk.c
+++ b/drivers/clk/imx/clk.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/slab.h>
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 9ba000e3a50d..55d48140b0d0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -805,7 +805,14 @@ void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
+#ifdef CONFIG_COMMON_CLK
 struct clk_hw *__clk_get_hw(struct clk *clk);
+#else
+static inline struct clk_hw *__clk_get_hw(struct clk *clk)
+{
+	return (struct clk_hw *)clk;
+}
+#endif
 unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
-- 
cgit v1.2.3


From 9bd3bb6703d8c0a5fb8aec8e3287bd55b7341dcd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 11 Jul 2019 20:52:08 -0700
Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address

Architectures like powerpc use different address range to map ioremap
and vmalloc range.  The memunmap() check used by the nvdimm layer was
wrongly using is_vmalloc_addr() to check for ioremap range which fails
for ppc64.  This result in ppc64 not freeing the ioremap mapping.  The
side effect of this is an unbind failure during module unload with
papr_scm nvdimm driver

Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions")
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h | 14 ++++++++++++++
 include/linux/mm.h                 |  5 +++++
 kernel/iomem.c                     |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 3f53be60fb01..64145751b2fd 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p)
 }
 #endif
 
+#ifdef CONFIG_PPC64
+#define is_ioremap_addr is_ioremap_addr
+static inline bool is_ioremap_addr(const void *x)
+{
+#ifdef CONFIG_MMU
+	unsigned long addr = (unsigned long)x;
+
+	return addr >= IOREMAP_BASE && addr < IOREMAP_END;
+#else
+	return false;
+#endif
+}
+#endif /* CONFIG_PPC64 */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..0a6dae2f2b84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x)
 	return false;
 #endif
 }
+
+#ifndef is_ioremap_addr
+#define is_ioremap_addr(x) is_vmalloc_addr(x)
+#endif
+
 #ifdef CONFIG_MMU
 extern int is_vmalloc_or_module_addr(const void *x);
 #else
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 93c264444510..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
 
 void memunmap(void *addr)
 {
-	if (is_vmalloc_addr(addr))
+	if (is_ioremap_addr(addr))
 		iounmap((void __iomem *) addr);
 }
 EXPORT_SYMBOL(memunmap);
-- 
cgit v1.2.3


From a760f8a67cb38d19fd52f2a28c65c967e469367e Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 11 Jul 2019 20:52:24 -0700
Subject: include/linux/dmar.h: replace single-char identifiers in macros

There are a few macros in IOMMU have single-char identifiers make the code
hard to read and debug.  Replace them with meaningful names.

Link: http://lkml.kernel.org/r/1559566783-13627-1-git-send-email-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dmar.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 28813c6f44b6..a7cf3599d9a1 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -92,12 +92,14 @@ static inline bool dmar_rcu_check(void)
 
 #define	dmar_rcu_dereference(p)	rcu_dereference_check((p), dmar_rcu_check())
 
-#define	for_each_dev_scope(a, c, p, d)	\
-	for ((p) = 0; ((d) = (p) < (c) ? dmar_rcu_dereference((a)[(p)].dev) : \
-			NULL, (p) < (c)); (p)++)
-
-#define	for_each_active_dev_scope(a, c, p, d)	\
-	for_each_dev_scope((a), (c), (p), (d))	if (!(d)) { continue; } else
+#define for_each_dev_scope(devs, cnt, i, tmp)				\
+	for ((i) = 0; ((tmp) = (i) < (cnt) ?				\
+	    dmar_rcu_dereference((devs)[(i)].dev) : NULL, (i) < (cnt)); \
+	    (i)++)
+
+#define for_each_active_dev_scope(devs, cnt, i, tmp)			\
+	for_each_dev_scope((devs), (cnt), (i), (tmp))			\
+		if (!(tmp)) { continue; } else
 
 extern int dmar_table_init(void);
 extern int dmar_dev_scope_init(void);
-- 
cgit v1.2.3


From 7d8ad890dad00f6cd64bfb44d9be4fceb10cf819 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:03 -0700
Subject: mm/kasan: introduce __kasan_check_{read,write}

Patch series "mm/kasan: Add object validation in ksize()", v3.

This patch (of 5):

This introduces __kasan_check_{read,write}.  __kasan_check functions may
be used from anywhere, even compilation units that disable instrumentation
selectively.

This change eliminates the need for the __KASAN_INTERNAL definition.

[elver@google.com: v5]
  Link: http://lkml.kernel.org/r/20190708170706.174189-2-elver@google.com
Link: http://lkml.kernel.org/r/20190626142014.141844-2-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h | 25 ++++++++++++++++++++++---
 mm/kasan/common.c            | 10 ++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index a61dc075e2ce..221f05fbddd7 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,9 +2,28 @@
 #ifndef _LINUX_KASAN_CHECKS_H
 #define _LINUX_KASAN_CHECKS_H
 
-#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL)
-void kasan_check_read(const volatile void *p, unsigned int size);
-void kasan_check_write(const volatile void *p, unsigned int size);
+/*
+ * __kasan_check_*: Always available when KASAN is enabled. This may be used
+ * even in compilation units that selectively disable KASAN, but must use KASAN
+ * to validate access to an address.   Never use these in header files!
+ */
+#ifdef CONFIG_KASAN
+void __kasan_check_read(const volatile void *p, unsigned int size);
+void __kasan_check_write(const volatile void *p, unsigned int size);
+#else
+static inline void __kasan_check_read(const volatile void *p, unsigned int size)
+{ }
+static inline void __kasan_check_write(const volatile void *p, unsigned int size)
+{ }
+#endif
+
+/*
+ * kasan_check_*: Only available when the particular compilation unit has KASAN
+ * instrumentation enabled. May be used in header files.
+ */
+#ifdef __SANITIZE_ADDRESS__
+#define kasan_check_read __kasan_check_read
+#define kasan_check_write __kasan_check_write
 #else
 static inline void kasan_check_read(const volatile void *p, unsigned int size)
 { }
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 242fdc01aaa9..6bada42cc152 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -14,8 +14,6 @@
  *
  */
 
-#define __KASAN_INTERNAL
-
 #include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -89,17 +87,17 @@ void kasan_disable_current(void)
 	current->kasan_depth--;
 }
 
-void kasan_check_read(const volatile void *p, unsigned int size)
+void __kasan_check_read(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, false, _RET_IP_);
 }
-EXPORT_SYMBOL(kasan_check_read);
+EXPORT_SYMBOL(__kasan_check_read);
 
-void kasan_check_write(const volatile void *p, unsigned int size)
+void __kasan_check_write(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, true, _RET_IP_);
 }
-EXPORT_SYMBOL(kasan_check_write);
+EXPORT_SYMBOL(__kasan_check_write);
 
 #undef memset
 void *memset(void *addr, int c, size_t len)
-- 
cgit v1.2.3


From b5f6e0fc7d60e0234dac82498e90dfe9027bad1f Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:07 -0700
Subject: mm/kasan: change kasan_check_{read,write} to return boolean

This changes {,__}kasan_check_{read,write} functions to return a boolean
denoting if the access was valid or not.

[sfr@canb.auug.org.au: include types.h for "bool"]
  Link: http://lkml.kernel.org/r/20190705184949.13cdd021@canb.auug.org.au
Link: http://lkml.kernel.org/r/20190626142014.141844-3-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h | 30 ++++++++++++++++++++----------
 mm/kasan/common.c            |  8 ++++----
 mm/kasan/generic.c           | 13 +++++++------
 mm/kasan/kasan.h             | 10 +++++++++-
 mm/kasan/tags.c              | 12 +++++++-----
 5 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index 221f05fbddd7..ac6aba632f2d 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,19 +2,25 @@
 #ifndef _LINUX_KASAN_CHECKS_H
 #define _LINUX_KASAN_CHECKS_H
 
+#include <linux/types.h>
+
 /*
  * __kasan_check_*: Always available when KASAN is enabled. This may be used
  * even in compilation units that selectively disable KASAN, but must use KASAN
  * to validate access to an address.   Never use these in header files!
  */
 #ifdef CONFIG_KASAN
-void __kasan_check_read(const volatile void *p, unsigned int size);
-void __kasan_check_write(const volatile void *p, unsigned int size);
+bool __kasan_check_read(const volatile void *p, unsigned int size);
+bool __kasan_check_write(const volatile void *p, unsigned int size);
 #else
-static inline void __kasan_check_read(const volatile void *p, unsigned int size)
-{ }
-static inline void __kasan_check_write(const volatile void *p, unsigned int size)
-{ }
+static inline bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+	return true;
+}
+static inline bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+	return true;
+}
 #endif
 
 /*
@@ -25,10 +31,14 @@ static inline void __kasan_check_write(const volatile void *p, unsigned int size
 #define kasan_check_read __kasan_check_read
 #define kasan_check_write __kasan_check_write
 #else
-static inline void kasan_check_read(const volatile void *p, unsigned int size)
-{ }
-static inline void kasan_check_write(const volatile void *p, unsigned int size)
-{ }
+static inline bool kasan_check_read(const volatile void *p, unsigned int size)
+{
+	return true;
+}
+static inline bool kasan_check_write(const volatile void *p, unsigned int size)
+{
+	return true;
+}
 #endif
 
 #endif
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6bada42cc152..2277b82902d8 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -87,15 +87,15 @@ void kasan_disable_current(void)
 	current->kasan_depth--;
 }
 
-void __kasan_check_read(const volatile void *p, unsigned int size)
+bool __kasan_check_read(const volatile void *p, unsigned int size)
 {
-	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+	return check_memory_region((unsigned long)p, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(__kasan_check_read);
 
-void __kasan_check_write(const volatile void *p, unsigned int size)
+bool __kasan_check_write(const volatile void *p, unsigned int size)
 {
-	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+	return check_memory_region((unsigned long)p, size, true, _RET_IP_);
 }
 EXPORT_SYMBOL(__kasan_check_write);
 
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 504c79363a34..616f9dd82d12 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -166,29 +166,30 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
 	return memory_is_poisoned_n(addr, size);
 }
 
-static __always_inline void check_memory_region_inline(unsigned long addr,
+static __always_inline bool check_memory_region_inline(unsigned long addr,
 						size_t size, bool write,
 						unsigned long ret_ip)
 {
 	if (unlikely(size == 0))
-		return;
+		return true;
 
 	if (unlikely((void *)addr <
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
 		kasan_report(addr, size, write, ret_ip);
-		return;
+		return false;
 	}
 
 	if (likely(!memory_is_poisoned(addr, size)))
-		return;
+		return true;
 
 	kasan_report(addr, size, write, ret_ip);
+	return false;
 }
 
-void check_memory_region(unsigned long addr, size_t size, bool write,
+bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip)
 {
-	check_memory_region_inline(addr, size, write, ret_ip);
+	return check_memory_region_inline(addr, size, write, ret_ip);
 }
 
 void kasan_cache_shrink(struct kmem_cache *cache)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1979db4763e2..014f19e76247 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -128,7 +128,15 @@ static inline bool addr_has_shadow(const void *addr)
 
 void kasan_poison_shadow(const void *address, size_t size, u8 value);
 
-void check_memory_region(unsigned long addr, size_t size, bool write,
+/**
+ * check_memory_region - Check memory region, and report if invalid access.
+ * @addr: the accessed address
+ * @size: the accessed size
+ * @write: true if access is a write access
+ * @ret_ip: return address
+ * @return: true if access was valid, false if invalid
+ */
+bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip);
 
 void *find_first_bad_addr(void *addr, size_t size);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 63fca3172659..0e987c9ca052 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -76,7 +76,7 @@ void *kasan_reset_tag(const void *addr)
 	return reset_tag(addr);
 }
 
-void check_memory_region(unsigned long addr, size_t size, bool write,
+bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip)
 {
 	u8 tag;
@@ -84,7 +84,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write,
 	void *untagged_addr;
 
 	if (unlikely(size == 0))
-		return;
+		return true;
 
 	tag = get_tag((const void *)addr);
 
@@ -106,22 +106,24 @@ void check_memory_region(unsigned long addr, size_t size, bool write,
 	 * set to KASAN_TAG_KERNEL (0xFF)).
 	 */
 	if (tag == KASAN_TAG_KERNEL)
-		return;
+		return true;
 
 	untagged_addr = reset_tag((const void *)addr);
 	if (unlikely(untagged_addr <
 			kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
 		kasan_report(addr, size, write, ret_ip);
-		return;
+		return false;
 	}
 	shadow_first = kasan_mem_to_shadow(untagged_addr);
 	shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
 	for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
 		if (*shadow != tag) {
 			kasan_report(addr, size, write, ret_ip);
-			return;
+			return false;
 		}
 	}
+
+	return true;
 }
 
 #define DEFINE_HWASAN_LOAD_STORE(size)					\
-- 
cgit v1.2.3


From 10d1f8cb3965a6f633bf23eb984cda552927e3a5 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:14 -0700
Subject: mm/slab: refactor common ksize KASAN logic into slab_common.c

This refactors common code of ksize() between the various allocators into
slab_common.c: __ksize() is the allocator-specific implementation without
instrumentation, whereas ksize() includes the required KASAN logic.

Link: http://lkml.kernel.org/r/20190626142014.141844-5-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  1 +
 mm/slab.c            | 22 +++++-----------------
 mm/slab_common.c     | 26 ++++++++++++++++++++++++++
 mm/slob.c            |  4 ++--
 mm/slub.c            | 14 ++------------
 5 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9449b19c5f10..98c3d12b7275 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -184,6 +184,7 @@ void * __must_check __krealloc(const void *, size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
 void kzfree(const void *);
+size_t __ksize(const void *);
 size_t ksize(const void *);
 
 #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
diff --git a/mm/slab.c b/mm/slab.c
index db01e9aae31b..3521a351ceb5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4204,20 +4204,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 #endif /* CONFIG_HARDENED_USERCOPY */
 
 /**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
+ * __ksize -- Uninstrumented ksize.
  *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
+ * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
+ * safety checks as ksize() with KASAN instrumentation enabled.
  */
-size_t ksize(const void *objp)
+size_t __ksize(const void *objp)
 {
 	struct kmem_cache *c;
 	size_t size;
@@ -4228,11 +4220,7 @@ size_t ksize(const void *objp)
 
 	c = virt_to_cache(objp);
 	size = c ? c->object_size : 0;
-	/* We assume that ksize callers could use the whole allocated area,
-	 * so we need to unpoison this area.
-	 */
-	kasan_unpoison_shadow(objp, size);
 
 	return size;
 }
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 58251ba63e4a..b7c6a40e436a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1597,6 +1597,32 @@ void kzfree(const void *p)
 }
 EXPORT_SYMBOL(kzfree);
 
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t ksize(const void *objp)
+{
+	size_t size = __ksize(objp);
+	/*
+	 * We assume that ksize callers could use whole allocated area,
+	 * so we need to unpoison this area.
+	 */
+	kasan_unpoison_shadow(objp, size);
+	return size;
+}
+EXPORT_SYMBOL(ksize);
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/slob.c b/mm/slob.c
index 84aefd9b91ee..7f421d0ca9ab 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -527,7 +527,7 @@ void kfree(const void *block)
 EXPORT_SYMBOL(kfree);
 
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
-size_t ksize(const void *block)
+size_t __ksize(const void *block)
 {
 	struct page *sp;
 	int align;
@@ -545,7 +545,7 @@ size_t ksize(const void *block)
 	m = (unsigned int *)(block - align);
 	return SLOB_UNITS(*m) * SLOB_UNIT;
 }
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
 
 int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
 {
diff --git a/mm/slub.c b/mm/slub.c
index d46a91759b96..5e217653286c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3895,7 +3895,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 }
 #endif /* CONFIG_HARDENED_USERCOPY */
 
-static size_t __ksize(const void *object)
+size_t __ksize(const void *object)
 {
 	struct page *page;
 
@@ -3911,17 +3911,7 @@ static size_t __ksize(const void *object)
 
 	return slab_ksize(page->slab_cache);
 }
-
-size_t ksize(const void *object)
-{
-	size_t size = __ksize(object);
-	/* We assume that ksize callers could use whole allocated area,
-	 * so we need to unpoison this area.
-	 */
-	kasan_unpoison_shadow(object, size);
-	return size;
-}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
 
 void kfree(const void *x)
 {
-- 
cgit v1.2.3


From 0d4ca4c9bab397b525c9a4f875d31410ce4bc738 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:18 -0700
Subject: mm/kasan: add object validation in ksize()

ksize() has been unconditionally unpoisoning the whole shadow memory
region associated with an allocation.  This can lead to various undetected
bugs, for example, double-kzfree().

Specifically, kzfree() uses ksize() to determine the actual allocation
size, and subsequently zeroes the memory.  Since ksize() used to just
unpoison the whole shadow memory region, no invalid free was detected.

This patch addresses this as follows:

1. Add a check in ksize(), and only then unpoison the memory region.

2. Preserve kasan_unpoison_slab() semantics by explicitly unpoisoning
   the shadow memory region using the size obtained from __ksize().

Tested:
1. With SLAB allocator: a) normal boot without warnings; b) verified the
   added double-kzfree() is detected.
2. With SLUB allocator: a) normal boot without warnings; b) verified the
   added double-kzfree() is detected.

[elver@google.com: s/BUG_ON/WARN_ON_ONCE/, per Kees]
  Link: http://lkml.kernel.org/r/20190627094445.216365-6-elver@google.com
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199359
Link: http://lkml.kernel.org/r/20190626142014.141844-6-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  7 +++++--
 mm/slab_common.c      | 22 +++++++++++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b40ea104dd36..cc8a03cc9674 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -76,8 +76,11 @@ void kasan_free_shadow(const struct vm_struct *vm);
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
-size_t ksize(const void *);
-static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t __ksize(const void *);
+static inline void kasan_unpoison_slab(const void *ptr)
+{
+	kasan_unpoison_shadow(ptr, __ksize(ptr));
+}
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
 bool kasan_save_enable_multi_shot(void);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b7c6a40e436a..a09bb10aa026 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1613,7 +1613,27 @@ EXPORT_SYMBOL(kzfree);
  */
 size_t ksize(const void *objp)
 {
-	size_t size = __ksize(objp);
+	size_t size;
+
+	if (WARN_ON_ONCE(!objp))
+		return 0;
+	/*
+	 * We need to check that the pointed to object is valid, and only then
+	 * unpoison the shadow memory below. We use __kasan_check_read(), to
+	 * generate a more useful report at the time ksize() is called (rather
+	 * than later where behaviour is undefined due to potential
+	 * use-after-free or double-free).
+	 *
+	 * If the pointed to memory is invalid we return 0, to avoid users of
+	 * ksize() writing to and potentially corrupting the memory region.
+	 *
+	 * We want to perform the check before __ksize(), to avoid potentially
+	 * crashing in __ksize() due to accessing invalid metadata.
+	 */
+	if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+		return 0;
+
+	size = __ksize(objp);
 	/*
 	 * We assume that ksize callers could use whole allocated area,
 	 * so we need to unpoison this area.
-- 
cgit v1.2.3


From 2236b99d6a33df72befa7205c2d8381aca7ae701 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Jul 2019 20:54:21 -0700
Subject: include/linux/pfn_t.h: remove pfn_t_to_virt()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It has no callers and there is no virt_to_pfn_t().

Reported-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pfn_t.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 3c202a11a79e..01e8037023f7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -66,13 +66,6 @@ static inline phys_addr_t pfn_t_to_phys(pfn_t pfn)
 	return PFN_PHYS(pfn_t_to_pfn(pfn));
 }
 
-static inline void *pfn_t_to_virt(pfn_t pfn)
-{
-	if (pfn_t_has_page(pfn) && !is_device_private_page(pfn_t_to_page(pfn)))
-		return __va(pfn_t_to_phys(pfn));
-	return NULL;
-}
-
 static inline pfn_t page_to_pfn_t(struct page *page)
 {
 	return pfn_to_pfn_t(page_to_pfn(page));
-- 
cgit v1.2.3


From 442a5a9a9295bfd9b0cffd0691ef8a6ce81db7c4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 11 Jul 2019 20:54:40 -0700
Subject: mm: make !CONFIG_HUGE_PAGE wrappers into static inlines

Instead of using defines, which loses type safety and provokes unused
variable warnings from gcc, put the constants into static inlines.

Link: http://lkml.kernel.org/r/20190522235102.GA15370@mellanox.com
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 102 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 86 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edf476c8cfb9..f895a79c6f5c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -608,22 +608,92 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-#define alloc_huge_page(v, a, r) NULL
-#define alloc_huge_page_node(h, nid) NULL
-#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
-#define alloc_huge_page_vma(h, vma, address) NULL
-#define alloc_bootmem_huge_page(h) NULL
-#define hstate_file(f) NULL
-#define hstate_sizelog(s) NULL
-#define hstate_vma(v) NULL
-#define hstate_inode(i) NULL
-#define page_hstate(page) NULL
-#define huge_page_size(h) PAGE_SIZE
-#define huge_page_mask(h) PAGE_MASK
-#define vma_kernel_pagesize(v) PAGE_SIZE
-#define vma_mmu_pagesize(v) PAGE_SIZE
-#define huge_page_order(h) 0
-#define huge_page_shift(h) PAGE_SHIFT
+
+static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
+					   unsigned long addr,
+					   int avoid_reserve)
+{
+	return NULL;
+}
+
+static inline struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+	return NULL;
+}
+
+static inline struct page *
+alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask)
+{
+	return NULL;
+}
+
+static inline struct page *alloc_huge_page_vma(struct hstate *h,
+					       struct vm_area_struct *vma,
+					       unsigned long address)
+{
+	return NULL;
+}
+
+static inline int __alloc_bootmem_huge_page(struct hstate *h)
+{
+	return 0;
+}
+
+static inline struct hstate *hstate_file(struct file *f)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_inode(struct inode *i)
+{
+	return NULL;
+}
+
+static inline struct hstate *page_hstate(struct page *page)
+{
+	return NULL;
+}
+
+static inline unsigned long huge_page_size(struct hstate *h)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned long huge_page_mask(struct hstate *h)
+{
+	return PAGE_MASK;
+}
+
+static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned int huge_page_order(struct hstate *h)
+{
+	return 0;
+}
+
+static inline unsigned int huge_page_shift(struct hstate *h)
+{
+	return PAGE_SHIFT;
+}
+
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
 	return false;
-- 
cgit v1.2.3


From 219f8a2e25f0abbe222b170a0de2fd38c22d43ad Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 11 Jul 2019 20:54:43 -0700
Subject: include/linux/mm_types.h: ifdef struct
 vm_area_struct::swap_readahead_info

The field is only used in swap code.

Link: http://lkml.kernel.org/r/20190503190500.GA30589@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8ec38b11b361..1d1093474c1a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -329,7 +329,9 @@ struct vm_area_struct {
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
+#ifdef CONFIG_SWAP
 	atomic_long_t swap_readahead_info;
+#endif
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
-- 
cgit v1.2.3


From 1fcf0a561cd09d7fb7f7afa2ddfe05f72f32050e Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Thu, 11 Jul 2019 20:54:49 -0700
Subject: mm/page_isolation.c: change the prototype of
 undo_isolate_page_range()

undo_isolate_page_range() never fails, so no need to return value.

Link: http://lkml.kernel.org/r/1562075604-8979-1-git-send-email-kernelfans@gmail.com
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 2 +-
 mm/page_isolation.c            | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 280ae96dc4c3..1099c2fee20f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
  * target range is [start_pfn, end_pfn)
  */
-int
+void
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			unsigned migratetype);
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index e3638a5bafff..89c19c0feadb 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -230,7 +230,7 @@ undo:
 /*
  * Make isolated pages available again.
  */
-int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			    unsigned migratetype)
 {
 	unsigned long pfn;
@@ -247,7 +247,6 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			continue;
 		unset_migratetype_isolate(page, migratetype);
 	}
-	return 0;
 }
 /*
  * Test all pages in the range is free(means isolated) or not.
-- 
cgit v1.2.3


From 51b176290496518d6701bc40e63f70e4b6870198 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 11 Jul 2019 20:54:52 -0700
Subject: include/linux/vmpressure.h: use spinlock_t instead of struct spinlock

For spinlocks the type spinlock_t should be used instead of "struct
spinlock".

Use spinlock_t for spinlock's definition.

Link: http://lkml.kernel.org/r/20190704153803.12739-3-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmpressure.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 61e6fddfb26f..6d28bc433c1c 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -17,7 +17,7 @@ struct vmpressure {
 	unsigned long tree_scanned;
 	unsigned long tree_reclaimed;
 	/* The lock is used to keep the scanned/reclaimed above in sync. */
-	struct spinlock sr_lock;
+	spinlock_t sr_lock;
 
 	/* The list of vmpressure_event structs. */
 	struct list_head events;
-- 
cgit v1.2.3


From f445884562dd8bc51eb4136bd21f014403d1813d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Jul 2019 20:54:59 -0700
Subject: include/linux/pagemap.h: document trylock_page() return value

Cc: Henry Burns <henryburns@google.com>
Cc: Jonathan Adams <jwadams@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: Xidong Wang <wangxidong_97@163.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fe0b29bf2df7..6fd0d3aa492c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -452,6 +452,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 extern void unlock_page(struct page *page);
 
+/*
+ * Return true if the page was successfully locked
+ */
 static inline int trylock_page(struct page *page)
 {
 	page = compound_head(page);
-- 
cgit v1.2.3


From 96a2b03f281d3a3b29c27028164f43090d6495b9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 11 Jul 2019 20:55:06 -0700
Subject: mm, debug_pagelloc: use static keys to enable debugging

Patch series "debug_pagealloc improvements".

I have been recently debugging some pcplist corruptions, where it would be
useful to perform struct page checks immediately as pages are allocated
from and freed to pcplists, which is now only possible by rebuilding the
kernel with CONFIG_DEBUG_VM (details in Patch 2 changelog).

To make this kind of debugging simpler in future on a distro kernel, I
have improved CONFIG_DEBUG_PAGEALLOC so that it has even smaller overhead
when not enabled at boot time (Patch 1) and also when enabled (Patch 3),
and extended it to perform the struct page checks more often when enabled
(Patch 2).  Now it can be configured in when building a distro kernel
without extra overhead, and debugging page use after free or double free
can be enabled simply by rebooting with debug_pagealloc=on.

This patch (of 3):

CONFIG_DEBUG_PAGEALLOC has been redesigned by 031bc5743f15
("mm/debug-pagealloc: make debug-pagealloc boottime configurable") to
allow being always enabled in a distro kernel, but only perform its
expensive functionality when booted with debug_pagelloc=on.  We can
further reduce the overhead when not boot-enabled (including page
allocator fast paths) using static keys.  This patch introduces one for
debug_pagealloc core functionality, and another for the optional guard
page functionality (enabled by booting with debug_guardpage_minorder=X).

Link: http://lkml.kernel.org/r/20190603143451.27353-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 15 +++++++++++----
 mm/page_alloc.c    | 23 +++++++++++++++++------
 2 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0a6dae2f2b84..2c2e98cae2d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2701,11 +2701,18 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
 #endif
 
-extern bool _debug_pagealloc_enabled;
+#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+#endif
 
 static inline bool debug_pagealloc_enabled(void)
 {
-	return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
+	if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
+		return false;
+
+	return static_branch_unlikely(&_debug_pagealloc_enabled);
 }
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
@@ -2859,7 +2866,7 @@ extern struct page_ext_operations debug_guardpage_ops;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
-extern bool _debug_guardpage_enabled;
+DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
 
 static inline unsigned int debug_guardpage_minorder(void)
 {
@@ -2868,7 +2875,7 @@ static inline unsigned int debug_guardpage_minorder(void)
 
 static inline bool debug_guardpage_enabled(void)
 {
-	return _debug_guardpage_enabled;
+	return static_branch_unlikely(&_debug_guardpage_enabled);
 }
 
 static inline bool page_is_guard(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 060303496094..3180d79be20c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -644,16 +644,27 @@ void prep_compound_page(struct page *page, unsigned int order)
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly
-			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+#else
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+#endif
 EXPORT_SYMBOL(_debug_pagealloc_enabled);
-bool _debug_guardpage_enabled __read_mostly;
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
 
 static int __init early_debug_pagealloc(char *buf)
 {
-	if (!buf)
+	bool enable = false;
+
+	if (kstrtobool(buf, &enable))
 		return -EINVAL;
-	return kstrtobool(buf, &_debug_pagealloc_enabled);
+
+	if (enable)
+		static_branch_enable(&_debug_pagealloc_enabled);
+
+	return 0;
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
 
@@ -677,7 +688,7 @@ static void init_debug_guardpage(void)
 	if (!debug_guardpage_minorder())
 		return;
 
-	_debug_guardpage_enabled = true;
+	static_branch_enable(&_debug_guardpage_enabled);
 }
 
 struct page_ext_operations debug_guardpage_ops = {
-- 
cgit v1.2.3


From 3972f6bb1c6ae1d32dcf2e4ff635d24b77f26dcb Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 11 Jul 2019 20:55:13 -0700
Subject: mm, debug_pagealloc: use a page type instead of page_ext flag

When debug_pagealloc is enabled, we currently allocate the page_ext
array to mark guard pages with the PAGE_EXT_DEBUG_GUARD flag.  Now that
we have the page_type field in struct page, we can use that instead, as
guard pages are neither PageSlab nor mapped to userspace.  This reduces
memory overhead when debug_pagealloc is enabled and there are no other
features requiring the page_ext array.

Link: http://lkml.kernel.org/r/20190603143451.27353-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +++----
 include/linux/mm.h                              | 10 +------
 include/linux/page-flags.h                      |  6 ++++
 include/linux/page_ext.h                        |  1 -
 mm/Kconfig.debug                                |  1 -
 mm/page_alloc.c                                 | 40 ++++---------------------
 mm/page_ext.c                                   |  3 --
 7 files changed, 17 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1c433daef6b..aa4e7e7b87c2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -805,12 +805,10 @@
 			tracking down these problems.
 
 	debug_pagealloc=
-			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
-			parameter enables the feature at boot time. In
-			default, it is disabled. We can avoid allocating huge
-			chunk of memory for debug pagealloc if we don't enable
-			it at boot time and the system will work mostly same
-			with the kernel built without CONFIG_DEBUG_PAGEALLOC.
+			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
+			enables the feature at boot time. By default, it is
+			disabled and the system will work mostly the same as a
+			kernel built without CONFIG_DEBUG_PAGEALLOC.
 			on: enable the feature
 
 	debugpat	[X86] Enable PAT debugging
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c2e98cae2d1..cb8d413d635e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2862,8 +2862,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
 				bool allow_pagefault);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
-extern struct page_ext_operations debug_guardpage_ops;
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
 DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
@@ -2880,16 +2878,10 @@ static inline bool debug_guardpage_enabled(void)
 
 static inline bool page_is_guard(struct page *page)
 {
-	struct page_ext *page_ext;
-
 	if (!debug_guardpage_enabled())
 		return false;
 
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return false;
-
-	return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+	return PageGuard(page);
 }
 #else
 static inline unsigned int debug_guardpage_minorder(void) { return 0; }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f8712a4b1a5..b848517da64c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -703,6 +703,7 @@ PAGEFLAG_FALSE(DoubleMap)
 #define PG_offline	0x00000100
 #define PG_kmemcg	0x00000200
 #define PG_table	0x00000400
+#define PG_guard	0x00000800
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -754,6 +755,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg)
  */
 PAGE_TYPE_OPS(Table, table)
 
+/*
+ * Marks guardpages used with debug_pagealloc.
+ */
+PAGE_TYPE_OPS(Guard, guard)
+
 extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index f84f167ec04c..09592951725c 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -17,7 +17,6 @@ struct page_ext_operations {
 #ifdef CONFIG_PAGE_EXTENSION
 
 enum page_ext_flags {
-	PAGE_EXT_DEBUG_GUARD,
 	PAGE_EXT_OWNER,
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index a35ab6c55192..82b6a20898bd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -12,7 +12,6 @@ config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
 	depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
-	select PAGE_EXTENSION
 	select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
 	  Unmap pages from the kernel linear mapping after free_pages().
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 26b6ad8b065d..ae56e8feec0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -50,7 +50,6 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/page_ext.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
@@ -668,18 +667,6 @@ static int __init early_debug_pagealloc(char *buf)
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
 
-static bool need_debug_guardpage(void)
-{
-	/* If we don't use debug_pagealloc, we don't need guard page */
-	if (!debug_pagealloc_enabled())
-		return false;
-
-	if (!debug_guardpage_minorder())
-		return false;
-
-	return true;
-}
-
 static void init_debug_guardpage(void)
 {
 	if (!debug_pagealloc_enabled())
@@ -691,11 +678,6 @@ static void init_debug_guardpage(void)
 	static_branch_enable(&_debug_guardpage_enabled);
 }
 
-struct page_ext_operations debug_guardpage_ops = {
-	.need = need_debug_guardpage,
-	.init = init_debug_guardpage,
-};
-
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
@@ -713,20 +695,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
 {
-	struct page_ext *page_ext;
-
 	if (!debug_guardpage_enabled())
 		return false;
 
 	if (order >= debug_guardpage_minorder())
 		return false;
 
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return false;
-
-	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
-
+	__SetPageGuard(page);
 	INIT_LIST_HEAD(&page->lru);
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
@@ -738,23 +713,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
 static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
 {
-	struct page_ext *page_ext;
-
 	if (!debug_guardpage_enabled())
 		return;
 
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return;
-
-	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+	__ClearPageGuard(page);
 
 	set_page_private(page, 0);
 	if (!is_migrate_isolate(migratetype))
 		__mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
 #else
-struct page_ext_operations debug_guardpage_ops;
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 			unsigned int order, int migratetype) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -1930,6 +1898,10 @@ void __init page_alloc_init_late(void)
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	init_debug_guardpage();
+#endif
 }
 
 #ifdef CONFIG_CMA
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d8f1aca4ad43..5f5769c7db3b 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,9 +59,6 @@
  */
 
 static struct page_ext_operations *page_ext_ops[] = {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	&debug_guardpage_ops,
-#endif
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
 #endif
-- 
cgit v1.2.3


From 6c45b454191b330c8bc21d1ed3cf39bb6da1a4eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jul 2019 20:55:20 -0700
Subject: mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page

We can just pass a NULL filler and do the right thing inside of
do_read_cache_page based on the NULL parameter.

Link: http://lkml.kernel.org/r/20190520055731.24538-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  3 +--
 mm/filemap.c            | 10 ++++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6fd0d3aa492c..c7552459a15f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -383,8 +383,7 @@ extern int read_cache_pages(struct address_space *mapping,
 static inline struct page *read_mapping_page(struct address_space *mapping,
 				pgoff_t index, void *data)
 {
-	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-	return read_cache_page(mapping, index, filler, data);
+	return read_cache_page(mapping, index, NULL, data);
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index d6f7596f148f..1e5e006b8557 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2825,7 +2825,11 @@ repeat:
 		}
 
 filler:
-		err = filler(data, page);
+		if (filler)
+			err = filler(data, page);
+		else
+			err = mapping->a_ops->readpage(data, page);
+
 		if (err < 0) {
 			put_page(page);
 			return ERR_PTR(err);
@@ -2937,9 +2941,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
 				pgoff_t index,
 				gfp_t gfp)
 {
-	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-
-	return do_read_cache_page(mapping, index, filler, NULL, gfp);
+	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
 }
 EXPORT_SYMBOL(read_cache_page_gfp);
 
-- 
cgit v1.2.3


From eb085574a7526c4375965c5fbf7e5b0c19cdd336 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 11 Jul 2019 20:55:33 -0700
Subject: mm, swap: fix race between swapoff and some swap operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When swapin is performed, after getting the swap entry information from
the page table, system will swap in the swap entry, without any lock held
to prevent the swap device from being swapoff.  This may cause the race
like below,

CPU 1				CPU 2
-----				-----
				do_swap_page
				  swapin_readahead
				    __read_swap_cache_async
swapoff				      swapcache_prepare
  p->swap_map = NULL		        __swap_duplicate
					  p->swap_map[?] /* !!! NULL pointer access */

Because swapoff is usually done when system shutdown only, the race may
not hit many people in practice.  But it is still a race need to be fixed.

To fix the race, get_swap_device() is added to check whether the specified
swap entry is valid in its swap device.  If so, it will keep the swap
entry valid via preventing the swap device from being swapoff, until
put_swap_device() is called.

Because swapoff() is very rare code path, to make the normal path runs as
fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of
reference count is used to implement get/put_swap_device().  >From
get_swap_device() to put_swap_device(), RCU reader side is locked, so
synchronize_rcu() in swapoff() will wait until put_swap_device() is
called.

In addition to swap_map, cluster_info, etc.  data structure in the struct
swap_info_struct, the swap cache radix tree will be freed after swapoff,
so this patch fixes the race between swap cache looking up and swapoff
too.

Races between some other swap cache usages and swapoff are fixed too via
calling synchronize_rcu() between clearing PageSwapCache() and freeing
swap cache data structure.

Another possible method to fix this is to use preempt_off() +
stop_machine() to prevent the swap device from being swapoff when its data
structure is being accessed.  The overhead in hot-path of both methods is
similar.  The advantages of RCU based method are,

1. stop_machine() may disturb the normal execution code path on other
   CPUs.

2. File cache uses RCU to protect its radix tree.  If the similar
   mechanism is used for swap cache too, it is easier to share code
   between them.

3. RCU is used to protect swap cache in total_swapcache_pages() and
   exit_swap_address_space() already.  The two mechanisms can be
   merged to simplify the logic.

Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com
Fixes: 235b62176712 ("mm/swap: add cluster lock")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Not-nacked-by: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  13 ++++-
 mm/memory.c          |   2 +-
 mm/swap_state.c      |  16 +++++-
 mm/swapfile.c        | 154 ++++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 146 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4bfb5c4ac108..6358a6185634 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,8 +175,9 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
+	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 13),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
@@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
@@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+	rcu_read_unlock();
+}
 
 #else /* CONFIG_SWAP */
 
@@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
 {
 	return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index ced4bedc660d..b47e4e56448a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2805,7 +2805,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		struct swap_info_struct *si = swp_swap_info(entry);
 
 		if (si->flags & SWP_SYNCHRONOUS_IO &&
-				__swap_count(si, entry) == 1) {
+				__swap_count(entry) == 1) {
 			/* skip swapcache */
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 							vmf->address);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85245fdec8d9..61453f1faf72 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
 			       unsigned long addr)
 {
 	struct page *page;
+	struct swap_info_struct *si;
 
+	si = get_swap_device(entry);
+	if (!si)
+		return NULL;
 	page = find_get_page(swap_address_space(entry), swp_offset(entry));
+	put_swap_device(si);
 
 	INC_CACHE_INFO(find_total);
 	if (page) {
@@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr,
 			bool *new_page_allocated)
 {
-	struct page *found_page, *new_page = NULL;
-	struct address_space *swapper_space = swap_address_space(entry);
+	struct page *found_page = NULL, *new_page = NULL;
+	struct swap_info_struct *si;
 	int err;
 	*new_page_allocated = false;
 
@@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * called after lookup_swap_cache() failed, re-calling
 		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(swapper_space, swp_offset(entry));
+		si = get_swap_device(entry);
+		if (!si)
+			break;
+		found_page = find_get_page(swap_address_space(entry),
+					   swp_offset(entry));
+		put_swap_device(si);
 		if (found_page)
 			break;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 596ac98051c5..dbab16ddefa6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1079,12 +1079,11 @@ fail:
 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
-	unsigned long offset, type;
+	unsigned long offset;
 
 	if (!entry.val)
 		goto out;
-	type = swp_type(entry);
-	p = swap_type_to_swap_info(type);
+	p = swp_swap_info(entry);
 	if (!p)
 		goto bad_nofile;
 	if (!(p->flags & SWP_USED))
@@ -1187,6 +1186,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
 	return usage;
 }
 
+/*
+ * Check whether swap entry is valid in the swap device.  If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called.  Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc.  The caller must
+ * be prepared for that.  For example, the following situation is
+ * possible.
+ *
+ *   CPU1				CPU2
+ *   do_swap_page()
+ *     ...				swapoff+swapon
+ *     __read_swap_cache_async()
+ *       swapcache_prepare()
+ *         __swap_duplicate()
+ *           // check swap_map
+ *     // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff.  And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+	struct swap_info_struct *si;
+	unsigned long offset;
+
+	if (!entry.val)
+		goto out;
+	si = swp_swap_info(entry);
+	if (!si)
+		goto bad_nofile;
+
+	rcu_read_lock();
+	if (!(si->flags & SWP_VALID))
+		goto unlock_out;
+	offset = swp_offset(entry);
+	if (offset >= si->max)
+		goto unlock_out;
+
+	return si;
+bad_nofile:
+	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+	return NULL;
+unlock_out:
+	rcu_read_unlock();
+	return NULL;
+}
+
 static unsigned char __swap_entry_free(struct swap_info_struct *p,
 				       swp_entry_t entry, unsigned char usage)
 {
@@ -1358,11 +1420,18 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
 {
+	struct swap_info_struct *si;
 	pgoff_t offset = swp_offset(entry);
+	int count = 0;
 
-	return swap_count(si->swap_map[offset]);
+	si = get_swap_device(entry);
+	if (si) {
+		count = swap_count(si->swap_map[offset]);
+		put_swap_device(si);
+	}
+	return count;
 }
 
 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1456,11 @@ int __swp_swapcount(swp_entry_t entry)
 	int count = 0;
 	struct swap_info_struct *si;
 
-	si = __swap_info_get(entry);
-	if (si)
+	si = get_swap_device(entry);
+	if (si) {
 		count = swap_swapcount(si, entry);
+		put_swap_device(si);
+	}
 	return count;
 }
 
@@ -2335,9 +2406,9 @@ static int swap_node(struct swap_info_struct *p)
 	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
 }
 
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
-				unsigned char *swap_map,
-				struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+			    unsigned char *swap_map,
+			    struct swap_cluster_info *cluster_info)
 {
 	int i;
 
@@ -2362,7 +2433,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	}
 	p->swap_map = swap_map;
 	p->cluster_info = cluster_info;
-	p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+	p->flags |= SWP_WRITEOK | SWP_VALID;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
 
@@ -2389,7 +2464,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	frontswap_init(p->type, frontswap_map);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	 _enable_swap_info(p, prio, swap_map, cluster_info);
+	setup_swap_info(p, prio, swap_map, cluster_info);
+	spin_unlock(&p->lock);
+	spin_unlock(&swap_lock);
+	/*
+	 * Guarantee swap_map, cluster_info, etc. fields are valid
+	 * between get/put_swap_device() if SWP_VALID bit is set
+	 */
+	synchronize_rcu();
+	spin_lock(&swap_lock);
+	spin_lock(&p->lock);
+	_enable_swap_info(p);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -2398,7 +2483,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	_enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+	_enable_swap_info(p);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -2501,6 +2587,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	reenable_swap_slots_cache_unlock();
 
+	spin_lock(&swap_lock);
+	spin_lock(&p->lock);
+	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
+	spin_unlock(&p->lock);
+	spin_unlock(&swap_lock);
+	/*
+	 * wait for swap operations protected by get/put_swap_device()
+	 * to complete
+	 */
+	synchronize_rcu();
+
 	flush_work(&p->discard_work);
 
 	destroy_swap_extents(p);
@@ -3265,17 +3362,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 	unsigned char has_cache;
 	int err = -EINVAL;
 
-	if (non_swap_entry(entry))
-		goto out;
-
-	p = swp_swap_info(entry);
+	p = get_swap_device(entry);
 	if (!p)
-		goto bad_file;
-
-	offset = swp_offset(entry);
-	if (unlikely(offset >= p->max))
 		goto out;
 
+	offset = swp_offset(entry);
 	ci = lock_cluster_or_swap_info(p, offset);
 
 	count = p->swap_map[offset];
@@ -3321,11 +3412,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 unlock_out:
 	unlock_cluster_or_swap_info(p, ci);
 out:
+	if (p)
+		put_swap_device(p);
 	return err;
-
-bad_file:
-	pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
-	goto out;
 }
 
 /*
@@ -3417,6 +3506,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	struct page *list_page;
 	pgoff_t offset;
 	unsigned char count;
+	int ret = 0;
 
 	/*
 	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3514,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	 */
 	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
 
-	si = swap_info_get(entry);
+	si = get_swap_device(entry);
 	if (!si) {
 		/*
 		 * An acceptable race has occurred since the failing
-		 * __swap_duplicate(): the swap entry has been freed,
-		 * perhaps even the whole swap_map cleared for swapoff.
+		 * __swap_duplicate(): the swap device may be swapoff
 		 */
 		goto outer;
 	}
+	spin_lock(&si->lock);
 
 	offset = swp_offset(entry);
 
@@ -3450,9 +3540,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	}
 
 	if (!page) {
-		unlock_cluster(ci);
-		spin_unlock(&si->lock);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	/*
@@ -3504,10 +3593,11 @@ out_unlock_cont:
 out:
 	unlock_cluster(ci);
 	spin_unlock(&si->lock);
+	put_swap_device(si);
 outer:
 	if (page)
 		__free_page(page);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 Mon Sep 17 00:00:00 2001
From: Aaron Lu <ziqian.lzq@antfin.com>
Date: Thu, 11 Jul 2019 20:55:41 -0700
Subject: mm, swap: use rbtree for swap_extent

swap_extent is used to map swap page offset to backing device's block
offset.  For a continuous block range, one swap_extent is used and all
these swap_extents are managed in a linked list.

These swap_extents are used by map_swap_entry() during swap's read and
write path.  To find out the backing device's block offset for a page
offset, the swap_extent list will be traversed linearly, with
curr_swap_extent being used as a cache to speed up the search.

This works well as long as swap_extents are not huge or when the number
of processes that access swap device are few, but when the swap device
has many extents and there are a number of processes accessing the swap
device concurrently, it can be a problem.  On one of our servers, the
disk's remaining size is tight:

  $df -h
  Filesystem      Size  Used Avail Use% Mounted on
  ... ...
  /dev/nvme0n1p1  1.8T  1.3T  504G  72% /home/t4

When creating a 80G swapfile there, there are as many as 84656 swap
extents.  The end result is, kernel spends abou 30% time in
map_swap_entry() and swap throughput is only 70MB/s.

As a comparison, when I used smaller sized swapfile, like 4G whose
swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and
map_swap_entry() is about 3%.

One downside of using rbtree for swap_extent is, 'struct rbtree' takes
24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more
for each swap_extent.  For a swapfile that has 80k swap_extents, that
means 625KiB more memory consumed.

Test:

Since it's not possible to reboot that server, I can not test this patch
diretly there.  Instead, I tested it on another server with NVMe disk.

I created a 20G swapfile on an NVMe backed XFS fs.  By default, the
filesystem is quite clean and the created swapfile has only 2 extents.
Testing vanilla and this patch shows no obvious performance difference
when swapfile is not fragmented.

To see the patch's effects, I used some tweaks to manually fragment the
swapfile by breaking the extent at 1M boundary.  This made the swapfile
have 20K extents.

  nr_task=4
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  165191           90.77%             171798          90.21%
  patched  858993 +420%      2.16%             715827 +317%     0.77%

  nr_task=8
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  306783           92.19%             318145          87.76%
  patched  954437 +211%      2.35%            1073741 +237%     1.57%

swapout: the throughput of swap out, in KB/s, higher is better 1st
map_swap_entry: cpu cycles percent sampled by perf swapin: the
throughput of swap in, in KB/s, higher is better.  2nd map_swap_entry:
cpu cycles percent sampled by perf

nr_task=1 doesn't show any difference, this is due to the curr_swap_extent
can be effectively used to cache the correct swap extent for single task
workload.

[akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/]
Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu
Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   5 +-
 mm/page_io.c         |   2 +-
 mm/swapfile.c        | 137 ++++++++++++++++++++++++++++-----------------------
 3 files changed, 78 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6358a6185634..de2c67a33b7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -148,7 +148,7 @@ struct zone;
  * We always assume that blocks are of size PAGE_SIZE.
  */
 struct swap_extent {
-	struct list_head list;
+	struct rb_node rb_node;
 	pgoff_t start_page;
 	pgoff_t nr_pages;
 	sector_t start_block;
@@ -248,8 +248,7 @@ struct swap_info_struct {
 	unsigned int cluster_next;	/* likely index for next allocation */
 	unsigned int cluster_nr;	/* countdown to next cluster search */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
-	struct swap_extent *curr_swap_extent;
-	struct swap_extent first_swap_extent;
+	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
diff --git a/mm/page_io.c b/mm/page_io.c
index a39aac2f8c8d..24ee600f9131 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
 	blocks_per_page = PAGE_SIZE >> blkbits;
 
 	/*
-	 * Map all the blocks into the extent list.  This code doesn't try
+	 * Map all the blocks into the extent tree.  This code doesn't try
 	 * to be very smart.
 	 */
 	probe_block = 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbab16ddefa6..0789a762ce2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	return ret;
 }
 
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+	struct rb_node *rb = rb_first(&sis->swap_extent_root);
+	return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+	struct rb_node *rb = rb_next(&se->rb_node);
+	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
 /*
  * swapon tell device that all the old swap contents can be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
 	int err = 0;
 
 	/* Do not discard the swap header page! */
-	se = &si->first_swap_extent;
+	se = first_se(si);
 	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
 		cond_resched();
 	}
 
-	list_for_each_entry(se, &si->first_swap_extent.list, list) {
+	for (se = next_se(se); se; se = next_se(se)) {
 		start_block = se->start_block << (PAGE_SHIFT - 9);
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
 	return err;		/* That will often be -EOPNOTSUPP */
 }
 
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+	struct swap_extent *se;
+	struct rb_node *rb;
+
+	rb = sis->swap_extent_root.rb_node;
+	while (rb) {
+		se = rb_entry(rb, struct swap_extent, rb_node);
+		if (offset < se->start_page)
+			rb = rb->rb_left;
+		else if (offset >= se->start_page + se->nr_pages)
+			rb = rb->rb_right;
+		else
+			return se;
+	}
+	/* It *must* be present */
+	BUG();
+}
+
 /*
  * swap allocation tell device that a cluster of swap can now be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
 static void discard_swap_cluster(struct swap_info_struct *si,
 				 pgoff_t start_page, pgoff_t nr_pages)
 {
-	struct swap_extent *se = si->curr_swap_extent;
-	int found_extent = 0;
+	struct swap_extent *se = offset_to_swap_extent(si, start_page);
 
 	while (nr_pages) {
-		if (se->start_page <= start_page &&
-		    start_page < se->start_page + se->nr_pages) {
-			pgoff_t offset = start_page - se->start_page;
-			sector_t start_block = se->start_block + offset;
-			sector_t nr_blocks = se->nr_pages - offset;
-
-			if (nr_blocks > nr_pages)
-				nr_blocks = nr_pages;
-			start_page += nr_blocks;
-			nr_pages -= nr_blocks;
-
-			if (!found_extent++)
-				si->curr_swap_extent = se;
-
-			start_block <<= PAGE_SHIFT - 9;
-			nr_blocks <<= PAGE_SHIFT - 9;
-			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, 0))
-				break;
-		}
+		pgoff_t offset = start_page - se->start_page;
+		sector_t start_block = se->start_block + offset;
+		sector_t nr_blocks = se->nr_pages - offset;
+
+		if (nr_blocks > nr_pages)
+			nr_blocks = nr_pages;
+		start_page += nr_blocks;
+		nr_pages -= nr_blocks;
+
+		start_block <<= PAGE_SHIFT - 9;
+		nr_blocks <<= PAGE_SHIFT - 9;
+		if (blkdev_issue_discard(si->bdev, start_block,
+					nr_blocks, GFP_NOIO, 0))
+			break;
 
-		se = list_next_entry(se, list);
+		se = next_se(se);
 	}
 }
 
@@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 			return type;
 		}
 		if (bdev == sis->bdev) {
-			struct swap_extent *se = &sis->first_swap_extent;
+			struct swap_extent *se = first_se(sis);
 
 			if (se->start_block == offset) {
 				if (bdev_p)
@@ -2232,7 +2257,6 @@ static void drain_mmlist(void)
 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 {
 	struct swap_info_struct *sis;
-	struct swap_extent *start_se;
 	struct swap_extent *se;
 	pgoff_t offset;
 
@@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 	*bdev = sis->bdev;
 
 	offset = swp_offset(entry);
-	start_se = sis->curr_swap_extent;
-	se = start_se;
-
-	for ( ; ; ) {
-		if (se->start_page <= offset &&
-				offset < (se->start_page + se->nr_pages)) {
-			return se->start_block + (offset - se->start_page);
-		}
-		se = list_next_entry(se, list);
-		sis->curr_swap_extent = se;
-		BUG_ON(se == start_se);		/* It *must* be present */
-	}
+	se = offset_to_swap_extent(sis, offset);
+	return se->start_block + (offset - se->start_page);
 }
 
 /*
@@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
  */
 static void destroy_swap_extents(struct swap_info_struct *sis)
 {
-	while (!list_empty(&sis->first_swap_extent.list)) {
-		struct swap_extent *se;
+	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+		struct rb_node *rb = sis->swap_extent_root.rb_node;
+		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
 
-		se = list_first_entry(&sis->first_swap_extent.list,
-				struct swap_extent, list);
-		list_del(&se->list);
+		rb_erase(rb, &sis->swap_extent_root);
 		kfree(se);
 	}
 
@@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 
 /*
  * Add a block range (and the corresponding page range) into this swapdev's
- * extent list.  The extent list is kept sorted in page order.
+ * extent tree.
  *
  * This function rather assumes that it is called in ascending page order.
  */
@@ -2298,20 +2311,21 @@ int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 		unsigned long nr_pages, sector_t start_block)
 {
+	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
 	struct swap_extent *se;
 	struct swap_extent *new_se;
-	struct list_head *lh;
-
-	if (start_page == 0) {
-		se = &sis->first_swap_extent;
-		sis->curr_swap_extent = se;
-		se->start_page = 0;
-		se->nr_pages = nr_pages;
-		se->start_block = start_block;
-		return 1;
-	} else {
-		lh = sis->first_swap_extent.list.prev;	/* Highest extent */
-		se = list_entry(lh, struct swap_extent, list);
+
+	/*
+	 * place the new node at the right most since the
+	 * function is called in ascending page order.
+	 */
+	while (*link) {
+		parent = *link;
+		link = &parent->rb_right;
+	}
+
+	if (parent) {
+		se = rb_entry(parent, struct swap_extent, rb_node);
 		BUG_ON(se->start_page + se->nr_pages != start_page);
 		if (se->start_block + se->nr_pages == start_block) {
 			/* Merge it */
@@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 		}
 	}
 
-	/*
-	 * No merge.  Insert a new extent, preserving ordering.
-	 */
+	/* No merge, insert a new extent. */
 	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
 	if (new_se == NULL)
 		return -ENOMEM;
@@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->nr_pages = nr_pages;
 	new_se->start_block = start_block;
 
-	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
+	rb_link_node(&new_se->rb_node, parent, link);
+	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
 	return 1;
 }
 EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 		 * would be relying on p->type to remain valid.
 		 */
 	}
-	INIT_LIST_HEAD(&p->first_swap_extent.list);
+	p->swap_extent_root = RB_ROOT;
 	plist_node_init(&p->list, 0);
 	for_each_node(i)
 		plist_node_init(&p->avail_lists[i], 0);
-- 
cgit v1.2.3


From 1e577f970f66a53d429cbee37b36177c9712f488 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 20:55:55 -0700
Subject: mm, memcg: introduce memory.events.local

The memory controller in cgroup v2 exposes memory.events file for each
memcg which shows the number of times events like low, high, max, oom
and oom_kill have happened for the whole tree rooted at that memcg.
Users can also poll or register notification to monitor the changes in
that file.  Any event at any level of the tree rooted at memcg will
notify all the listeners along the path till root_mem_cgroup.  There are
existing users which depend on this behavior.

However there are users which are only interested in the events
happening at a specific level of the memcg tree and not in the events in
the underlying tree rooted at that memcg.  One such use-case is a
centralized resource monitor which can dynamically adjust the limits of
the jobs running on a system.  The jobs can create their sub-hierarchy
for their own sub-tasks.  The centralized monitor is only interested in
the events at the top level memcgs of the jobs as it can then act and
adjust the limits of the jobs.  Using the current memory.events for such
centralized monitor is very inconvenient.  The monitor will keep
receiving events which it is not interested and to find if the received
event is interesting, it has to read memory.event files of the next
level and compare it with the top level one.  So, let's introduce
memory.events.local to the memcg which shows and notify for the events
at the memcg level.

Now, does memory.stat and memory.pressure need their local versions.  IMHO
no due to the no internal process contraint of the cgroup v2.  The
memory.stat file of the top level memcg of a job shows the stats and
vmevents of the whole tree.  The local stats or vmevents of the top level
memcg will only change if there is a process running in that memcg but v2
does not allow that.  Similarly for memory.pressure there will not be any
process in the internal nodes and thus no chance of local pressure.

Link: http://lkml.kernel.org/r/20190527174643.209172-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
 include/linux/memcontrol.h              |  7 ++++++-
 mm/memcontrol.c                         | 34 +++++++++++++++++++++++----------
 3 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index a5c845338d6d..a9548de56ac9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1146,6 +1146,11 @@ PAGE_SIZE multiple when read back.
 	otherwise, a value change in this file generates a file
 	modified event.
 
+	Note that all fields in this file are hierarchical and the
+	file modified event can be generated due to an event down the
+	hierarchy. For for the local events at the cgroup level see
+	memory.events.local.
+
 	  low
 		The number of times the cgroup is reclaimed due to
 		high memory pressure even though its usage is under
@@ -1185,6 +1190,11 @@ PAGE_SIZE multiple when read back.
 		The number of processes belonging to this cgroup
 		killed by any kind of OOM killer.
 
+  memory.events.local
+	Similar to memory.events but the fields in the file are local
+	to the cgroup i.e. not hierarchical. The file modified event
+	generated on this file reflects only the local events.
+
   memory.stat
 	A read-only flat-keyed file which exists on non-root cgroups.
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1dcb763bb610..22141ebc5e15 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -233,8 +233,9 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
-	/* memory.events */
+	/* memory.events and memory.events.local */
 	struct cgroup_file events_file;
+	struct cgroup_file events_local_file;
 
 	/* handle for "memory.swap.events" */
 	struct cgroup_file swap_events_file;
@@ -281,6 +282,7 @@ struct mem_cgroup {
 
 	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
+	atomic_long_t		memory_events_local[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
 
@@ -747,6 +749,9 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
 {
+	atomic_long_inc(&memcg->memory_events_local[event]);
+	cgroup_file_notify(&memcg->events_local_file);
+
 	do {
 		atomic_long_inc(&memcg->memory_events[event]);
 		cgroup_file_notify(&memcg->events_file);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2ad94d0ce22f..0a9bd604aa15 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5624,21 +5624,29 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+	seq_printf(m, "oom_kill %lu\n",
+		   atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
 static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-	seq_printf(m, "low %lu\n",
-		   atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
-	seq_printf(m, "high %lu\n",
-		   atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
-	seq_printf(m, "max %lu\n",
-		   atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
-	seq_printf(m, "oom %lu\n",
-		   atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
-	seq_printf(m, "oom_kill %lu\n",
-		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+	__memory_events_show(m, memcg->memory_events);
+	return 0;
+}
+
+static int memory_events_local_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
+	__memory_events_show(m, memcg->memory_events_local);
 	return 0;
 }
 
@@ -5800,6 +5808,12 @@ static struct cftype memory_files[] = {
 		.file_offset = offsetof(struct mem_cgroup, events_file),
 		.seq_show = memory_events_show,
 	},
+	{
+		.name = "events.local",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.file_offset = offsetof(struct mem_cgroup, events_local_file),
+		.seq_show = memory_events_local_show,
+	},
 	{
 		.name = "stat",
 		.flags = CFTYPE_NOT_ON_ROOT,
-- 
cgit v1.2.3


From 0b14e8aa68223c2c124d408aa4b110b364d13c53 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:06 -0700
Subject: mm: memcg/slab: rename slab delayed deactivation functions and fields

The delayed work/rcu deactivation infrastructure of non-root kmem_caches
can be also used for asynchronous release of these objects.  Let's get rid
of the word "deactivation" in corresponding names to make the code look
better after generalization.

It's easier to make the renaming first, so that the generalized code will
look consistent from scratch.

Let's rename struct memcg_cache_params fields:
  deact_fn -> work_fn
  deact_rcu_head -> rcu_head
  deact_work -> work

And RCU/delayed work callbacks in slab common code:
  kmemcg_deactivate_rcufn -> kmemcg_rcufn
  kmemcg_deactivate_workfn -> kmemcg_workfn

This patch contains no functional changes, only renamings.

Link: http://lkml.kernel.org/r/20190611231813.3148843-3-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  6 +++---
 mm/slab.h            |  2 +-
 mm/slab_common.c     | 30 +++++++++++++++---------------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 98c3d12b7275..6008d884e621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -643,10 +643,10 @@ struct memcg_cache_params {
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
 
-			void (*deact_fn)(struct kmem_cache *);
+			void (*work_fn)(struct kmem_cache *);
 			union {
-				struct rcu_head deact_rcu_head;
-				struct work_struct deact_work;
+				struct rcu_head rcu_head;
+				struct work_struct work;
 			};
 		};
 	};
diff --git a/mm/slab.h b/mm/slab.h
index 86f7ede21203..7ef695b91919 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -291,7 +291,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 extern void slab_init_memcg_params(struct kmem_cache *);
 extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
 extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
-				void (*deact_fn)(struct kmem_cache *));
+				void (*work_fn)(struct kmem_cache *));
 
 #else /* CONFIG_MEMCG_KMEM */
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 07ee4189b40c..f4dd9f75751c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -691,17 +691,17 @@ out_unlock:
 	put_online_cpus();
 }
 
-static void kmemcg_deactivate_workfn(struct work_struct *work)
+static void kmemcg_workfn(struct work_struct *work)
 {
 	struct kmem_cache *s = container_of(work, struct kmem_cache,
-					    memcg_params.deact_work);
+					    memcg_params.work);
 
 	get_online_cpus();
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
-	s->memcg_params.deact_fn(s);
+	s->memcg_params.work_fn(s);
 
 	mutex_unlock(&slab_mutex);
 
@@ -712,36 +712,36 @@ static void kmemcg_deactivate_workfn(struct work_struct *work)
 	css_put(&s->memcg_params.memcg->css);
 }
 
-static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+static void kmemcg_rcufn(struct rcu_head *head)
 {
 	struct kmem_cache *s = container_of(head, struct kmem_cache,
-					    memcg_params.deact_rcu_head);
+					    memcg_params.rcu_head);
 
 	/*
-	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
+	 * We need to grab blocking locks.  Bounce to ->work.  The
 	 * work item shares the space with the RCU head and can't be
 	 * initialized eariler.
 	 */
-	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
-	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
+	INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
 }
 
 /**
  * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
  *					   sched RCU grace period
  * @s: target kmem_cache
- * @deact_fn: deactivation function to call
+ * @work_fn: deactivation function to call
  *
- * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * Schedule @work_fn to be invoked with online cpus, mems and slab_mutex
  * held after a sched RCU grace period.  The slab is guaranteed to stay
- * alive until @deact_fn is finished.  This is to be used from
+ * alive until @work_fn is finished.  This is to be used from
  * __kmemcg_cache_deactivate().
  */
 void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
-					   void (*deact_fn)(struct kmem_cache *))
+					   void (*work_fn)(struct kmem_cache *))
 {
 	if (WARN_ON_ONCE(is_root_cache(s)) ||
-	    WARN_ON_ONCE(s->memcg_params.deact_fn))
+	    WARN_ON_ONCE(s->memcg_params.work_fn))
 		return;
 
 	if (s->memcg_params.root_cache->memcg_params.dying)
@@ -750,8 +750,8 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
 	/* pin memcg so that @s doesn't get destroyed in the middle */
 	css_get(&s->memcg_params.memcg->css);
 
-	s->memcg_params.deact_fn = deact_fn;
-	call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+	s->memcg_params.work_fn = work_fn;
+	call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
 }
 
 void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 49a18eae2e98a794477b5af5d85938e430c0be72 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:13 -0700
Subject: mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg()

Let's separate the page counter modification code out of
__memcg_kmem_uncharge() in a way similar to what
__memcg_kmem_charge() and __memcg_kmem_charge_memcg() work.

This will allow to reuse this code later using a new
memcg_kmem_uncharge_memcg() wrapper, which calls
__memcg_kmem_uncharge_memcg() if memcg_kmem_enabled()
check is passed.

Link: http://lkml.kernel.org/r/20190611231813.3148843-5-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++++
 mm/memcontrol.c            | 25 +++++++++++++++++--------
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 22141ebc5e15..68402842c337 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1278,6 +1278,8 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge(struct page *page, int order);
 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 			      struct mem_cgroup *memcg);
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+				 unsigned int nr_pages);
 
 extern struct static_key_false memcg_kmem_enabled_key;
 extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1319,6 +1321,14 @@ static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
 		return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
 	return 0;
 }
+
+static inline void memcg_kmem_uncharge_memcg(struct page *page, int order,
+					     struct mem_cgroup *memcg)
+{
+	if (memcg_kmem_enabled())
+		__memcg_kmem_uncharge_memcg(memcg, 1 << order);
+}
+
 /*
  * helper for accessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6de79ec3cd21..25e35a8b8ba2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2807,6 +2807,22 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 	css_put(&memcg->css);
 	return ret;
 }
+
+/**
+ * __memcg_kmem_uncharge_memcg: uncharge a kmem page
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+				 unsigned int nr_pages)
+{
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		page_counter_uncharge(&memcg->kmem, nr_pages);
+
+	page_counter_uncharge(&memcg->memory, nr_pages);
+	if (do_memsw_account())
+		page_counter_uncharge(&memcg->memsw, nr_pages);
+}
 /**
  * __memcg_kmem_uncharge: uncharge a kmem page
  * @page: page to uncharge
@@ -2821,14 +2837,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 		return;
 
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
-	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		page_counter_uncharge(&memcg->kmem, nr_pages);
-
-	page_counter_uncharge(&memcg->memory, nr_pages);
-	if (do_memsw_account())
-		page_counter_uncharge(&memcg->memsw, nr_pages);
-
+	__memcg_kmem_uncharge_memcg(memcg, nr_pages);
 	page->mem_cgroup = NULL;
 
 	/* slab pages do not have PageKmemcg flag set */
-- 
cgit v1.2.3


From f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:27 -0700
Subject: mm: memcg/slab: rework non-root kmem_cache lifecycle management

Currently each charged slab page holds a reference to the cgroup to which
it's charged.  Kmem_caches are held by the memcg and are released all
together with the memory cgroup.  It means that none of kmem_caches are
released unless at least one reference to the memcg exists, which is very
far from optimal.

Let's rework it in a way that allows releasing individual kmem_caches as
soon as the cgroup is offline, the kmem_cache is empty and there are no
pending allocations.

To make it possible, let's introduce a new percpu refcounter for non-root
kmem caches.  The counter is initialized to the percpu mode, and is
switched to the atomic mode during kmem_cache deactivation.  The counter
is bumped for every charged page and also for every running allocation.
So the kmem_cache can't be released unless all allocations complete.

To shutdown non-active empty kmem_caches, let's reuse the work queue,
previously used for the kmem_cache deactivation.  Once the reference
counter reaches 0, let's schedule an asynchronous kmem_cache release.

* I used the following simple approach to test the performance
(stolen from another patchset by T. Harding):

    time find / -name fname-no-exist
    echo 2 > /proc/sys/vm/drop_caches
    repeat 10 times

Results:

        orig		patched

real	0m1.455s	real	0m1.355s
user	0m0.206s	user	0m0.219s
sys	0m0.855s	sys	0m0.807s

real	0m1.487s	real	0m1.699s
user	0m0.221s	user	0m0.256s
sys	0m0.806s	sys	0m0.948s

real	0m1.515s	real	0m1.505s
user	0m0.183s	user	0m0.215s
sys	0m0.876s	sys	0m0.858s

real	0m1.291s	real	0m1.380s
user	0m0.193s	user	0m0.198s
sys	0m0.843s	sys	0m0.786s

real	0m1.364s	real	0m1.374s
user	0m0.180s	user	0m0.182s
sys	0m0.868s	sys	0m0.806s

real	0m1.352s	real	0m1.312s
user	0m0.201s	user	0m0.212s
sys	0m0.820s	sys	0m0.761s

real	0m1.302s	real	0m1.349s
user	0m0.205s	user	0m0.203s
sys	0m0.803s	sys	0m0.792s

real	0m1.334s	real	0m1.301s
user	0m0.194s	user	0m0.201s
sys	0m0.806s	sys	0m0.779s

real	0m1.426s	real	0m1.434s
user	0m0.216s	user	0m0.181s
sys	0m0.824s	sys	0m0.864s

real	0m1.350s	real	0m1.295s
user	0m0.200s	user	0m0.190s
sys	0m0.842s	sys	0m0.811s

So it looks like the difference is not noticeable in this test.

[cai@lca.pw: fix an use-after-free in kmemcg_workfn()]
  Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  3 +-
 mm/memcontrol.c      | 50 +++++++++++++++++++++++++--------
 mm/slab.h            | 44 ++++++++---------------------
 mm/slab_common.c     | 78 +++++++++++++++++++++++++++++-----------------------
 4 files changed, 96 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6008d884e621..bc189a43e680 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
 #include <linux/overflow.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-refcount.h>
 
 
 /*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
 
 void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
 void memcg_deactivate_kmem_caches(struct mem_cgroup *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
+			struct percpu_ref refcnt;
 
 			void (*work_fn)(struct kmem_cache *);
 			union {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25e35a8b8ba2..ce4ce5e7937b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 {
 	struct memcg_kmem_cache_create_work *cw;
 
+	if (!css_tryget_online(&memcg->css))
+		return;
+
 	cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
 	if (!cw)
 		return;
 
-	css_get(&memcg->css);
-
 	cw->memcg = memcg;
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
+	struct memcg_cache_array *arr;
 	int kmemcg_id;
 
 	VM_BUG_ON(!is_root_cache(cachep));
@@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
 	if (memcg_kmem_bypass())
 		return cachep;
 
-	memcg = get_mem_cgroup_from_current();
+	rcu_read_lock();
+
+	if (unlikely(current->active_memcg))
+		memcg = current->active_memcg;
+	else
+		memcg = mem_cgroup_from_task(current);
+
+	if (!memcg || memcg == root_mem_cgroup)
+		goto out_unlock;
+
 	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
 	if (kmemcg_id < 0)
-		goto out;
+		goto out_unlock;
+
+	arr = rcu_dereference(cachep->memcg_params.memcg_caches);
 
-	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
-	if (likely(memcg_cachep))
-		return memcg_cachep;
+	/*
+	 * Make sure we will access the up-to-date value. The code updating
+	 * memcg_caches issues a write barrier to match the data dependency
+	 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+	 */
+	memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
 
 	/*
 	 * If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
 	 * memcg_create_kmem_cache, this means no further allocation
 	 * could happen with the slab_mutex held. So it's better to
 	 * defer everything.
+	 *
+	 * If the memcg is dying or memcg_cache is about to be released,
+	 * don't bother creating new kmem_caches. Because memcg_cachep
+	 * is ZEROed as the fist step of kmem offlining, we don't need
+	 * percpu_ref_tryget_live() here. css_tryget_online() check in
+	 * memcg_schedule_kmem_cache_create() will prevent us from
+	 * creation of a new kmem_cache.
 	 */
-	memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
-	css_put(&memcg->css);
+	if (unlikely(!memcg_cachep))
+		memcg_schedule_kmem_cache_create(memcg, cachep);
+	else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+		cachep = memcg_cachep;
+out_unlock:
+	rcu_read_unlock();
 	return cachep;
 }
 
@@ -2748,7 +2774,7 @@ out:
 void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 	if (!is_root_cache(cachep))
-		css_put(&cachep->memcg_params.memcg->css);
+		percpu_ref_put(&cachep->memcg_params.refcnt);
 }
 
 /**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 		memcg_offline_kmem(memcg);
 
 	if (memcg->kmem_state == KMEM_ALLOCATED) {
-		memcg_destroy_kmem_caches(memcg);
+		WARN_ON(!list_empty(&memcg->kmem_caches));
 		static_branch_dec(&memcg_kmem_enabled_key);
 		WARN_ON(page_counter_read(&memcg->kmem));
 	}
diff --git a/mm/slab.h b/mm/slab.h
index 46623a576a3c..5d2b8511e6fb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
 	return s->name;
 }
 
-/*
- * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away by either
- * taking a css reference to the owner cgroup, or holding the slab_mutex.
- */
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-	struct kmem_cache *cachep;
-	struct memcg_cache_array *arr;
-
-	rcu_read_lock();
-	arr = rcu_dereference(s->memcg_params.memcg_caches);
-
-	/*
-	 * Make sure we will access the up-to-date value. The code updating
-	 * memcg_caches issues a write barrier to match this (see
-	 * memcg_create_kmem_cache()).
-	 */
-	cachep = READ_ONCE(arr->entries[idx]);
-	rcu_read_unlock();
-
-	return cachep;
-}
-
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
 					     gfp_t gfp, int order,
 					     struct kmem_cache *s)
 {
+	int ret;
+
 	if (is_root_cache(s))
 		return 0;
-	return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+
+	ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+	if (ret)
+		return ret;
+
+	percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+
+	return 0;
 }
 
 static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 						struct kmem_cache *s)
 {
+	if (!is_root_cache(s))
+		percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
 	memcg_kmem_uncharge(page, order);
 }
 
@@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s)
 	return s->name;
 }
 
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-	return NULL;
-}
-
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	return s;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a15557776d7d..ee3971f7fabc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 LIST_HEAD(slab_root_caches);
 static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
 
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
+
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
 	struct memcg_cache_array *arr;
 
 	if (root_cache) {
+		int ret = percpu_ref_init(&s->memcg_params.refcnt,
+					  kmemcg_cache_shutdown,
+					  0, GFP_KERNEL);
+		if (ret)
+			return ret;
+
 		s->memcg_params.root_cache = root_cache;
 		INIT_LIST_HEAD(&s->memcg_params.children_node);
 		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
 		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+	else
+		percpu_ref_exit(&s->memcg_params.refcnt);
 }
 
 static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
 	if (is_root_cache(s)) {
 		list_add(&s->root_caches_node, &slab_root_caches);
 	} else {
+		css_get(&memcg->css);
 		s->memcg_params.memcg = memcg;
 		list_add(&s->memcg_params.children_node,
 			 &s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
 	} else {
 		list_del(&s->memcg_params.children_node);
 		list_del(&s->memcg_params.kmem_caches_node);
+		css_put(&s->memcg_params.memcg->css);
 	}
 }
 #else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	}
 
 	/*
-	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * Since readers won't lock (see memcg_kmem_get_cache()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
 	 * initialized.
 	 */
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-
 	s->memcg_params.work_fn(s);
-
 	mutex_unlock(&slab_mutex);
 
 	put_online_mems();
 	put_online_cpus();
-
-	/* done, put the ref from kmemcg_cache_deactivate() */
-	css_put(&s->memcg_params.memcg->css);
 }
 
 static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
 	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
 }
 
+static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
+{
+	WARN_ON(shutdown_cache(s));
+}
+
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
+{
+	struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
+					    memcg_params.refcnt);
+	unsigned long flags;
+
+	spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
+	if (s->memcg_params.root_cache->memcg_params.dying)
+		goto unlock;
+
+	s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
+	INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
+
+unlock:
+	spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
+}
+
+static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+	__kmemcg_cache_deactivate_after_rcu(s);
+	percpu_ref_kill(&s->memcg_params.refcnt);
+}
+
 static void kmemcg_cache_deactivate(struct kmem_cache *s)
 {
-	if (WARN_ON_ONCE(is_root_cache(s)) ||
-	    WARN_ON_ONCE(s->memcg_params.work_fn))
+	if (WARN_ON_ONCE(is_root_cache(s)))
 		return;
 
 	__kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
 	if (s->memcg_params.root_cache->memcg_params.dying)
 		goto unlock;
 
-	/* pin memcg so that @s doesn't get destroyed in the middle */
-	css_get(&s->memcg_params.memcg->css);
-
-	s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
+	s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
 	call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
 unlock:
 	spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	put_online_cpus();
 }
 
-void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
-{
-	struct kmem_cache *s, *s2;
-
-	get_online_cpus();
-	get_online_mems();
-
-	mutex_lock(&slab_mutex);
-	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
-				 memcg_params.kmem_caches_node) {
-		/*
-		 * The cgroup is about to be freed and therefore has no charges
-		 * left. Hence, all its caches must be empty by now.
-		 */
-		BUG_ON(shutdown_cache(s));
-	}
-	mutex_unlock(&slab_mutex);
-
-	put_online_mems();
-	put_online_cpus();
-}
-
 static int shutdown_memcg_caches(struct kmem_cache *s)
 {
 	struct memcg_cache_array *arr;
-- 
cgit v1.2.3


From fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:34 -0700
Subject: mm: memcg/slab: reparent memcg kmem_caches on cgroup removal

Let's reparent non-root kmem_caches on memcg offlining.  This allows us to
release the memory cgroup without waiting for the last outstanding kernel
object (e.g.  dentry used by another application).

Since the parent cgroup is already charged, everything we need to do is to
splice the list of kmem_caches to the parent's kmem_caches list, swap the
memcg pointer, drop the css refcounter for each kmem_cache and adjust the
parent's css refcounter.

Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer
anymore.  It's safe to read it under rcu_read_lock(), cgroup_mutex held,
or any other way that protects the memory cgroup from being released.

We can race with the slab allocation and deallocation paths.  It's not a
big problem: parent's charge and slab global stats are always correct, and
we don't care anymore about the child usage and global stats.  The child
cgroup is already offline, so we don't use or show it anywhere.

Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't
used anywhere except count_shadow_nodes().  But even there it won't break
anything: after reparenting "nodes" will be 0 on child level (because
we're already reparenting shrinker lists), and on parent level page stats
always were 0, and this patch won't change anything.

[guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup]
  Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com
Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  2 +-
 mm/memcontrol.c      | 14 ++++++++------
 mm/slab.h            | 41 ++++++++++++++++++++++++++++++++---------
 mm/slab_common.c     | 19 +++++++++++++++++--
 4 files changed, 58 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index bc189a43e680..fd0ef2e16178 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 
 void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fa39e51b3d94..2cb7e4e5c51a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3284,15 +3284,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 	 */
 	memcg->kmem_state = KMEM_ALLOCATED;
 
-	memcg_deactivate_kmem_caches(memcg);
-
-	kmemcg_id = memcg->kmemcg_id;
-	BUG_ON(kmemcg_id < 0);
-
 	parent = parent_mem_cgroup(memcg);
 	if (!parent)
 		parent = root_mem_cgroup;
 
+	memcg_deactivate_kmem_caches(memcg, parent);
+
+	kmemcg_id = memcg->kmemcg_id;
+	BUG_ON(kmemcg_id < 0);
+
 	/*
 	 * Change kmemcg_id of this cgroup and all its descendants to the
 	 * parent's id, and then move all entries from this cgroup's list_lrus
@@ -3325,7 +3325,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 	if (memcg->kmem_state == KMEM_ALLOCATED) {
 		WARN_ON(!list_empty(&memcg->kmem_caches));
 		static_branch_dec(&memcg_kmem_enabled_key);
-		WARN_ON(page_counter_read(&memcg->kmem));
 	}
 }
 #else
@@ -4773,6 +4772,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	/* The following stuff does not apply to the root */
 	if (!parent) {
+#ifdef CONFIG_MEMCG_KMEM
+		INIT_LIST_HEAD(&memcg->kmem_caches);
+#endif
 		root_mem_cgroup = memcg;
 		return &memcg->css;
 	}
diff --git a/mm/slab.h b/mm/slab.h
index 7ead47cb9338..a62372d0f271 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -261,6 +261,9 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
  * which do not have slab_cache pointer set.
  * So this function assumes that the page can pass PageHead() and PageSlab()
  * checks.
+ *
+ * The kmem_cache can be reparented asynchronously. The caller must ensure
+ * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
  */
 static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
 {
@@ -268,7 +271,7 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
 
 	s = READ_ONCE(page->slab_cache);
 	if (s && !is_root_cache(s))
-		return s->memcg_params.memcg;
+		return READ_ONCE(s->memcg_params.memcg);
 
 	return NULL;
 }
@@ -285,10 +288,22 @@ static __always_inline int memcg_charge_slab(struct page *page,
 	struct lruvec *lruvec;
 	int ret;
 
-	memcg = s->memcg_params.memcg;
+	rcu_read_lock();
+	memcg = READ_ONCE(s->memcg_params.memcg);
+	while (memcg && !css_tryget_online(&memcg->css))
+		memcg = parent_mem_cgroup(memcg);
+	rcu_read_unlock();
+
+	if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
+		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+				    (1 << order));
+		percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+		return 0;
+	}
+
 	ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
 	if (ret)
-		return ret;
+		goto out;
 
 	lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
 	mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
@@ -296,8 +311,9 @@ static __always_inline int memcg_charge_slab(struct page *page,
 	/* transer try_charge() page references to kmem_cache */
 	percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
 	css_put_many(&memcg->css, 1 << order);
-
-	return 0;
+out:
+	css_put(&memcg->css);
+	return ret;
 }
 
 /*
@@ -310,10 +326,17 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
-	memcg = s->memcg_params.memcg;
-	lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
-	mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
-	memcg_kmem_uncharge_memcg(page, order, memcg);
+	rcu_read_lock();
+	memcg = READ_ONCE(s->memcg_params.memcg);
+	if (likely(!mem_cgroup_is_root(memcg))) {
+		lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+		mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
+		memcg_kmem_uncharge_memcg(page, order, memcg);
+	} else {
+		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+				    -(1 << order));
+	}
+	rcu_read_unlock();
 
 	percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ee3971f7fabc..b893eefb6229 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -252,7 +252,8 @@ static void memcg_unlink_cache(struct kmem_cache *s)
 	} else {
 		list_del(&s->memcg_params.children_node);
 		list_del(&s->memcg_params.kmem_caches_node);
-		css_put(&s->memcg_params.memcg->css);
+		mem_cgroup_put(s->memcg_params.memcg);
+		WRITE_ONCE(s->memcg_params.memcg, NULL);
 	}
 }
 #else
@@ -785,11 +786,13 @@ unlock:
 	spin_unlock_irq(&memcg_kmem_wq_lock);
 }
 
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
+				  struct mem_cgroup *parent)
 {
 	int idx;
 	struct memcg_cache_array *arr;
 	struct kmem_cache *s, *c;
+	unsigned int nr_reparented;
 
 	idx = memcg_cache_id(memcg);
 
@@ -807,6 +810,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 		kmemcg_cache_deactivate(c);
 		arr->entries[idx] = NULL;
 	}
+	nr_reparented = 0;
+	list_for_each_entry(s, &memcg->kmem_caches,
+			    memcg_params.kmem_caches_node) {
+		WRITE_ONCE(s->memcg_params.memcg, parent);
+		css_put(&memcg->css);
+		nr_reparented++;
+	}
+	if (nr_reparented) {
+		list_splice_init(&memcg->kmem_caches,
+				 &parent->kmem_caches);
+		css_get_many(&parent->css, nr_reparented);
+	}
 	mutex_unlock(&slab_mutex);
 
 	put_online_mems();
-- 
cgit v1.2.3


From fcf8a1e483490cd249df4e02d5425636c3f43c86 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 11 Jul 2019 20:56:38 -0700
Subject: mm, memcg: add a memcg_slabinfo debugfs file

There are concerns about memory leaks from extensive use of memory cgroups
as each memory cgroup creates its own set of kmem caches.  There is a
possiblity that the memcg kmem caches may remain even after the memory
cgroups have been offlined.  Therefore, it will be useful to show the
status of each of memcg kmem caches.

This patch introduces a new <debugfs>/memcg_slabinfo file which is
somewhat similar to /proc/slabinfo in format, but lists only information
about kmem caches that have child memcg kmem caches.  Information
available in /proc/slabinfo are not repeated in memcg_slabinfo.

A portion of a sample output of the file was:

  # <name> <css_id[:dead]> <active_objs> <num_objs> <active_slabs> <num_slabs>
  rpc_inode_cache   root          13     51      1      1
  rpc_inode_cache     48           0      0      0      0
  fat_inode_cache   root           1     45      1      1
  fat_inode_cache     41           2     45      1      1
  xfs_inode         root         770    816     24     24
  xfs_inode           92          22     34      1      1
  xfs_inode           88:dead      1     34      1      1
  xfs_inode           89:dead     23     34      1      1
  xfs_inode           85           4     34      1      1
  xfs_inode           84           9     34      1      1

The css id of the memcg is also listed. If a memcg is not online,
the tag ":dead" will be attached as shown above.

[longman@redhat.com: memcg: add ":deact" tag for reparented kmem caches in memcg_slabinfo]
  Link: http://lkml.kernel.org/r/20190621173005.31514-1-longman@redhat.com
[longman@redhat.com: set the flag in the common code as suggested by Roman]
  Link: http://lkml.kernel.org/r/20190627184324.5875-1-longman@redhat.com
Link: http://lkml.kernel.org/r/20190619171621.26209-1-longman@redhat.com
Signed-off-by: Waiman Long <longman@redhat.com>
Suggested-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  4 ++++
 mm/slab_common.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index fd0ef2e16178..56c9c7eed34e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,6 +116,10 @@
 /* Objects are reclaimable */
 #define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
+
+/* Slab deactivation flag */
+#define SLAB_DEACTIVATED	((slab_flags_t __force)0x10000000U)
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b893eefb6229..6c49dbb3769e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -17,6 +17,7 @@
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -770,6 +771,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
 		return;
 
 	__kmemcg_cache_deactivate(s);
+	s->flags |= SLAB_DEACTIVATED;
 
 	/*
 	 * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
@@ -1521,6 +1523,64 @@ static int __init slab_proc_init(void)
 	return 0;
 }
 module_init(slab_proc_init);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
+/*
+ * Display information about kmem caches that have child memcg caches.
+ */
+static int memcg_slabinfo_show(struct seq_file *m, void *unused)
+{
+	struct kmem_cache *s, *c;
+	struct slabinfo sinfo;
+
+	mutex_lock(&slab_mutex);
+	seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
+	seq_puts(m, " <active_slabs> <num_slabs>\n");
+	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
+		/*
+		 * Skip kmem caches that don't have any memcg children.
+		 */
+		if (list_empty(&s->memcg_params.children))
+			continue;
+
+		memset(&sinfo, 0, sizeof(sinfo));
+		get_slabinfo(s, &sinfo);
+		seq_printf(m, "%-17s root       %6lu %6lu %6lu %6lu\n",
+			   cache_name(s), sinfo.active_objs, sinfo.num_objs,
+			   sinfo.active_slabs, sinfo.num_slabs);
+
+		for_each_memcg_cache(c, s) {
+			struct cgroup_subsys_state *css;
+			char *status = "";
+
+			css = &c->memcg_params.memcg->css;
+			if (!(css->flags & CSS_ONLINE))
+				status = ":dead";
+			else if (c->flags & SLAB_DEACTIVATED)
+				status = ":deact";
+
+			memset(&sinfo, 0, sizeof(sinfo));
+			get_slabinfo(c, &sinfo);
+			seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
+				   cache_name(c), css->id, status,
+				   sinfo.active_objs, sinfo.num_objs,
+				   sinfo.active_slabs, sinfo.num_slabs);
+		}
+	}
+	mutex_unlock(&slab_mutex);
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
+
+static int __init memcg_slabinfo_init(void)
+{
+	debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
+			    NULL, NULL, &memcg_slabinfo_fops);
+	return 0;
+}
+
+late_initcall(memcg_slabinfo_init);
+#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
 #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
 
 static __always_inline void *__do_krealloc(const void *p, size_t new_size,
-- 
cgit v1.2.3


From cbd34da7dc9afd521e0bea5e7d12701f4a9da7c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jul 2019 20:57:28 -0700
Subject: mm: move the powerpc hugepd code to mm/gup.c

While only powerpc supports the hugepd case, the code is pretty generic
and I'd like to keep all GUP internals in one place.

Link: http://lkml.kernel.org/r/20190625143715.1689-15-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig          |  1 +
 arch/powerpc/mm/hugetlbpage.c | 72 -------------------------------------
 include/linux/hugetlb.h       | 18 ----------
 mm/Kconfig                    | 10 ++++++
 mm/gup.c                      | 82 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 93 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 959866c156de..24a41f919309 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -125,6 +125,7 @@ config PPC
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_HUGEPD			if HUGETLB_PAGE
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b5d92dc32844..51716c11d0fb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -511,13 +511,6 @@ retry:
 	return page;
 }
 
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
-				      unsigned long sz)
-{
-	unsigned long __boundary = (addr + sz) & ~(sz-1);
-	return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
 #ifdef CONFIG_PPC_MM_SLICES
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
@@ -665,68 +658,3 @@ void flush_dcache_icache_hugepage(struct page *page)
 		}
 	}
 }
-
-static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-		       unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long pte_end;
-	struct page *head, *page;
-	pte_t pte;
-	int refs;
-
-	pte_end = (addr + sz) & ~(sz-1);
-	if (pte_end < end)
-		end = pte_end;
-
-	pte = READ_ONCE(*ptep);
-
-	if (!pte_access_permitted(pte, write))
-		return 0;
-
-	/* hugepages are never "special" */
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-
-	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-
-	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-		/* Could be optimized better */
-		*nr -= refs;
-		while (refs--)
-			put_page(head);
-		return 0;
-	}
-
-	return 1;
-}
-
-int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	pte_t *ptep;
-	unsigned long sz = 1UL << hugepd_shift(hugepd);
-	unsigned long next;
-
-	ptep = hugepte_offset(hugepd, addr, pdshift);
-	do {
-		next = hugepte_addr_end(addr, end, sz);
-		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
-			return 0;
-	} while (ptep++, addr = next, addr != end);
-
-	return 1;
-}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f895a79c6f5c..edfca4278319 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,29 +16,11 @@ struct user_struct;
 struct mmu_gather;
 
 #ifndef is_hugepd
-/*
- * Some architectures requires a hugepage directory format that is
- * required to support multiple hugepage sizes. For example
- * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
- * introduced the same on powerpc. This allows for a more flexible hugepage
- * pagetable layout.
- */
 typedef struct { unsigned long pd; } hugepd_t;
 #define is_hugepd(hugepd) (0)
 #define __hugepd(x) ((hugepd_t) { (x) })
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-			      unsigned pdshift, unsigned long end,
-			      int write, struct page **pages, int *nr)
-{
-	return 0;
-}
-#else
-extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-		       unsigned pdshift, unsigned long end,
-		       int write, struct page **pages, int *nr);
 #endif
 
-
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
diff --git a/mm/Kconfig b/mm/Kconfig
index 48840b28482b..0b4352557dd5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -769,4 +769,14 @@ config GUP_GET_PTE_LOW_HIGH
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
+#
+# Some architectures require a special hugepage directory format that is
+# required to support multiple hugepage sizes. For example a4fe3ce76
+# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+# introduced it on powerpc.  This allows for a more flexible hugepage
+# pagetable layouts.
+#
+config ARCH_HAS_HUGEPD
+	bool
+
 endmenu
diff --git a/mm/gup.c b/mm/gup.c
index 9d68cef2fa90..2f8bf7a71c74 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1966,6 +1966,88 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
 }
 #endif
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+				      unsigned long sz)
+{
+	unsigned long __boundary = (addr + sz) & ~(sz-1);
+	return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		       unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long pte_end;
+	struct page *head, *page;
+	pte_t pte;
+	int refs;
+
+	pte_end = (addr + sz) & ~(sz-1);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = READ_ONCE(*ptep);
+
+	if (!pte_access_permitted(pte, write))
+		return 0;
+
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+
+	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+		unsigned int pdshift, unsigned long end, int write,
+		struct page **pages, int *nr)
+{
+	pte_t *ptep;
+	unsigned long sz = 1UL << hugepd_shift(hugepd);
+	unsigned long next;
+
+	ptep = hugepte_offset(hugepd, addr, pdshift);
+	do {
+		next = hugepte_addr_end(addr, end, sz);
+		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+			return 0;
+	} while (ptep++, addr = next, addr != end);
+
+	return 1;
+}
+#else
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+		unsigned pdshift, unsigned long end, int write,
+		struct page **pages, int *nr)
+{
+	return 0;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, unsigned int flags, struct page **pages, int *nr)
 {
-- 
cgit v1.2.3


From 8b1e0f81fb6fcf3109465a168b2e2da3f711fa86 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 11 Jul 2019 20:58:43 -0700
Subject: mm/pgtable: drop pgtable_t variable from pte_fn_t functions

Drop the pgtable_t variable from all implementation for pte_fn_t as none
of them use it.  apply_to_pte_range() should stop computing it as well.
Should help us save some cycles.

Link: http://lkml.kernel.org/r/1556803126-26596-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Matthew Wilcox <willy@infradead.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/efi.c          | 3 +--
 arch/arm/mm/dma-mapping.c      | 3 +--
 arch/arm/mm/pageattr.c         | 3 +--
 arch/arm64/kernel/efi.c        | 3 +--
 arch/arm64/mm/pageattr.c       | 3 +--
 arch/x86/xen/mmu_pv.c          | 3 +--
 drivers/gpu/drm/i915/i915_mm.c | 3 +--
 drivers/xen/gntdev.c           | 6 ++----
 drivers/xen/privcmd.c          | 6 ++----
 drivers/xen/xlate_mmu.c        | 3 +--
 include/linux/mm.h             | 3 +--
 mm/memory.c                    | 5 +----
 mm/vmalloc.c                   | 2 +-
 13 files changed, 15 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c
index ed005870671a..e57dbcc89123 100644
--- a/arch/arm/kernel/efi.c
+++ b/arch/arm/kernel/efi.c
@@ -8,8 +8,7 @@
 #include <asm/mach/map.h>
 #include <asm/mmu_context.h>
 
-static int __init set_permissions(pte_t *ptep, pgtable_t token,
-				  unsigned long addr, void *data)
+static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	efi_memory_desc_t *md = data;
 	pte_t pte = *ptep;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 439bb6a59a04..1fb5c0ca1ed8 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -493,8 +493,7 @@ void __init dma_contiguous_remap(void)
 	}
 }
 
-static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr,
-			    void *data)
+static int __dma_update_pte(pte_t *pte, unsigned long addr, void *data)
 {
 	struct page *page = virt_to_page(addr);
 	pgprot_t prot = *(pgprot_t *)data;
diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c
index 0f5faf30d9bf..d546efad7e97 100644
--- a/arch/arm/mm/pageattr.c
+++ b/arch/arm/mm/pageattr.c
@@ -14,8 +14,7 @@ struct page_change_data {
 	pgprot_t clear_mask;
 };
 
-static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
-			void *data)
+static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
 	pte_t pte = *ptep;
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 3c33d0dd8e0e..d0cf596db82c 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -82,8 +82,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 	return 0;
 }
 
-static int __init set_permissions(pte_t *ptep, pgtable_t token,
-				  unsigned long addr, void *data)
+static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	efi_memory_desc_t *md = data;
 	pte_t pte = READ_ONCE(*ptep);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index fcdcf6cd7677..03c53f16ee77 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -19,8 +19,7 @@ struct page_change_data {
 
 bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
 
-static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
-			void *data)
+static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
 	pte_t pte = READ_ONCE(*ptep);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index beb44e22afdf..f6e5eeecfc69 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2700,8 +2700,7 @@ struct remap_data {
 	struct mmu_update *mmu_update;
 };
 
-static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
-				 unsigned long addr, void *data)
+static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct remap_data *rmd = data;
 	pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
index e4935dd1fd37..c23bb29e6d3e 100644
--- a/drivers/gpu/drm/i915/i915_mm.c
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -35,8 +35,7 @@ struct remap_pfn {
 	pgprot_t prot;
 };
 
-static int remap_pfn(pte_t *pte, pgtable_t token,
-		     unsigned long addr, void *data)
+static int remap_pfn(pte_t *pte, unsigned long addr, void *data)
 {
 	struct remap_pfn *r = data;
 
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 469dfbd6cf90..4c339c7e66e5 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -264,8 +264,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
 
 /* ------------------------------------------------------------------ */
 
-static int find_grant_ptes(pte_t *pte, pgtable_t token,
-		unsigned long addr, void *data)
+static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data)
 {
 	struct gntdev_grant_map *map = data;
 	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
@@ -292,8 +291,7 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token,
 }
 
 #ifdef CONFIG_X86
-static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token,
-				     unsigned long addr, void *data)
+static int set_grant_ptes_as_special(pte_t *pte, unsigned long addr, void *data)
 {
 	set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte));
 	return 0;
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 1ff38d8036e9..2f5ce7230a43 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -731,8 +731,7 @@ struct remap_pfn {
 	unsigned long i;
 };
 
-static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
-			void *data)
+static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct remap_pfn *r = data;
 	struct page *page = r->pages[r->i];
@@ -966,8 +965,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
  * on a per pfn/pte basis. Mapping calls that fail with ENOENT
  * can be then retried until success.
  */
-static int is_mapped_fn(pte_t *pte, struct page *pmd_page,
-	                unsigned long addr, void *data)
+static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
 {
 	return pte_none(*pte) ? 0 : -EBUSY;
 }
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
index e7df65d32c91..ba883a80b3c0 100644
--- a/drivers/xen/xlate_mmu.c
+++ b/drivers/xen/xlate_mmu.c
@@ -93,8 +93,7 @@ static void setup_hparams(unsigned long gfn, void *data)
 	info->fgfn++;
 }
 
-static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
-			void *data)
+static int remap_pte_fn(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct remap_data *info = data;
 	struct page *page = info->pages[info->index++];
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb8d413d635e..bb242ad810eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2686,8 +2686,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 	return 0;
 }
 
-typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
-			void *data);
+typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
diff --git a/mm/memory.c b/mm/memory.c
index b47e4e56448a..0428ff5ee339 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2036,7 +2036,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 {
 	pte_t *pte;
 	int err;
-	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
 	pte = (mm == &init_mm) ?
@@ -2049,10 +2048,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 	arch_enter_lazy_mmu_mode();
 
-	token = pmd_pgtable(*pmd);
-
 	do {
-		err = fn(pte++, token, addr, data);
+		err = fn(pte++, addr, data);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 030a544e6602..a5413a6e51fa 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2996,7 +2996,7 @@ void __weak vmalloc_sync_all(void)
 }
 
 
-static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
+static int f(pte_t *pte, unsigned long addr, void *data)
 {
 	pte_t ***p = data;
 
-- 
cgit v1.2.3


From 6471384af2a6530696fc0203bafe4de41a23c9ef Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 11 Jul 2019 20:59:19 -0700
Subject: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot
 options

Patch series "add init_on_alloc/init_on_free boot options", v10.

Provide init_on_alloc and init_on_free boot options.

These are aimed at preventing possible information leaks and making the
control-flow bugs that depend on uninitialized values more deterministic.

Enabling either of the options guarantees that the memory returned by the
page allocator and SL[AU]B is initialized with zeroes.  SLOB allocator
isn't supported at the moment, as its emulation of kmem caches complicates
handling of SLAB_TYPESAFE_BY_RCU caches correctly.

Enabling init_on_free also guarantees that pages and heap objects are
initialized right after they're freed, so it won't be possible to access
stale data by using a dangling pointer.

As suggested by Michal Hocko, right now we don't let the heap users to
disable initialization for certain allocations.  There's not enough
evidence that doing so can speed up real-life cases, and introducing ways
to opt-out may result in things going out of control.

This patch (of 2):

The new options are needed to prevent possible information leaks and make
control-flow bugs that depend on uninitialized values more deterministic.

This is expected to be on-by-default on Android and Chrome OS.  And it
gives the opportunity for anyone else to use it under distros too via the
boot args.  (The init_on_free feature is regularly requested by folks
where memory forensics is included in their threat models.)

init_on_alloc=1 makes the kernel initialize newly allocated pages and heap
objects with zeroes.  Initialization is done at allocation time at the
places where checks for __GFP_ZERO are performed.

init_on_free=1 makes the kernel initialize freed pages and heap objects
with zeroes upon their deletion.  This helps to ensure sensitive data
doesn't leak via use-after-free accesses.

Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator
returns zeroed memory.  The two exceptions are slab caches with
constructors and SLAB_TYPESAFE_BY_RCU flag.  Those are never
zero-initialized to preserve their semantics.

Both init_on_alloc and init_on_free default to zero, but those defaults
can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and
CONFIG_INIT_ON_FREE_DEFAULT_ON.

If either SLUB poisoning or page poisoning is enabled, those options take
precedence over init_on_alloc and init_on_free: initialization is only
applied to unpoisoned allocations.

Slowdown for the new features compared to init_on_free=0, init_on_alloc=0:

hackbench, init_on_free=1:  +7.62% sys time (st.err 0.74%)
hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%)

Linux build with -j12, init_on_free=1:  +8.38% wall time (st.err 0.39%)
Linux build with -j12, init_on_free=1:  +24.42% sys time (st.err 0.52%)
Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%)
Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%)

The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline
is within the standard error.

The new features are also going to pave the way for hardware memory
tagging (e.g.  arm64's MTE), which will require both on_alloc and on_free
hooks to set the tags for heap objects.  With MTE, tagging will have the
same cost as memory initialization.

Although init_on_free is rather costly, there are paranoid use-cases where
in-memory data lifetime is desired to be minimized.  There are various
arguments for/against the realism of the associated threat models, but
given that we'll need the infrastructure for MTE anyway, and there are
people who want wipe-on-free behavior no matter what the performance cost,
it seems reasonable to include it in this series.

[glider@google.com: v8]
  Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com
[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com
[glider@google.com: v10]
  Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com
Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.cz>		[page and dmapool parts
Acked-by: James Morris <jamorris@linux.microsoft.com>]
Cc: Christoph Lameter <cl@linux.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Sandeep Patil <sspatil@android.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Jann Horn <jannh@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 ++++
 drivers/infiniband/core/uverbs_ioctl.c          |  2 +-
 include/linux/mm.h                              | 24 +++++++++
 mm/dmapool.c                                    |  4 +-
 mm/page_alloc.c                                 | 71 ++++++++++++++++++++++---
 mm/slab.c                                       | 16 ++++--
 mm/slab.h                                       | 20 +++++++
 mm/slub.c                                       | 40 ++++++++++++--
 net/core/sock.c                                 |  2 +-
 security/Kconfig.hardening                      | 29 ++++++++++
 10 files changed, 199 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index aa4e7e7b87c2..099c5a4be95b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1668,6 +1668,15 @@
 
 	initrd=		[BOOT] Specify the location of the initial ramdisk
 
+	init_on_alloc=	[MM] Fill newly allocated pages and heap objects with
+			zeroes.
+			Format: 0 | 1
+			Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON.
+
+	init_on_free=	[MM] Fill freed pages and heap objects with zeroes.
+			Format: 0 | 1
+			Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
+
 	init_pkru=	[x86] Specify the default memory protection keys rights
 			register contents for all processes.  0x55555554 by
 			default (disallow access to all but pkey 0).  Can
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 829b0c6944d8..61758201d9b2 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -127,7 +127,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size,
 	res = (void *)pbundle->internal_buffer + pbundle->internal_used;
 	pbundle->internal_used =
 		ALIGN(new_used, sizeof(*pbundle->internal_buffer));
-	if (flags & __GFP_ZERO)
+	if (want_init_on_alloc(flags))
 		memset(res, 0, size);
 	return res;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb242ad810eb..f88f0eabcc5e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2700,6 +2700,30 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
 #endif
 
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DECLARE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DECLARE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+static inline bool want_init_on_alloc(gfp_t flags)
+{
+	if (static_branch_unlikely(&init_on_alloc) &&
+	    !page_poisoning_enabled())
+		return true;
+	return flags & __GFP_ZERO;
+}
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DECLARE_STATIC_KEY_TRUE(init_on_free);
+#else
+DECLARE_STATIC_KEY_FALSE(init_on_free);
+#endif
+static inline bool want_init_on_free(void)
+{
+	return static_branch_unlikely(&init_on_free) &&
+	       !page_poisoning_enabled();
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
 DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
 #else
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 8c94c89a6f7e..fe5d33060415 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -378,7 +378,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 #endif
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	if (mem_flags & __GFP_ZERO)
+	if (want_init_on_alloc(mem_flags))
 		memset(retval, 0, pool->size);
 
 	return retval;
@@ -428,6 +428,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	}
 
 	offset = vaddr - page->vaddr;
+	if (want_init_on_free())
+		memset(vaddr, 0, pool->size);
 #ifdef	DMAPOOL_DEBUG
 	if ((dma - page->dma) != offset) {
 		spin_unlock_irqrestore(&pool->lock, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3a555ce69006..dbd0d5cbbcbb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -135,6 +135,55 @@ unsigned long totalcma_pages __read_mostly;
 
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+EXPORT_SYMBOL(init_on_alloc);
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_free);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_free);
+#endif
+EXPORT_SYMBOL(init_on_free);
+
+static int __init early_init_on_alloc(char *buf)
+{
+	int ret;
+	bool bool_result;
+
+	if (!buf)
+		return -EINVAL;
+	ret = kstrtobool(buf, &bool_result);
+	if (bool_result && page_poisoning_enabled())
+		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
+	if (bool_result)
+		static_branch_enable(&init_on_alloc);
+	else
+		static_branch_disable(&init_on_alloc);
+	return ret;
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static int __init early_init_on_free(char *buf)
+{
+	int ret;
+	bool bool_result;
+
+	if (!buf)
+		return -EINVAL;
+	ret = kstrtobool(buf, &bool_result);
+	if (bool_result && page_poisoning_enabled())
+		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
+	if (bool_result)
+		static_branch_enable(&init_on_free);
+	else
+		static_branch_disable(&init_on_free);
+	return ret;
+}
+early_param("init_on_free", early_init_on_free);
 
 /*
  * A cached value of the page's pageblock's migratetype, used when the page is
@@ -1067,6 +1116,14 @@ out:
 	return ret;
 }
 
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+	int i;
+
+	for (i = 0; i < numpages; i++)
+		clear_highpage(page + i);
+}
+
 static __always_inline bool free_pages_prepare(struct page *page,
 					unsigned int order, bool check_free)
 {
@@ -1118,6 +1175,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
+	if (want_init_on_free())
+		kernel_init_free_pages(page, 1 << order);
+
 	kernel_poison_pages(page, 1 << order, 0);
 	if (debug_pagealloc_enabled())
 		kernel_map_pages(page, 1 << order, 0);
@@ -2019,8 +2079,8 @@ static inline int check_new_page(struct page *page)
 
 static inline bool free_pages_prezeroed(void)
 {
-	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-		page_poisoning_enabled();
+	return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+		page_poisoning_enabled()) || want_init_on_free();
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -2090,13 +2150,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 							unsigned int alloc_flags)
 {
-	int i;
-
 	post_alloc_hook(page, order, gfp_flags);
 
-	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
-		for (i = 0; i < (1 << order); i++)
-			clear_highpage(page + i);
+	if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
+		kernel_init_free_pages(page, 1 << order);
 
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
diff --git a/mm/slab.c b/mm/slab.c
index e9d90b0da47b..9df370558e5d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1811,6 +1811,14 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
 
 	cachep->num = 0;
 
+	/*
+	 * If slab auto-initialization on free is enabled, store the freelist
+	 * off-slab, so that its contents don't end up in one of the allocated
+	 * objects.
+	 */
+	if (unlikely(slab_want_init_on_free(cachep)))
+		return false;
+
 	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
 		return false;
 
@@ -3248,7 +3256,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 
-	if (unlikely(flags & __GFP_ZERO) && ptr)
+	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
 		memset(ptr, 0, cachep->object_size);
 
 	slab_post_alloc_hook(cachep, flags, 1, &ptr);
@@ -3305,7 +3313,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	prefetchw(objp);
 
-	if (unlikely(flags & __GFP_ZERO) && objp)
+	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
 		memset(objp, 0, cachep->object_size);
 
 	slab_post_alloc_hook(cachep, flags, 1, &objp);
@@ -3426,6 +3434,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
+	if (unlikely(slab_want_init_on_free(cachep)))
+		memset(objp, 0, cachep->object_size);
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
 
@@ -3513,7 +3523,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
 
 	/* Clear memory outside IRQ disabled section */
-	if (unlikely(flags & __GFP_ZERO))
+	if (unlikely(slab_want_init_on_alloc(flags, s)))
 		for (i = 0; i < size; i++)
 			memset(p[i], 0, s->object_size);
 
diff --git a/mm/slab.h b/mm/slab.h
index a62372d0f271..9057b8056b07 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -607,4 +607,24 @@ static inline int cache_random_seq_create(struct kmem_cache *cachep,
 static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
 
+static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
+{
+	if (static_branch_unlikely(&init_on_alloc)) {
+		if (c->ctor)
+			return false;
+		if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
+			return flags & __GFP_ZERO;
+		return true;
+	}
+	return flags & __GFP_ZERO;
+}
+
+static inline bool slab_want_init_on_free(struct kmem_cache *c)
+{
+	if (static_branch_unlikely(&init_on_free))
+		return !(c->ctor ||
+			 (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
+	return false;
+}
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slub.c b/mm/slub.c
index c9541a480627..e6c030e47364 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1279,6 +1279,10 @@ check_slabs:
 	if (*str == ',')
 		slub_debug_slabs = str + 1;
 out:
+	if ((static_branch_unlikely(&init_on_alloc) ||
+	     static_branch_unlikely(&init_on_free)) &&
+	    (slub_debug & SLAB_POISON))
+		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
 	return 1;
 }
 
@@ -1422,6 +1426,28 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 					   void **head, void **tail)
 {
+
+	void *object;
+	void *next = *head;
+	void *old_tail = *tail ? *tail : *head;
+	int rsize;
+
+	if (slab_want_init_on_free(s))
+		do {
+			object = next;
+			next = get_freepointer(s, object);
+			/*
+			 * Clear the object and the metadata, but don't touch
+			 * the redzone.
+			 */
+			memset(object, 0, s->object_size);
+			rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
+							   : 0;
+			memset((char *)object + s->inuse, 0,
+			       s->size - s->inuse - rsize);
+			set_freepointer(s, object, next);
+		} while (object != old_tail);
+
 /*
  * Compiler cannot detect this function can be removed if slab_free_hook()
  * evaluates to nothing.  Thus, catch all relevant config debug options here.
@@ -1431,9 +1457,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 	defined(CONFIG_DEBUG_OBJECTS_FREE) ||	\
 	defined(CONFIG_KASAN)
 
-	void *object;
-	void *next = *head;
-	void *old_tail = *tail ? *tail : *head;
+	next = *head;
 
 	/* Head and tail of the reconstructed freelist */
 	*head = NULL;
@@ -2729,8 +2753,14 @@ redo:
 		prefetch_freepointer(s, next_object);
 		stat(s, ALLOC_FASTPATH);
 	}
+	/*
+	 * If the object has been wiped upon free, make sure it's fully
+	 * initialized by zeroing out freelist pointer.
+	 */
+	if (unlikely(slab_want_init_on_free(s)) && object)
+		memset(object + s->offset, 0, sizeof(void *));
 
-	if (unlikely(gfpflags & __GFP_ZERO) && object)
+	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
 		memset(object, 0, s->object_size);
 
 	slab_post_alloc_hook(s, gfpflags, 1, &object);
@@ -3151,7 +3181,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	local_irq_enable();
 
 	/* Clear memory outside IRQ disabled fastpath loop */
-	if (unlikely(flags & __GFP_ZERO)) {
+	if (unlikely(slab_want_init_on_alloc(flags, s))) {
 		int j;
 
 		for (j = 0; j < i; j++)
diff --git a/net/core/sock.c b/net/core/sock.c
index 3e073ca6138f..d57b0cc995a0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1597,7 +1597,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
 		if (!sk)
 			return sk;
-		if (priority & __GFP_ZERO)
+		if (want_init_on_alloc(priority))
 			sk_prot_clear_nulls(sk, prot->obj_size);
 	} else
 		sk = kmalloc(prot->obj_size, priority);
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index c6cb2d9b2905..a1ffe2eb4d5f 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -160,6 +160,35 @@ config STACKLEAK_RUNTIME_DISABLE
 	  runtime to control kernel stack erasing for kernels built with
 	  CONFIG_GCC_PLUGIN_STACKLEAK.
 
+config INIT_ON_ALLOC_DEFAULT_ON
+	bool "Enable heap memory zeroing on allocation by default"
+	help
+	  This has the effect of setting "init_on_alloc=1" on the kernel
+	  command line. This can be disabled with "init_on_alloc=0".
+	  When "init_on_alloc" is enabled, all page allocator and slab
+	  allocator memory will be zeroed when allocated, eliminating
+	  many kinds of "uninitialized heap memory" flaws, especially
+	  heap content exposures. The performance impact varies by
+	  workload, but most cases see <1% impact. Some synthetic
+	  workloads have measured as high as 7%.
+
+config INIT_ON_FREE_DEFAULT_ON
+	bool "Enable heap memory zeroing on free by default"
+	help
+	  This has the effect of setting "init_on_free=1" on the kernel
+	  command line. This can be disabled with "init_on_free=0".
+	  Similar to "init_on_alloc", when "init_on_free" is enabled,
+	  all page allocator and slab allocator memory will be zeroed
+	  when freed, eliminating many kinds of "uninitialized heap memory"
+	  flaws, especially heap content exposures. The primary difference
+	  with "init_on_free" is that data lifetime in memory is reduced,
+	  as anything freed is wiped immediately, making live forensics or
+	  cold boot memory attacks unable to recover freed memory contents.
+	  The performance impact varies by workload, but is more expensive
+	  than "init_on_alloc" due to the negative cache effects of
+	  touching "cold" memory areas. Most cases see 3-5% impact. Some
+	  synthetic workloads have measured as high as 8%.
+
 endmenu
 
 endmenu
-- 
cgit v1.2.3


From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 21:00:13 -0700
Subject: mm: vmalloc: show number of vmalloc pages in /proc/meminfo

Vmalloc() is getting more and more used these days (kernel stacks, bpf and
percpu allocator are new top users), and the total % of memory consumed by
vmalloc() can be pretty significant and changes dynamically.

/proc/meminfo is the best place to display this information: its top goal
is to show top consumers of the memory.

Since the VmallocUsed field in /proc/meminfo is not in use for quite a
long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of
'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the
actual physical memory consumption of vmalloc().

Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c       |  2 +-
 include/linux/vmalloc.h |  2 ++
 mm/vmalloc.c            | 10 ++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 568d90e17c17..465ea0153b2a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "Committed_AS:   ", committed);
 	seq_printf(m, "VmallocTotal:   %8lu kB\n",
 		   (unsigned long)VMALLOC_TOTAL >> 10);
-	show_val_kb(m, "VmallocUsed:    ", 0ul);
+	show_val_kb(m, "VmallocUsed:    ", vmalloc_nr_pages());
 	show_val_kb(m, "VmallocChunk:   ", 0ul);
 	show_val_kb(m, "Percpu:         ", pcpu_nr_pages());
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 51e131245379..9b21d0047710 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void);
 
 #ifdef CONFIG_MMU
 extern void __init vmalloc_init(void);
+extern unsigned long vmalloc_nr_pages(void);
 #else
 static inline void vmalloc_init(void)
 {
 }
+static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
 extern void *vmalloc(unsigned long size);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index edb212298c8a..4fa8d84599b0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -406,6 +406,13 @@ static void purge_vmap_area_lazy(void);
 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
 static unsigned long lazy_max_pages(void);
 
+static atomic_long_t nr_vmalloc_pages;
+
+unsigned long vmalloc_nr_pages(void)
+{
+	return atomic_long_read(&nr_vmalloc_pages);
+}
+
 static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
 	struct rb_node *n = vmap_area_root.rb_node;
@@ -2237,6 +2244,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
 			BUG_ON(!page);
 			__free_pages(page, 0);
 		}
+		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
 
 		kvfree(area->pages);
 	}
@@ -2414,12 +2422,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
 			area->nr_pages = i;
+			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 			goto fail;
 		}
 		area->pages[i] = page;
 		if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
 			cond_resched();
 	}
+	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 
 	if (map_vm_area(area, prot, pages))
 		goto fail;
-- 
cgit v1.2.3


From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 21:00:26 -0700
Subject: mm, oom: remove redundant task_in_mem_cgroup() check

oom_unkillable_task() can be called from three different contexts i.e.
global OOM, memcg OOM and oom_score procfs interface.  At the moment
oom_unkillable_task() does a task_in_mem_cgroup() check on the given
process.  Since there is no reason to perform task_in_mem_cgroup()
check for global OOM and oom_score procfs interface, those contexts
provide NULL memcg and skips the task_in_mem_cgroup() check.  However
for memcg OOM context, the oom_unkillable_task() is always called from
mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes
redundant and effectively dead code.  So, just remove the
task_in_mem_cgroup() check altogether.

Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c             |  2 +-
 include/linux/memcontrol.h |  7 -------
 include/linux/oom.h        |  2 +-
 mm/memcontrol.c            | 26 --------------------------
 mm/oom_kill.c              | 19 +++++++------------
 5 files changed, 9 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 534fb1ae498a..64dadd469786 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,7 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, NULL, totalpages) *
+	points = oom_badness(task, NULL, totalpages) *
 					1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 68402842c337..44c41462be33 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -394,7 +394,6 @@ out:
 
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return true;
 }
 
-static inline bool task_in_mem_cgroup(struct task_struct *task,
-				      const struct mem_cgroup *memcg)
-{
-	return true;
-}
-
 static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	return NULL;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d07992009265..b75104690311 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 773ae5674e12..4f05735b02d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1259,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		*lru_size += nr_pages;
 }
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *task_memcg;
-	struct task_struct *p;
-	bool ret;
-
-	p = find_lock_task_mm(task);
-	if (p) {
-		task_memcg = get_mem_cgroup_from_mm(p->mm);
-		task_unlock(p);
-	} else {
-		/*
-		 * All threads may have already detached their mm's, but the oom
-		 * killer still needs to detect if they have already been oom
-		 * killed to prevent needlessly killing additional tasks.
-		 */
-		rcu_read_lock();
-		task_memcg = mem_cgroup_from_task(task);
-		css_get(&task_memcg->css);
-		rcu_read_unlock();
-	}
-	ret = mem_cgroup_is_descendant(task_memcg, memcg);
-	css_put(&task_memcg->css);
-	return ret;
-}
-
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 59326614508a..b353f468a36a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -153,17 +153,13 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask)
+				const nodemask_t *nodemask)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
-
 	/* p may not have freeable memory in nodemask */
 	if (!has_intersects_mems_allowed(p, nodemask))
 		return true;
@@ -194,20 +190,19 @@ static bool is_dump_unreclaim_slabs(void)
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
  * @totalpages: total present RAM allowed for page allocation
- * @memcg: task's memory controller, if constrained
  * @nodemask: nodemask passed to page allocator for mempolicy ooms
  *
  * The heuristic for determining which task to kill is made to be as simple and
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+unsigned long oom_badness(struct task_struct *p,
 			  const nodemask_t *nodemask, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
-	if (oom_unkillable_task(p, memcg, nodemask))
+	if (oom_unkillable_task(p, nodemask))
 		return 0;
 
 	p = find_lock_task_mm(p);
@@ -318,7 +313,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	struct oom_control *oc = arg;
 	unsigned long points;
 
-	if (oom_unkillable_task(task, NULL, oc->nodemask))
+	if (oom_unkillable_task(task, oc->nodemask))
 		goto next;
 
 	/*
@@ -342,7 +337,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+	points = oom_badness(task, oc->nodemask, oc->totalpages);
 	if (!points || points < oc->chosen_points)
 		goto next;
 
@@ -387,7 +382,7 @@ static int dump_task(struct task_struct *p, void *arg)
 	struct oom_control *oc = arg;
 	struct task_struct *task;
 
-	if (oom_unkillable_task(p, NULL, oc->nodemask))
+	if (oom_unkillable_task(p, oc->nodemask))
 		return 0;
 
 	task = find_lock_task_mm(p);
@@ -1084,7 +1079,7 @@ bool out_of_memory(struct oom_control *oc)
 	check_panic_on_oom(oc);
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+	    current->mm && !oom_unkillable_task(current, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;
-- 
cgit v1.2.3


From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 21:00:31 -0700
Subject: oom: decouple mems_allowed from oom_unkillable_task

Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to
mem_exclusive cpuset") introduces a heuristic where a potential
oom-killer victim is skipped if the intersection of the potential victim
and the current (the process triggered the oom) is empty based on the
reason that killing such victim most probably will not help the current
allocating process.

However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the
heuristic to just decrease the oom_badness scores of such potential
victim based on the reason that the cpuset of such processes might have
changed and previously they may have allocated memory on mems where the
current allocating process can allocate from.

Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a
side effect as the oom_badness is also exposed to the user space through
/proc/[pid]/oom_score, so, readers with different cpusets can read
different oom_score of the same process.

Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same
cpuset") fixed the side effect introduced by 7887a3da753e by moving the
cpuset intersection back to only oom-killer context and out of
oom_badness.  However the combination of ab290adbaf8f ("oom: make
oom_unkillable_task() helper function") and 26ebc984913b ("oom:
/proc/<pid>/oom_score treat kernel thread honestly") unintentionally
brought back the cpuset intersection check into the oom_badness
calculation function.

Other than doing cpuset/mempolicy intersection from oom_badness, the memcg
oom context is also doing cpuset/mempolicy intersection which is quite
wrong and is caught by syzcaller with the following report:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline]
RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline]
RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline]
RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155
Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00
00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f
85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff
RSP: 0018:ffff888000127490 EFLAGS: 00010a03
RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c
RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001
RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0
R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007
R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6
FS:  00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
Call Trace:
  oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321
  mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169
  select_bad_process mm/oom_kill.c:374 [inline]
  out_of_memory mm/oom_kill.c:1088 [inline]
  out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035
  mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573
  mem_cgroup_oom mm/memcontrol.c:1905 [inline]
  try_charge+0xfbe/0x1480 mm/memcontrol.c:2468
  mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073
  mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088
  do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201
  do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359
  wp_huge_pmd mm/memory.c:3793 [inline]
  __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006
  handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053
  do_user_addr_fault arch/x86/mm/fault.c:1455 [inline]
  __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521
  do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552
  page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156
RIP: 0033:0x400590
Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48
8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01
00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b
RSP: 002b:00007fff7bc49780 EFLAGS: 00010206
RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001
RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008
R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0
Modules linked in:
---[ end trace a65689219582ffff ]---
RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline]
RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline]
RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline]
RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155
Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00
00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f
85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff
RSP: 0018:ffff888000127490 EFLAGS: 00010a03
RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c
RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001
RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0
R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007
R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6
FS:  00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600

The fix is to decouple the cpuset/mempolicy intersection check from
oom_unkillable_task() and make sure cpuset/mempolicy intersection check is
only done in the global oom context.

[shakeelb@google.com: change function name and update comment]
  Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com
Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c      |  3 +--
 include/linux/oom.h |  1 -
 mm/oom_kill.c       | 57 ++++++++++++++++++++++++++++++-----------------------
 3 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 64dadd469786..77eb628ecc7f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, totalpages) *
-					1000 / totalpages;
+	points = oom_badness(task, totalpages) * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b75104690311..c696c265f019 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b353f468a36a..d1c9c4e66d59 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1;
  */
 DEFINE_MUTEX(oom_lock);
 
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+	return oc->memcg != NULL;
+}
+
 #ifdef CONFIG_NUMA
 /**
- * has_intersects_mems_allowed() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligiblity for kill
  * @start: task struct of which task to consider
  * @mask: nodemask passed to page allocator for mempolicy ooms
  *
  * Task eligibility is determined by whether or not a candidate task, @tsk,
  * shares the same mempolicy nodes as current if it is bound by such a policy
  * and whether or not it has the same set of allowed cpuset nodes.
+ *
+ * This function is assuming oom-killer context and 'current' has triggered
+ * the oom-killer.
  */
-static bool has_intersects_mems_allowed(struct task_struct *start,
-					const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *start,
+				struct oom_control *oc)
 {
 	struct task_struct *tsk;
 	bool ret = false;
+	const nodemask_t *mask = oc->nodemask;
+
+	if (is_memcg_oom(oc))
+		return true;
 
 	rcu_read_lock();
 	for_each_thread(start, tsk) {
@@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start,
 	return ret;
 }
 #else
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
-					const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
 {
 	return true;
 }
@@ -146,24 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
 	return oc->order == -1;
 }
 
-static inline bool is_memcg_oom(struct oom_control *oc)
-{
-	return oc->memcg != NULL;
-}
-
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
-				const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
-
-	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
-		return true;
-
 	return false;
 }
 
@@ -190,19 +190,17 @@ static bool is_dump_unreclaim_slabs(void)
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
  * @totalpages: total present RAM allowed for page allocation
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
  *
  * The heuristic for determining which task to kill is made to be as simple and
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
-	if (oom_unkillable_task(p, nodemask))
+	if (oom_unkillable_task(p))
 		return 0;
 
 	p = find_lock_task_mm(p);
@@ -313,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	struct oom_control *oc = arg;
 	unsigned long points;
 
-	if (oom_unkillable_task(task, oc->nodemask))
+	if (oom_unkillable_task(task))
+		goto next;
+
+	/* p may not have freeable memory in nodemask */
+	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
 		goto next;
 
 	/*
@@ -337,7 +339,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, oc->nodemask, oc->totalpages);
+	points = oom_badness(task, oc->totalpages);
 	if (!points || points < oc->chosen_points)
 		goto next;
 
@@ -382,7 +384,11 @@ static int dump_task(struct task_struct *p, void *arg)
 	struct oom_control *oc = arg;
 	struct task_struct *task;
 
-	if (oom_unkillable_task(p, oc->nodemask))
+	if (oom_unkillable_task(p))
+		return 0;
+
+	/* p may not have freeable memory in nodemask */
+	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
 		return 0;
 
 	task = find_lock_task_mm(p);
@@ -1079,7 +1085,8 @@ bool out_of_memory(struct oom_control *oc)
 	check_panic_on_oom(oc);
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, oc->nodemask) &&
+	    current->mm && !oom_unkillable_task(current) &&
+	    oom_cpuset_eligible(current, oc) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;
-- 
cgit v1.2.3


From 54638c6eaf445ecf901128599cfeb4620be47d2f Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Wed, 10 Jul 2019 21:03:24 +0300
Subject: net: phy: make exported variables non-static

The variables phy_basic_ports_array, phy_fibre_port_array and
phy_all_ports_features_array are declared static and marked
EXPORT_SYMBOL_GPL(), which is at best an odd combination.
Because the variables were decided to be a part of API, this commit
removes the static attributes and adds the declarations to the header.

Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode")
Signed-off-by: Denis Efremov <efremov@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 6 +++---
 include/linux/phy.h          | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 53878908adf4..6b5cb87f3866 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -56,19 +56,19 @@ EXPORT_SYMBOL_GPL(phy_10gbit_features);
 __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init;
 EXPORT_SYMBOL_GPL(phy_10gbit_fec_features);
 
-static const int phy_basic_ports_array[] = {
+const int phy_basic_ports_array[3] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_basic_ports_array);
 
-static const int phy_fibre_port_array[] = {
+const int phy_fibre_port_array[1] = {
 	ETHTOOL_LINK_MODE_FIBRE_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_fibre_port_array);
 
-static const int phy_all_ports_features_array[] = {
+const int phy_all_ports_features_array[7] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1739c6dc470e..462b90b73f93 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -55,6 +55,9 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_ini
 #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features)
 #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)
 
+extern const int phy_basic_ports_array[3];
+extern const int phy_fibre_port_array[1];
+extern const int phy_all_ports_features_array[7];
 extern const int phy_10_100_features_array[4];
 extern const int phy_basic_t1_features_array[2];
 extern const int phy_gbit_features_array[2];
-- 
cgit v1.2.3


From 8a58ddae23796c733c5dfbd717538d89d036c5bd Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 1 Jul 2019 14:07:55 +0300
Subject: perf/core: Fix exclusive events' grouping

So far, we tried to disallow grouping exclusive events for the fear of
complications they would cause with moving between contexts. Specifically,
moving a software group to a hardware context would violate the exclusivity
rules if both groups contain matching exclusive events.

This attempt was, however, unsuccessful: the check that we have in the
perf_event_open() syscall is both wrong (looks at wrong PMU) and
insufficient (group leader may still be exclusive), as can be illustrated
by running:

  $ perf record -e '{intel_pt//,cycles}' uname
  $ perf record -e '{cycles,intel_pt//}' uname

ultimately successfully.

Furthermore, we are completely free to trigger the exclusivity violation
by:

   perf -e '{cycles,intel_pt//}' -e '{intel_pt//,instructions}'

even though the helpful perf record will not allow that, the ABI will.

The warning later in the perf_event_open() path will also not trigger, because
it's also wrong.

Fix all this by validating the original group before moving, getting rid
of broken safeguards and placing a useful one to perf_install_in_context().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: mathieu.poirier@linaro.org
Cc: will.deacon@arm.com
Fixes: bed5b25ad9c8a ("perf: Add a pmu capability for "exclusive" events")
Link: https://lkml.kernel.org/r/20190701110755.24646-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/core.c       | 34 ++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..e8ad3c590a23 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1055,6 +1055,11 @@ static inline int in_software_context(struct perf_event *event)
 	return event->ctx->pmu->task_ctx_nr == perf_sw_context;
 }
 
+static inline int is_exclusive_pmu(struct pmu *pmu)
+{
+	return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
+}
+
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5dd19bedbf64..eea9d52b010c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2553,6 +2553,9 @@ unlock:
 	return ret;
 }
 
+static bool exclusive_event_installable(struct perf_event *event,
+					struct perf_event_context *ctx);
+
 /*
  * Attach a performance event to a context.
  *
@@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 
 	lockdep_assert_held(&ctx->mutex);
 
+	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
@@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return 0;
 
 	/*
@@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return;
 
 	/* see comment in exclusive_event_init() */
@@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 	return false;
 }
 
-/* Called under the same ctx::mutex as perf_install_in_context() */
 static bool exclusive_event_installable(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *iter_event;
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	lockdep_assert_held(&ctx->mutex);
+
+	if (!is_exclusive_pmu(pmu))
 		return true;
 
 	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -10947,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_alloc;
 	}
 
-	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
-		err = -EBUSY;
-		goto err_context;
-	}
-
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -11039,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
 				move_group = 0;
 			}
 		}
+
+		/*
+		 * Failure to create exclusive events returns -EBUSY.
+		 */
+		err = -EBUSY;
+		if (!exclusive_event_installable(group_leader, ctx))
+			goto err_locked;
+
+		for_each_sibling_event(sibling, group_leader) {
+			if (!exclusive_event_installable(sibling, ctx))
+				goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
@@ -11075,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	 * because we need to serialize with concurrent event creation.
 	 */
 	if (!exclusive_event_installable(event, ctx)) {
-		/* exclusive and group stuff are assumed mutually exclusive */
-		WARN_ON_ONCE(move_group);
-
 		err = -EBUSY;
 		goto err_locked;
 	}
-- 
cgit v1.2.3


From 028b6e8a89de9133a869bb4cd1bc72445b1ec8ca Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Sun, 14 Jul 2019 19:20:47 +0300
Subject: clone: fix CLONE_PIDFD support

The introduction of clone3 syscall accidentally broke CLONE_PIDFD
support in traditional clone syscall on compat x86 and those
architectures that use do_fork to implement clone syscall.

This bug was found by strace test suite.

Link: https://strace.io/logs/strace/2019-07-12
Fixes: 7f192e3cd316 ("fork: add clone3")
Bisected-and-tested-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Link: https://lore.kernel.org/r/20190714162047.GB10389@altlinux.org
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 arch/x86/ia32/sys_ia32.c   |  4 ++++
 include/linux/sched/task.h |  1 +
 kernel/fork.c              | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 64a6c952091e..21790307121e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -239,6 +239,7 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -246,5 +247,8 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		.tls		= tls_val,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 109a0df5af39..0497091e40c1 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -89,6 +89,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern long _do_fork(struct kernel_clone_args *kargs);
+extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f3e2d97d771..ef1e05a68827 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,6 +2406,16 @@ long _do_fork(struct kernel_clone_args *args)
 	return nr;
 }
 
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+	if ((kargs->flags & CLONE_PIDFD) &&
+	    (kargs->flags & CLONE_PARENT_SETTID))
+		return false;
+
+	return true;
+}
+
 #ifndef CONFIG_HAVE_COPY_THREAD_TLS
 /* For compatibility with architectures that call do_fork directly rather than
  * using the syscall entry points below. */
@@ -2417,6 +2427,7 @@ long do_fork(unsigned long clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -2424,6 +2435,9 @@ long do_fork(unsigned long clone_flags,
 		.stack_size	= stack_size,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
 #endif
@@ -2505,8 +2519,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		.tls		= tls,
 	};
 
-	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+	if (!legacy_clone_args_valid(&args))
 		return -EINVAL;
 
 	return _do_fork(&args);
-- 
cgit v1.2.3


From 6dfc43d3a19174faead54575c204aee106225f43 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 15 Jul 2019 15:16:20 +1000
Subject: mm: adjust apply_to_pfn_range interface for dropped token.

mm/pgtable: drop pgtable_t variable from pte_fn_t functions
drops the token came in via the hmm tree, this caused lots of
conflicts, but applying this cleanup patch should reduce it
to something easier to handle. Just accept the token is unused
at this point.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++----
 mm/memory.c           | 6 +++---
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..c45f936bd81c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index f600e31534fb..6352a3729408 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,7 +26,6 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -36,7 +35,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+static int apply_pt_wrprotect(pte_t *pte,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -78,7 +77,6 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -91,7 +89,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
+static int apply_pt_clean(pte_t *pte,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..b8218e962231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = closure->ptefn(pte++, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+static int apply_to_page_range_wrapper(pte_t *pte,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, token, addr, pra->data);
+	return pra->fn(pte, NULL, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3


From dfd6f9ad36368b8dbd5f5a2b2f0a4705ae69a323 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Jul 2019 11:01:21 +0200
Subject: ACPI: fix false-positive -Wuninitialized warning

clang gets confused by an uninitialized variable in what looks
to it like a never executed code path:

arch/x86/kernel/acpi/boot.c:618:13: error: variable 'polarity' is uninitialized when used here [-Werror,-Wuninitialized]
        polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
                   ^~~~~~~~
arch/x86/kernel/acpi/boot.c:606:32: note: initialize the variable 'polarity' to silence this warning
        int rc, irq, trigger, polarity;
                                      ^
                                       = 0
arch/x86/kernel/acpi/boot.c:617:12: error: variable 'trigger' is uninitialized when used here [-Werror,-Wuninitialized]
        trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
                  ^~~~~~~
arch/x86/kernel/acpi/boot.c:606:22: note: initialize the variable 'trigger' to silence this warning
        int rc, irq, trigger, polarity;
                            ^
                             = 0

This is unfortunately a design decision in clang and won't be fixed.

Changing the acpi_get_override_irq() macro to an inline function
reliably avoids the issue.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 451e7b544342..8309923eafe1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -324,7 +324,10 @@ struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
+static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+	return -1;
+}
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3


From 8da04e05cdfc715d414a1c5f8318c03030eb68fb Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 15 Jul 2019 09:56:30 +1000
Subject: intel_rapl: need linux/cpuhotplug.h for enum cpuhp_state

Fixes: 7ebf8eff63b4 ("intel_rapl: introduce struct rapl_if_private")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/intel_rapl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 0c179d92d110..efb3ce892c20 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/powercap.h>
+#include <linux/cpuhotplug.h>
 
 enum rapl_domain_type {
 	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
-- 
cgit v1.2.3


From 387b14684f94483cbbb72843db406ec9a8d0d6d2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 10 Apr 2019 08:32:41 -0300
Subject: docs: locking: convert docs to ReST and rename to *.rst

Convert the locking documents to ReST and add them to the
kernel development book where it belongs.

Most of the stuff here is just to make Sphinx to properly
parse the text file, as they're already in good shape,
not requiring massive changes in order to be parsed.

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Federico Vaga <federico.vaga@vaga.pv.it>
---
 Documentation/kernel-hacking/locking.rst           |   2 +-
 Documentation/locking/index.rst                    |  24 +
 Documentation/locking/lockdep-design.rst           | 394 ++++++++++++++
 Documentation/locking/lockdep-design.txt           | 389 --------------
 Documentation/locking/lockstat.rst                 | 204 ++++++++
 Documentation/locking/lockstat.txt                 | 183 -------
 Documentation/locking/locktorture.rst              | 170 ++++++
 Documentation/locking/locktorture.txt              | 145 ------
 Documentation/locking/mutex-design.rst             | 152 ++++++
 Documentation/locking/mutex-design.txt             | 142 -----
 Documentation/locking/rt-mutex-design.rst          | 574 +++++++++++++++++++++
 Documentation/locking/rt-mutex-design.txt          | 559 --------------------
 Documentation/locking/rt-mutex.rst                 |  77 +++
 Documentation/locking/rt-mutex.txt                 |  73 ---
 Documentation/locking/spinlocks.rst                | 177 +++++++
 Documentation/locking/spinlocks.txt                | 167 ------
 Documentation/locking/ww-mutex-design.rst          | 393 ++++++++++++++
 Documentation/locking/ww-mutex-design.txt          | 383 --------------
 Documentation/pi-futex.txt                         |   2 +-
 .../translations/it_IT/kernel-hacking/locking.rst  |   2 +-
 drivers/gpu/drm/drm_modeset_lock.c                 |   2 +-
 include/linux/lockdep.h                            |   2 +-
 include/linux/mutex.h                              |   2 +-
 include/linux/rwsem.h                              |   2 +-
 kernel/locking/mutex.c                             |   2 +-
 kernel/locking/rtmutex.c                           |   2 +-
 lib/Kconfig.debug                                  |   4 +-
 27 files changed, 2176 insertions(+), 2052 deletions(-)
 create mode 100644 Documentation/locking/index.rst
 create mode 100644 Documentation/locking/lockdep-design.rst
 delete mode 100644 Documentation/locking/lockdep-design.txt
 create mode 100644 Documentation/locking/lockstat.rst
 delete mode 100644 Documentation/locking/lockstat.txt
 create mode 100644 Documentation/locking/locktorture.rst
 delete mode 100644 Documentation/locking/locktorture.txt
 create mode 100644 Documentation/locking/mutex-design.rst
 delete mode 100644 Documentation/locking/mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex-design.rst
 delete mode 100644 Documentation/locking/rt-mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex.rst
 delete mode 100644 Documentation/locking/rt-mutex.txt
 create mode 100644 Documentation/locking/spinlocks.rst
 delete mode 100644 Documentation/locking/spinlocks.txt
 create mode 100644 Documentation/locking/ww-mutex-design.rst
 delete mode 100644 Documentation/locking/ww-mutex-design.txt

(limited to 'include/linux')

diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index dc698ea456e0..a8518ac0d31d 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1364,7 +1364,7 @@ Futex API reference
 Further reading
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking
+-  ``Documentation/locking/spinlocks.rst``: Linus Torvalds' spinlocking
    tutorial in the kernel sources.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
new file mode 100644
index 000000000000..ef5da7fe9aac
--- /dev/null
+++ b/Documentation/locking/index.rst
@@ -0,0 +1,24 @@
+:orphan:
+
+=======
+locking
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    lockdep-design
+    lockstat
+    locktorture
+    mutex-design
+    rt-mutex-design
+    rt-mutex
+    spinlocks
+    ww-mutex-design
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst
new file mode 100644
index 000000000000..23fcbc4d3fc0
--- /dev/null
+++ b/Documentation/locking/lockdep-design.rst
@@ -0,0 +1,394 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'usage state' of lock-classes, and it tracks
+the dependencies between different lock-classes. Lock usage indicates
+how a lock is used with regard to its IRQ contexts, while lock
+dependency can be understood as lock order, where L1 -> L2 suggests that
+a task is attempting to acquire L2 while holding L1. From lockdep's
+perspective, the two locks (L1 and L2) are not necessarily related; that
+dependency just means the order ever happened. The validator maintains a
+continuing effort to prove lock usages and dependencies are correct or
+the validator will shoot a splat if incorrect.
+
+A lock-class's behavior is constructed by its instances collectively:
+when the first instance of a lock-class is used after bootup the class
+gets registered, then all (subsequent) instances will be mapped to the
+class and hence their usages and dependecies will contribute to those of
+the class. A lock-class does not go away when a lock instance does, but
+it can be removed if the memory space of the lock class (static or
+dynamic) is reclaimed, this happens for example when a module is
+unloaded or a workqueue is destroyed.
+
+State
+-----
+
+The validator tracks lock-class usage history and divides the usage into
+(4 usages * n STATEs + 1) categories:
+
+where the 4 usages can be:
+- 'ever held in STATE context'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
+
+where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
+now they include:
+- hardirq
+- softirq
+
+where the last 1 category is:
+- 'ever used'                                       [ == !unused        ]
+
+When locking rules are violated, these usage bits are presented in the
+locking error messages, inside curlies, with a total of 2 * n STATEs bits.
+A contrived example::
+
+   modprobe/2287 is trying to acquire lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+   but task is already holding lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+For a given lock, the bit positions from left to right indicate the usage
+of the lock and readlock (if exists), for each of the n STATEs listed
+above respectively, and the character displayed at each bit position
+indicates:
+
+   ===  ===================================================
+   '.'  acquired while irqs disabled and not in irq context
+   '-'  acquired in irq context
+   '+'  acquired with irqs enabled
+   '?'  acquired in irq context with irqs enabled.
+   ===  ===================================================
+
+The bits are illustrated with an example::
+
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+                         ||||
+                         ||| \-> softirq disabled and not in softirq context
+                         || \--> acquired in softirq context
+                         | \---> hardirq disabled and not in hardirq context
+                          \----> acquired in hardirq context
+
+
+For a given STATE, whether the lock is ever acquired in that STATE
+context and whether that STATE is enabled yields four possible cases as
+shown in the table below. The bit character is able to indicate which
+exact case is for the lock as of the reporting time.
+
+  +--------------+-------------+--------------+
+  |              | irq enabled | irq disabled |
+  +--------------+-------------+--------------+
+  | ever in irq  |      ?      |       -      |
+  +--------------+-------------+--------------+
+  | never in irq |      +      |       .      |
+  +--------------+-------------+--------------+
+
+The character '-' suggests irq is disabled because if otherwise the
+charactor '?' would have been shown instead. Similar deduction can be
+applied for '+' too.
+
+Unused locks (e.g., mutexes) cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A lock is irq-safe means it was ever used in an irq context, while a lock
+is irq-unsafe means it was ever acquired with irq enabled.
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states must be exclusive: only one of them is allowed to be set
+for any lock-class based on its usage::
+
+ <hardirq-safe> or <hardirq-unsafe>
+ <softirq-safe> or <softirq-unsafe>
+
+This is because if a lock can be used in irq context (irq-safe) then it
+cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
+deadlock may happen. For example, in the scenario that after this lock
+was acquired but before released, if the context is interrupted this
+lock will be attempted to acquire twice, which creates a deadlock,
+referred to as lock recursion deadlock.
+
+The validator detects and reports lock usage that violates these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks can not be taken in inverse order::
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to a deadlock - referred to as lock inversion
+deadlock - as attempts to acquire the two locks form a circle which
+could lead to the two contexts waiting for each other permanently. The
+validator will find such dependency circle in arbitrary complexity,
+i.e., there can be any other locking sequence between the acquire-lock
+operations; the validator will still find whether these locks can be
+acquired in a circular fashion.
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes::
+
+   <hardirq-safe>   ->  <hardirq-unsafe>
+   <softirq-safe>   ->  <softirq-unsafe>
+
+The first rule comes from the fact that a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+  took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+  any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+  hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+  softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this::
+
+  enum bdev_bd_mutex_lock_class
+  {
+       BD_MUTEX_NORMAL,
+       BD_MUTEX_WHOLE,
+       BD_MUTEX_PARTITION
+  };
+
+mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Annotations
+-----------
+
+Two constructs can be used to annotate and check where and if certain locks
+must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
+
+As the name suggests, lockdep_assert_held* family of macros assert that a
+particular lock is held at a certain time (and generate a WARN() otherwise).
+This annotation is largely used all over the kernel, e.g. kernel/sched/
+core.c::
+
+  void update_rq_clock(struct rq *rq)
+  {
+	s64 delta;
+
+	lockdep_assert_held(&rq->lock);
+	[...]
+  }
+
+where holding rq->lock is required to safely update a rq's clock.
+
+The other family of macros is lockdep_*pin_lock(), which is admittedly only
+used for rq->lock ATM. Despite their limited adoption these annotations
+generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
+out to be especially helpful to debug code with callbacks, where an upper
+layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
+and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
+returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
+that nobody tampered with the lock, e.g. kernel/sched/sched.h::
+
+  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	rf->cookie = lockdep_pin_lock(&rq->lock);
+	[...]
+  }
+
+  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	[...]
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
+  }
+
+While comments about locking requirements might provide useful information,
+the runtime checks performed by annotations are invaluable when debugging
+locking problems and they carry the same level of details when inspecting
+code.  Always prefer annotations when in doubt!
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [1]_
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+.. [1]
+
+    assuming that the validator itself is 100% correct, and no other
+    part of the system corrupts the state of the validator in any way.
+    We also assume that all NMI/SMM paths [which could interrupt
+    even hardirq-disabled codepaths] are correct and do not interfere
+    with the validator. We also assume that the 64-bit 'chain hash'
+    value is unique for every lock-chain in the system. Also, lock
+    recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require **massive** amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+don't have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks.  These two problems are illustrated below:
+
+1.	Repeated module loading and unloading while running the validator
+	will result in lock-class leakage.  The issue here is that each
+	load of the module will create a new set of lock classes for
+	that module's locks, but module unloading does not remove old
+	classes (see below discussion of reuse of lock classes for why).
+	Therefore, if that module is loaded and unloaded repeatedly,
+	the number of lock classes will eventually reach the maximum.
+
+2.	Using structures such as arrays that have large numbers of
+	locks that are not explicitly initialized.  For example,
+	a hash table with 8192 buckets where each bucket has its own
+	spinlock_t will consume 8192 lock classes -unless- each spinlock
+	is explicitly initialized at runtime, for example, using the
+	run-time spin_lock_init() as opposed to compile-time initializers
+	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
+	the per-bucket spinlocks would guarantee lock-class overflow.
+	In contrast, a loop that called spin_lock_init() on each lock
+	would place all 8192 locks into a single lock class.
+
+	The moral of this story is that you should always explicitly
+	initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused.  However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph.  This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes.  First, the following command gives
+you the number of lock classes currently in use along with the maximum::
+
+	grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system::
+
+	lock-classes:                          748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak.  The following command can be used to
+identify the leaking lock classes::
+
+	grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers.  This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
deleted file mode 100644
index f189d130e543..000000000000
--- a/Documentation/locking/lockdep-design.txt
+++ /dev/null
@@ -1,389 +0,0 @@
-Runtime locking correctness validator
-=====================================
-
-started by Ingo Molnar <mingo@redhat.com>
-additions by Arjan van de Ven <arjan@linux.intel.com>
-
-Lock-class
-----------
-
-The basic object the validator operates upon is a 'class' of locks.
-
-A class of locks is a group of locks that are logically the same with
-respect to locking rules, even if the locks may have multiple (possibly
-tens of thousands of) instantiations. For example a lock in the inode
-struct is one class, while each inode has its own instantiation of that
-lock class.
-
-The validator tracks the 'usage state' of lock-classes, and it tracks
-the dependencies between different lock-classes. Lock usage indicates
-how a lock is used with regard to its IRQ contexts, while lock
-dependency can be understood as lock order, where L1 -> L2 suggests that
-a task is attempting to acquire L2 while holding L1. From lockdep's
-perspective, the two locks (L1 and L2) are not necessarily related; that
-dependency just means the order ever happened. The validator maintains a
-continuing effort to prove lock usages and dependencies are correct or
-the validator will shoot a splat if incorrect.
-
-A lock-class's behavior is constructed by its instances collectively:
-when the first instance of a lock-class is used after bootup the class
-gets registered, then all (subsequent) instances will be mapped to the
-class and hence their usages and dependecies will contribute to those of
-the class. A lock-class does not go away when a lock instance does, but
-it can be removed if the memory space of the lock class (static or
-dynamic) is reclaimed, this happens for example when a module is
-unloaded or a workqueue is destroyed.
-
-State
------
-
-The validator tracks lock-class usage history and divides the usage into
-(4 usages * n STATEs + 1) categories:
-
-where the 4 usages can be:
-- 'ever held in STATE context'
-- 'ever held as readlock in STATE context'
-- 'ever held with STATE enabled'
-- 'ever held as readlock with STATE enabled'
-
-where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
-now they include:
-- hardirq
-- softirq
-
-where the last 1 category is:
-- 'ever used'                                       [ == !unused        ]
-
-When locking rules are violated, these usage bits are presented in the
-locking error messages, inside curlies, with a total of 2 * n STATEs bits.
-A contrived example:
-
-   modprobe/2287 is trying to acquire lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-   but task is already holding lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-
-For a given lock, the bit positions from left to right indicate the usage
-of the lock and readlock (if exists), for each of the n STATEs listed
-above respectively, and the character displayed at each bit position
-indicates:
-
-   '.'  acquired while irqs disabled and not in irq context
-   '-'  acquired in irq context
-   '+'  acquired with irqs enabled
-   '?'  acquired in irq context with irqs enabled.
-
-The bits are illustrated with an example:
-
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-                         ||||
-                         ||| \-> softirq disabled and not in softirq context
-                         || \--> acquired in softirq context
-                         | \---> hardirq disabled and not in hardirq context
-                          \----> acquired in hardirq context
-
-
-For a given STATE, whether the lock is ever acquired in that STATE
-context and whether that STATE is enabled yields four possible cases as
-shown in the table below. The bit character is able to indicate which
-exact case is for the lock as of the reporting time.
-
-   -------------------------------------------
-  |              | irq enabled | irq disabled |
-  |-------------------------------------------|
-  | ever in irq  |      ?      |       -      |
-  |-------------------------------------------|
-  | never in irq |      +      |       .      |
-   -------------------------------------------
-
-The character '-' suggests irq is disabled because if otherwise the
-charactor '?' would have been shown instead. Similar deduction can be
-applied for '+' too.
-
-Unused locks (e.g., mutexes) cannot be part of the cause of an error.
-
-
-Single-lock state rules:
-------------------------
-
-A lock is irq-safe means it was ever used in an irq context, while a lock
-is irq-unsafe means it was ever acquired with irq enabled.
-
-A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
-following states must be exclusive: only one of them is allowed to be set
-for any lock-class based on its usage:
-
- <hardirq-safe> or <hardirq-unsafe>
- <softirq-safe> or <softirq-unsafe>
-
-This is because if a lock can be used in irq context (irq-safe) then it
-cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
-deadlock may happen. For example, in the scenario that after this lock
-was acquired but before released, if the context is interrupted this
-lock will be attempted to acquire twice, which creates a deadlock,
-referred to as lock recursion deadlock.
-
-The validator detects and reports lock usage that violates these
-single-lock state rules.
-
-Multi-lock dependency rules:
-----------------------------
-
-The same lock-class must not be acquired twice, because this could lead
-to lock recursion deadlocks.
-
-Furthermore, two locks can not be taken in inverse order:
-
- <L1> -> <L2>
- <L2> -> <L1>
-
-because this could lead to a deadlock - referred to as lock inversion
-deadlock - as attempts to acquire the two locks form a circle which
-could lead to the two contexts waiting for each other permanently. The
-validator will find such dependency circle in arbitrary complexity,
-i.e., there can be any other locking sequence between the acquire-lock
-operations; the validator will still find whether these locks can be
-acquired in a circular fashion.
-
-Furthermore, the following usage based lock dependencies are not allowed
-between any two lock-classes:
-
-   <hardirq-safe>   ->  <hardirq-unsafe>
-   <softirq-safe>   ->  <softirq-unsafe>
-
-The first rule comes from the fact that a hardirq-safe lock could be
-taken by a hardirq context, interrupting a hardirq-unsafe lock - and
-thus could result in a lock inversion deadlock. Likewise, a softirq-safe
-lock could be taken by an softirq context, interrupting a softirq-unsafe
-lock.
-
-The above rules are enforced for any locking sequence that occurs in the
-kernel: when acquiring a new lock, the validator checks whether there is
-any rule violation between the new lock and any of the held locks.
-
-When a lock-class changes its state, the following aspects of the above
-dependency rules are enforced:
-
-- if a new hardirq-safe lock is discovered, we check whether it
-  took any hardirq-unsafe lock in the past.
-
-- if a new softirq-safe lock is discovered, we check whether it took
-  any softirq-unsafe lock in the past.
-
-- if a new hardirq-unsafe lock is discovered, we check whether any
-  hardirq-safe lock took it in the past.
-
-- if a new softirq-unsafe lock is discovered, we check whether any
-  softirq-safe lock took it in the past.
-
-(Again, we do these checks too on the basis that an interrupt context
-could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
-could lead to a lock inversion deadlock - even if that lock scenario did
-not trigger in practice yet.)
-
-Exception: Nested data dependencies leading to nested locking
--------------------------------------------------------------
-
-There are a few cases where the Linux kernel acquires more than one
-instance of the same lock-class. Such cases typically happen when there
-is some sort of hierarchy within objects of the same type. In these
-cases there is an inherent "natural" ordering between the two objects
-(defined by the properties of the hierarchy), and the kernel grabs the
-locks in this fixed order on each of the objects.
-
-An example of such an object hierarchy that results in "nested locking"
-is that of a "whole disk" block-dev object and a "partition" block-dev
-object; the partition is "part of" the whole device and as long as one
-always takes the whole disk lock as a higher lock than the partition
-lock, the lock ordering is fully correct. The validator does not
-automatically detect this natural ordering, as the locking rule behind
-the ordering is not static.
-
-In order to teach the validator about this correct usage model, new
-versions of the various locking primitives were added that allow you to
-specify a "nesting level". An example call, for the block device mutex,
-looks like this:
-
-enum bdev_bd_mutex_lock_class
-{
-       BD_MUTEX_NORMAL,
-       BD_MUTEX_WHOLE,
-       BD_MUTEX_PARTITION
-};
-
- mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
-
-In this case the locking is done on a bdev object that is known to be a
-partition.
-
-The validator treats a lock that is taken in such a nested fashion as a
-separate (sub)class for the purposes of validation.
-
-Note: When changing code to use the _nested() primitives, be careful and
-check really thoroughly that the hierarchy is correctly mapped; otherwise
-you can get false positives or false negatives.
-
-Annotations
------------
-
-Two constructs can be used to annotate and check where and if certain locks
-must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
-
-As the name suggests, lockdep_assert_held* family of macros assert that a
-particular lock is held at a certain time (and generate a WARN() otherwise).
-This annotation is largely used all over the kernel, e.g. kernel/sched/
-core.c
-
-  void update_rq_clock(struct rq *rq)
-  {
-	s64 delta;
-
-	lockdep_assert_held(&rq->lock);
-	[...]
-  }
-
-where holding rq->lock is required to safely update a rq's clock.
-
-The other family of macros is lockdep_*pin_lock(), which is admittedly only
-used for rq->lock ATM. Despite their limited adoption these annotations
-generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
-out to be especially helpful to debug code with callbacks, where an upper
-layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
-and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
-returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
-that nobody tampered with the lock, e.g. kernel/sched/sched.h
-
-  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	rf->cookie = lockdep_pin_lock(&rq->lock);
-	[...]
-  }
-
-  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	[...]
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
-  }
-
-While comments about locking requirements might provide useful information,
-the runtime checks performed by annotations are invaluable when debugging
-locking problems and they carry the same level of details when inspecting
-code.  Always prefer annotations when in doubt!
-
-Proof of 100% correctness:
---------------------------
-
-The validator achieves perfect, mathematical 'closure' (proof of locking
-correctness) in the sense that for every simple, standalone single-task
-locking sequence that occurred at least once during the lifetime of the
-kernel, the validator proves it with a 100% certainty that no
-combination and timing of these locking sequences can cause any class of
-lock related deadlock. [*]
-
-I.e. complex multi-CPU and multi-task locking scenarios do not have to
-occur in practice to prove a deadlock: only the simple 'component'
-locking chains have to occur at least once (anytime, in any
-task/context) for the validator to be able to prove correctness. (For
-example, complex deadlocks that would normally need more than 3 CPUs and
-a very unlikely constellation of tasks, irq-contexts and timings to
-occur, can be detected on a plain, lightly loaded single-CPU system as
-well!)
-
-This radically decreases the complexity of locking related QA of the
-kernel: what has to be done during QA is to trigger as many "simple"
-single-task locking dependencies in the kernel as possible, at least
-once, to prove locking correctness - instead of having to trigger every
-possible combination of locking interaction between CPUs, combined with
-every possible hardirq and softirq nesting scenario (which is impossible
-to do in practice).
-
-[*] assuming that the validator itself is 100% correct, and no other
-    part of the system corrupts the state of the validator in any way.
-    We also assume that all NMI/SMM paths [which could interrupt
-    even hardirq-disabled codepaths] are correct and do not interfere
-    with the validator. We also assume that the 64-bit 'chain hash'
-    value is unique for every lock-chain in the system. Also, lock
-    recursion must not be higher than 20.
-
-Performance:
-------------
-
-The above rules require _massive_ amounts of runtime checking. If we did
-that for every lock taken and for every irqs-enable event, it would
-render the system practically unusably slow. The complexity of checking
-is O(N^2), so even with just a few hundred lock-classes we'd have to do
-tens of thousands of checks for every event.
-
-This problem is solved by checking any given 'locking scenario' (unique
-sequence of locks taken after each other) only once. A simple stack of
-held locks is maintained, and a lightweight 64-bit hash value is
-calculated, which hash is unique for every lock chain. The hash value,
-when the chain is validated for the first time, is then put into a hash
-table, which hash-table can be checked in a lockfree manner. If the
-locking chain occurs again later on, the hash table tells us that we
-don't have to validate the chain again.
-
-Troubleshooting:
-----------------
-
-The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
-Exceeding this number will trigger the following lockdep warning:
-
-	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
-
-By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
-desktop systems have less than 1,000 lock classes, so this warning
-normally results from lock-class leakage or failure to properly
-initialize locks.  These two problems are illustrated below:
-
-1.	Repeated module loading and unloading while running the validator
-	will result in lock-class leakage.  The issue here is that each
-	load of the module will create a new set of lock classes for
-	that module's locks, but module unloading does not remove old
-	classes (see below discussion of reuse of lock classes for why).
-	Therefore, if that module is loaded and unloaded repeatedly,
-	the number of lock classes will eventually reach the maximum.
-
-2.	Using structures such as arrays that have large numbers of
-	locks that are not explicitly initialized.  For example,
-	a hash table with 8192 buckets where each bucket has its own
-	spinlock_t will consume 8192 lock classes -unless- each spinlock
-	is explicitly initialized at runtime, for example, using the
-	run-time spin_lock_init() as opposed to compile-time initializers
-	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
-	the per-bucket spinlocks would guarantee lock-class overflow.
-	In contrast, a loop that called spin_lock_init() on each lock
-	would place all 8192 locks into a single lock class.
-
-	The moral of this story is that you should always explicitly
-	initialize your locks.
-
-One might argue that the validator should be modified to allow
-lock classes to be reused.  However, if you are tempted to make this
-argument, first review the code and think through the changes that would
-be required, keeping in mind that the lock classes to be removed are
-likely to be linked into the lock-dependency graph.  This turns out to
-be harder to do than to say.
-
-Of course, if you do run out of lock classes, the next thing to do is
-to find the offending lock classes.  First, the following command gives
-you the number of lock classes currently in use along with the maximum:
-
-	grep "lock-classes" /proc/lockdep_stats
-
-This command produces the following output on a modest system:
-
-	 lock-classes:                          748 [max: 8191]
-
-If the number allocated (748 above) increases continually over time,
-then there is likely a leak.  The following command can be used to
-identify the leaking lock classes:
-
-	grep "BD" /proc/lockdep
-
-Run the command and save the output, then compare against the output from
-a later run of this command to identify the leakers.  This same output
-can also help you find situations where runtime lock initialization has
-been omitted.
diff --git a/Documentation/locking/lockstat.rst b/Documentation/locking/lockstat.rst
new file mode 100644
index 000000000000..536eab8dbd99
--- /dev/null
+++ b/Documentation/locking/lockstat.rst
@@ -0,0 +1,204 @@
+===============
+Lock Statistics
+===============
+
+What
+====
+
+As the name suggests, it provides statistics on locks.
+
+
+Why
+===
+
+Because things like lock contention can severely impact performance.
+
+How
+===
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that (see Documentation/locking/lockdep-design.rst).
+The graph below shows the relation between the lock functions and the various
+hooks therein::
+
+        __acquire
+            |
+           lock _____
+            |        \
+            |    __contended
+            |         |
+            |       <wait>
+            | _______/
+            |/
+            |
+       __acquired
+            |
+            .
+          <hold>
+            .
+            |
+       __release
+            |
+         unlock
+
+  lock, unlock	- the regular lock functions
+  __*		- the hooks
+  <> 		- states
+
+With these hooks we provide the following statistics:
+
+ con-bounces
+	- number of lock contention that involved x-cpu data
+ contentions
+	- number of lock acquisitions that had to wait
+ wait time
+     min
+	- shortest (non-0) time we ever had to wait for a lock
+     max
+	- longest time we ever had to wait for a lock
+     total
+	- total time we spend waiting on this lock
+     avg
+	- average time spent waiting on this lock
+ acq-bounces
+	- number of lock acquisitions that involved x-cpu data
+ acquisitions
+	- number of times we took the lock
+ hold time
+     min
+	- shortest (non-0) time we ever held the lock
+     max
+	- longest time we ever held the lock
+     total
+	- total time this lock was held
+     avg
+	- average time this lock was held
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+Configuration
+-------------
+
+Lock statistics are enabled via CONFIG_LOCK_STAT.
+
+Usage
+-----
+
+Enable collection of statistics::
+
+	# echo 1 >/proc/sys/kernel/lock_stat
+
+Disable collection of statistics::
+
+	# echo 0 >/proc/sys/kernel/lock_stat
+
+Look at the current lock statistics::
+
+  ( line numbers not part of actual output, done for clarity in the explanation
+    below )
+
+  # less /proc/lock_stat
+
+  01 lock_stat version 0.4
+  02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
+  04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  05
+  06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
+  07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
+  08                         ---------------
+  09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+  10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
+  13                         ---------------
+  14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+  15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+  16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  18
+  19.............................................................................................................................................................................................................................
+  20
+  21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
+  22                         ---------------
+  23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+  27                         ---------------
+  28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 13) from the contention points.
+
+Lines 09-12 show the first 4 recorded contention points (the code
+which tries to get the lock) and lines 14-17 show the first 4 recorded
+contended points (the lock holder). It is possible that the max
+con-bounces point is missing in the statistics.
+
+The first lock (05-18) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
+
+The integer part of the time values is in us.
+
+Dealing with nested locks, subclasses may appear::
+
+  32...........................................................................................................................................................................................................................
+  33
+  34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
+  35                               ---------
+  36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+  39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+  40                               ---------
+  41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
+  45
+  46...........................................................................................................................................................................................................................
+  47
+  48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
+  49                             -----------
+  50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  51                             -----------
+  52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+  54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
+View the top contending locks::
+
+  # grep : /proc/lock_stat | head
+			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
+		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
+		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
+			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
+	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
+			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
+			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
+			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
+       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
+			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
+
+Clear the statistics::
+
+  # echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
deleted file mode 100644
index fdbeb0c45ef3..000000000000
--- a/Documentation/locking/lockstat.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-
-LOCK STATISTICS
-
-- WHAT
-
-As the name suggests, it provides statistics on locks.
-
-- WHY
-
-Because things like lock contention can severely impact performance.
-
-- HOW
-
-Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
-The graph below shows the relation between the lock functions and the various
-hooks therein.
-
-        __acquire
-            |
-           lock _____
-            |        \
-            |    __contended
-            |         |
-            |       <wait>
-            | _______/
-            |/
-            |
-       __acquired
-            |
-            .
-          <hold>
-            .
-            |
-       __release
-            |
-         unlock
-
-lock, unlock	- the regular lock functions
-__*		- the hooks
-<> 		- states
-
-With these hooks we provide the following statistics:
-
- con-bounces       - number of lock contention that involved x-cpu data
- contentions       - number of lock acquisitions that had to wait
- wait time min     - shortest (non-0) time we ever had to wait for a lock
-           max     - longest time we ever had to wait for a lock
-	   total   - total time we spend waiting on this lock
-	   avg     - average time spent waiting on this lock
- acq-bounces       - number of lock acquisitions that involved x-cpu data
- acquisitions      - number of times we took the lock
- hold time min     - shortest (non-0) time we ever held the lock
-	   max     - longest time we ever held the lock
-	   total   - total time this lock was held
-	   avg     - average time this lock was held
-
-These numbers are gathered per lock class, per read/write state (when
-applicable).
-
-It also tracks 4 contention points per class. A contention point is a call site
-that had to wait on lock acquisition.
-
- - CONFIGURATION
-
-Lock statistics are enabled via CONFIG_LOCK_STAT.
-
- - USAGE
-
-Enable collection of statistics:
-
-# echo 1 >/proc/sys/kernel/lock_stat
-
-Disable collection of statistics:
-
-# echo 0 >/proc/sys/kernel/lock_stat
-
-Look at the current lock statistics:
-
-( line numbers not part of actual output, done for clarity in the explanation
-  below )
-
-# less /proc/lock_stat
-
-01 lock_stat version 0.4
-02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
-04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-05
-06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
-07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
-08                         ---------------
-09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
-10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
-13                         ---------------
-14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
-15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
-16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-18
-19.............................................................................................................................................................................................................................
-20
-21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
-22                         ---------------
-23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-27                         ---------------
-28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-
-
-This excerpt shows the first two lock class statistics. Line 01 shows the
-output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-18 and 20-31 show the actual
-statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 13) from the contention points.
-
-Lines 09-12 show the first 4 recorded contention points (the code
-which tries to get the lock) and lines 14-17 show the first 4 recorded
-contended points (the lock holder). It is possible that the max
-con-bounces point is missing in the statistics.
-
-The first lock (05-18) is a read/write lock, and shows two lines above the
-short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol. The second set of contention
-points are the points we're contending with.
-
-The integer part of the time values is in us.
-
-Dealing with nested locks, subclasses may appear:
-
-32...........................................................................................................................................................................................................................
-33
-34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
-35                               ---------
-36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
-40                               ---------
-41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
-45
-46...........................................................................................................................................................................................................................
-47
-48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
-49                             -----------
-50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-51                             -----------
-52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
-54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-
-Line 48 shows statistics for the second subclass (/1) of &rq->lock class
-(subclass starts from 0), since in this case, as line 50 suggests,
-double_rq_lock actually acquires a nested lock of two spinlocks.
-
-View the top contending locks:
-
-# grep : /proc/lock_stat | head
-			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
-		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
-		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
-			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
-	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
-			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
-			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
-			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
-       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
-			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
-
-Clear the statistics:
-
-# echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
new file mode 100644
index 000000000000..e79eeeca3ac6
--- /dev/null
+++ b/Documentation/locking/locktorture.rst
@@ -0,0 +1,170 @@
+==================================
+Kernel Lock Torture Test Operation
+==================================
+
+CONFIG_LOCK_TORTURE_TEST
+========================
+
+The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
+that runs torture tests on core kernel locking primitives. The kernel
+module, 'locktorture', may be built after the fact on the running
+kernel to be tested, if desired. The tests periodically output status
+messages via printk(), which can be examined via the dmesg (perhaps
+grepping for "torture").  The test is started when the module is loaded,
+and stops when the module is unloaded. This program is based on how RCU
+is tortured, via rcutorture.
+
+This torture test consists of creating a number of kernel threads which
+acquire the lock and hold it for specific amount of time, thus simulating
+different critical region behaviors. The amount of contention on the lock
+can be simulated by either enlarging this critical region hold time and/or
+creating more kthreads.
+
+
+Module Parameters
+=================
+
+This module has the following parameters:
+
+
+Locktorture-specific
+--------------------
+
+nwriters_stress
+		  Number of kernel threads that will stress exclusive lock
+		  ownership (writers). The default value is twice the number
+		  of online CPUs.
+
+nreaders_stress
+		  Number of kernel threads that will stress shared lock
+		  ownership (readers). The default is the same amount of writer
+		  locks. If the user did not specify nwriters_stress, then
+		  both readers and writers be the amount of online CPUs.
+
+torture_type
+		  Type of lock to torture. By default, only spinlocks will
+		  be tortured. This module can torture the following locks,
+		  with string values as follows:
+
+		     - "lock_busted":
+				Simulates a buggy lock implementation.
+
+		     - "spin_lock":
+				spin_lock() and spin_unlock() pairs.
+
+		     - "spin_lock_irq":
+				spin_lock_irq() and spin_unlock_irq() pairs.
+
+		     - "rw_lock":
+				read/write lock() and unlock() rwlock pairs.
+
+		     - "rw_lock_irq":
+				read/write lock_irq() and unlock_irq()
+				rwlock pairs.
+
+		     - "mutex_lock":
+				mutex_lock() and mutex_unlock() pairs.
+
+		     - "rtmutex_lock":
+				rtmutex_lock() and rtmutex_unlock() pairs.
+				Kernel must have CONFIG_RT_MUTEX=y.
+
+		     - "rwsem_lock":
+				read/write down() and up() semaphore pairs.
+
+
+Torture-framework (RCU + locking)
+---------------------------------
+
+shutdown_secs
+		  The number of seconds to run the test before terminating
+		  the test and powering off the system.  The default is
+		  zero, which disables test termination and system shutdown.
+		  This capability is useful for automated testing.
+
+onoff_interval
+		  The number of seconds between each attempt to execute a
+		  randomly selected CPU-hotplug operation.  Defaults
+		  to zero, which disables CPU hotplugging.  In
+		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
+		  refuse to do any CPU-hotplug operations regardless of
+		  what value is specified for onoff_interval.
+
+onoff_holdoff
+		  The number of seconds to wait until starting CPU-hotplug
+		  operations.  This would normally only be used when
+		  locktorture was built into the kernel and started
+		  automatically at boot time, in which case it is useful
+		  in order to avoid confusing boot-time code with CPUs
+		  coming and going. This parameter is only useful if
+		  CONFIG_HOTPLUG_CPU is enabled.
+
+stat_interval
+		  Number of seconds between statistics-related printk()s.
+		  By default, locktorture will report stats every 60 seconds.
+		  Setting the interval to zero causes the statistics to
+		  be printed -only- when the module is unloaded, and this
+		  is the default.
+
+stutter
+		  The length of time to run the test before pausing for this
+		  same period of time.  Defaults to "stutter=5", so as
+		  to run and pause for (roughly) five-second intervals.
+		  Specifying "stutter=0" causes the test to run continuously
+		  without pausing, which is the old default behavior.
+
+shuffle_interval
+		  The number of seconds to keep the test threads affinitied
+		  to a particular subset of the CPUs, defaults to 3 seconds.
+		  Used in conjunction with test_no_idle_hz.
+
+verbose
+		  Enable verbose debugging printing, via printk(). Enabled
+		  by default. This extra information is mostly related to
+		  high-level errors and reports from the main 'torture'
+		  framework.
+
+
+Statistics
+==========
+
+Statistics are printed in the following format::
+
+  spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
+     (A)		    (B)		   (C)		  (D)	       (E)
+
+  (A): Lock type that is being tortured -- torture_type parameter.
+
+  (B): Number of writer lock acquisitions. If dealing with a read/write
+       primitive a second "Reads" statistics line is printed.
+
+  (C): Number of times the lock was acquired.
+
+  (D): Min and max number of times threads failed to acquire the lock.
+
+  (E): true/false values if there were errors acquiring the lock. This should
+       -only- be positive if there is a bug in the locking primitive's
+       implementation. Otherwise a lock should never fail (i.e., spin_lock()).
+       Of course, the same applies for (C), above. A dummy example of this is
+       the "lock_busted" type.
+
+Usage
+=====
+
+The following script may be used to torture locks::
+
+	#!/bin/sh
+
+	modprobe locktorture
+	sleep 3600
+	rmmod locktorture
+	dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.  The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
+two are self-explanatory, while the last indicates that while there
+were no locking failures, CPU-hotplug problems were detected.
+
+Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
deleted file mode 100644
index 6a8df4cd19bf..000000000000
--- a/Documentation/locking/locktorture.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Kernel Lock Torture Test Operation
-
-CONFIG_LOCK_TORTURE_TEST
-
-The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
-that runs torture tests on core kernel locking primitives. The kernel
-module, 'locktorture', may be built after the fact on the running
-kernel to be tested, if desired. The tests periodically output status
-messages via printk(), which can be examined via the dmesg (perhaps
-grepping for "torture").  The test is started when the module is loaded,
-and stops when the module is unloaded. This program is based on how RCU
-is tortured, via rcutorture.
-
-This torture test consists of creating a number of kernel threads which
-acquire the lock and hold it for specific amount of time, thus simulating
-different critical region behaviors. The amount of contention on the lock
-can be simulated by either enlarging this critical region hold time and/or
-creating more kthreads.
-
-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-
-	    ** Locktorture-specific **
-
-nwriters_stress   Number of kernel threads that will stress exclusive lock
-		  ownership (writers). The default value is twice the number
-		  of online CPUs.
-
-nreaders_stress   Number of kernel threads that will stress shared lock
-		  ownership (readers). The default is the same amount of writer
-		  locks. If the user did not specify nwriters_stress, then
-		  both readers and writers be the amount of online CPUs.
-
-torture_type	  Type of lock to torture. By default, only spinlocks will
-		  be tortured. This module can torture the following locks,
-		  with string values as follows:
-
-		     o "lock_busted": Simulates a buggy lock implementation.
-
-		     o "spin_lock": spin_lock() and spin_unlock() pairs.
-
-		     o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq()
-					pairs.
-
-		     o "rw_lock": read/write lock() and unlock() rwlock pairs.
-
-		     o "rw_lock_irq": read/write lock_irq() and unlock_irq()
-				      rwlock pairs.
-
-		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
-
-		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
-				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
-
-		     o "rwsem_lock": read/write down() and up() semaphore pairs.
-
-
-	    ** Torture-framework (RCU + locking) **
-
-shutdown_secs	  The number of seconds to run the test before terminating
-		  the test and powering off the system.  The default is
-		  zero, which disables test termination and system shutdown.
-		  This capability is useful for automated testing.
-
-onoff_interval	  The number of seconds between each attempt to execute a
-		  randomly selected CPU-hotplug operation.  Defaults
-		  to zero, which disables CPU hotplugging.  In
-		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
-		  refuse to do any CPU-hotplug operations regardless of
-		  what value is specified for onoff_interval.
-
-onoff_holdoff	  The number of seconds to wait until starting CPU-hotplug
-		  operations.  This would normally only be used when
-		  locktorture was built into the kernel and started
-		  automatically at boot time, in which case it is useful
-		  in order to avoid confusing boot-time code with CPUs
-		  coming and going. This parameter is only useful if
-		  CONFIG_HOTPLUG_CPU is enabled.
-
-stat_interval	  Number of seconds between statistics-related printk()s.
-		  By default, locktorture will report stats every 60 seconds.
-		  Setting the interval to zero causes the statistics to
-		  be printed -only- when the module is unloaded, and this
-		  is the default.
-
-stutter		  The length of time to run the test before pausing for this
-		  same period of time.  Defaults to "stutter=5", so as
-		  to run and pause for (roughly) five-second intervals.
-		  Specifying "stutter=0" causes the test to run continuously
-		  without pausing, which is the old default behavior.
-
-shuffle_interval  The number of seconds to keep the test threads affinitied
-		  to a particular subset of the CPUs, defaults to 3 seconds.
-		  Used in conjunction with test_no_idle_hz.
-
-verbose		  Enable verbose debugging printing, via printk(). Enabled
-		  by default. This extra information is mostly related to
-		  high-level errors and reports from the main 'torture'
-		  framework.
-
-
-STATISTICS
-
-Statistics are printed in the following format:
-
-spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
-   (A)		    (B)		   (C)		  (D)	       (E)
-
-(A): Lock type that is being tortured -- torture_type parameter.
-
-(B): Number of writer lock acquisitions. If dealing with a read/write primitive
-     a second "Reads" statistics line is printed.
-
-(C): Number of times the lock was acquired.
-
-(D): Min and max number of times threads failed to acquire the lock.
-
-(E): true/false values if there were errors acquiring the lock. This should
-     -only- be positive if there is a bug in the locking primitive's
-     implementation. Otherwise a lock should never fail (i.e., spin_lock()).
-     Of course, the same applies for (C), above. A dummy example of this is
-     the "lock_busted" type.
-
-USAGE
-
-The following script may be used to torture locks:
-
-	#!/bin/sh
-
-	modprobe locktorture
-	sleep 3600
-	rmmod locktorture
-	dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors.  The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
-two are self-explanatory, while the last indicates that while there
-were no locking failures, CPU-hotplug problems were detected.
-
-Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
new file mode 100644
index 000000000000..4d8236b81fa5
--- /dev/null
+++ b/Documentation/locking/mutex-design.rst
@@ -0,0 +1,152 @@
+=======================
+Generic Mutex Subsystem
+=======================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+updated by Davidlohr Bueso <davidlohr@hp.com>
+
+What are mutexes?
+-----------------
+
+In the Linux kernel, mutexes refer to a particular locking primitive
+that enforces serialization on shared memory systems, and not only to
+the generic term referring to 'mutual exclusion' found in academia
+or similar theoretical text books. Mutexes are sleeping locks which
+behave similarly to binary semaphores, and were introduced in 2006[1]
+as an alternative to these. This new data structure provided a number
+of advantages, including simpler interfaces, and at that time smaller
+code (see Disadvantages).
+
+[1] http://lwn.net/Articles/164802/
+
+Implementation
+--------------
+
+Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime.  Field owner
+actually contains `struct task_struct *` to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty).  In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
+
+When acquiring a mutex, there are three possible paths that can be
+taken, depending on the state of the lock:
+
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+    the current task. This only works in the uncontended case (cmpxchg() checks
+    against 0UL, so all 3 state bits above have to be 0). If the lock is
+    contended it goes to the next possible path.
+
+(ii) midpath: aka optimistic spinning, tries to spin for acquisition
+     while the lock owner is running and there are no other tasks ready
+     to run that have higher priority (need_resched). The rationale is
+     that if the lock owner is running, it is likely to release the lock
+     soon. The mutex spinners are queued up using MCS lock so that only
+     one spinner can compete for the mutex.
+
+     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
+     with the desirable properties of being fair and with each cpu trying
+     to acquire the lock spinning on a local variable. It avoids expensive
+     cacheline bouncing that common test-and-set spinlock implementations
+     incur. An MCS-like lock is specially tailored for optimistic spinning
+     for sleeping lock implementation. An important feature of the customized
+     MCS lock is that it has the extra property that spinners are able to exit
+     the MCS spinlock queue when they need to reschedule. This further helps
+     avoid situations where MCS spinners that need to reschedule would continue
+     waiting to spin on mutex owner, only to go directly to slowpath upon
+     obtaining the MCS lock.
+
+
+(iii) slowpath: last resort, if the lock is still unable to be acquired,
+      the task is added to the wait-queue and sleeps until woken up by the
+      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
+
+While formally kernel mutexes are sleepable locks, it is path (ii) that
+makes them more practically a hybrid type. By simply not interrupting a
+task and busy-waiting for a few cycles instead of immediately sleeping,
+the performance of this lock has been seen to significantly improve a
+number of workloads. Note that this technique is also used for rw-semaphores.
+
+Semantics
+---------
+
+The mutex subsystem checks and enforces the following rules:
+
+    - Only one task can hold the mutex at a time.
+    - Only the owner can unlock the mutex.
+    - Multiple unlocks are not permitted.
+    - Recursive locking/unlocking is not permitted.
+    - A mutex must only be initialized via the API (see below).
+    - A task may not exit with a mutex held.
+    - Memory areas where held locks reside must not be freed.
+    - Held mutexes must not be reinitialized.
+    - Mutexes may not be used in hardware or software interrupt
+      contexts such as tasklets and timers.
+
+These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
+In addition, the mutex debugging code also implements a number of other
+features that make lock debugging easier and faster:
+
+    - Uses symbolic names of mutexes, whenever they are printed
+      in debug output.
+    - Point-of-acquire tracking, symbolic lookup of function names,
+      list of all locks held in the system, printout of them.
+    - Owner tracking.
+    - Detects self-recursing locks and prints out all relevant info.
+    - Detects multi-task circular deadlocks and prints out all affected
+      locks and tasks (and only those tasks).
+
+
+Interfaces
+----------
+Statically define the mutex::
+
+   DEFINE_MUTEX(name);
+
+Dynamically initialize the mutex::
+
+   mutex_init(mutex);
+
+Acquire the mutex, uninterruptible::
+
+   void mutex_lock(struct mutex *lock);
+   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+   int  mutex_trylock(struct mutex *lock);
+
+Acquire the mutex, interruptible::
+
+   int mutex_lock_interruptible_nested(struct mutex *lock,
+				       unsigned int subclass);
+   int mutex_lock_interruptible(struct mutex *lock);
+
+Acquire the mutex, interruptible, if dec to 0::
+
+   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+
+Unlock the mutex::
+
+   void mutex_unlock(struct mutex *lock);
+
+Test if the mutex is taken::
+
+   int mutex_is_locked(struct mutex *lock);
+
+Disadvantages
+-------------
+
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
+
+When to use mutexes
+-------------------
+
+Unless the strict semantics of mutexes are unsuitable and/or the critical
+region prevents the lock from being shared, always prefer them to any other
+locking primitive.
diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
deleted file mode 100644
index 818aca19612f..000000000000
--- a/Documentation/locking/mutex-design.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Generic Mutex Subsystem
-
-started by Ingo Molnar <mingo@redhat.com>
-updated by Davidlohr Bueso <davidlohr@hp.com>
-
-What are mutexes?
------------------
-
-In the Linux kernel, mutexes refer to a particular locking primitive
-that enforces serialization on shared memory systems, and not only to
-the generic term referring to 'mutual exclusion' found in academia
-or similar theoretical text books. Mutexes are sleeping locks which
-behave similarly to binary semaphores, and were introduced in 2006[1]
-as an alternative to these. This new data structure provided a number
-of advantages, including simpler interfaces, and at that time smaller
-code (see Disadvantages).
-
-[1] http://lwn.net/Articles/164802/
-
-Implementation
---------------
-
-Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use an atomic variable
-(->owner) to keep track of the lock state during its lifetime.  Field owner
-actually contains 'struct task_struct *' to the current lock owner and it is
-therefore NULL if not currently owned. Since task_struct pointers are aligned
-at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
-if waiter list is non-empty).  In its most basic form it also includes a
-wait-queue and a spinlock that serializes access to it. Furthermore,
-CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
-below in (ii).
-
-When acquiring a mutex, there are three possible paths that can be
-taken, depending on the state of the lock:
-
-(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
-    the current task. This only works in the uncontended case (cmpxchg() checks
-    against 0UL, so all 3 state bits above have to be 0). If the lock is
-    contended it goes to the next possible path.
-
-(ii) midpath: aka optimistic spinning, tries to spin for acquisition
-     while the lock owner is running and there are no other tasks ready
-     to run that have higher priority (need_resched). The rationale is
-     that if the lock owner is running, it is likely to release the lock
-     soon. The mutex spinners are queued up using MCS lock so that only
-     one spinner can compete for the mutex.
-
-     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
-     with the desirable properties of being fair and with each cpu trying
-     to acquire the lock spinning on a local variable. It avoids expensive
-     cacheline bouncing that common test-and-set spinlock implementations
-     incur. An MCS-like lock is specially tailored for optimistic spinning
-     for sleeping lock implementation. An important feature of the customized
-     MCS lock is that it has the extra property that spinners are able to exit
-     the MCS spinlock queue when they need to reschedule. This further helps
-     avoid situations where MCS spinners that need to reschedule would continue
-     waiting to spin on mutex owner, only to go directly to slowpath upon
-     obtaining the MCS lock.
-
-
-(iii) slowpath: last resort, if the lock is still unable to be acquired,
-      the task is added to the wait-queue and sleeps until woken up by the
-      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
-
-While formally kernel mutexes are sleepable locks, it is path (ii) that
-makes them more practically a hybrid type. By simply not interrupting a
-task and busy-waiting for a few cycles instead of immediately sleeping,
-the performance of this lock has been seen to significantly improve a
-number of workloads. Note that this technique is also used for rw-semaphores.
-
-Semantics
----------
-
-The mutex subsystem checks and enforces the following rules:
-
-    - Only one task can hold the mutex at a time.
-    - Only the owner can unlock the mutex.
-    - Multiple unlocks are not permitted.
-    - Recursive locking/unlocking is not permitted.
-    - A mutex must only be initialized via the API (see below).
-    - A task may not exit with a mutex held.
-    - Memory areas where held locks reside must not be freed.
-    - Held mutexes must not be reinitialized.
-    - Mutexes may not be used in hardware or software interrupt
-      contexts such as tasklets and timers.
-
-These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
-In addition, the mutex debugging code also implements a number of other
-features that make lock debugging easier and faster:
-
-    - Uses symbolic names of mutexes, whenever they are printed
-      in debug output.
-    - Point-of-acquire tracking, symbolic lookup of function names,
-      list of all locks held in the system, printout of them.
-    - Owner tracking.
-    - Detects self-recursing locks and prints out all relevant info.
-    - Detects multi-task circular deadlocks and prints out all affected
-      locks and tasks (and only those tasks).
-
-
-Interfaces
-----------
-Statically define the mutex:
-   DEFINE_MUTEX(name);
-
-Dynamically initialize the mutex:
-   mutex_init(mutex);
-
-Acquire the mutex, uninterruptible:
-   void mutex_lock(struct mutex *lock);
-   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-   int  mutex_trylock(struct mutex *lock);
-
-Acquire the mutex, interruptible:
-   int mutex_lock_interruptible_nested(struct mutex *lock,
-				       unsigned int subclass);
-   int mutex_lock_interruptible(struct mutex *lock);
-
-Acquire the mutex, interruptible, if dec to 0:
-   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-
-Unlock the mutex:
-   void mutex_unlock(struct mutex *lock);
-
-Test if the mutex is taken:
-   int mutex_is_locked(struct mutex *lock);
-
-Disadvantages
--------------
-
-Unlike its original design and purpose, 'struct mutex' is among the largest
-locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
-is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
-cache and memory footprint.
-
-When to use mutexes
--------------------
-
-Unless the strict semantics of mutexes are unsuitable and/or the critical
-region prevents the lock from being shared, always prefer them to any other
-locking primitive.
diff --git a/Documentation/locking/rt-mutex-design.rst b/Documentation/locking/rt-mutex-design.rst
new file mode 100644
index 000000000000..59c2a64efb21
--- /dev/null
+++ b/Documentation/locking/rt-mutex-design.rst
@@ -0,0 +1,574 @@
+==============================
+RT-mutex implementation design
+==============================
+
+Copyright (c) 2006 Steven Rostedt
+
+Licensed under the GNU Free Documentation License, Version 1.2
+
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/locking/rt-mutex.rst.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is where you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem::
+
+     grab lock L1 (owned by C)
+       |
+  A ---+
+          C preempted by B
+            |
+  C    +----+
+
+  B         +-------->
+                  B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain
+         - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex
+         - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock
+         - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock
+         - Same as lock above.
+
+waiter
+         - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has rbtree node structures to
+           place the task in the waiters rbtree of a mutex as well as the
+           pi_waiters rbtree of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters
+         - A list of processes that are blocked on a mutex.
+
+top waiter
+         - The highest priority process waiting on a specific mutex.
+
+top pi waiter
+              - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:
+       task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example::
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be::
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be::
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains::
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2::
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again::
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
+
+Mutex Waiters Tree
+------------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority.  This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
+
+
+Task PI Tree
+------------
+
+To keep track of the PI chains, each process has its own PI rbtree.  This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI tree is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this tree.
+
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters.  It is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way::
+
+  void func1(void)
+  {
+	mutex_lock(L1);
+
+	/* do anything */
+
+	mutex_unlock(L1);
+  }
+
+  void func2(void)
+  {
+	mutex_lock(L1);
+	mutex_lock(L2);
+
+	/* do something */
+
+	mutex_unlock(L2);
+	mutex_unlock(L1);
+  }
+
+  void func3(void)
+  {
+	mutex_lock(L2);
+	mutex_lock(L3);
+
+	/* do something else */
+
+	mutex_unlock(L3);
+	mutex_unlock(L2);
+  }
+
+  void func4(void)
+  {
+	mutex_lock(L3);
+
+	/* do something again */
+
+	mutex_unlock(L3);
+  }
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows::
+
+  D owns L3
+         C blocked on L3
+         C owns L2
+                B blocked on L2
+                B owns L1
+                       A blocked on L1
+
+  And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+
+See Documentation/locking/rt-mutex.rst for further details.
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically::
+
+  unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+  {
+	unsigned long T = *A;
+	if (*A == *B) {
+		*A = *C;
+	}
+	return T;
+  }
+  #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_waiters of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
+
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
+
+Note:
+	For the "prio" field in task_struct, the lower the number, the
+	higher the priority. A "prio" of 5 is of higher priority than a
+	"prio" of 10.
+
+It is interesting to note that rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_waiters
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
+solves all that.
+
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+
+   1) The lock has no owner
+   2) The current task is the highest priority against all other
+      waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The rbtree node of waiter are initialized to the processes
+current priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the task waiter tree.  If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree.  Since the pi_waiter of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_waiters changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The task can then wake up for a couple of reasons:
+  1) The previous lock owner released the lock, and the task now is top_waiter
+  2) we received a signal or timeout
+
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
+
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
+
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
+
+Original Reviewers:
+		     Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+		     Randy Dunlap
+
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
deleted file mode 100644
index 3d7b865539cc..000000000000
--- a/Documentation/locking/rt-mutex-design.txt
+++ /dev/null
@@ -1,559 +0,0 @@
-#
-# Copyright (c) 2006 Steven Rostedt
-# Licensed under the GNU Free Documentation License, Version 1.2
-#
-
-RT-mutex implementation design
-------------------------------
-
-This document tries to describe the design of the rtmutex.c implementation.
-It doesn't describe the reasons why rtmutex.c exists. For that please see
-Documentation/locking/rt-mutex.txt.  Although this document does explain problems
-that happen without this code, but that is in the concept to understand
-what the code actually is doing.
-
-The goal of this document is to help others understand the priority
-inheritance (PI) algorithm that is used, as well as reasons for the
-decisions that were made to implement PI in the manner that was done.
-
-
-Unbounded Priority Inversion
-----------------------------
-
-Priority inversion is when a lower priority process executes while a higher
-priority process wants to run.  This happens for several reasons, and
-most of the time it can't be helped.  Anytime a high priority process wants
-to use a resource that a lower priority process has (a mutex for example),
-the high priority process must wait until the lower priority process is done
-with the resource.  This is a priority inversion.  What we want to prevent
-is something called unbounded priority inversion.  That is when the high
-priority process is prevented from running by a lower priority process for
-an undetermined amount of time.
-
-The classic example of unbounded priority inversion is where you have three
-processes, let's call them processes A, B, and C, where A is the highest
-priority process, C is the lowest, and B is in between. A tries to grab a lock
-that C owns and must wait and lets C run to release the lock. But in the
-meantime, B executes, and since B is of a higher priority than C, it preempts C,
-but by doing so, it is in fact preempting A which is a higher priority process.
-Now there's no way of knowing how long A will be sleeping waiting for C
-to release the lock, because for all we know, B is a CPU hog and will
-never give C a chance to release the lock.  This is called unbounded priority
-inversion.
-
-Here's a little ASCII art to show the problem.
-
-   grab lock L1 (owned by C)
-     |
-A ---+
-        C preempted by B
-          |
-C    +----+
-
-B         +-------->
-                B now keeps A from running.
-
-
-Priority Inheritance (PI)
--------------------------
-
-There are several ways to solve this issue, but other ways are out of scope
-for this document.  Here we only discuss PI.
-
-PI is where a process inherits the priority of another process if the other
-process blocks on a lock owned by the current process.  To make this easier
-to understand, let's use the previous example, with processes A, B, and C again.
-
-This time, when A blocks on the lock owned by C, C would inherit the priority
-of A.  So now if B becomes runnable, it would not preempt C, since C now has
-the high priority of A.  As soon as C releases the lock, it loses its
-inherited priority, and A then can continue with the resource that C had.
-
-Terminology
------------
-
-Here I explain some terminology that is used in this document to help describe
-the design that is used to implement PI.
-
-PI chain - The PI chain is an ordered series of locks and processes that cause
-           processes to inherit priorities from a previous process that is
-           blocked on one of its locks.  This is described in more detail
-           later in this document.
-
-mutex    - In this document, to differentiate from locks that implement
-           PI and spin locks that are used in the PI code, from now on
-           the PI locks will be called a mutex.
-
-lock     - In this document from now on, I will use the term lock when
-           referring to spin locks that are used to protect parts of the PI
-           algorithm.  These locks disable preemption for UP (when
-           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
-           entering critical sections simultaneously.
-
-spin lock - Same as lock above.
-
-waiter   - A waiter is a struct that is stored on the stack of a blocked
-           process.  Since the scope of the waiter is within the code for
-           a process being blocked on the mutex, it is fine to allocate
-           the waiter on the process's stack (local variable).  This
-           structure holds a pointer to the task, as well as the mutex that
-           the task is blocked on.  It also has rbtree node structures to
-           place the task in the waiters rbtree of a mutex as well as the
-           pi_waiters rbtree of a mutex owner task (described below).
-
-           waiter is sometimes used in reference to the task that is waiting
-           on a mutex. This is the same as waiter->task.
-
-waiters  - A list of processes that are blocked on a mutex.
-
-top waiter - The highest priority process waiting on a specific mutex.
-
-top pi waiter - The highest priority process waiting on one of the mutexes
-                that a specific process owns.
-
-Note:  task and process are used interchangeably in this document, mostly to
-       differentiate between two processes that are being described together.
-
-
-PI chain
---------
-
-The PI chain is a list of processes and mutexes that may cause priority
-inheritance to take place.  Multiple chains may converge, but a chain
-would never diverge, since a process can't be blocked on more than one
-mutex at a time.
-
-Example:
-
-   Process:  A, B, C, D, E
-   Mutexes:  L1, L2, L3, L4
-
-   A owns: L1
-           B blocked on L1
-           B owns L2
-                  C blocked on L2
-                  C owns L3
-                         D blocked on L3
-                         D owns L4
-                                E blocked on L4
-
-The chain would be:
-
-   E->L4->D->L3->C->L2->B->L1->A
-
-To show where two chains merge, we could add another process F and
-another mutex L5 where B owns L5 and F is blocked on mutex L5.
-
-The chain for F would be:
-
-   F->L5->B->L1->A
-
-Since a process may own more than one mutex, but never be blocked on more than
-one, the chains merge.
-
-Here we show both chains:
-
-   E->L4->D->L3->C->L2-+
-                       |
-                       +->B->L1->A
-                       |
-                 F->L5-+
-
-For PI to work, the processes at the right end of these chains (or we may
-also call it the Top of the chain) must be equal to or higher in priority
-than the processes to the left or below in the chain.
-
-Also since a mutex may have more than one process blocked on it, we can
-have multiple chains merge at mutexes.  If we add another process G that is
-blocked on mutex L2:
-
-  G->L2->B->L1->A
-
-And once again, to show how this can grow I will show the merging chains
-again.
-
-   E->L4->D->L3->C-+
-                   +->L2-+
-                   |     |
-                 G-+     +->B->L1->A
-                         |
-                   F->L5-+
-
-If process G has the highest priority in the chain, then all the tasks up
-the chain (A and B in this example), must have their priorities increased
-to that of G.
-
-Mutex Waiters Tree
------------------
-
-Every mutex keeps track of all the waiters that are blocked on itself. The
-mutex has a rbtree to store these waiters by priority.  This tree is protected
-by a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock.
-
-
-Task PI Tree
-------------
-
-To keep track of the PI chains, each process has its own PI rbtree.  This is
-a tree of all top waiters of the mutexes that are owned by the process.
-Note that this tree only holds the top waiters and not all waiters that are
-blocked on mutexes owned by the process.
-
-The top of the task's PI tree is always the highest priority task that
-is waiting on a mutex that is owned by the task.  So if the task has
-inherited a priority, it will always be the priority of the task that is
-at the top of this tree.
-
-This tree is stored in the task structure of a process as a rbtree called
-pi_waiters.  It is protected by a spin lock also in the task structure,
-called pi_lock.  This lock may also be taken in interrupt context, so when
-locking the pi_lock, interrupts must be disabled.
-
-
-Depth of the PI Chain
----------------------
-
-The maximum depth of the PI chain is not dynamic, and could actually be
-defined.  But is very complex to figure it out, since it depends on all
-the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
-L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
-The following shows a locking order of L1->L2->L3, but may not actually
-be directly nested that way.
-
-void func1(void)
-{
-	mutex_lock(L1);
-
-	/* do anything */
-
-	mutex_unlock(L1);
-}
-
-void func2(void)
-{
-	mutex_lock(L1);
-	mutex_lock(L2);
-
-	/* do something */
-
-	mutex_unlock(L2);
-	mutex_unlock(L1);
-}
-
-void func3(void)
-{
-	mutex_lock(L2);
-	mutex_lock(L3);
-
-	/* do something else */
-
-	mutex_unlock(L3);
-	mutex_unlock(L2);
-}
-
-void func4(void)
-{
-	mutex_lock(L3);
-
-	/* do something again */
-
-	mutex_unlock(L3);
-}
-
-Now we add 4 processes that run each of these functions separately.
-Processes A, B, C, and D which run functions func1, func2, func3 and func4
-respectively, and such that D runs first and A last.  With D being preempted
-in func4 in the "do something again" area, we have a locking that follows:
-
-D owns L3
-       C blocked on L3
-       C owns L2
-              B blocked on L2
-              B owns L1
-                     A blocked on L1
-
-And thus we have the chain A->L1->B->L2->C->L3->D.
-
-This gives us a PI depth of 4 (four processes), but looking at any of the
-functions individually, it seems as though they only have at most a locking
-depth of two.  So, although the locking depth is defined at compile time,
-it still is very difficult to find the possibilities of that depth.
-
-Now since mutexes can be defined by user-land applications, we don't want a DOS
-type of application that nests large amounts of mutexes to create a large
-PI chain, and have the code holding spin locks while looking at a large
-amount of data.  So to prevent this, the implementation not only implements
-a maximum lock depth, but also only holds at most two different locks at a
-time, as it walks the PI chain.  More about this below.
-
-
-Mutex owner and flags
----------------------
-
-The mutex structure contains a pointer to the owner of the mutex.  If the
-mutex is not owned, this owner is set to NULL.  Since all architectures
-have the task structure on at least a two byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the least
-significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
-flag. It's set whenever there are waiters on a mutex.
-
-See Documentation/locking/rt-mutex.txt for further details.
-
-cmpxchg Tricks
---------------
-
-Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
-is used (when applicable) to keep the fast path of grabbing and releasing
-mutexes short.
-
-cmpxchg is basically the following function performed atomically:
-
-unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
-{
-	unsigned long T = *A;
-	if (*A == *B) {
-		*A = *C;
-	}
-	return T;
-}
-#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
-
-This is really nice to have, since it allows you to only update a variable
-if the variable is what you expect it to be.  You know if it succeeded if
-the return value (the old value of A) is equal to B.
-
-The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
-the architecture does not support CMPXCHG, then this macro is simply set
-to fail every time.  But if CMPXCHG is supported, then this will
-help out extremely to keep the fast path short.
-
-The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
-the system for architectures that support it.  This will also be explained
-later in this document.
-
-
-Priority adjustments
---------------------
-
-The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority.  With the help of the pi_waiters of a
-process this is rather easy to know what needs to be adjusted.
-
-The functions implementing the task adjustments are rt_mutex_adjust_prio
-and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-
-rt_mutex_adjust_prio examines the priority of the task, and the highest
-priority process that is waiting any of mutexes owned by the task. Since
-the pi_waiters of a task holds an order by priority of all the top waiters
-of all the mutexes that the task owns, we simply need to compare the top
-pi waiter to its own normal/deadline priority and take the higher one.
-Then rt_mutex_setprio is called to adjust the priority of the task to the
-new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
-to implement the actual change in priority.
-
-(Note:  For the "prio" field in task_struct, the lower the number, the
-	higher the priority. A "prio" of 5 is of higher priority than a
-	"prio" of 10.)
-
-It is interesting to note that rt_mutex_adjust_prio can either increase
-or decrease the priority of the task.  In the case that a higher priority
-process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
-would increase/boost the task's priority.  But if a higher priority task
-were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task.  That is because the pi_waiters
-always contains the highest priority task that is waiting on a mutex owned
-by the task, so we only need to compare the priority of that top pi waiter
-to the normal priority of the given task.
-
-
-High level overview of the PI chain walk
-----------------------------------------
-
-The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
-
-The implementation has gone through several iterations, and has ended up
-with what we believe is the best.  It walks the PI chain by only grabbing
-at most two locks at a time, and is very efficient.
-
-The rt_mutex_adjust_prio_chain can be used either to boost or lower process
-priorities.
-
-rt_mutex_adjust_prio_chain is called with a task to be checked for PI
-(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, a pointer to a waiter
-that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting), a pointer to the mutex on which the task
-is blocked, and a top_task as the top waiter of the mutex.
-
-For this explanation, I will not mention deadlock detection. This explanation
-will try to stay at a high level.
-
-When this function is called, there are no locks held.  That also means
-that the state of the owner and lock can change when entered into this function.
-
-Before this function is called, the task has already had rt_mutex_adjust_prio
-performed on it.  This means that the task is set to the priority that it
-should be at, but the rbtree nodes of the task's waiter have not been updated
-with the new priorities, and this task may not be in the proper locations
-in the pi_waiters and waiters trees that the task is blocked on. This function
-solves all that.
-
-The main operation of this function is summarized by Thomas Gleixner in
-rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
-details.
-
-Taking of a mutex (The walk through)
-------------------------------------
-
-OK, now let's take a look at the detailed walk through of what happens when
-taking a mutex.
-
-The first thing that is tried is the fast taking of the mutex.  This is
-done when we have CMPXCHG enabled (otherwise the fast taking automatically
-fails).  Only when the owner field of the mutex is NULL can the lock be
-taken with the CMPXCHG and nothing else needs to be done.
-
-If there is contention on the lock, we go about the slow path
-(rt_mutex_slowlock).
-
-The slow path function is where the task's waiter structure is created on
-the stack.  This is because the waiter structure is only needed for the
-scope of this function.  The waiter structure holds the nodes to store
-the task on the waiters tree of the mutex, and if need be, the pi_waiters
-tree of the owner.
-
-The wait_lock of the mutex is taken since the slow path of unlocking the
-mutex also takes this lock.
-
-We then call try_to_take_rt_mutex.  This is where the architecture that
-does not implement CMPXCHG would always grab the lock (if there's no
-contention).
-
-try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
-slow path.  The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. By setting this flag
-now, the current owner of the mutex being contended for can't release the mutex
-without going into the slow unlock path, and it would then need to grab the
-wait_lock, which this code currently holds. So setting the "Has Waiters" flag
-forces the current owner to synchronize with this code.
-
-The lock is taken if the following are true:
-   1) The lock has no owner
-   2) The current task is the highest priority against all other
-      waiters of the lock
-
-If the task succeeds to acquire the lock, then the task is set as the
-owner of the lock, and if the lock still has waiters, the top_waiter
-(highest priority task waiting on the lock) is added to this task's
-pi_waiters tree.
-
-If the lock is not taken by try_to_take_rt_mutex(), then the
-task_blocks_on_rt_mutex() function is called. This will add the task to
-the lock's waiter tree and propagate the pi chain of the lock as well
-as the lock's owner's pi_waiters tree. This is described in the next
-section.
-
-Task blocks on mutex
---------------------
-
-The accounting of a mutex and process is done with the waiter structure of
-the process.  The "task" field is set to the process, and the "lock" field
-to the mutex.  The rbtree node of waiter are initialized to the processes
-current priority.
-
-Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the task waiter tree.  If the current process is the
-highest priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_waiters of the owner,
-and add the current process to that tree.  Since the pi_waiter of the owner
-has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
-should adjust its priority accordingly.
-
-If the owner is also blocked on a lock, and had its pi_waiters changed
-(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
-and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
-
-Now all locks are released, and if the current process is still blocked on a
-mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
-
-Waking up in the loop
----------------------
-
-The task can then wake up for a couple of reasons:
-  1) The previous lock owner released the lock, and the task now is top_waiter
-  2) we received a signal or timeout
-
-In both cases, the task will try again to acquire the lock. If it
-does, then it will take itself off the waiters tree and set itself back
-to the TASK_RUNNING state.
-
-In first case, if the lock was acquired by another task before this task
-could get the lock, then it will go back to sleep and wait to be woken again.
-
-The second case is only applicable for tasks that are grabbing a mutex
-that can wake up before getting the lock, either due to a signal or
-a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
-take the lock again, if it succeeds, then the task will return with the
-lock held, otherwise it will return with -EINTR if the task was woken
-by a signal, or -ETIMEDOUT if it timed out.
-
-
-Unlocking the Mutex
--------------------
-
-The unlocking of a mutex also has a fast path for those architectures with
-CMPXCHG.  Since the taking of a mutex on contention always sets the
-"Has Waiters" flag of the mutex's owner, we use this to know if we need to
-take the slow path when unlocking the mutex.  If the mutex doesn't have any
-waiters, the owner field of the mutex would equal the current process and
-the mutex can be unlocked by just replacing the owner field with NULL.
-
-If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
-the slow unlock path is taken.
-
-The first thing done in the slow unlock path is to take the wait_lock of the
-mutex.  This synchronizes the locking and unlocking of the mutex.
-
-A check is made to see if the mutex has waiters or not.  On architectures that
-do not have CMPXCHG, this is the location that the owner of the mutex will
-determine if a waiter needs to be awoken or not.  On architectures that
-do have CMPXCHG, that check is done in the fast path, but it is still needed
-in the slow path too.  If a waiter of a mutex woke up because of a signal
-or timeout between the time the owner failed the fast path CMPXCHG check and
-the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters then the mutex
-owner field is set to NULL, the wait_lock is released and nothing more is
-needed.
-
-If there are waiters, then we need to wake one up.
-
-On the wake up code, the pi_lock of the current owner is taken.  The top
-waiter of the lock is found and removed from the waiters tree of the mutex
-as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
-marked to prevent lower priority tasks from stealing the lock.
-
-Finally we unlock the pi_lock of the pending owner and wake it up.
-
-
-Contact
--------
-
-For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
-
-
-Credits
--------
-
-Author:  Steven Rostedt <rostedt@goodmis.org>
-Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
-
-Original Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
-		     Randy Dunlap
-Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
-
-Updates
--------
-
-This document was originally written for 2.6.17-rc3-mm1
-was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst
new file mode 100644
index 000000000000..c365dc302081
--- /dev/null
+++ b/Documentation/locking/rt-mutex.rst
@@ -0,0 +1,77 @@
+==================================
+RT-mutex subsystem with PI support
+==================================
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter tree is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters tree. This tree too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state:
+
+ ============ ======= ================================================
+ owner        bit0    Notes
+ ============ ======= ================================================
+ NULL         0       lock is free (fast acquire possible)
+ NULL         1       lock is free and has waiters and the top waiter
+		      is going to take the lock [1]_
+ taskpointer  0       lock is held (fast release possible)
+ taskpointer  1       lock is held and has waiters [2]_
+ ============ ======= ================================================
+
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
+
+.. [1] It also can be a transitional state when grabbing the lock
+       with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+       we need to set the bit0 before looking at the lock, and the owner may
+       be NULL in this small time, hence this can be a transitional state.
+
+.. [2] There is a small time when bit 0 is set but there are no
+       waiters. This can happen when grabbing the lock in the slow path.
+       To prevent a cmpxchg of the owner releasing the lock, we need to
+       set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
deleted file mode 100644
index 35793e003041..000000000000
--- a/Documentation/locking/rt-mutex.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-RT-mutex subsystem with PI support
-----------------------------------
-
-RT-mutexes with priority inheritance are used to support PI-futexes,
-which enable pthread_mutex_t priority inheritance attributes
-(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
-about PI-futexes.]
-
-This technology was developed in the -rt tree and streamlined for
-pthread_mutex support.
-
-Basic principles:
------------------
-
-RT-mutexes extend the semantics of simple mutexes by the priority
-inheritance protocol.
-
-A low priority owner of a rt-mutex inherits the priority of a higher
-priority waiter until the rt-mutex is released. If the temporarily
-boosted owner blocks on a rt-mutex itself it propagates the priority
-boosting to the owner of the other rt_mutex it gets blocked on. The
-priority boosting is immediately removed once the rt_mutex has been
-unlocked.
-
-This approach allows us to shorten the block of high-prio tasks on
-mutexes which protect shared resources. Priority inheritance is not a
-magic bullet for poorly designed applications, but it allows
-well-designed applications to use userspace locks in critical parts of
-an high priority thread, without losing determinism.
-
-The enqueueing of the waiters into the rtmutex waiter tree is done in
-priority order. For same priorities FIFO order is chosen. For each
-rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters tree. This tree too queues in priority order. Whenever
-the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. The
-priority enqueueing is handled by "pi_waiters".
-
-RT-mutexes are optimized for fastpath operations and have no internal
-locking overhead when locking an uncontended mutex or unlocking a mutex
-without waiters. The optimized fastpath operations require cmpxchg
-support. [If that is not available then the rt-mutex internal spinlock
-is used]
-
-The state of the rt-mutex is tracked via the owner field of the rt-mutex
-structure:
-
-lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
-keep track of the "lock has waiters" state.
-
- owner        bit0
- NULL         0       lock is free (fast acquire possible)
- NULL         1       lock is free and has waiters and the top waiter
-			is going to take the lock*
- taskpointer  0       lock is held (fast release possible)
- taskpointer  1       lock is held and has waiters**
-
-The fast atomic compare exchange based acquire and release is only
-possible when bit 0 of lock->owner is 0.
-
-(*) It also can be a transitional state when grabbing the lock
-with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
-we need to set the bit0 before looking at the lock, and the owner may be
-NULL in this small time, hence this can be a transitional state.
-
-(**) There is a small time when bit 0 is set but there are no
-waiters. This can happen when grabbing the lock in the slow path.
-To prevent a cmpxchg of the owner releasing the lock, we need to
-set this bit before looking at the lock.
-
-BTW, there is still technically a "Pending Owner", it's just not called
-that anymore. The pending owner happens to be the top_waiter of a lock
-that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/spinlocks.rst b/Documentation/locking/spinlocks.rst
new file mode 100644
index 000000000000..098107fb7d86
--- /dev/null
+++ b/Documentation/locking/spinlocks.rst
@@ -0,0 +1,177 @@
+===============
+Locking lessons
+===============
+
+Lesson 1: Spin locks
+====================
+
+The most basic primitive for locking is spinlock::
+
+  static DEFINE_SPINLOCK(xxx_lock);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&xxx_lock, flags);
+	... critical section here ..
+	spin_unlock_irqrestore(&xxx_lock, flags);
+
+The above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock. This works well even under UP also, so the code does _not_ need to
+worry about UP vs SMP issues: the spinlocks work correctly under both.
+
+   NOTE! Implications of spin_locks for memory are further described in:
+
+     Documentation/memory-barriers.txt
+
+       (5) LOCK operations.
+
+       (6) UNLOCK operations.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you **know** need to be split up: avoid it at all cost if you
+aren't sure).
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures **everywhere** they are used. The spinlocks are most
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+   NOTE! The spin-lock is safe only when you **also** use the lock itself
+   to do locking across CPU's, which implies that EVERYTHING that
+   touches a shared variable has to agree about the spinlock they want
+   to use.
+
+----
+
+Lesson 2: reader-writer spinlocks.
+==================================
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock.
+
+   NOTE! reader-writer locks require more atomic memory operations than
+   simple spinlocks.  Unless the reader critical section is long, you
+   are better off just using spinlocks.
+
+The routines look the same as above::
+
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+	unsigned long flags;
+
+	read_lock_irqsave(&xxx_lock, flags);
+	.. critical section that only reads the info ...
+	read_unlock_irqrestore(&xxx_lock, flags);
+
+	write_lock_irqsave(&xxx_lock, flags);
+	.. read and write exclusive access to the info ...
+	write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself.  The read lock allows many concurrent readers.  Anything that
+**changes** the list will have to get the write lock.
+
+   NOTE! RCU is better for list traversal, but requires careful
+   attention to design detail (see Documentation/RCU/listRCU.txt).
+
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning.
+
+   NOTE! We are working hard to remove reader-writer spinlocks in most
+   cases, so please don't add a new one without consensus.  (Instead, see
+   Documentation/RCU/rcu.txt for complete information.)
+
+----
+
+Lesson 3: spinlocks revisited.
+==============================
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly **because** they are safe they are also fairly slow. They are slower
+than they'd need to be, because they do have to disable interrupts
+(which is just a single instruction on a x86, but it's an expensive one -
+and on other architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions::
+
+	spin_lock(&lock);
+	...
+	spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks::
+
+	spin_lock(&lock);
+	...
+		<- interrupt comes in:
+			spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+		Linus
+
+----
+
+Reference information:
+======================
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate::
+
+   spinlock_t xxx_lock;
+   rwlock_t xxx_rw_lock;
+
+   static int __init xxx_init(void)
+   {
+	spin_lock_init(&xxx_lock);
+	rwlock_init(&xxx_rw_lock);
+	...
+   }
+
+   module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt
deleted file mode 100644
index ff35e40bdf5b..000000000000
--- a/Documentation/locking/spinlocks.txt
+++ /dev/null
@@ -1,167 +0,0 @@
-Lesson 1: Spin locks
-
-The most basic primitive for locking is spinlock.
-
-static DEFINE_SPINLOCK(xxx_lock);
-
-	unsigned long flags;
-
-	spin_lock_irqsave(&xxx_lock, flags);
-	... critical section here ..
-	spin_unlock_irqrestore(&xxx_lock, flags);
-
-The above is always safe. It will disable interrupts _locally_, but the
-spinlock itself will guarantee the global lock, so it will guarantee that
-there is only one thread-of-control within the region(s) protected by that
-lock. This works well even under UP also, so the code does _not_ need to
-worry about UP vs SMP issues: the spinlocks work correctly under both.
-
-   NOTE! Implications of spin_locks for memory are further described in:
-
-     Documentation/memory-barriers.txt
-       (5) LOCK operations.
-       (6) UNLOCK operations.
-
-The above is usually pretty simple (you usually need and want only one
-spinlock for most things - using more than one spinlock can make things a
-lot more complex and even slower and is usually worth it only for
-sequences that you _know_ need to be split up: avoid it at all cost if you
-aren't sure).
-
-This is really the only really hard part about spinlocks: once you start
-using spinlocks they tend to expand to areas you might not have noticed
-before, because you have to make sure the spinlocks correctly protect the
-shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (for
-example, internal driver data structures that nobody else ever touches).
-
-   NOTE! The spin-lock is safe only when you _also_ use the lock itself
-   to do locking across CPU's, which implies that EVERYTHING that
-   touches a shared variable has to agree about the spinlock they want
-   to use.
-
-----
-
-Lesson 2: reader-writer spinlocks.
-
-If your data accesses have a very natural pattern where you usually tend
-to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
-readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock.
-
-   NOTE! reader-writer locks require more atomic memory operations than
-   simple spinlocks.  Unless the reader critical section is long, you
-   are better off just using spinlocks.
-
-The routines look the same as above:
-
-   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
-
-	unsigned long flags;
-
-	read_lock_irqsave(&xxx_lock, flags);
-	.. critical section that only reads the info ...
-	read_unlock_irqrestore(&xxx_lock, flags);
-
-	write_lock_irqsave(&xxx_lock, flags);
-	.. read and write exclusive access to the info ...
-	write_unlock_irqrestore(&xxx_lock, flags);
-
-The above kind of lock may be useful for complex data structures like
-linked lists, especially searching for entries without changing the list
-itself.  The read lock allows many concurrent readers.  Anything that
-_changes_ the list will have to get the write lock.
-
-   NOTE! RCU is better for list traversal, but requires careful
-   attention to design detail (see Documentation/RCU/listRCU.txt).
-
-Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
-time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning.
-
-   NOTE! We are working hard to remove reader-writer spinlocks in most
-   cases, so please don't add a new one without consensus.  (Instead, see
-   Documentation/RCU/rcu.txt for complete information.)
-
-----
-
-Lesson 3: spinlocks revisited.
-
-The single spin-lock primitives above are by no means the only ones. They
-are the most safe ones, and the ones that work under all circumstances,
-but partly _because_ they are safe they are also fairly slow. They are slower
-than they'd need to be, because they do have to disable interrupts
-(which is just a single instruction on a x86, but it's an expensive one -
-and on other architectures it can be worse).
-
-If you have a case where you have to protect a data structure across
-several CPU's and you want to use spinlocks you can potentially use
-cheaper versions of the spinlocks. IFF you know that the spinlocks are
-never used in interrupt handlers, you can use the non-irq versions:
-
-	spin_lock(&lock);
-	...
-	spin_unlock(&lock);
-
-(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
-This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
-
-The reasons you mustn't use these versions if you have interrupts that
-play with the spinlock is that you can get deadlocks:
-
-	spin_lock(&lock);
-	...
-		<- interrupt comes in:
-			spin_lock(&lock);
-
-where an interrupt tries to lock an already locked variable. This is ok if
-the other interrupt happens on another CPU, but it is _not_ ok if the
-interrupt happens on the same CPU that already holds the lock, because the
-lock will obviously never be released (because the interrupt is waiting
-for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
-
-(This is also the reason why the irq-versions of the spinlocks only need
-to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
-on other CPU's, because an interrupt on another CPU doesn't interrupt the
-CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
-
-Note that you can be clever with read-write locks and interrupts. For
-example, if you know that the interrupt only ever gets a read-lock, then
-you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
-
-For an example of being clever with rw-locks, see the "waitqueue_lock"
-handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
-within an interrupt, they only read the queue in order to know whom to
-wake up. So read-locks are safe (which is good: they are very common
-indeed), while write-locks need to protect themselves against interrupts.
-
-		Linus
-
-----
-
-Reference information:
-
-For dynamic initialization, use spin_lock_init() or rwlock_init() as
-appropriate:
-
-   spinlock_t xxx_lock;
-   rwlock_t xxx_rw_lock;
-
-   static int __init xxx_init(void)
-   {
-	spin_lock_init(&xxx_lock);
-	rwlock_init(&xxx_rw_lock);
-	...
-   }
-
-   module_init(xxx_init);
-
-For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/ww-mutex-design.rst b/Documentation/locking/ww-mutex-design.rst
new file mode 100644
index 000000000000..1846c199da23
--- /dev/null
+++ b/Documentation/locking/ww-mutex-design.rst
@@ -0,0 +1,393 @@
+======================================
+Wound/Wait Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple.  For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter.  In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the task that just
+  killed its transaction after having dropped all already acquired locks.
+  These functions have the _slow postfix.
+
+  From a simple semantics point-of-view the _slow functions are not strictly
+  required, since simply calling the normal ww_mutex_lock functions on the
+  contending lock (after having dropped all other already acquired locks) will
+  work correctly. After all if no other ww mutex has been acquired yet there's
+  no deadlock potential and hence the ww_mutex_lock call will block and not
+  prematurely return -EDEADLK. The advantage of the _slow functions is in
+  interface safety:
+
+  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+    has a void return type. Note that since ww mutex code needs loops/retries
+    anyway the __must_check doesn't result in spurious warnings, even though the
+    very first lock operation can never fail.
+  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+    ww mutex have been released (preventing deadlocks) and makes sure that we
+    block on the contending lock (preventing spinning through the -EDEADLK
+    slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+  context.
+
+  Again this is not strictly required. But often you only want to acquire a
+  single lock in which case it's pointless to set up an acquire context (and so
+  better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2::
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  struct obj {
+	struct ww_mutex lock;
+	/* obj data */
+  };
+
+  struct obj_entry {
+	struct list_head head;
+	struct obj *obj;
+  };
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl)::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	list_for_each_entry (entry, list, head) {
+		if (entry->obj == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_entry = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+  err:
+	list_for_each_entry_continue_reverse (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+  }
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (entry, list, head) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (entry2, list, head)
+				ww_mutex_unlock(&entry2->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->head);
+			list_add(&entry->head, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+Unlocking works the same way for both methods #1 and #2::
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry;
+
+	list_for_each_entry (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+  }
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK die condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here::
+
+  struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+  };
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  void __unlock_objs(struct list_head *list)
+  {
+	struct obj *entry, *temp;
+
+	list_for_each_entry_safe (entry, temp, list, locked_list) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(&entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+  }
+
+  void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *obj;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list);
+
+			ww_mutex_lock_slow(obj, ctx);
+			list_add(&entry->locked_list, list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add_tail(&entry->locked_list, list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	__unlock_objs(list);
+	ww_acquire_fini(ctx);
+  }
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+^^^^^^^
+
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  We maintain the following invariants for the wait list:
+
+  (1) Waiters with an acquire context are sorted by stamp order; waiters
+      without an acquire context are interspersed in FIFO order.
+  (2) For Wait-Die, among waiters with contexts, only the first one can have
+      other locks acquired already (ctx->acquired > 0). Note that this waiter
+      may come after other waiters without contexts in the list.
+
+  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+  The wounded status of the transaction is checked only when there is
+  contention for a new lock and hence a true chance of deadlock. In that
+  situation, if the transaction is wounded, it backs off, clears the
+  wounded status and retries. A great benefit of implementing preemption in
+  this way is that the wounded transaction can identify a contending lock to
+  wait for before restarting the transaction. Just blindly restarting the
+  transaction would likely make the transaction end up in a situation where
+  it would have to back off again.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices, and optimization focus should
+  therefore be directed towards the uncontended cases.
+
+Lockdep:
+^^^^^^^^
+
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock the wrong mutex after -EDEADLK and
+     unlocking all mutexes.
+   - Attempting to lock the right mutex after -EDEADLK,
+     before unlocking all mutexes.
+
+   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - 'normal' deadlocks that can occur.
+
+FIXME:
+  Update this section once we have the TASK_DEADLOCK task state flag magic
+  implemented.
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
deleted file mode 100644
index f0ed7c30e695..000000000000
--- a/Documentation/locking/ww-mutex-design.txt
+++ /dev/null
@@ -1,383 +0,0 @@
-Wound/Wait Deadlock-Proof Mutex Design
-======================================
-
-Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
-
-Motivation for WW-Mutexes
--------------------------
-
-GPU's do operations that commonly involve many buffers.  Those buffers
-can be shared across contexts/processes, exist in different memory
-domains (for example VRAM vs system memory), and so on.  And with
-PRIME / dmabuf, they can even be shared across devices.  So there are
-a handful of situations where the driver needs to wait for buffers to
-become ready.  If you think about this in terms of waiting on a buffer
-mutex for it to become available, this presents a problem because
-there is no way to guarantee that buffers appear in a execbuf/batch in
-the same order in all contexts.  That is directly under control of
-userspace, and a result of the sequence of GL calls that an application
-makes.	Which results in the potential for deadlock.  The problem gets
-more complex when you consider that the kernel may need to migrate the
-buffer(s) into VRAM before the GPU operates on the buffer(s), which
-may in turn require evicting some other buffers (and you don't want to
-evict other buffers which are already queued up to the GPU), but for a
-simplified understanding of the problem you can ignore this.
-
-The algorithm that the TTM graphics subsystem came up with for dealing with
-this problem is quite simple.  For each group of buffers (execbuf) that need
-to be locked, the caller would be assigned a unique reservation id/ticket,
-from a global counter.  In case of deadlock while locking all the buffers
-associated with a execbuf, the one with the lowest reservation ticket (i.e.
-the oldest task) wins, and the one with the higher reservation id (i.e. the
-younger task) unlocks all of the buffers that it has already locked, and then
-tries again.
-
-In the RDBMS literature, a reservation ticket is associated with a transaction.
-and the deadlock handling approach is called Wait-Die. The name is based on
-the actions of a locking thread when it encounters an already locked mutex.
-If the transaction holding the lock is younger, the locking transaction waits.
-If the transaction holding the lock is older, the locking transaction backs off
-and dies. Hence Wait-Die.
-There is also another algorithm called Wound-Wait:
-If the transaction holding the lock is younger, the locking transaction
-wounds the transaction holding the lock, requesting it to die.
-If the transaction holding the lock is older, it waits for the other
-transaction. Hence Wound-Wait.
-The two algorithms are both fair in that a transaction will eventually succeed.
-However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
-compared to Wait-Die, but is, on the other hand, associated with more work than
-Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
-algorithm in that transactions are wounded by other transactions, and that
-requires a reliable way to pick up up the wounded condition and preempt the
-running transaction. Note that this is not the same as process preemption. A
-Wound-Wait transaction is considered preempted when it dies (returning
--EDEADLK) following a wound.
-
-Concepts
---------
-
-Compared to normal mutexes two additional concepts/objects show up in the lock
-interface for w/w mutexes:
-
-Acquire context: To ensure eventual forward progress it is important the a task
-trying to acquire locks doesn't grab a new reservation id, but keeps the one it
-acquired when starting the lock acquisition. This ticket is stored in the
-acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse. An acquire context is representing a
-transaction.
-
-W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context. The lock
-class also specifies what algorithm to use, Wound-Wait or Wait-Die.
-
-Furthermore there are three different class of w/w lock acquire functions:
-
-* Normal lock acquisition with a context, using ww_mutex_lock.
-
-* Slowpath lock acquisition on the contending lock, used by the task that just
-  killed its transaction after having dropped all already acquired locks.
-  These functions have the _slow postfix.
-
-  From a simple semantics point-of-view the _slow functions are not strictly
-  required, since simply calling the normal ww_mutex_lock functions on the
-  contending lock (after having dropped all other already acquired locks) will
-  work correctly. After all if no other ww mutex has been acquired yet there's
-  no deadlock potential and hence the ww_mutex_lock call will block and not
-  prematurely return -EDEADLK. The advantage of the _slow functions is in
-  interface safety:
-  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
-    has a void return type. Note that since ww mutex code needs loops/retries
-    anyway the __must_check doesn't result in spurious warnings, even though the
-    very first lock operation can never fail.
-  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
-    ww mutex have been released (preventing deadlocks) and makes sure that we
-    block on the contending lock (preventing spinning through the -EDEADLK
-    slowpath until the contended lock can be acquired).
-
-* Functions to only acquire a single w/w mutex, which results in the exact same
-  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
-  context.
-
-  Again this is not strictly required. But often you only want to acquire a
-  single lock in which case it's pointless to set up an acquire context (and so
-  better to avoid grabbing a deadlock avoidance ticket).
-
-Of course, all the usual variants for handling wake-ups due to signals are also
-provided.
-
-Usage
------
-
-The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
-DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
-As a rough rule of thumb, use Wound-Wait iff you
-expect the number of simultaneous competing transactions to be typically small,
-and you want to reduce the number of rollbacks.
-
-Three different ways to acquire locks within the same w/w class. Common
-definitions for methods #1 and #2:
-
-static DEFINE_WW_CLASS(ww_class);
-
-struct obj {
-	struct ww_mutex lock;
-	/* obj data */
-};
-
-struct obj_entry {
-	struct list_head head;
-	struct obj *obj;
-};
-
-Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
-This is useful if a list of required objects is already tracked somewhere.
-Furthermore the lock helper can use propagate the -EALREADY return code back to
-the caller as a signal that an object is twice on the list. This is useful if
-the list is constructed from userspace input and the ABI requires userspace to
-not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *res_obj = NULL;
-	struct obj_entry *contended_entry = NULL;
-	struct obj_entry *entry;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	list_for_each_entry (entry, list, head) {
-		if (entry->obj == res_obj) {
-			res_obj = NULL;
-			continue;
-		}
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			contended_entry = entry;
-			goto err;
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-
-err:
-	list_for_each_entry_continue_reverse (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	if (res_obj)
-		ww_mutex_unlock(&res_obj->lock);
-
-	if (ret == -EDEADLK) {
-		/* we lost out in a seqno race, lock and retry.. */
-		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
-		res_obj = contended_entry->obj;
-		goto retry;
-	}
-	ww_acquire_fini(ctx);
-
-	return ret;
-}
-
-Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
-of duplicate entry detection using -EALREADY as method 1 above. But the
-list-reordering allows for a bit more idiomatic code.
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry, *entry2;
-
-	ww_acquire_init(ctx, &ww_class);
-
-	list_for_each_entry (entry, list, head) {
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			entry2 = entry;
-
-			list_for_each_entry_continue_reverse (entry2, list, head)
-				ww_mutex_unlock(&entry2->obj->lock);
-
-			if (ret != -EDEADLK) {
-				ww_acquire_fini(ctx);
-				return ret;
-			}
-
-			/* we lost out in a seqno race, lock and retry.. */
-			ww_mutex_lock_slow(&entry->obj->lock, ctx);
-
-			/*
-			 * Move buf to head of the list, this will point
-			 * buf->next to the first unlocked entry,
-			 * restarting the for loop.
-			 */
-			list_del(&entry->head);
-			list_add(&entry->head, list);
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-Unlocking works the same way for both methods #1 and #2:
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry;
-
-	list_for_each_entry (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	ww_acquire_fini(ctx);
-}
-
-Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
-e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
-and edges can only be changed when holding the locks of all involved nodes. w/w
-mutexes are a natural fit for such a case for two reasons:
-- They can handle lock-acquisition in any order which allows us to start walking
-  a graph from a starting point and then iteratively discovering new edges and
-  locking down the nodes those edges connect to.
-- Due to the -EALREADY return code signalling that a given objects is already
-  held there's no need for additional book-keeping to break cycles in the graph
-  or keep track off which looks are already held (when using more than one node
-  as a starting point).
-
-Note that this approach differs in two important ways from the above methods:
-- Since the list of objects is dynamically constructed (and might very well be
-  different when retrying due to hitting the -EDEADLK die condition) there's
-  no need to keep any object on a persistent list when it's not locked. We can
-  therefore move the list_head into the object itself.
-- On the other hand the dynamic object list construction also means that the -EALREADY return
-  code can't be propagated.
-
-Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
-list of starting nodes (passed in from userspace) using one of the above
-methods. And then lock any additional objects affected by the operations using
-method #3 below. The backoff/retry procedure will be a bit more involved, since
-when the dynamic locking step hits -EDEADLK we also need to unlock all the
-objects acquired with the fixed list. But the w/w mutex debug checks will catch
-any interface misuse for these cases.
-
-Also, method 3 can't fail the lock acquisition step since it doesn't return
--EALREADY. Of course this would be different when using the _interruptible
-variants, but that's outside of the scope of these examples here.
-
-struct obj {
-	struct ww_mutex ww_mutex;
-	struct list_head locked_list;
-};
-
-static DEFINE_WW_CLASS(ww_class);
-
-void __unlock_objs(struct list_head *list)
-{
-	struct obj *entry, *temp;
-
-	list_for_each_entry_safe (entry, temp, list, locked_list) {
-		/* need to do that before unlocking, since only the current lock holder is
-		allowed to use object */
-		list_del(&entry->locked_list);
-		ww_mutex_unlock(entry->ww_mutex)
-	}
-}
-
-void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *obj;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	/* re-init loop start state */
-	loop {
-		/* magic code which walks over a graph and decides which objects
-		 * to lock */
-
-		ret = ww_mutex_lock(obj->ww_mutex, ctx);
-		if (ret == -EALREADY) {
-			/* we have that one already, get to the next object */
-			continue;
-		}
-		if (ret == -EDEADLK) {
-			__unlock_objs(list);
-
-			ww_mutex_lock_slow(obj, ctx);
-			list_add(&entry->locked_list, list);
-			goto retry;
-		}
-
-		/* locked a new object, add it to the list */
-		list_add_tail(&entry->locked_list, list);
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	__unlock_objs(list);
-	ww_acquire_fini(ctx);
-}
-
-Method 4: Only lock one single objects. In that case deadlock detection and
-prevention is obviously overkill, since with grabbing just one lock you can't
-produce a deadlock within just one class. To simplify this case the w/w mutex
-api can be used with a NULL context.
-
-Implementation Details
-----------------------
-
-Design:
-  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
-  normal mutex locks, which are far more common. As such there is only a small
-  increase in code size if wait/wound mutexes are not used.
-
-  We maintain the following invariants for the wait list:
-  (1) Waiters with an acquire context are sorted by stamp order; waiters
-      without an acquire context are interspersed in FIFO order.
-  (2) For Wait-Die, among waiters with contexts, only the first one can have
-      other locks acquired already (ctx->acquired > 0). Note that this waiter
-      may come after other waiters without contexts in the list.
-
-  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
-  The wounded status of the transaction is checked only when there is
-  contention for a new lock and hence a true chance of deadlock. In that
-  situation, if the transaction is wounded, it backs off, clears the
-  wounded status and retries. A great benefit of implementing preemption in
-  this way is that the wounded transaction can identify a contending lock to
-  wait for before restarting the transaction. Just blindly restarting the
-  transaction would likely make the transaction end up in a situation where
-  it would have to back off again.
-
-  In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices, and optimization focus should
-  therefore be directed towards the uncontended cases.
-
-Lockdep:
-  Special care has been taken to warn for as many cases of api abuse
-  as possible. Some common api abuses will be caught with
-  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
-
-  Some of the errors which will be warned about:
-   - Forgetting to call ww_acquire_fini or ww_acquire_init.
-   - Attempting to lock more mutexes after ww_acquire_done.
-   - Attempting to lock the wrong mutex after -EDEADLK and
-     unlocking all mutexes.
-   - Attempting to lock the right mutex after -EDEADLK,
-     before unlocking all mutexes.
-
-   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
-
-   - Unlocking mutexes with the wrong unlock function.
-   - Calling one of the ww_acquire_* twice on the same context.
-   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
-   - Normal lockdep errors that can result in deadlocks.
-
-  Some of the lockdep errors that can result in deadlocks:
-   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
-     having called ww_acquire_fini on the first.
-   - 'normal' deadlocks that can occur.
-
-FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
-implemented.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
index b154f6c0c36e..c33ba2befbf8 100644
--- a/Documentation/pi-futex.txt
+++ b/Documentation/pi-futex.txt
@@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex,
 robust-futex, PI-futex, robust+PI-futex.
 
 More details about priority inheritance can be found in
-Documentation/locking/rt-mutex.txt.
+Documentation/locking/rt-mutex.rst.
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index 5fd8a1abd2be..b9a6be4b8499 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1404,7 +1404,7 @@ Riferimento per l'API dei Futex
 Approfondimenti
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: la guida di Linus Torvalds agli
+-  ``Documentation/locking/spinlocks.rst``: la guida di Linus Torvalds agli
    spinlock del kernel.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 81dd11901ffd..cb5671d32ada 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -36,7 +36,7 @@
  * of extra utility/tracking out of our acquire-ctx.  This is provided
  * by &struct drm_modeset_lock and &struct drm_modeset_acquire_ctx.
  *
- * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.txt
+ * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.rst
  *
  * The basic usage pattern is to::
  *
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 57baa27f238c..0b0d7259276d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -5,7 +5,7 @@
  *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  *
- * see Documentation/locking/lockdep-design.txt for more details.
+ * see Documentation/locking/lockdep-design.rst for more details.
  */
 #ifndef __LINUX_LOCKDEP_H
 #define __LINUX_LOCKDEP_H
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3093dd162424..dcd03fee6e01 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,7 +151,7 @@ static inline bool mutex_is_locked(struct mutex *lock)
 
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index e401358c4e7e..9d9c663987d8 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -160,7 +160,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  * static then another method for expressing nested locking is
  * the explicit definition of lock class keys and the use of
  * lockdep_set_class() at lock initialization time.
- * See Documentation/locking/lockdep-design.txt for more details.)
+ * See Documentation/locking/lockdep-design.rst for more details.)
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0c601ae072b3..edd1c082dbf5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -16,7 +16,7 @@
  *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
  *    and Sven Dietrich.
  *
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..fa83d36e30c6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -9,7 +9,7 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
- *  See Documentation/locking/rt-mutex-design.txt for details.
+ *  See Documentation/locking/rt-mutex-design.rst for details.
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4ac4ca21a30a..a858b55e8ac7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1139,7 +1139,7 @@ config PROVE_LOCKING
 	 the proof of observed correctness is also maintained for an
 	 arbitrary combination of these separate locking variants.
 
-	 For more details, see Documentation/locking/lockdep-design.txt.
+	 For more details, see Documentation/locking/lockdep-design.rst.
 
 config LOCK_STAT
 	bool "Lock usage statistics"
@@ -1153,7 +1153,7 @@ config LOCK_STAT
 	help
 	 This feature enables tracking lock contention points
 
-	 For more details, see Documentation/locking/lockstat.txt
+	 For more details, see Documentation/locking/lockstat.rst
 
 	 This also enables lock events required by "perf lock",
 	 subcommand of perf.
-- 
cgit v1.2.3


From 720594f691e5c8fb0624f3653b20b24ba8e57742 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Sat, 13 Apr 2019 22:54:53 -0300
Subject: docs: connector: convert to ReST and rename to connector.rst

As it has some function definitions, move them to connector.h.

The remaining conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/connector/connector.rst | 156 +++++++++++++++++++++++++++
 Documentation/connector/connector.txt | 196 ----------------------------------
 drivers/w1/Kconfig                    |   2 +-
 include/linux/connector.h             |  63 ++++++++++-
 samples/Kconfig                       |   2 +-
 5 files changed, 220 insertions(+), 199 deletions(-)
 create mode 100644 Documentation/connector/connector.rst
 delete mode 100644 Documentation/connector/connector.txt

(limited to 'include/linux')

diff --git a/Documentation/connector/connector.rst b/Documentation/connector/connector.rst
new file mode 100644
index 000000000000..24e26dc22dbf
--- /dev/null
+++ b/Documentation/connector/connector.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+================
+Kernel Connector
+================
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+The Connector driver makes it easy to connect various agents using a
+netlink based network.  One must register a callback and an identifier.
+When the driver receives a special netlink message with the appropriate
+identifier, the appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+	- socket();
+	- bind();
+	- send();
+	- recv();
+
+But if kernelspace wants to use the full power of such connections, the
+driver writer must create special sockets, must know about struct sk_buff
+handling, etc...  The Connector driver allows any kernelspace agents to use
+netlink based networking for inter-process communication in a significantly
+easier way::
+
+  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
+  void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
+
+  struct cb_id
+  {
+	__u32			idx;
+	__u32			val;
+  };
+
+idx and val are unique identifiers which must be registered in the
+connector.h header for in-kernel usage.  `void (*callback) (void *)` is a
+callback function which will be called when a message with above idx.val
+is received by the connector core.  The argument for that function must
+be dereferenced to `struct cn_msg *`::
+
+  struct cn_msg
+  {
+	struct cb_id		id;
+
+	__u32			seq;
+	__u32			ack;
+
+	__u32			len;	/* Length of the following data */
+	__u8			data[0];
+  };
+
+Connector interfaces
+====================
+
+ .. kernel-doc:: include/linux/connector.h
+
+ Note:
+   When registering new callback user, connector core assigns
+   netlink group to the user which is equal to its id.idx.
+
+Protocol description
+====================
+
+The current framework offers a transport layer with fixed headers.  The
+recommended protocol which uses such a header is as following:
+
+msg->seq and msg->ack are used to determine message genealogy.  When
+someone sends a message, they use a locally unique sequence and random
+acknowledge number.  The sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+The sequence number is incremented with each message sent.
+
+If you expect a reply to the message, then the sequence number in the
+received message MUST be the same as in the original message, and the
+acknowledge number MUST be the same + 1.
+
+If we receive a message and its sequence number is not equal to one we
+are expecting, then it is a new message.  If we receive a message and
+its sequence number is the same as one we are expecting, but its
+acknowledge is not equal to the sequence number in the original
+message + 1, then it is a new message.
+
+Obviously, the protocol header contains the above id.
+
+The connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected ids will be turned on or off (registered or unregistered its
+callback).  It is done by sending a special command to the connector
+driver (it also registers itself with id={-1, -1}).
+
+As example of this usage can be found in the cn_test.c module which
+uses the connector to request notification and to send messages.
+
+Reliability
+===========
+
+Netlink itself is not a reliable protocol.  That means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned that it must be prepared.  That is why the struct
+cn_msg [main connector's message header] contains u32 seq and u32 ack
+fields.
+
+Userspace usage
+===============
+
+2.6.14 has a new netlink socket implementation, which by default does not
+allow people to send data to netlink groups other than 1.
+So, if you wish to use a netlink socket (for example using connector)
+with a different group number, the userspace application must subscribe to
+that group first.  It can be achieved by the following pseudocode::
+
+  s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+  l_local.nl_family = AF_NETLINK;
+  l_local.nl_groups = 12345;
+  l_local.nl_pid = 0;
+
+  if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+	perror("bind");
+	close(s);
+	return -1;
+  }
+
+  {
+	int on = l_local.nl_groups;
+	setsockopt(s, 270, 1, &on, sizeof(on));
+  }
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option.  To drop a multicast subscription, one should call the above socket
+option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
+
+Code samples
+============
+
+Sample code for a connector test module and user space can be found
+in samples/connector/. To build this code, enable CONFIG_CONNECTOR
+and CONFIG_SAMPLES.
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
deleted file mode 100644
index ab7ca897fab7..000000000000
--- a/Documentation/connector/connector.txt
+++ /dev/null
@@ -1,196 +0,0 @@
-/*****************************************/
-Kernel Connector.
-/*****************************************/
-
-Kernel connector - new netlink based userspace <-> kernel space easy
-to use communication module.
-
-The Connector driver makes it easy to connect various agents using a
-netlink based network.  One must register a callback and an identifier.
-When the driver receives a special netlink message with the appropriate
-identifier, the appropriate callback will be called.
-
-From the userspace point of view it's quite straightforward:
-
-	socket();
-	bind();
-	send();
-	recv();
-
-But if kernelspace wants to use the full power of such connections, the
-driver writer must create special sockets, must know about struct sk_buff
-handling, etc...  The Connector driver allows any kernelspace agents to use
-netlink based networking for inter-process communication in a significantly
-easier way:
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
-void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
-
-struct cb_id
-{
-	__u32			idx;
-	__u32			val;
-};
-
-idx and val are unique identifiers which must be registered in the
-connector.h header for in-kernel usage.  void (*callback) (void *) is a
-callback function which will be called when a message with above idx.val
-is received by the connector core.  The argument for that function must
-be dereferenced to struct cn_msg *.
-
-struct cn_msg
-{
-	struct cb_id		id;
-
-	__u32			seq;
-	__u32			ack;
-
-	__u32			len;		/* Length of the following data */
-	__u8			data[0];
-};
-
-/*****************************************/
-Connector interfaces.
-/*****************************************/
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-
- Registers new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-				  It must be registered in connector.h for legal in-kernel users.
- char *name			- connector's callback symbolic name.
- void (*callback) (struct cn..)	- connector's callback.
-				  cn_msg and the sender's credentials
-
-
-void cn_del_callback(struct cb_id *id);
-
- Unregisters new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-
-
-int cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __groups, int gfp_mask);
-int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __groups, int gfp_mask);
-
- Sends message to the specified groups.  It can be safely called from
- softirq context, but may silently fail under strong memory pressure.
- If there are no listeners for given group -ESRCH can be returned.
-
- struct cn_msg *		- message header(with attached data).
- u16 len			- for *_multi multiple cn_msg messages can be sent
- u32 port			- destination port.
- 				  If non-zero the message will be sent to the
-				  given port, which should be set to the
-				  original sender.
- u32 __group			- destination group.
-				  If port and __group is zero, then appropriate group will
-				  be searched through all registered connector users,
-				  and message will be delivered to the group which was
-				  created for user with the same ID as in msg.
-				  If __group is not zero, then message will be delivered
-				  to the specified group.
- int gfp_mask			- GFP mask.
-
- Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to its id.idx.
-
-/*****************************************/
-Protocol description.
-/*****************************************/
-
-The current framework offers a transport layer with fixed headers.  The
-recommended protocol which uses such a header is as following:
-
-msg->seq and msg->ack are used to determine message genealogy.  When
-someone sends a message, they use a locally unique sequence and random
-acknowledge number.  The sequence number may be copied into
-nlmsghdr->nlmsg_seq too.
-
-The sequence number is incremented with each message sent.
-
-If you expect a reply to the message, then the sequence number in the
-received message MUST be the same as in the original message, and the
-acknowledge number MUST be the same + 1.
-
-If we receive a message and its sequence number is not equal to one we
-are expecting, then it is a new message.  If we receive a message and
-its sequence number is the same as one we are expecting, but its
-acknowledge is not equal to the sequence number in the original
-message + 1, then it is a new message.
-
-Obviously, the protocol header contains the above id.
-
-The connector allows event notification in the following form: kernel
-driver or userspace process can ask connector to notify it when
-selected ids will be turned on or off (registered or unregistered its
-callback).  It is done by sending a special command to the connector
-driver (it also registers itself with id={-1, -1}).
-
-As example of this usage can be found in the cn_test.c module which
-uses the connector to request notification and to send messages.
-
-/*****************************************/
-Reliability.
-/*****************************************/
-
-Netlink itself is not a reliable protocol.  That means that messages can
-be lost due to memory pressure or process' receiving queue overflowed,
-so caller is warned that it must be prepared.  That is why the struct
-cn_msg [main connector's message header] contains u32 seq and u32 ack
-fields.
-
-/*****************************************/
-Userspace usage.
-/*****************************************/
-
-2.6.14 has a new netlink socket implementation, which by default does not
-allow people to send data to netlink groups other than 1.
-So, if you wish to use a netlink socket (for example using connector)
-with a different group number, the userspace application must subscribe to
-that group first.  It can be achieved by the following pseudocode:
-
-s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
-
-l_local.nl_family = AF_NETLINK;
-l_local.nl_groups = 12345;
-l_local.nl_pid = 0;
-
-if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
-	perror("bind");
-	close(s);
-	return -1;
-}
-
-{
-	int on = l_local.nl_groups;
-	setsockopt(s, 270, 1, &on, sizeof(on));
-}
-
-Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
-option.  To drop a multicast subscription, one should call the above socket
-option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
-
-2.6.14 netlink code only allows to select a group which is less or equal to
-the maximum group number, which is used at netlink_kernel_create() time.
-In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
-group number 12345, you must increment CN_NETLINK_USERS to that number.
-Additional 0xf numbers are allocated to be used by non-in-kernel users.
-
-Due to this limitation, group 0xffffffff does not work now, so one can
-not use add/remove connector's group notifications, but as far as I know, 
-only cn_test.c test module used it.
-
-Some work in netlink area is still being done, so things can be changed in
-2.6.15 timeframe, if it will happen, documentation will be updated for that
-kernel.
-
-/*****************************************/
-Code samples
-/*****************************************/
-
-Sample code for a connector test module and user space can be found
-in samples/connector/. To build this code, enable CONFIG_CONNECTOR
-and CONFIG_SAMPLES.
diff --git a/drivers/w1/Kconfig b/drivers/w1/Kconfig
index 03dd57581df7..160053c0baea 100644
--- a/drivers/w1/Kconfig
+++ b/drivers/w1/Kconfig
@@ -19,7 +19,7 @@ config W1_CON
 	default y
 	---help---
 	  This allows to communicate with userspace using connector. For more
-	  information see <file:Documentation/connector/connector.txt>.
+	  information see <file:Documentation/connector/connector.rst>.
 	  There are three types of messages between w1 core and userspace:
 	  1. Events. They are generated each time new master or slave device found
 		either due to automatic or requested search.
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 1d72ef76f24f..6b6c7396a584 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -55,10 +55,71 @@ struct cn_dev {
 	struct cn_queue_dev *cbdev;
 };
 
+/**
+ * cn_add_callback() - Registers new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ *		It must be registered in connector.h for legal
+ *		in-kernel users.
+ * @name:	connector's callback symbolic name.
+ * @callback:	connector's callback.
+ * 		parameters are %cn_msg and the sender's credentials
+ */
 int cn_add_callback(struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
-void cn_del_callback(struct cb_id *);
+/**
+ * cn_del_callback() - Unregisters new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ */
+void cn_del_callback(struct cb_id *id);
+
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg: 	message header(with attached data).
+ * @len:	Number of @msg to be sent.
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask);
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg:	message header(with attached data).
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 group, gfp_t gfp_mask);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
diff --git a/samples/Kconfig b/samples/Kconfig
index 71b5e833dd9e..155da47dc6a4 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -99,7 +99,7 @@ config SAMPLE_CONNECTOR
 	  When enabled, this builds both a sample kernel module for
 	  the connector interface and a user space tool to communicate
 	  with it.
-	  See also Documentation/connector/connector.txt
+	  See also Documentation/connector/connector.rst
 
 config SAMPLE_HIDRAW
 	bool "hidraw sample"
-- 
cgit v1.2.3


From fe34c89d25429e079ba67416529514120dd715f8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 12:34:59 -0300
Subject: docs: driver-model: move it to the driver-api book

The audience for the Kernel driver-model is clearly Kernel hackers.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> # ice driver changes
---
 Documentation/driver-api/driver-model/binding.rst  |  98 +++++
 Documentation/driver-api/driver-model/bus.rst      | 146 +++++++
 Documentation/driver-api/driver-model/class.rst    | 149 +++++++
 .../driver-api/driver-model/design-patterns.rst    | 116 ++++++
 Documentation/driver-api/driver-model/device.rst   | 109 +++++
 Documentation/driver-api/driver-model/devres.rst   | 414 +++++++++++++++++++
 Documentation/driver-api/driver-model/driver.rst   | 223 ++++++++++
 Documentation/driver-api/driver-model/index.rst    |  24 ++
 Documentation/driver-api/driver-model/overview.rst | 124 ++++++
 Documentation/driver-api/driver-model/platform.rst | 246 +++++++++++
 Documentation/driver-api/driver-model/porting.rst  | 448 +++++++++++++++++++++
 Documentation/driver-api/gpio/driver.rst           |   2 +-
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-model/binding.rst             |  98 -----
 Documentation/driver-model/bus.rst                 | 146 -------
 Documentation/driver-model/class.rst               | 149 -------
 Documentation/driver-model/design-patterns.rst     | 116 ------
 Documentation/driver-model/device.rst              | 109 -----
 Documentation/driver-model/devres.rst              | 414 -------------------
 Documentation/driver-model/driver.rst              | 223 ----------
 Documentation/driver-model/index.rst               |  26 --
 Documentation/driver-model/overview.rst            | 124 ------
 Documentation/driver-model/platform.rst            | 246 -----------
 Documentation/driver-model/porting.rst             | 448 ---------------------
 Documentation/eisa.txt                             |   4 +-
 Documentation/filesystems/sysfs.txt                |   2 +-
 Documentation/hwmon/submitting-patches.rst         |   2 +-
 .../translations/zh_CN/filesystems/sysfs.txt       |   2 +-
 drivers/base/platform.c                            |   2 +-
 drivers/gpio/gpio-cs5535.c                         |   2 +-
 drivers/net/ethernet/intel/ice/ice_main.c          |   2 +-
 drivers/staging/unisys/Documentation/overview.txt  |   4 +-
 include/linux/device.h                             |   2 +-
 include/linux/platform_device.h                    |   2 +-
 scripts/coccinelle/free/devm_free.cocci            |   2 +-
 35 files changed, 2112 insertions(+), 2113 deletions(-)
 create mode 100644 Documentation/driver-api/driver-model/binding.rst
 create mode 100644 Documentation/driver-api/driver-model/bus.rst
 create mode 100644 Documentation/driver-api/driver-model/class.rst
 create mode 100644 Documentation/driver-api/driver-model/design-patterns.rst
 create mode 100644 Documentation/driver-api/driver-model/device.rst
 create mode 100644 Documentation/driver-api/driver-model/devres.rst
 create mode 100644 Documentation/driver-api/driver-model/driver.rst
 create mode 100644 Documentation/driver-api/driver-model/index.rst
 create mode 100644 Documentation/driver-api/driver-model/overview.rst
 create mode 100644 Documentation/driver-api/driver-model/platform.rst
 create mode 100644 Documentation/driver-api/driver-model/porting.rst
 delete mode 100644 Documentation/driver-model/binding.rst
 delete mode 100644 Documentation/driver-model/bus.rst
 delete mode 100644 Documentation/driver-model/class.rst
 delete mode 100644 Documentation/driver-model/design-patterns.rst
 delete mode 100644 Documentation/driver-model/device.rst
 delete mode 100644 Documentation/driver-model/devres.rst
 delete mode 100644 Documentation/driver-model/driver.rst
 delete mode 100644 Documentation/driver-model/index.rst
 delete mode 100644 Documentation/driver-model/overview.rst
 delete mode 100644 Documentation/driver-model/platform.rst
 delete mode 100644 Documentation/driver-model/porting.rst

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst
new file mode 100644
index 000000000000..7ea1d7a41e1d
--- /dev/null
+++ b/Documentation/driver-api/driver-model/binding.rst
@@ -0,0 +1,98 @@
+==============
+Driver Binding
+==============
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
new file mode 100644
index 000000000000..016b15a6e8ea
--- /dev/null
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -0,0 +1,146 @@
+=========
+Bus Types
+=========
+
+Definition
+~~~~~~~~~~
+See the kerneldoc for the struct bus_type.
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback::
+
+   struct bus_type pci_bus_type = {
+          .name	= "pci",
+          .match	= pci_bus_match,
+   };
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is to give the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+		       void * data,
+		       int (*fn)(struct device *, void *));
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+		       void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+	    |-- Intel ICH
+	    |-- Intel ICH Joystick
+	    |-- agpgart
+	    `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy::
+
+	/sys/bus/pci/
+	|-- devices
+	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
+	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
+	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
+	`-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct bus_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct bus_type *, char * buf);
+	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+  };
+
+Bus drivers can export attributes using the BUS_ATTR_RW macro that works
+similarly to the DEVICE_ATTR_RW macro for devices. For example, a
+definition like this::
+
+	static BUS_ATTR_RW(debug);
+
+is equivalent to declaring::
+
+	static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using::
+
+	int bus_create_file(struct bus_type *, struct bus_attribute *);
+	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst
new file mode 100644
index 000000000000..fff55b80e86a
--- /dev/null
+++ b/Documentation/driver-api/driver-model/class.rst
@@ -0,0 +1,149 @@
+==============
+Device Classes
+==============
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like::
+
+
+  typedef int (*devclass_add)(struct device *);
+  typedef void (*devclass_remove)(struct device *);
+
+See the kerneldoc for the struct class.
+
+A typical device class definition would look like::
+
+  struct device_class input_devclass = {
+        .name		= "input",
+        .add_device	= input_add_device,
+	.remove_device	= input_remove_device,
+  };
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using::
+
+  int devclass_register(struct device_class * cls);
+  void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories::
+
+        class/
+        `-- input
+            |-- devices
+            `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory)::
+
+   class/
+   `-- input
+       |-- devices
+       `-- drivers
+           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy::
+
+   class/
+   `-- input
+       |-- devices
+       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+       `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct devclass_attribute {
+        struct attribute        attr;
+        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+  };
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this::
+
+  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring::
+
+  static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using::
+
+  int devclass_create_file(struct device_class *, struct devclass_attribute *);
+  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst
new file mode 100644
index 000000000000..41eb8f41f7dd
--- /dev/null
+++ b/Documentation/driver-api/driver-model/design-patterns.rst
@@ -0,0 +1,116 @@
+=============================
+Device Driver Design Patterns
+=============================
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form::
+
+  struct foo {
+      spinlock_t lock; /* Example member */
+      (...)
+  };
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+      if (!foo)
+          return -ENOMEM;
+      spin_lock_init(&foo->lock);
+      (...)
+  }
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this::
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      (...)
+      ret = request_irq(irq, foo_handler, 0, "foo", foo);
+  }
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add an offloaded work::
+
+  struct foo {
+      spinlock_t lock;
+      struct workqueue_struct *wq;
+      struct work_struct offload;
+      (...)
+  };
+
+  static void foo_work(struct work_struct *work)
+  {
+      struct foo *foo = container_of(work, struct foo, offload);
+
+      (...)
+  }
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+
+      queue_work(foo->wq, &foo->offload);
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo->wq = create_singlethread_workqueue("foo-wq");
+      INIT_WORK(&foo->offload, foo_work);
+      (...)
+  }
+
+The design pattern is the same for an hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst
new file mode 100644
index 000000000000..2b868d49d349
--- /dev/null
+++ b/Documentation/driver-api/driver-model/device.rst
@@ -0,0 +1,109 @@
+==========================
+The Basic Device Structure
+==========================
+
+See the kerneldoc for the struct device.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core::
+
+  int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+    - parent
+    - name
+    - bus_id
+    - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using::
+
+  struct device * get_device(struct device * dev);
+  void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using::
+
+  void lock_device(struct device * dev);
+  void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct device_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count);
+  };
+
+Attributes of devices can be exported by a device driver through sysfs.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+As explained in Documentation/kobject.txt, device attributes must be
+created before the KOBJ_ADD uevent is generated. The only way to realize
+that is by defining an attribute group.
+
+Attributes are declared using a macro called DEVICE_ATTR::
+
+  #define DEVICE_ATTR(name,mode,show,store)
+
+Example:::
+
+  static DEVICE_ATTR(type, 0444, show_type, NULL);
+  static DEVICE_ATTR(power, 0644, show_power, store_power);
+
+This declares two structures of type struct device_attribute with respective
+names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
+organized as follows into a group::
+
+  static struct attribute *dev_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_power.attr,
+	NULL,
+  };
+
+  static struct attribute_group dev_attr_group = {
+	.attrs = dev_attrs,
+  };
+
+  static const struct attribute_group *dev_attr_groups[] = {
+	&dev_attr_group,
+	NULL,
+  };
+
+This array of groups can then be associated with a device by setting the
+group pointer in struct device before device_register() is invoked::
+
+        dev->groups = dev_attr_groups;
+        device_register(dev);
+
+The device_register() function will use the 'groups' pointer to create the
+device attributes and the device_unregister() function will use this pointer
+to remove the device attributes.
+
+Word of warning:  While the kernel allows device_create_file() and
+device_remove_file() to be called on a device at any time, userspace has
+strict expectations on when attributes get created.  When a new device is
+registered in the kernel, a uevent is generated to notify userspace (like
+udev) that a new device is available.  If attributes are added after the
+device is registered, then userspace won't get notified and userspace will
+not know about the new attributes.
+
+This is important for device driver that need to publish additional
+attributes for a device at driver probe time.  If the device driver simply
+calls device_create_file() on the device structure passed to it, then
+userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
new file mode 100644
index 000000000000..4ac99122b5f1
--- /dev/null
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -0,0 +1,414 @@
+================================
+Devres - Managed Device Resource
+================================
+
+Tejun Heo	<teheo@suse.de>
+
+First draft	10 January 2007
+
+.. contents
+
+   1. Intro			: Huh? Devres?
+   2. Devres			: Devres in a nutshell
+   3. Devres Group		: Group devres'es and release them together
+   4. Details			: Life time rules, calling context, ...
+   5. Overhead			: How much do we have to pay for this?
+   6. List of managed interfaces: Currently implemented managed interfaces
+
+
+1. Intro
+--------
+
+devres came up while trying to convert libata to use iomap.  Each
+iomapped address should be kept and unmapped on driver detach.  For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path.  Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers?  After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs.  iomap
+adds more to this mix.  So do msi and msix.
+
+
+2. Devres
+---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device.  Each devres entry is associated with
+a release function.  A devres can be released in several ways.  No
+matter what, all devres entries are released on driver detach.  On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres.  For example, coherent DMA memory is acquired
+using dma_alloc_coherent().  The managed version is called
+dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach.  Implementation looks like
+the following::
+
+  struct dma_devres {
+	size_t		size;
+	void		*vaddr;
+	dma_addr_t	dma_handle;
+  };
+
+  static void dmam_coherent_release(struct device *dev, void *res)
+  {
+	struct dma_devres *this = res;
+
+	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+  }
+
+  dmam_alloc_coherent(dev, size, dma_handle, gfp)
+  {
+	struct dma_devres *dr;
+	void *vaddr;
+
+	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+	...
+
+	/* alloc DMA memory as usual */
+	vaddr = dma_alloc_coherent(...);
+	...
+
+	/* record size, vaddr, dma_handle in dr */
+	dr->vaddr = vaddr;
+	...
+
+	devres_add(dev, dr);
+
+	return vaddr;
+  }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached.  If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code.  Init path basically
+looks like the following::
+
+  my_init_one()
+  {
+	struct mydev *d;
+
+	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->ring = dmam_alloc_coherent(...);
+	if (!d->ring)
+		return -ENOMEM;
+
+	if (check something)
+		return -EINVAL;
+	...
+
+	return register_to_upper_layer(d);
+  }
+
+And exit path::
+
+  my_remove_one()
+  {
+	unregister_from_upper_layer(d);
+	shutdown_my_hardware();
+  }
+
+As shown above, low level drivers can be simplified a lot by using
+devres.  Complexity is shifted from less maintained low level drivers
+to better maintained higher layer.  Also, as init failure path is
+shared with exit path, both can get more testing.
+
+Note though that when converting current calls or assignments to
+managed devm_* versions it is up to you to check if internal operations
+like allocating memory, have failed. Managed resources pertains to the
+freeing of these resources *only* - all other checks needed are still
+on you. In some cases this may mean introducing checks that were not
+necessary before moving to the managed devm_* calls.
+
+
+3. Devres group
+---------------
+
+Devres entries can be grouped using devres group.  When a group is
+released, all contained normal devres entries and properly nested
+groups are released.  One usage is to rollback series of acquired
+resources on failure.  For example::
+
+  if (!devres_open_group(dev, NULL, GFP_KERNEL))
+	return -ENOMEM;
+
+  acquire A;
+  if (failed)
+	goto err;
+
+  acquire B;
+  if (failed)
+	goto err;
+  ...
+
+  devres_remove_group(dev, NULL);
+  return 0;
+
+ err:
+  devres_release_group(dev, NULL);
+  return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by `void *id`.  It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example.  In both
+cases, devres_open_group() returns the group's id.  The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following::
+
+  int my_midlayer_create_something()
+  {
+	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+		return -ENOMEM;
+
+	...
+
+	devres_close_group(dev, my_midlayer_create_something);
+	return 0;
+  }
+
+  void my_midlayer_destroy_something()
+  {
+	devres_release_group(dev, my_midlayer_create_something);
+  }
+
+
+4. Details
+----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found).  Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility.  This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+5. Overhead
+-----------
+
+Each devres bookkeeping info is allocated together with requested data
+area.  With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment).  If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers.  It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+6. List of managed interfaces
+-----------------------------
+
+CLOCK
+  devm_clk_get()
+  devm_clk_get_optional()
+  devm_clk_put()
+  devm_clk_hw_register()
+  devm_of_clk_add_hw_provider()
+  devm_clk_hw_register_clkdev()
+
+DMA
+  dmaenginem_async_device_register()
+  dmam_alloc_coherent()
+  dmam_alloc_attrs()
+  dmam_free_coherent()
+  dmam_pool_create()
+  dmam_pool_destroy()
+
+DRM
+  devm_drm_dev_init()
+
+GPIO
+  devm_gpiod_get()
+  devm_gpiod_get_index()
+  devm_gpiod_get_index_optional()
+  devm_gpiod_get_optional()
+  devm_gpiod_put()
+  devm_gpiod_unhinge()
+  devm_gpiochip_add_data()
+  devm_gpio_request()
+  devm_gpio_request_one()
+  devm_gpio_free()
+
+I2C
+  devm_i2c_new_dummy_device()
+
+IIO
+  devm_iio_device_alloc()
+  devm_iio_device_free()
+  devm_iio_device_register()
+  devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
+  devm_iio_triggered_buffer_setup()
+  devm_iio_triggered_buffer_cleanup()
+  devm_iio_trigger_alloc()
+  devm_iio_trigger_free()
+  devm_iio_trigger_register()
+  devm_iio_trigger_unregister()
+  devm_iio_channel_get()
+  devm_iio_channel_release()
+  devm_iio_channel_get_all()
+  devm_iio_channel_release_all()
+
+INPUT
+  devm_input_allocate_device()
+
+IO region
+  devm_release_mem_region()
+  devm_release_region()
+  devm_release_resource()
+  devm_request_mem_region()
+  devm_request_region()
+  devm_request_resource()
+
+IOMAP
+  devm_ioport_map()
+  devm_ioport_unmap()
+  devm_ioremap()
+  devm_ioremap_nocache()
+  devm_ioremap_wc()
+  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
+  devm_iounmap()
+  pcim_iomap()
+  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
+  pcim_iomap_table()	: array of mapped addresses indexed by BAR
+  pcim_iounmap()
+
+IRQ
+  devm_free_irq()
+  devm_request_any_context_irq()
+  devm_request_irq()
+  devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
+  devm_irq_alloc_generic_chip()
+  devm_irq_setup_generic_chip()
+  devm_irq_sim_init()
+
+LED
+  devm_led_classdev_register()
+  devm_led_classdev_unregister()
+
+MDIO
+  devm_mdiobus_alloc()
+  devm_mdiobus_alloc_size()
+  devm_mdiobus_free()
+
+MEM
+  devm_free_pages()
+  devm_get_free_pages()
+  devm_kasprintf()
+  devm_kcalloc()
+  devm_kfree()
+  devm_kmalloc()
+  devm_kmalloc_array()
+  devm_kmemdup()
+  devm_kstrdup()
+  devm_kvasprintf()
+  devm_kzalloc()
+
+MFD
+  devm_mfd_add_devices()
+
+MUX
+  devm_mux_chip_alloc()
+  devm_mux_chip_register()
+  devm_mux_control_get()
+
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
+PCI
+  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
+  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
+  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
+  pcim_enable_device()		: after success, all PCI ops become managed
+  pcim_pin_device()		: keep PCI device enabled after release
+
+PHY
+  devm_usb_get_phy()
+  devm_usb_put_phy()
+
+PINCTRL
+  devm_pinctrl_get()
+  devm_pinctrl_put()
+  devm_pinctrl_register()
+  devm_pinctrl_unregister()
+
+POWER
+  devm_reboot_mode_register()
+  devm_reboot_mode_unregister()
+
+PWM
+  devm_pwm_get()
+  devm_pwm_put()
+
+REGULATOR
+  devm_regulator_bulk_get()
+  devm_regulator_get()
+  devm_regulator_put()
+  devm_regulator_register()
+
+RESET
+  devm_reset_control_get()
+  devm_reset_controller_register()
+
+SERDEV
+  devm_serdev_device_open()
+
+SLAVE DMA ENGINE
+  devm_acpi_dma_controller_register()
+
+SPI
+  devm_spi_register_master()
+
+WATCHDOG
+  devm_watchdog_register_device()
diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst
new file mode 100644
index 000000000000..11d281506a04
--- /dev/null
+++ b/Documentation/driver-api/driver-model/driver.rst
@@ -0,0 +1,223 @@
+==============
+Device Drivers
+==============
+
+See the kerneldoc for the struct device_driver.
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model::
+
+  static struct device_driver eepro100_driver = {
+         .name		= "eepro100",
+         .bus		= &pci_bus_type,
+
+         .probe		= eepro100_probe,
+         .remove		= eepro100_remove,
+         .suspend		= eepro100_suspend,
+         .resume		= eepro100_resume,
+  };
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this::
+
+  struct pci_driver {
+         const struct pci_device_id *id_table;
+         struct device_driver	  driver;
+  };
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again)::
+
+  static struct pci_driver eepro100_driver = {
+         .id_table       = eepro100_pci_tbl,
+         .driver	       = {
+		.name		= "eepro100",
+		.bus		= &pci_bus_type,
+		.probe		= eepro100_probe,
+		.remove		= eepro100_remove,
+		.suspend	= eepro100_suspend,
+		.resume		= eepro100_resume,
+         },
+  };
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+::
+
+  int driver_register(struct device_driver *drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices::
+
+  int driver_for_each_dev(struct device_driver *drv, void *data,
+			  int (*callback)(struct device *dev, void *data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+::
+
+	int	(*probe)	(struct device *dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device.  Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines.  That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device.  That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated::
+
+	int 	(*remove)	(struct device *dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state::
+
+	int	(*suspend)	(struct device *dev, pm_message_t state);
+
+suspend is called to put the device in a low power state::
+
+	int	(*resume)	(struct device *dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct driver_attribute {
+          struct attribute        attr;
+          ssize_t (*show)(struct device_driver *driver, char *buf);
+          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
+  };
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
+macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
+macros.
+
+Example::
+
+	DRIVER_ATTR_RW(debug);
+
+This is equivalent to declaring::
+
+	struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using::
+
+  int driver_create_file(struct device_driver *, const struct driver_attribute *);
+  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst
new file mode 100644
index 000000000000..755016422269
--- /dev/null
+++ b/Documentation/driver-api/driver-model/index.rst
@@ -0,0 +1,24 @@
+============
+Driver Model
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   binding
+   bus
+   class
+   design-patterns
+   device
+   devres
+   driver
+   overview
+   platform
+   porting
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
new file mode 100644
index 000000000000..d4d1e9b40e0c
--- /dev/null
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -0,0 +1,124 @@
+=============================
+The Linux Kernel Device Model
+=============================
+
+Patrick Mochel	<mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm.  Of course,
+not every bus is able to support all such operations, although most
+buses support most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this::
+
+  struct pci_dev {
+	...
+
+	struct device dev;     /* Generic device interface */
+	...
+  };
+
+Note first that the struct device dev within the struct pci_dev is
+statically allocated. This means only one allocation on device discovery.
+
+Note also that that struct device dev is not necessarily defined at the
+front of the pci_dev structure.  This is to make people think about what
+they're doing when switching between the bus driver and the global driver,
+and to discourage meaningless and incorrect casts between the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a compelling reason to do so.
+
+The above abstraction prevents unnecessary pain during transitional phases.
+If it were not done this way, then when a field was renamed or removed, every
+downstream driver would break.  On the other hand, if only the bus layer
+(and not the device layer) accesses the struct device, it is only the bus
+layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs.
+
+Almost all mainstream Linux distros mount this filesystem automatically; you
+can see some variation of the following in the output of the "mount" command::
+
+  $ mount
+  ...
+  none on /sys type sysfs (rw,noexec,nosuid,nodev)
+  ...
+  $
+
+The auto-mounting of sysfs is typically accomplished by an entry similar to
+the following in the /etc/fstab file::
+
+  none     	/sys	sysfs    defaults	  	0 0
+
+or something similar in the /lib/init/fstab file on Debian-based systems::
+
+  none            /sys    sysfs    nodev,noexec,nosuid    0 0
+
+If sysfs is not automatically mounted, you can always do it manually with::
+
+	# mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
new file mode 100644
index 000000000000..334dd4071ae4
--- /dev/null
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -0,0 +1,246 @@
+============================
+Platform Devices and Drivers
+============================
+
+See <linux/platform_device.h> for the driver model interface to the
+platform bus:  platform_device, and platform_driver.  This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms.  What they usually have in common
+is direct addressing from a CPU bus.  Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs::
+
+  struct platform_device {
+	const char	*name;
+	u32		id;
+	struct device	dev;
+	u32		num_resources;
+	struct resource	*resource;
+  };
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods.  They support power management
+and shutdown notifications using the standard conventions::
+
+  struct platform_driver {
+	int (*probe)(struct platform_device *);
+	int (*remove)(struct platform_device *);
+	void (*shutdown)(struct platform_device *);
+	int (*suspend)(struct platform_device *, pm_message_t state);
+	int (*suspend_late)(struct platform_device *, pm_message_t state);
+	int (*resume_early)(struct platform_device *);
+	int (*resume)(struct platform_device *);
+	struct device_driver driver;
+  };
+
+Note that probe() should in general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure.  The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way::
+
+	int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint::
+
+	int platform_driver_probe(struct platform_driver *drv,
+			  int (*probe)(struct platform_device *))
+
+Kernel modules can be composed of several platform drivers. The platform core
+provides helpers to register and unregister an array of drivers::
+
+	int __platform_register_drivers(struct platform_driver * const *drivers,
+				      unsigned int count, struct module *owner);
+	void platform_unregister_drivers(struct platform_driver * const *drivers,
+					 unsigned int count);
+
+If one of the drivers fails to register, all drivers registered up to that
+point will be unregistered in reverse order. Note that there is a convenience
+macro that passes THIS_MODULE as owner parameter::
+
+	#define platform_register_drivers(drivers, count)
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices::
+
+	int platform_device_register(struct platform_device *pdev);
+
+	int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered.  For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board.   Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board.  Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work.  Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers:  Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role:  the driver registers its platform device, rather than
+leaving that for system infrastructure.  Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration.  Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged.  If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver.  This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers.  Avoid
+using these calls except with such hotplug-deficient drivers::
+
+	struct platform_device *platform_device_alloc(
+			const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually::
+
+	struct platform_device *platform_device_register_simple(
+			const char *name, int id,
+			struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+    * platform_device.name ... which is also used to for driver matching.
+
+    * platform_device.id ... the device instance number, or else "-1"
+      to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver.  If the
+probe() succeeds, the driver and device are bound as usual.  There are
+three different ways to find such a match:
+
+    - Whenever a device is registered, the drivers for that bus are
+      checked for matches.  Platform devices should be registered very
+      early during system boot.
+
+    - When a driver is registered using platform_driver_register(), all
+      unbound devices on that bus are checked for matches.  Drivers
+      usually register later during booting, or by module loading.
+
+    - Registering a driver using platform_driver_probe() works just like
+      using platform_driver_register(), except that the driver won't
+      be probed later if another device registers.  (Which is OK, since
+      this interface is only for use with non-hotpluggable devices.)
+
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platform driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst
new file mode 100644
index 000000000000..931ea879af3f
--- /dev/null
+++ b/Documentation/driver-api/driver-model/porting.rst
@@ -0,0 +1,448 @@
+=======================================
+Porting Drivers to the New Driver Model
+=======================================
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing::
+
+	# mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver::
+
+    struct bus_type pci_bus_type = {
+          .name           = "pci",
+    };
+
+
+- Register the bus type.
+
+  This should be done in the initialization function for the bus type,
+  which is usually the module_init(), or equivalent, function::
+
+    static int __init pci_driver_init(void)
+    {
+            return bus_register(&pci_bus_type);
+    }
+
+    subsys_initcall(pci_driver_init);
+
+
+  The bus type may be unregistered (if the bus driver may be compiled
+  as a module) by doing::
+
+     bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+  Other code may wish to reference the bus type, so declare it in a
+  shared header file and export the symbol.
+
+From include/linux/pci.h::
+
+  extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in::
+
+  EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+  subdirectories: 'devices' and 'drivers'::
+
+    # tree -d /sys/bus/pci/
+    /sys/bus/pci/
+    |-- devices
+    `-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type::
+
+
+    struct pci_dev {
+           ...
+           struct  device  dev;            /* Generic device interface */
+           ...
+    };
+
+  It is recommended that the generic device not be the first item in
+  the struct to discourage programmers from doing mindless casts
+  between the object types. Instead macros, or inline functions,
+  should be created to convert from the generic object type::
+
+
+    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+    or
+
+    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+    {
+	return container_of(n, struct pci_dev, dev);
+    }
+
+  This allows the compiler to verify type-safety of the operations
+  that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+  When devices are discovered or registered with the bus type, the
+  bus driver should initialize the generic device. The most important
+  things to initialize are the bus_id, parent, and bus fields.
+
+  The bus_id is an ASCII string that contains the device's address on
+  the bus. The format of this string is bus-specific. This is
+  necessary for representing devices in sysfs.
+
+  parent is the physical parent of the device. It is important that
+  the bus driver sets this field correctly.
+
+  The driver model maintains an ordered list of devices that it uses
+  for power management. This list must be in order to guarantee that
+  devices are shutdown before their physical parents, and vice versa.
+  The order of this list is determined by the parent of registered
+  devices.
+
+  Also, the location of the device's sysfs directory depends on a
+  device's parent. sysfs exports a directory structure that mirrors
+  the device hierarchy. Accurately setting the parent guarantees that
+  sysfs will accurately represent the hierarchy.
+
+  The device's bus field is a pointer to the bus type the device
+  belongs to. This should be set to the bus_type that was declared
+  and initialized before.
+
+  Optionally, the bus driver may set the device's name and release
+  fields.
+
+  The name field is an ASCII string describing the device, like
+
+     "ATI Technologies Inc Radeon QD"
+
+  The release field is a callback that the driver model core calls
+  when the device has been removed, and all references to it have
+  been released. More on this in a moment.
+
+
+- Register the device.
+
+  Once the generic device has been initialized, it can be registered
+  with the driver model core by doing::
+
+       device_register(&dev->dev);
+
+  It can later be unregistered by doing::
+
+       device_unregister(&dev->dev);
+
+  This should happen on buses that support hotpluggable devices.
+  If a bus driver unregisters a device, it should not immediately free
+  it. It should instead wait for the driver model core to call the
+  device's release method, then free the bus-specific object.
+  (There may be other code that is currently referencing the device
+  structure, and it would be rude to free the device while that is
+  happening).
+
+
+  When the device is registered, a directory in sysfs is created.
+  The PCI tree in sysfs looks like::
+
+    /sys/devices/pci0/
+    |-- 00:00.0
+    |-- 00:01.0
+    |   `-- 01:00.0
+    |-- 00:02.0
+    |   `-- 02:1f.0
+    |       `-- 03:00.0
+    |-- 00:1e.0
+    |   `-- 04:04.0
+    |-- 00:1f.0
+    |-- 00:1f.1
+    |   |-- ide0
+    |   |   |-- 0.0
+    |   |   `-- 0.1
+    |   `-- ide1
+    |       `-- 1.0
+    |-- 00:1f.2
+    |-- 00:1f.3
+    `-- 00:1f.5
+
+  Also, symlinks are created in the bus's 'devices' directory
+  that point to the device's directory in the physical hierarchy::
+
+    /sys/bus/pci/devices/
+    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
+    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
+    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
+    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+  Just like with devices, do something like::
+
+    struct pci_driver {
+           ...
+           struct device_driver    driver;
+    };
+
+
+- Initialize the generic driver structure.
+
+  When the driver registers with the bus (e.g. doing pci_register_driver()),
+  initialize the necessary fields of the driver: the name and bus
+  fields.
+
+
+- Register the driver.
+
+  After the generic driver has been initialized, call::
+
+	driver_register(&drv->driver);
+
+  to register the driver with the core.
+
+  When the driver is unregistered from the bus, unregister it from the
+  core by doing::
+
+        driver_unregister(&drv->driver);
+
+  Note that this will block until all references to the driver have
+  gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+  Drivers are exported via sysfs in their bus's 'driver's directory.
+  For example::
+
+    /sys/bus/pci/drivers/
+    |-- 3c59x
+    |-- Ensoniq AudioPCI
+    |-- agpgart-amdk7
+    |-- e100
+    `-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance::
+
+
+  static int pci_device_remove(struct device * dev)
+  {
+          struct pci_dev * pci_dev = to_pci_dev(dev);
+          struct pci_driver * drv = pci_dev->driver;
+
+          if (drv) {
+                  if (drv->remove)
+                          drv->remove(pci_dev);
+                  pci_dev->driver = NULL;
+          }
+          return 0;
+  }
+
+
+The generic driver should be initialized with these methods before it
+is registered::
+
+        /* initialize common driver fields */
+        drv->driver.name = drv->name;
+        drv->driver.bus = &pci_bus_type;
+        drv->driver.probe = pci_device_probe;
+        drv->driver.resume = pci_device_resume;
+        drv->driver.suspend = pci_device_suspend;
+        drv->driver.remove = pci_device_remove;
+
+        /* register with core */
+        driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison::
+
+  int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return positive value if the driver supports the device,
+and zero otherwise. It may also return error code (for example
+-EPROBE_DEFER) if determining that given driver supports the device is
+not possible.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory::
+
+  /sys/bus/pci/drivers/
+  |-- 3c59x
+  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+  |-- Ensoniq AudioPCI
+  |-- agpgart-amdk7
+  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+  |-- e100
+  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+  `-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type::
+
+     int (*hotplug) (struct device *dev, char **envp,
+                     int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+                       void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices::
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+                       void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 349f2dc33029..921c71a3d683 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -399,7 +399,7 @@ symbol:
   will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the
   callbacks need to embed the gpio_chip in its state container and obtain a
   pointer to the container using container_of().
-  (See Documentation/driver-model/design-patterns.rst)
+  (See Documentation/driver-api/driver-model/design-patterns.rst)
 
 - gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
   as discussed above regarding different types of cascaded irqchips. The
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index b4c993ff7655..9fb03b7bdeb1 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -14,6 +14,7 @@ available subsections can be seen below.
 .. toctree::
    :maxdepth: 2
 
+   driver-model/index
    basics
    infrastructure
    early-userspace/index
diff --git a/Documentation/driver-model/binding.rst b/Documentation/driver-model/binding.rst
deleted file mode 100644
index 7ea1d7a41e1d..000000000000
--- a/Documentation/driver-model/binding.rst
+++ /dev/null
@@ -1,98 +0,0 @@
-==============
-Driver Binding
-==============
-
-Driver binding is the process of associating a device with a device
-driver that can control it. Bus drivers have typically handled this
-because there have been bus-specific structures to represent the
-devices and the drivers. With generic device and device driver
-structures, most of the binding can take place using common code.
-
-
-Bus
-~~~
-
-The bus type structure contains a list of all devices that are on that bus
-type in the system. When device_register is called for a device, it is
-inserted into the end of this list. The bus object also contains a
-list of all drivers of that bus type. When driver_register is called
-for a driver, it is inserted at the end of this list. These are the
-two events which trigger driver binding.
-
-
-device_register
-~~~~~~~~~~~~~~~
-
-When a new device is added, the bus's list of drivers is iterated over
-to find one that supports it. In order to determine that, the device
-ID of the device must match one of the device IDs that the driver
-supports. The format and semantics for comparing IDs is bus-specific.
-Instead of trying to derive a complex state machine and matching
-algorithm, it is up to the bus driver to provide a callback to compare
-a device against the IDs of a driver. The bus returns 1 if a match was
-found; 0 otherwise.
-
-int match(struct device * dev, struct device_driver * drv);
-
-If a match is found, the device's driver field is set to the driver
-and the driver's probe callback is called. This gives the driver a
-chance to verify that it really does support the hardware, and that
-it's in a working state.
-
-Device Class
-~~~~~~~~~~~~
-
-Upon the successful completion of probe, the device is registered with
-the class to which it belongs. Device drivers belong to one and only one
-class, and that is set in the driver's devclass field.
-devclass_add_device is called to enumerate the device within the class
-and actually register it with the class, which happens with the
-class's register_dev callback.
-
-
-Driver
-~~~~~~
-
-When a driver is attached to a device, the device is inserted into the
-driver's list of devices.
-
-
-sysfs
-~~~~~
-
-A symlink is created in the bus's 'devices' directory that points to
-the device's directory in the physical hierarchy.
-
-A symlink is created in the driver's 'devices' directory that points
-to the device's directory in the physical hierarchy.
-
-A directory for the device is created in the class's directory. A
-symlink is created in that directory that points to the device's
-physical location in the sysfs tree.
-
-A symlink can be created (though this isn't done yet) in the device's
-physical directory to either its class directory, or the class's
-top-level directory. One can also be created to point to its driver's
-directory also.
-
-
-driver_register
-~~~~~~~~~~~~~~~
-
-The process is almost identical for when a new driver is added.
-The bus's list of devices is iterated over to find a match. Devices
-that already have a driver are skipped. All the devices are iterated
-over, to bind as many devices as possible to the driver.
-
-
-Removal
-~~~~~~~
-
-When a device is removed, the reference count for it will eventually
-go to 0. When it does, the remove callback of the driver is called. It
-is removed from the driver's list of devices and the reference count
-of the driver is decremented. All symlinks between the two are removed.
-
-When a driver is removed, the list of devices that it supports is
-iterated over, and the driver's remove callback is called for each
-one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-model/bus.rst b/Documentation/driver-model/bus.rst
deleted file mode 100644
index 016b15a6e8ea..000000000000
--- a/Documentation/driver-model/bus.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-=========
-Bus Types
-=========
-
-Definition
-~~~~~~~~~~
-See the kerneldoc for the struct bus_type.
-
-int bus_register(struct bus_type * bus);
-
-
-Declaration
-~~~~~~~~~~~
-
-Each bus type in the kernel (PCI, USB, etc) should declare one static
-object of this type. They must initialize the name field, and may
-optionally initialize the match callback::
-
-   struct bus_type pci_bus_type = {
-          .name	= "pci",
-          .match	= pci_bus_match,
-   };
-
-The structure should be exported to drivers in a header file:
-
-extern struct bus_type pci_bus_type;
-
-
-Registration
-~~~~~~~~~~~~
-
-When a bus driver is initialized, it calls bus_register. This
-initializes the rest of the fields in the bus object and inserts it
-into a global list of bus types. Once the bus object is registered,
-the fields in it are usable by the bus driver.
-
-
-Callbacks
-~~~~~~~~~
-
-match(): Attaching Drivers to Devices
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The format of device ID structures and the semantics for comparing
-them are inherently bus-specific. Drivers typically declare an array
-of device IDs of devices they support that reside in a bus-specific
-driver structure.
-
-The purpose of the match callback is to give the bus an opportunity to
-determine if a particular driver supports a particular device by
-comparing the device IDs the driver supports with the device ID of a
-particular device, without sacrificing bus-specific functionality or
-type-safety.
-
-When a driver is registered with the bus, the bus's list of devices is
-iterated over, and the match callback is called for each device that
-does not have a driver associated with it.
-
-
-
-Device and Driver Lists
-~~~~~~~~~~~~~~~~~~~~~~~
-
-The lists of devices and drivers are intended to replace the local
-lists that many buses keep. They are lists of struct devices and
-struct device_drivers, respectively. Bus drivers are free to use the
-lists as they please, but conversion to the bus-specific type may be
-necessary.
-
-The LDM core provides helper functions for iterating over each list::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-		       void * data,
-		       int (*fn)(struct device *, void *));
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-		       void * data, int (*fn)(struct device_driver *, void *));
-
-These helpers iterate over the respective list, and call the callback
-for each device or driver in the list. All list accesses are
-synchronized by taking the bus's lock (read currently). The reference
-count on each object in the list is incremented before the callback is
-called; it is decremented after the next object has been obtained. The
-lock is not held when calling the callback.
-
-
-sysfs
-~~~~~~~~
-There is a top-level directory named 'bus'.
-
-Each bus gets a directory in the bus directory, along with two default
-directories::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-
-Drivers registered with the bus get a directory in the bus's drivers
-directory::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-	    |-- Intel ICH
-	    |-- Intel ICH Joystick
-	    |-- agpgart
-	    `-- e100
-
-Each device that is discovered on a bus of that type gets a symlink in
-the bus's devices directory to the device's directory in the physical
-hierarchy::
-
-	/sys/bus/pci/
-	|-- devices
-	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
-	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
-	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
-	`-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct bus_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *, char * buf);
-	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
-  };
-
-Bus drivers can export attributes using the BUS_ATTR_RW macro that works
-similarly to the DEVICE_ATTR_RW macro for devices. For example, a
-definition like this::
-
-	static BUS_ATTR_RW(debug);
-
-is equivalent to declaring::
-
-	static bus_attribute bus_attr_debug;
-
-This can then be used to add and remove the attribute from the bus's
-sysfs directory using::
-
-	int bus_create_file(struct bus_type *, struct bus_attribute *);
-	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-model/class.rst b/Documentation/driver-model/class.rst
deleted file mode 100644
index fff55b80e86a..000000000000
--- a/Documentation/driver-model/class.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-==============
-Device Classes
-==============
-
-Introduction
-~~~~~~~~~~~~
-A device class describes a type of device, like an audio or network
-device. The following device classes have been identified:
-
-<Insert List of Device Classes Here>
-
-
-Each device class defines a set of semantics and a programming interface
-that devices of that class adhere to. Device drivers are the
-implementation of that programming interface for a particular device on
-a particular bus.
-
-Device classes are agnostic with respect to what bus a device resides
-on.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The device class structure looks like::
-
-
-  typedef int (*devclass_add)(struct device *);
-  typedef void (*devclass_remove)(struct device *);
-
-See the kerneldoc for the struct class.
-
-A typical device class definition would look like::
-
-  struct device_class input_devclass = {
-        .name		= "input",
-        .add_device	= input_add_device,
-	.remove_device	= input_remove_device,
-  };
-
-Each device class structure should be exported in a header file so it
-can be used by drivers, extensions and interfaces.
-
-Device classes are registered and unregistered with the core using::
-
-  int devclass_register(struct device_class * cls);
-  void devclass_unregister(struct device_class * cls);
-
-
-Devices
-~~~~~~~
-As devices are bound to drivers, they are added to the device class
-that the driver belongs to. Before the driver model core, this would
-typically happen during the driver's probe() callback, once the device
-has been initialized. It now happens after the probe() callback
-finishes from the core.
-
-The device is enumerated in the class. Each time a device is added to
-the class, the class's devnum field is incremented and assigned to the
-device. The field is never decremented, so if the device is removed
-from the class and re-added, it will receive a different enumerated
-value.
-
-The class is allowed to create a class-specific structure for the
-device and store it in the device's class_data pointer.
-
-There is no list of devices in the device class. Each driver has a
-list of devices that it supports. The device class has a list of
-drivers of that particular class. To access all of the devices in the
-class, iterate over the device lists of each driver in the class.
-
-
-Device Drivers
-~~~~~~~~~~~~~~
-Device drivers are added to device classes when they are registered
-with the core. A driver specifies the class it belongs to by setting
-the struct device_driver::devclass field.
-
-
-sysfs directory structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There is a top-level sysfs directory named 'class'.
-
-Each class gets a directory in the class directory, along with two
-default subdirectories::
-
-        class/
-        `-- input
-            |-- devices
-            `-- drivers
-
-
-Drivers registered with the class get a symlink in the drivers/ directory
-that points to the driver's directory (under its bus directory)::
-
-   class/
-   `-- input
-       |-- devices
-       `-- drivers
-           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
-
-
-Each device gets a symlink in the devices/ directory that points to the
-device's directory in the physical hierarchy::
-
-   class/
-   `-- input
-       |-- devices
-       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
-       `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct devclass_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
-        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
-  };
-
-Class drivers can export attributes using the DEVCLASS_ATTR macro that works
-similarly to the DEVICE_ATTR macro for devices. For example, a definition
-like this::
-
-  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
-
-is equivalent to declaring::
-
-  static devclass_attribute devclass_attr_debug;
-
-The bus driver can add and remove the attribute from the class's
-sysfs directory using::
-
-  int devclass_create_file(struct device_class *, struct devclass_attribute *);
-  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
-
-In the example above, the file will be named 'debug' in placed in the
-class's directory in sysfs.
-
-
-Interfaces
-~~~~~~~~~~
-There may exist multiple mechanisms for accessing the same device of a
-particular class type. Device interfaces describe these mechanisms.
-
-When a device is added to a device class, the core attempts to add it
-to every interface that is registered with the device class.
diff --git a/Documentation/driver-model/design-patterns.rst b/Documentation/driver-model/design-patterns.rst
deleted file mode 100644
index 41eb8f41f7dd..000000000000
--- a/Documentation/driver-model/design-patterns.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-=============================
-Device Driver Design Patterns
-=============================
-
-This document describes a few common design patterns found in device drivers.
-It is likely that subsystem maintainers will ask driver developers to
-conform to these design patterns.
-
-1. State Container
-2. container_of()
-
-
-1. State Container
-~~~~~~~~~~~~~~~~~~
-
-While the kernel contains a few device drivers that assume that they will
-only be probed() once on a certain system (singletons), it is custom to assume
-that the device the driver binds to will appear in several instances. This
-means that the probe() function and all callbacks need to be reentrant.
-
-The most common way to achieve this is to use the state container design
-pattern. It usually has this form::
-
-  struct foo {
-      spinlock_t lock; /* Example member */
-      (...)
-  };
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
-      if (!foo)
-          return -ENOMEM;
-      spin_lock_init(&foo->lock);
-      (...)
-  }
-
-This will create an instance of struct foo in memory every time probe() is
-called. This is our state container for this instance of the device driver.
-Of course it is then necessary to always pass this instance of the
-state around to all functions that need access to the state and its members.
-
-For example, if the driver is registering an interrupt handler, you would
-pass around a pointer to struct foo like this::
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      (...)
-      ret = request_irq(irq, foo_handler, 0, "foo", foo);
-  }
-
-This way you always get a pointer back to the correct instance of foo in
-your interrupt handler.
-
-
-2. container_of()
-~~~~~~~~~~~~~~~~~
-
-Continuing on the above example we add an offloaded work::
-
-  struct foo {
-      spinlock_t lock;
-      struct workqueue_struct *wq;
-      struct work_struct offload;
-      (...)
-  };
-
-  static void foo_work(struct work_struct *work)
-  {
-      struct foo *foo = container_of(work, struct foo, offload);
-
-      (...)
-  }
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-
-      queue_work(foo->wq, &foo->offload);
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo->wq = create_singlethread_workqueue("foo-wq");
-      INIT_WORK(&foo->offload, foo_work);
-      (...)
-  }
-
-The design pattern is the same for an hrtimer or something similar that will
-return a single argument which is a pointer to a struct member in the
-callback.
-
-container_of() is a macro defined in <linux/kernel.h>
-
-What container_of() does is to obtain a pointer to the containing struct from
-a pointer to a member by a simple subtraction using the offsetof() macro from
-standard C, which allows something similar to object oriented behaviours.
-Notice that the contained member must not be a pointer, but an actual member
-for this to work.
-
-We can see here that we avoid having global pointers to our struct foo *
-instance this way, while still keeping the number of parameters passed to the
-work function to a single pointer.
diff --git a/Documentation/driver-model/device.rst b/Documentation/driver-model/device.rst
deleted file mode 100644
index 2b868d49d349..000000000000
--- a/Documentation/driver-model/device.rst
+++ /dev/null
@@ -1,109 +0,0 @@
-==========================
-The Basic Device Structure
-==========================
-
-See the kerneldoc for the struct device.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The bus driver that discovers the device uses this to register the
-device with the core::
-
-  int device_register(struct device * dev);
-
-The bus should initialize the following fields:
-
-    - parent
-    - name
-    - bus_id
-    - bus
-
-A device is removed from the core when its reference count goes to
-0. The reference count can be adjusted using::
-
-  struct device * get_device(struct device * dev);
-  void put_device(struct device * dev);
-
-get_device() will return a pointer to the struct device passed to it
-if the reference is not already 0 (if it's in the process of being
-removed already).
-
-A driver can access the lock in the device structure using::
-
-  void lock_device(struct device * dev);
-  void unlock_device(struct device * dev);
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct device_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
-			 const char *buf, size_t count);
-  };
-
-Attributes of devices can be exported by a device driver through sysfs.
-
-Please see Documentation/filesystems/sysfs.txt for more information
-on how sysfs works.
-
-As explained in Documentation/kobject.txt, device attributes must be
-created before the KOBJ_ADD uevent is generated. The only way to realize
-that is by defining an attribute group.
-
-Attributes are declared using a macro called DEVICE_ATTR::
-
-  #define DEVICE_ATTR(name,mode,show,store)
-
-Example:::
-
-  static DEVICE_ATTR(type, 0444, show_type, NULL);
-  static DEVICE_ATTR(power, 0644, show_power, store_power);
-
-This declares two structures of type struct device_attribute with respective
-names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
-organized as follows into a group::
-
-  static struct attribute *dev_attrs[] = {
-	&dev_attr_type.attr,
-	&dev_attr_power.attr,
-	NULL,
-  };
-
-  static struct attribute_group dev_attr_group = {
-	.attrs = dev_attrs,
-  };
-
-  static const struct attribute_group *dev_attr_groups[] = {
-	&dev_attr_group,
-	NULL,
-  };
-
-This array of groups can then be associated with a device by setting the
-group pointer in struct device before device_register() is invoked::
-
-        dev->groups = dev_attr_groups;
-        device_register(dev);
-
-The device_register() function will use the 'groups' pointer to create the
-device attributes and the device_unregister() function will use this pointer
-to remove the device attributes.
-
-Word of warning:  While the kernel allows device_create_file() and
-device_remove_file() to be called on a device at any time, userspace has
-strict expectations on when attributes get created.  When a new device is
-registered in the kernel, a uevent is generated to notify userspace (like
-udev) that a new device is available.  If attributes are added after the
-device is registered, then userspace won't get notified and userspace will
-not know about the new attributes.
-
-This is important for device driver that need to publish additional
-attributes for a device at driver probe time.  If the device driver simply
-calls device_create_file() on the device structure passed to it, then
-userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-model/devres.rst b/Documentation/driver-model/devres.rst
deleted file mode 100644
index 4ac99122b5f1..000000000000
--- a/Documentation/driver-model/devres.rst
+++ /dev/null
@@ -1,414 +0,0 @@
-================================
-Devres - Managed Device Resource
-================================
-
-Tejun Heo	<teheo@suse.de>
-
-First draft	10 January 2007
-
-.. contents
-
-   1. Intro			: Huh? Devres?
-   2. Devres			: Devres in a nutshell
-   3. Devres Group		: Group devres'es and release them together
-   4. Details			: Life time rules, calling context, ...
-   5. Overhead			: How much do we have to pay for this?
-   6. List of managed interfaces: Currently implemented managed interfaces
-
-
-1. Intro
---------
-
-devres came up while trying to convert libata to use iomap.  Each
-iomapped address should be kept and unmapped on driver detach.  For
-example, a plain SFF ATA controller (that is, good old PCI IDE) in
-native mode makes use of 5 PCI BARs and all of them should be
-maintained.
-
-As with many other device drivers, libata low level drivers have
-sufficient bugs in ->remove and ->probe failure path.  Well, yes,
-that's probably because libata low level driver developers are lazy
-bunch, but aren't all low level driver developers?  After spending a
-day fiddling with braindamaged hardware with no document or
-braindamaged document, if it's finally working, well, it's working.
-
-For one reason or another, low level drivers don't receive as much
-attention or testing as core code, and bugs on driver detach or
-initialization failure don't happen often enough to be noticeable.
-Init failure path is worse because it's much less travelled while
-needs to handle multiple entry points.
-
-So, many low level drivers end up leaking resources on driver detach
-and having half broken failure path implementation in ->probe() which
-would leak resources or even cause oops when failure occurs.  iomap
-adds more to this mix.  So do msi and msix.
-
-
-2. Devres
----------
-
-devres is basically linked list of arbitrarily sized memory areas
-associated with a struct device.  Each devres entry is associated with
-a release function.  A devres can be released in several ways.  No
-matter what, all devres entries are released on driver detach.  On
-release, the associated release function is invoked and then the
-devres entry is freed.
-
-Managed interface is created for resources commonly used by device
-drivers using devres.  For example, coherent DMA memory is acquired
-using dma_alloc_coherent().  The managed version is called
-dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
-for the DMA memory allocated using it is managed and will be
-automatically released on driver detach.  Implementation looks like
-the following::
-
-  struct dma_devres {
-	size_t		size;
-	void		*vaddr;
-	dma_addr_t	dma_handle;
-  };
-
-  static void dmam_coherent_release(struct device *dev, void *res)
-  {
-	struct dma_devres *this = res;
-
-	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
-  }
-
-  dmam_alloc_coherent(dev, size, dma_handle, gfp)
-  {
-	struct dma_devres *dr;
-	void *vaddr;
-
-	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
-	...
-
-	/* alloc DMA memory as usual */
-	vaddr = dma_alloc_coherent(...);
-	...
-
-	/* record size, vaddr, dma_handle in dr */
-	dr->vaddr = vaddr;
-	...
-
-	devres_add(dev, dr);
-
-	return vaddr;
-  }
-
-If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
-freed whether initialization fails half-way or the device gets
-detached.  If most resources are acquired using managed interface, a
-driver can have much simpler init and exit code.  Init path basically
-looks like the following::
-
-  my_init_one()
-  {
-	struct mydev *d;
-
-	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
-	if (!d)
-		return -ENOMEM;
-
-	d->ring = dmam_alloc_coherent(...);
-	if (!d->ring)
-		return -ENOMEM;
-
-	if (check something)
-		return -EINVAL;
-	...
-
-	return register_to_upper_layer(d);
-  }
-
-And exit path::
-
-  my_remove_one()
-  {
-	unregister_from_upper_layer(d);
-	shutdown_my_hardware();
-  }
-
-As shown above, low level drivers can be simplified a lot by using
-devres.  Complexity is shifted from less maintained low level drivers
-to better maintained higher layer.  Also, as init failure path is
-shared with exit path, both can get more testing.
-
-Note though that when converting current calls or assignments to
-managed devm_* versions it is up to you to check if internal operations
-like allocating memory, have failed. Managed resources pertains to the
-freeing of these resources *only* - all other checks needed are still
-on you. In some cases this may mean introducing checks that were not
-necessary before moving to the managed devm_* calls.
-
-
-3. Devres group
----------------
-
-Devres entries can be grouped using devres group.  When a group is
-released, all contained normal devres entries and properly nested
-groups are released.  One usage is to rollback series of acquired
-resources on failure.  For example::
-
-  if (!devres_open_group(dev, NULL, GFP_KERNEL))
-	return -ENOMEM;
-
-  acquire A;
-  if (failed)
-	goto err;
-
-  acquire B;
-  if (failed)
-	goto err;
-  ...
-
-  devres_remove_group(dev, NULL);
-  return 0;
-
- err:
-  devres_release_group(dev, NULL);
-  return err_code;
-
-As resource acquisition failure usually means probe failure, constructs
-like above are usually useful in midlayer driver (e.g. libata core
-layer) where interface function shouldn't have side effect on failure.
-For LLDs, just returning error code suffices in most cases.
-
-Each group is identified by `void *id`.  It can either be explicitly
-specified by @id argument to devres_open_group() or automatically
-created by passing NULL as @id as in the above example.  In both
-cases, devres_open_group() returns the group's id.  The returned id
-can be passed to other devres functions to select the target group.
-If NULL is given to those functions, the latest open group is
-selected.
-
-For example, you can do something like the following::
-
-  int my_midlayer_create_something()
-  {
-	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
-		return -ENOMEM;
-
-	...
-
-	devres_close_group(dev, my_midlayer_create_something);
-	return 0;
-  }
-
-  void my_midlayer_destroy_something()
-  {
-	devres_release_group(dev, my_midlayer_create_something);
-  }
-
-
-4. Details
-----------
-
-Lifetime of a devres entry begins on devres allocation and finishes
-when it is released or destroyed (removed and freed) - no reference
-counting.
-
-devres core guarantees atomicity to all basic devres operations and
-has support for single-instance devres types (atomic
-lookup-and-add-if-not-found).  Other than that, synchronizing
-concurrent accesses to allocated devres data is caller's
-responsibility.  This is usually non-issue because bus ops and
-resource allocations already do the job.
-
-For an example of single-instance devres type, read pcim_iomap_table()
-in lib/devres.c.
-
-All devres interface functions can be called without context if the
-right gfp mask is given.
-
-
-5. Overhead
------------
-
-Each devres bookkeeping info is allocated together with requested data
-area.  With debug option turned off, bookkeeping info occupies 16
-bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
-up to ull alignment).  If singly linked list is used, it can be
-reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
-
-Each devres group occupies 8 pointers.  It can be reduced to 6 if
-singly linked list is used.
-
-Memory space overhead on ahci controller with two ports is between 300
-and 400 bytes on 32bit machine after naive conversion (we can
-certainly invest a bit more effort into libata core layer).
-
-
-6. List of managed interfaces
------------------------------
-
-CLOCK
-  devm_clk_get()
-  devm_clk_get_optional()
-  devm_clk_put()
-  devm_clk_hw_register()
-  devm_of_clk_add_hw_provider()
-  devm_clk_hw_register_clkdev()
-
-DMA
-  dmaenginem_async_device_register()
-  dmam_alloc_coherent()
-  dmam_alloc_attrs()
-  dmam_free_coherent()
-  dmam_pool_create()
-  dmam_pool_destroy()
-
-DRM
-  devm_drm_dev_init()
-
-GPIO
-  devm_gpiod_get()
-  devm_gpiod_get_index()
-  devm_gpiod_get_index_optional()
-  devm_gpiod_get_optional()
-  devm_gpiod_put()
-  devm_gpiod_unhinge()
-  devm_gpiochip_add_data()
-  devm_gpio_request()
-  devm_gpio_request_one()
-  devm_gpio_free()
-
-I2C
-  devm_i2c_new_dummy_device()
-
-IIO
-  devm_iio_device_alloc()
-  devm_iio_device_free()
-  devm_iio_device_register()
-  devm_iio_device_unregister()
-  devm_iio_kfifo_allocate()
-  devm_iio_kfifo_free()
-  devm_iio_triggered_buffer_setup()
-  devm_iio_triggered_buffer_cleanup()
-  devm_iio_trigger_alloc()
-  devm_iio_trigger_free()
-  devm_iio_trigger_register()
-  devm_iio_trigger_unregister()
-  devm_iio_channel_get()
-  devm_iio_channel_release()
-  devm_iio_channel_get_all()
-  devm_iio_channel_release_all()
-
-INPUT
-  devm_input_allocate_device()
-
-IO region
-  devm_release_mem_region()
-  devm_release_region()
-  devm_release_resource()
-  devm_request_mem_region()
-  devm_request_region()
-  devm_request_resource()
-
-IOMAP
-  devm_ioport_map()
-  devm_ioport_unmap()
-  devm_ioremap()
-  devm_ioremap_nocache()
-  devm_ioremap_wc()
-  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
-  devm_iounmap()
-  pcim_iomap()
-  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
-  pcim_iomap_table()	: array of mapped addresses indexed by BAR
-  pcim_iounmap()
-
-IRQ
-  devm_free_irq()
-  devm_request_any_context_irq()
-  devm_request_irq()
-  devm_request_threaded_irq()
-  devm_irq_alloc_descs()
-  devm_irq_alloc_desc()
-  devm_irq_alloc_desc_at()
-  devm_irq_alloc_desc_from()
-  devm_irq_alloc_descs_from()
-  devm_irq_alloc_generic_chip()
-  devm_irq_setup_generic_chip()
-  devm_irq_sim_init()
-
-LED
-  devm_led_classdev_register()
-  devm_led_classdev_unregister()
-
-MDIO
-  devm_mdiobus_alloc()
-  devm_mdiobus_alloc_size()
-  devm_mdiobus_free()
-
-MEM
-  devm_free_pages()
-  devm_get_free_pages()
-  devm_kasprintf()
-  devm_kcalloc()
-  devm_kfree()
-  devm_kmalloc()
-  devm_kmalloc_array()
-  devm_kmemdup()
-  devm_kstrdup()
-  devm_kvasprintf()
-  devm_kzalloc()
-
-MFD
-  devm_mfd_add_devices()
-
-MUX
-  devm_mux_chip_alloc()
-  devm_mux_chip_register()
-  devm_mux_control_get()
-
-PER-CPU MEM
-  devm_alloc_percpu()
-  devm_free_percpu()
-
-PCI
-  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
-  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
-  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
-  pcim_enable_device()		: after success, all PCI ops become managed
-  pcim_pin_device()		: keep PCI device enabled after release
-
-PHY
-  devm_usb_get_phy()
-  devm_usb_put_phy()
-
-PINCTRL
-  devm_pinctrl_get()
-  devm_pinctrl_put()
-  devm_pinctrl_register()
-  devm_pinctrl_unregister()
-
-POWER
-  devm_reboot_mode_register()
-  devm_reboot_mode_unregister()
-
-PWM
-  devm_pwm_get()
-  devm_pwm_put()
-
-REGULATOR
-  devm_regulator_bulk_get()
-  devm_regulator_get()
-  devm_regulator_put()
-  devm_regulator_register()
-
-RESET
-  devm_reset_control_get()
-  devm_reset_controller_register()
-
-SERDEV
-  devm_serdev_device_open()
-
-SLAVE DMA ENGINE
-  devm_acpi_dma_controller_register()
-
-SPI
-  devm_spi_register_master()
-
-WATCHDOG
-  devm_watchdog_register_device()
diff --git a/Documentation/driver-model/driver.rst b/Documentation/driver-model/driver.rst
deleted file mode 100644
index 11d281506a04..000000000000
--- a/Documentation/driver-model/driver.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-==============
-Device Drivers
-==============
-
-See the kerneldoc for the struct device_driver.
-
-
-Allocation
-~~~~~~~~~~
-
-Device drivers are statically allocated structures. Though there may
-be multiple devices in a system that a driver supports, struct
-device_driver represents the driver as a whole (not a particular
-device instance).
-
-Initialization
-~~~~~~~~~~~~~~
-
-The driver must initialize at least the name and bus fields. It should
-also initialize the devclass field (when it arrives), so it may obtain
-the proper linkage internally. It should also initialize as many of
-the callbacks as possible, though each is optional.
-
-Declaration
-~~~~~~~~~~~
-
-As stated above, struct device_driver objects are statically
-allocated. Below is an example declaration of the eepro100
-driver. This declaration is hypothetical only; it relies on the driver
-being converted completely to the new model::
-
-  static struct device_driver eepro100_driver = {
-         .name		= "eepro100",
-         .bus		= &pci_bus_type,
-
-         .probe		= eepro100_probe,
-         .remove		= eepro100_remove,
-         .suspend		= eepro100_suspend,
-         .resume		= eepro100_resume,
-  };
-
-Most drivers will not be able to be converted completely to the new
-model because the bus they belong to has a bus-specific structure with
-bus-specific fields that cannot be generalized.
-
-The most common example of this are device ID structures. A driver
-typically defines an array of device IDs that it supports. The format
-of these structures and the semantics for comparing device IDs are
-completely bus-specific. Defining them as bus-specific entities would
-sacrifice type-safety, so we keep bus-specific structures around.
-
-Bus-specific drivers should include a generic struct device_driver in
-the definition of the bus-specific driver. Like this::
-
-  struct pci_driver {
-         const struct pci_device_id *id_table;
-         struct device_driver	  driver;
-  };
-
-A definition that included bus-specific fields would look like
-(using the eepro100 driver again)::
-
-  static struct pci_driver eepro100_driver = {
-         .id_table       = eepro100_pci_tbl,
-         .driver	       = {
-		.name		= "eepro100",
-		.bus		= &pci_bus_type,
-		.probe		= eepro100_probe,
-		.remove		= eepro100_remove,
-		.suspend	= eepro100_suspend,
-		.resume		= eepro100_resume,
-         },
-  };
-
-Some may find the syntax of embedded struct initialization awkward or
-even a bit ugly. So far, it's the best way we've found to do what we want...
-
-Registration
-~~~~~~~~~~~~
-
-::
-
-  int driver_register(struct device_driver *drv);
-
-The driver registers the structure on startup. For drivers that have
-no bus-specific fields (i.e. don't have a bus-specific driver
-structure), they would use driver_register and pass a pointer to their
-struct device_driver object.
-
-Most drivers, however, will have a bus-specific structure and will
-need to register with the bus using something like pci_driver_register.
-
-It is important that drivers register their driver structure as early as
-possible. Registration with the core initializes several fields in the
-struct device_driver object, including the reference count and the
-lock. These fields are assumed to be valid at all times and may be
-used by the device model core or the bus driver.
-
-
-Transition Bus Drivers
-~~~~~~~~~~~~~~~~~~~~~~
-
-By defining wrapper functions, the transition to the new model can be
-made easier. Drivers can ignore the generic structure altogether and
-let the bus wrapper fill in the fields. For the callbacks, the bus can
-define generic callbacks that forward the call to the bus-specific
-callbacks of the drivers.
-
-This solution is intended to be only temporary. In order to get class
-information in the driver, the drivers must be modified anyway. Since
-converting drivers to the new model should reduce some infrastructural
-complexity and code size, it is recommended that they are converted as
-class information is added.
-
-Access
-~~~~~~
-
-Once the object has been registered, it may access the common fields of
-the object, like the lock and the list of devices::
-
-  int driver_for_each_dev(struct device_driver *drv, void *data,
-			  int (*callback)(struct device *dev, void *data));
-
-The devices field is a list of all the devices that have been bound to
-the driver. The LDM core provides a helper function to operate on all
-the devices a driver controls. This helper locks the driver on each
-node access, and does proper reference counting on each device as it
-accesses it.
-
-
-sysfs
-~~~~~
-
-When a driver is registered, a sysfs directory is created in its
-bus's directory. In this directory, the driver can export an interface
-to userspace to control operation of the driver on a global basis;
-e.g. toggling debugging output in the driver.
-
-A future feature of this directory will be a 'devices' directory. This
-directory will contain symlinks to the directories of devices it
-supports.
-
-
-
-Callbacks
-~~~~~~~~~
-
-::
-
-	int	(*probe)	(struct device *dev);
-
-The probe() entry is called in task context, with the bus's rwsem locked
-and the driver partially bound to the device.  Drivers commonly use
-container_of() to convert "dev" to a bus-specific type, both in probe()
-and other routines.  That type often provides device resource data, such
-as pci_dev.resource[] or platform_device.resources, which is used in
-addition to dev->platform_data to initialize the driver.
-
-This callback holds the driver-specific logic to bind the driver to a
-given device.  That includes verifying that the device is present, that
-it's a version the driver can handle, that driver data structures can
-be allocated and initialized, and that any hardware can be initialized.
-Drivers often store a pointer to their state with dev_set_drvdata().
-When the driver has successfully bound itself to that device, then probe()
-returns zero and the driver model code will finish its part of binding
-the driver to that device.
-
-A driver's probe() may return a negative errno value to indicate that
-the driver did not bind to this device, in which case it should have
-released all resources it allocated::
-
-	int 	(*remove)	(struct device *dev);
-
-remove is called to unbind a driver from a device. This may be
-called if a device is physically removed from the system, if the
-driver module is being unloaded, during a reboot sequence, or
-in other cases.
-
-It is up to the driver to determine if the device is present or
-not. It should free any resources allocated specifically for the
-device; i.e. anything in the device's driver_data field.
-
-If the device is still present, it should quiesce the device and place
-it into a supported low-power state::
-
-	int	(*suspend)	(struct device *dev, pm_message_t state);
-
-suspend is called to put the device in a low power state::
-
-	int	(*resume)	(struct device *dev);
-
-Resume is used to bring a device back from a low power state.
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct driver_attribute {
-          struct attribute        attr;
-          ssize_t (*show)(struct device_driver *driver, char *buf);
-          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
-  };
-
-Device drivers can export attributes via their sysfs directories.
-Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
-macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
-macros.
-
-Example::
-
-	DRIVER_ATTR_RW(debug);
-
-This is equivalent to declaring::
-
-	struct driver_attribute driver_attr_debug;
-
-This can then be used to add and remove the attribute from the
-driver's directory using::
-
-  int driver_create_file(struct device_driver *, const struct driver_attribute *);
-  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-model/index.rst b/Documentation/driver-model/index.rst
deleted file mode 100644
index 9f85d579ce56..000000000000
--- a/Documentation/driver-model/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-:orphan:
-
-============
-Driver Model
-============
-
-.. toctree::
-   :maxdepth: 1
-
-   binding
-   bus
-   class
-   design-patterns
-   device
-   devres
-   driver
-   overview
-   platform
-   porting
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/driver-model/overview.rst b/Documentation/driver-model/overview.rst
deleted file mode 100644
index d4d1e9b40e0c..000000000000
--- a/Documentation/driver-model/overview.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-=============================
-The Linux Kernel Device Model
-=============================
-
-Patrick Mochel	<mochel@digitalimplant.org>
-
-Drafted 26 August 2002
-Updated 31 January 2006
-
-
-Overview
-~~~~~~~~
-
-The Linux Kernel Driver Model is a unification of all the disparate driver
-models that were previously used in the kernel. It is intended to augment the
-bus-specific drivers for bridges and devices by consolidating a set of data
-and operations into globally accessible data structures.
-
-Traditional driver models implemented some sort of tree-like structure
-(sometimes just a list) for the devices they control. There wasn't any
-uniformity across the different bus types.
-
-The current driver model provides a common, uniform data model for describing
-a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
-of common callbacks, such as device discovery during bus probing, bus
-shutdown, bus power management, etc.
-
-The common device and bridge interface reflects the goals of the modern
-computer: namely the ability to do seamless device "plug and play", power
-management, and hot plug. In particular, the model dictated by Intel and
-Microsoft (namely ACPI) ensures that almost every device on almost any bus
-on an x86-compatible system can work within this paradigm.  Of course,
-not every bus is able to support all such operations, although most
-buses support most of those operations.
-
-
-Downstream Access
-~~~~~~~~~~~~~~~~~
-
-Common data fields have been moved out of individual bus layers into a common
-data structure. These fields must still be accessed by the bus layers,
-and sometimes by the device-specific drivers.
-
-Other bus layers are encouraged to do what has been done for the PCI layer.
-struct pci_dev now looks like this::
-
-  struct pci_dev {
-	...
-
-	struct device dev;     /* Generic device interface */
-	...
-  };
-
-Note first that the struct device dev within the struct pci_dev is
-statically allocated. This means only one allocation on device discovery.
-
-Note also that that struct device dev is not necessarily defined at the
-front of the pci_dev structure.  This is to make people think about what
-they're doing when switching between the bus driver and the global driver,
-and to discourage meaningless and incorrect casts between the two.
-
-The PCI bus layer freely accesses the fields of struct device. It knows about
-the structure of struct pci_dev, and it should know the structure of struct
-device. Individual PCI device drivers that have been converted to the current
-driver model generally do not and should not touch the fields of struct device,
-unless there is a compelling reason to do so.
-
-The above abstraction prevents unnecessary pain during transitional phases.
-If it were not done this way, then when a field was renamed or removed, every
-downstream driver would break.  On the other hand, if only the bus layer
-(and not the device layer) accesses the struct device, it is only the bus
-layer that needs to change.
-
-
-User Interface
-~~~~~~~~~~~~~~
-
-By virtue of having a complete hierarchical view of all the devices in the
-system, exporting a complete hierarchical view to userspace becomes relatively
-easy. This has been accomplished by implementing a special purpose virtual
-file system named sysfs.
-
-Almost all mainstream Linux distros mount this filesystem automatically; you
-can see some variation of the following in the output of the "mount" command::
-
-  $ mount
-  ...
-  none on /sys type sysfs (rw,noexec,nosuid,nodev)
-  ...
-  $
-
-The auto-mounting of sysfs is typically accomplished by an entry similar to
-the following in the /etc/fstab file::
-
-  none     	/sys	sysfs    defaults	  	0 0
-
-or something similar in the /lib/init/fstab file on Debian-based systems::
-
-  none            /sys    sysfs    nodev,noexec,nosuid    0 0
-
-If sysfs is not automatically mounted, you can always do it manually with::
-
-	# mount -t sysfs sysfs /sys
-
-Whenever a device is inserted into the tree, a directory is created for it.
-This directory may be populated at each layer of discovery - the global layer,
-the bus layer, or the device layer.
-
-The global layer currently creates two files - 'name' and 'power'. The
-former only reports the name of the device. The latter reports the
-current power state of the device. It will also be used to set the current
-power state.
-
-The bus layer may also create files for the devices it finds while probing the
-bus. For example, the PCI layer currently creates 'irq' and 'resource' files
-for each PCI device.
-
-A device-specific driver may also export files in its directory to expose
-device-specific data or tunable interfaces.
-
-More information about the sysfs directory layout can be found in
-the other documents in this directory and in the file
-Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-model/platform.rst b/Documentation/driver-model/platform.rst
deleted file mode 100644
index 334dd4071ae4..000000000000
--- a/Documentation/driver-model/platform.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-============================
-Platform Devices and Drivers
-============================
-
-See <linux/platform_device.h> for the driver model interface to the
-platform bus:  platform_device, and platform_driver.  This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
-like those used to integrate peripherals on many system-on-chip
-processors, or some "legacy" PC interconnects; as opposed to large
-formally specified ones like PCI or USB.
-
-
-Platform devices
-~~~~~~~~~~~~~~~~
-Platform devices are devices that typically appear as autonomous
-entities in the system. This includes legacy port-based devices and
-host bridges to peripheral buses, and most controllers integrated
-into system-on-chip platforms.  What they usually have in common
-is direct addressing from a CPU bus.  Rarely, a platform_device will
-be connected through a segment of some other kind of bus; but its
-registers will still be directly addressable.
-
-Platform devices are given a name, used in driver binding, and a
-list of resources such as addresses and IRQs::
-
-  struct platform_device {
-	const char	*name;
-	u32		id;
-	struct device	dev;
-	u32		num_resources;
-	struct resource	*resource;
-  };
-
-
-Platform drivers
-~~~~~~~~~~~~~~~~
-Platform drivers follow the standard driver model convention, where
-discovery/enumeration is handled outside the drivers, and drivers
-provide probe() and remove() methods.  They support power management
-and shutdown notifications using the standard conventions::
-
-  struct platform_driver {
-	int (*probe)(struct platform_device *);
-	int (*remove)(struct platform_device *);
-	void (*shutdown)(struct platform_device *);
-	int (*suspend)(struct platform_device *, pm_message_t state);
-	int (*suspend_late)(struct platform_device *, pm_message_t state);
-	int (*resume_early)(struct platform_device *);
-	int (*resume)(struct platform_device *);
-	struct device_driver driver;
-  };
-
-Note that probe() should in general verify that the specified device hardware
-actually exists; sometimes platform setup code can't be sure.  The probing
-can use device resources, including clocks, and device platform_data.
-
-Platform drivers register themselves the normal way::
-
-	int platform_driver_register(struct platform_driver *drv);
-
-Or, in common situations where the device is known not to be hot-pluggable,
-the probe() routine can live in an init section to reduce the driver's
-runtime memory footprint::
-
-	int platform_driver_probe(struct platform_driver *drv,
-			  int (*probe)(struct platform_device *))
-
-Kernel modules can be composed of several platform drivers. The platform core
-provides helpers to register and unregister an array of drivers::
-
-	int __platform_register_drivers(struct platform_driver * const *drivers,
-				      unsigned int count, struct module *owner);
-	void platform_unregister_drivers(struct platform_driver * const *drivers,
-					 unsigned int count);
-
-If one of the drivers fails to register, all drivers registered up to that
-point will be unregistered in reverse order. Note that there is a convenience
-macro that passes THIS_MODULE as owner parameter::
-
-	#define platform_register_drivers(drivers, count)
-
-
-Device Enumeration
-~~~~~~~~~~~~~~~~~~
-As a rule, platform specific (and often board-specific) setup code will
-register platform devices::
-
-	int platform_device_register(struct platform_device *pdev);
-
-	int platform_add_devices(struct platform_device **pdevs, int ndev);
-
-The general rule is to register only those devices that actually exist,
-but in some cases extra devices might be registered.  For example, a kernel
-might be configured to work with an external network adapter that might not
-be populated on all boards, or likewise to work with an integrated controller
-that some boards might not hook up to any peripherals.
-
-In some cases, boot firmware will export tables describing the devices
-that are populated on a given board.   Without such tables, often the
-only way for system setup code to set up the correct devices is to build
-a kernel for a specific target board.  Such board-specific kernels are
-common with embedded and custom systems development.
-
-In many cases, the memory and IRQ resources associated with the platform
-device are not enough to let the device's driver work.  Board setup code
-will often provide additional information using the device's platform_data
-field to hold additional information.
-
-Embedded systems frequently need one or more clocks for platform devices,
-which are normally kept off until they're actively needed (to save power).
-System setup also associates those clocks with the device, so that that
-calls to clk_get(&pdev->dev, clock_name) return them as needed.
-
-
-Legacy Drivers:  Device Probing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers are not fully converted to the driver model, because they take
-on a non-driver role:  the driver registers its platform device, rather than
-leaving that for system infrastructure.  Such drivers can't be hotplugged
-or coldplugged, since those mechanisms require device creation to be in a
-different system component than the driver.
-
-The only "good" reason for this is to handle older system designs which, like
-original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
-configuration.  Newer systems have largely abandoned that model, in favor of
-bus-level support for dynamic configuration (PCI, USB), or device tables
-provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
-conflicting options about what might be where, and even educated guesses by
-an operating system will be wrong often enough to make trouble.
-
-This style of driver is discouraged.  If you're updating such a driver,
-please try to move the device enumeration to a more appropriate location,
-outside the driver.  This will usually be cleanup, since such drivers
-tend to already have "normal" modes, such as ones using device nodes that
-were created by PNP or by platform device setup.
-
-None the less, there are some APIs to support such legacy drivers.  Avoid
-using these calls except with such hotplug-deficient drivers::
-
-	struct platform_device *platform_device_alloc(
-			const char *name, int id);
-
-You can use platform_device_alloc() to dynamically allocate a device, which
-you will then initialize with resources and platform_device_register().
-A better solution is usually::
-
-	struct platform_device *platform_device_register_simple(
-			const char *name, int id,
-			struct resource *res, unsigned int nres);
-
-You can use platform_device_register_simple() as a one-step call to allocate
-and register a device.
-
-
-Device Naming and Driver Binding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The platform_device.dev.bus_id is the canonical name for the devices.
-It's built from two components:
-
-    * platform_device.name ... which is also used to for driver matching.
-
-    * platform_device.id ... the device instance number, or else "-1"
-      to indicate there's only one.
-
-These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
-"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
-named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
-and use the platform_driver called "my_rtc".
-
-Driver binding is performed automatically by the driver core, invoking
-driver probe() after finding a match between device and driver.  If the
-probe() succeeds, the driver and device are bound as usual.  There are
-three different ways to find such a match:
-
-    - Whenever a device is registered, the drivers for that bus are
-      checked for matches.  Platform devices should be registered very
-      early during system boot.
-
-    - When a driver is registered using platform_driver_register(), all
-      unbound devices on that bus are checked for matches.  Drivers
-      usually register later during booting, or by module loading.
-
-    - Registering a driver using platform_driver_probe() works just like
-      using platform_driver_register(), except that the driver won't
-      be probed later if another device registers.  (Which is OK, since
-      this interface is only for use with non-hotpluggable devices.)
-
-
-Early Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The early platform interfaces provide platform data to platform device
-drivers early on during the system boot. The code is built on top of the
-early_param() command line parsing and can be executed very early on.
-
-Example: "earlyprintk" class early serial console in 6 steps
-
-1. Registering early platform device data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code registers platform device data using the function
-early_platform_add_devices(). In the case of early serial console this
-should be hardware configuration for the serial port. Devices registered
-at this point will later on be matched against early platform drivers.
-
-2. Parsing kernel command line
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls parse_early_param() to parse the kernel
-command line. This will execute all matching early_param() callbacks.
-User specified early platform devices will be registered at this point.
-For the early serial console case the user can specify port on the
-kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platform driver and
-0 is the platform device id. If the id is -1 then the dot and the
-id can be omitted.
-
-3. Installing early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code may optionally force registration of all early
-platform drivers belonging to a certain class using the function
-early_platform_driver_register_all(). User specified devices from
-step 2 have priority over these. This step is omitted by the serial
-driver example since the early serial driver code should be disabled
-unless the user has specified port on the kernel command line.
-
-4. Early platform driver registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Compiled-in platform drivers making use of early_platform_init() are
-automatically registered during step 2 or 3. The serial driver example
-should use early_platform_init("earlyprintk", &platform_driver).
-
-5. Probing of early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls early_platform_driver_probe() to match
-registered early platform devices associated with a certain class with
-registered early platform drivers. Matched devices will get probed().
-This step can be executed at any point during the early boot. As soon
-as possible may be good for the serial port case.
-
-6. Inside the early platform driver probe()
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The driver code needs to take special care during early boot, especially
-when it comes to memory allocation and interrupt registration. The code
-in the probe() function can use is_early_platform_device() to check if
-it is called at early platform device or at the regular platform device
-time. The early serial driver performs register_console() at this point.
-
-For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-model/porting.rst b/Documentation/driver-model/porting.rst
deleted file mode 100644
index ae4bf843c1d6..000000000000
--- a/Documentation/driver-model/porting.rst
+++ /dev/null
@@ -1,448 +0,0 @@
-=======================================
-Porting Drivers to the New Driver Model
-=======================================
-
-Patrick Mochel
-
-7 January 2003
-
-
-Overview
-
-Please refer to `Documentation/driver-model/*.rst` for definitions of
-various driver types and concepts.
-
-Most of the work of porting devices drivers to the new model happens
-at the bus driver layer. This was intentional, to minimize the
-negative effect on kernel drivers, and to allow a gradual transition
-of bus drivers.
-
-In a nutshell, the driver model consists of a set of objects that can
-be embedded in larger, bus-specific objects. Fields in these generic
-objects can replace fields in the bus-specific objects.
-
-The generic objects must be registered with the driver model core. By
-doing so, they will exported via the sysfs filesystem. sysfs can be
-mounted by doing::
-
-	# mount -t sysfs sysfs /sys
-
-
-
-The Process
-
-Step 0: Read include/linux/device.h for object and function definitions.
-
-Step 1: Registering the bus driver.
-
-
-- Define a struct bus_type for the bus driver::
-
-    struct bus_type pci_bus_type = {
-          .name           = "pci",
-    };
-
-
-- Register the bus type.
-
-  This should be done in the initialization function for the bus type,
-  which is usually the module_init(), or equivalent, function::
-
-    static int __init pci_driver_init(void)
-    {
-            return bus_register(&pci_bus_type);
-    }
-
-    subsys_initcall(pci_driver_init);
-
-
-  The bus type may be unregistered (if the bus driver may be compiled
-  as a module) by doing::
-
-     bus_unregister(&pci_bus_type);
-
-
-- Export the bus type for others to use.
-
-  Other code may wish to reference the bus type, so declare it in a
-  shared header file and export the symbol.
-
-From include/linux/pci.h::
-
-  extern struct bus_type pci_bus_type;
-
-
-From file the above code appears in::
-
-  EXPORT_SYMBOL(pci_bus_type);
-
-
-
-- This will cause the bus to show up in /sys/bus/pci/ with two
-  subdirectories: 'devices' and 'drivers'::
-
-    # tree -d /sys/bus/pci/
-    /sys/bus/pci/
-    |-- devices
-    `-- drivers
-
-
-
-Step 2: Registering Devices.
-
-struct device represents a single device. It mainly contains metadata
-describing the relationship the device has to other entities.
-
-
-- Embed a struct device in the bus-specific device type::
-
-
-    struct pci_dev {
-           ...
-           struct  device  dev;            /* Generic device interface */
-           ...
-    };
-
-  It is recommended that the generic device not be the first item in
-  the struct to discourage programmers from doing mindless casts
-  between the object types. Instead macros, or inline functions,
-  should be created to convert from the generic object type::
-
-
-    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
-
-    or
-
-    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
-    {
-	return container_of(n, struct pci_dev, dev);
-    }
-
-  This allows the compiler to verify type-safety of the operations
-  that are performed (which is Good).
-
-
-- Initialize the device on registration.
-
-  When devices are discovered or registered with the bus type, the
-  bus driver should initialize the generic device. The most important
-  things to initialize are the bus_id, parent, and bus fields.
-
-  The bus_id is an ASCII string that contains the device's address on
-  the bus. The format of this string is bus-specific. This is
-  necessary for representing devices in sysfs.
-
-  parent is the physical parent of the device. It is important that
-  the bus driver sets this field correctly.
-
-  The driver model maintains an ordered list of devices that it uses
-  for power management. This list must be in order to guarantee that
-  devices are shutdown before their physical parents, and vice versa.
-  The order of this list is determined by the parent of registered
-  devices.
-
-  Also, the location of the device's sysfs directory depends on a
-  device's parent. sysfs exports a directory structure that mirrors
-  the device hierarchy. Accurately setting the parent guarantees that
-  sysfs will accurately represent the hierarchy.
-
-  The device's bus field is a pointer to the bus type the device
-  belongs to. This should be set to the bus_type that was declared
-  and initialized before.
-
-  Optionally, the bus driver may set the device's name and release
-  fields.
-
-  The name field is an ASCII string describing the device, like
-
-     "ATI Technologies Inc Radeon QD"
-
-  The release field is a callback that the driver model core calls
-  when the device has been removed, and all references to it have
-  been released. More on this in a moment.
-
-
-- Register the device.
-
-  Once the generic device has been initialized, it can be registered
-  with the driver model core by doing::
-
-       device_register(&dev->dev);
-
-  It can later be unregistered by doing::
-
-       device_unregister(&dev->dev);
-
-  This should happen on buses that support hotpluggable devices.
-  If a bus driver unregisters a device, it should not immediately free
-  it. It should instead wait for the driver model core to call the
-  device's release method, then free the bus-specific object.
-  (There may be other code that is currently referencing the device
-  structure, and it would be rude to free the device while that is
-  happening).
-
-
-  When the device is registered, a directory in sysfs is created.
-  The PCI tree in sysfs looks like::
-
-    /sys/devices/pci0/
-    |-- 00:00.0
-    |-- 00:01.0
-    |   `-- 01:00.0
-    |-- 00:02.0
-    |   `-- 02:1f.0
-    |       `-- 03:00.0
-    |-- 00:1e.0
-    |   `-- 04:04.0
-    |-- 00:1f.0
-    |-- 00:1f.1
-    |   |-- ide0
-    |   |   |-- 0.0
-    |   |   `-- 0.1
-    |   `-- ide1
-    |       `-- 1.0
-    |-- 00:1f.2
-    |-- 00:1f.3
-    `-- 00:1f.5
-
-  Also, symlinks are created in the bus's 'devices' directory
-  that point to the device's directory in the physical hierarchy::
-
-    /sys/bus/pci/devices/
-    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
-    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
-    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
-    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
-    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
-    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
-    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
-    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
-    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
-    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
-    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
-    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
-    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
-
-
-
-Step 3: Registering Drivers.
-
-struct device_driver is a simple driver structure that contains a set
-of operations that the driver model core may call.
-
-
-- Embed a struct device_driver in the bus-specific driver.
-
-  Just like with devices, do something like::
-
-    struct pci_driver {
-           ...
-           struct device_driver    driver;
-    };
-
-
-- Initialize the generic driver structure.
-
-  When the driver registers with the bus (e.g. doing pci_register_driver()),
-  initialize the necessary fields of the driver: the name and bus
-  fields.
-
-
-- Register the driver.
-
-  After the generic driver has been initialized, call::
-
-	driver_register(&drv->driver);
-
-  to register the driver with the core.
-
-  When the driver is unregistered from the bus, unregister it from the
-  core by doing::
-
-        driver_unregister(&drv->driver);
-
-  Note that this will block until all references to the driver have
-  gone away. Normally, there will not be any.
-
-
-- Sysfs representation.
-
-  Drivers are exported via sysfs in their bus's 'driver's directory.
-  For example::
-
-    /sys/bus/pci/drivers/
-    |-- 3c59x
-    |-- Ensoniq AudioPCI
-    |-- agpgart-amdk7
-    |-- e100
-    `-- serial
-
-
-Step 4: Define Generic Methods for Drivers.
-
-struct device_driver defines a set of operations that the driver model
-core calls. Most of these operations are probably similar to
-operations the bus already defines for drivers, but taking different
-parameters.
-
-It would be difficult and tedious to force every driver on a bus to
-simultaneously convert their drivers to generic format. Instead, the
-bus driver should define single instances of the generic methods that
-forward call to the bus-specific drivers. For instance::
-
-
-  static int pci_device_remove(struct device * dev)
-  {
-          struct pci_dev * pci_dev = to_pci_dev(dev);
-          struct pci_driver * drv = pci_dev->driver;
-
-          if (drv) {
-                  if (drv->remove)
-                          drv->remove(pci_dev);
-                  pci_dev->driver = NULL;
-          }
-          return 0;
-  }
-
-
-The generic driver should be initialized with these methods before it
-is registered::
-
-        /* initialize common driver fields */
-        drv->driver.name = drv->name;
-        drv->driver.bus = &pci_bus_type;
-        drv->driver.probe = pci_device_probe;
-        drv->driver.resume = pci_device_resume;
-        drv->driver.suspend = pci_device_suspend;
-        drv->driver.remove = pci_device_remove;
-
-        /* register with core */
-        driver_register(&drv->driver);
-
-
-Ideally, the bus should only initialize the fields if they are not
-already set. This allows the drivers to implement their own generic
-methods.
-
-
-Step 5: Support generic driver binding.
-
-The model assumes that a device or driver can be dynamically
-registered with the bus at any time. When registration happens,
-devices must be bound to a driver, or drivers must be bound to all
-devices that it supports.
-
-A driver typically contains a list of device IDs that it supports. The
-bus driver compares these IDs to the IDs of devices registered with it.
-The format of the device IDs, and the semantics for comparing them are
-bus-specific, so the generic model does attempt to generalize them.
-
-Instead, a bus may supply a method in struct bus_type that does the
-comparison::
-
-  int (*match)(struct device * dev, struct device_driver * drv);
-
-match should return positive value if the driver supports the device,
-and zero otherwise. It may also return error code (for example
--EPROBE_DEFER) if determining that given driver supports the device is
-not possible.
-
-When a device is registered, the bus's list of drivers is iterated
-over. bus->match() is called for each one until a match is found.
-
-When a driver is registered, the bus's list of devices is iterated
-over. bus->match() is called for each device that is not already
-claimed by a driver.
-
-When a device is successfully bound to a driver, device->driver is
-set, the device is added to a per-driver list of devices, and a
-symlink is created in the driver's sysfs directory that points to the
-device's physical directory::
-
-  /sys/bus/pci/drivers/
-  |-- 3c59x
-  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
-  |-- Ensoniq AudioPCI
-  |-- agpgart-amdk7
-  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
-  |-- e100
-  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
-  `-- serial
-
-
-This driver binding should replace the existing driver binding
-mechanism the bus currently uses.
-
-
-Step 6: Supply a hotplug callback.
-
-Whenever a device is registered with the driver model core, the
-userspace program /sbin/hotplug is called to notify userspace.
-Users can define actions to perform when a device is inserted or
-removed.
-
-The driver model core passes several arguments to userspace via
-environment variables, including
-
-- ACTION: set to 'add' or 'remove'
-- DEVPATH: set to the device's physical path in sysfs.
-
-A bus driver may also supply additional parameters for userspace to
-consume. To do this, a bus must implement the 'hotplug' method in
-struct bus_type::
-
-     int (*hotplug) (struct device *dev, char **envp,
-                     int num_envp, char *buffer, int buffer_size);
-
-This is called immediately before /sbin/hotplug is executed.
-
-
-Step 7: Cleaning up the bus driver.
-
-The generic bus, device, and driver structures provide several fields
-that can replace those defined privately to the bus driver.
-
-- Device list.
-
-struct bus_type contains a list of all devices registered with the bus
-type. This includes all devices on all instances of that bus type.
-An internal list that the bus uses may be removed, in favor of using
-this one.
-
-The core provides an iterator to access these devices::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-                       void * data, int (*fn)(struct device *, void *));
-
-
-- Driver list.
-
-struct bus_type also contains a list of all drivers registered with
-it. An internal list of drivers that the bus driver maintains may
-be removed in favor of using the generic one.
-
-The drivers may be iterated over, like devices::
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-                       void * data, int (*fn)(struct device_driver *, void *));
-
-
-Please see drivers/base/bus.c for more information.
-
-
-- rwsem
-
-struct bus_type contains an rwsem that protects all core accesses to
-the device and driver lists. This can be used by the bus driver
-internally, and should be used when accessing the device or driver
-lists the bus maintains.
-
-
-- Device and driver fields.
-
-Some of the fields in struct device and struct device_driver duplicate
-fields in the bus-specific representations of these objects. Feel free
-to remove the bus-specific ones and favor the generic ones. Note
-though, that this will likely mean fixing up all the drivers that
-reference the bus-specific fields (though those should all be 1-line
-changes).
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
index f388545a85a7..c07565ba57da 100644
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@@ -103,7 +103,7 @@ id_table	an array of NULL terminated EISA id strings,
 		(driver_data).
 
 driver		a generic driver, such as described in
-		Documentation/driver-model/driver.rst. Only .name,
+		Documentation/driver-api/driver-model/driver.rst. Only .name,
 		.probe and .remove members are mandatory.
 =============== ====================================================
 
@@ -152,7 +152,7 @@ state    set of flags indicating the state of the device. Current
 	 flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
 res	 set of four 256 bytes I/O regions allocated to this device
 dma_mask DMA mask set from the parent device.
-dev	 generic device (see Documentation/driver-model/device.rst)
+dev	 generic device (see Documentation/driver-api/driver-model/device.rst)
 ======== ============================================================
 
 You can get the 'struct eisa_device' from 'struct device' using the
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5b5311f9358d..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
 a stat(2) operation.
 
 More information can driver-model specific features can be found in
-Documentation/driver-model/. 
+Documentation/driver-api/driver-model/.
 
 
 TODO: Finish this section.
diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst
index d5b05d3e54ba..452fc28d8e0b 100644
--- a/Documentation/hwmon/submitting-patches.rst
+++ b/Documentation/hwmon/submitting-patches.rst
@@ -89,7 +89,7 @@ increase the chances of your change being accepted.
   console. Excessive logging can seriously affect system performance.
 
 * Use devres functions whenever possible to allocate resources. For rationale
-  and supported functions, please see Documentation/driver-model/devres.rst.
+  and supported functions, please see Documentation/driver-api/driver-model/devres.rst.
   If a function is not supported by devres, consider using devm_add_action().
 
 * If the driver has a detect function, make sure it is silent. Debug messages
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 452271dda141..ee1f37da5b23 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -288,7 +288,7 @@ dev/ 包含两个子目录： char/ 和 block/。在这两个子目录中，有
 中相应的设备。/sys/dev 提供一个通过一个 stat(2) 操作结果，查找
 设备 sysfs 接口快捷的方法。
 
-更多有关 driver-model 的特性信息可以在 Documentation/driver-model/
+更多有关 driver-model 的特性信息可以在 Documentation/driver-api/driver-model/
 中找到。
 
 
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 713903290385..506a0175a5a7 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2002-3 Patrick Mochel
  * Copyright (c) 2002-3 Open Source Development Labs
  *
- * Please see Documentation/driver-model/platform.rst for more
+ * Please see Documentation/driver-api/driver-model/platform.rst for more
  * information.
  */
 
diff --git a/drivers/gpio/gpio-cs5535.c b/drivers/gpio/gpio-cs5535.c
index 3611a0571667..53b24e3ae7de 100644
--- a/drivers/gpio/gpio-cs5535.c
+++ b/drivers/gpio/gpio-cs5535.c
@@ -41,7 +41,7 @@ MODULE_PARM_DESC(mask, "GPIO channel mask.");
 
 /*
  * FIXME: convert this singleton driver to use the state container
- * design pattern, see Documentation/driver-model/design-patterns.rst
+ * design pattern, see Documentation/driver-api/driver-model/design-patterns.rst
  */
 static struct cs5535_gpio_chip {
 	struct gpio_chip chip;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 41c90f2ddb31..63db08d9bafa 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2286,7 +2286,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	struct ice_hw *hw;
 	int err;
 
-	/* this driver uses devres, see Documentation/driver-model/devres.rst */
+	/* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */
 	err = pcim_enable_device(pdev);
 	if (err)
 		return err;
diff --git a/drivers/staging/unisys/Documentation/overview.txt b/drivers/staging/unisys/Documentation/overview.txt
index 9ab30af265a5..f8a4144b239c 100644
--- a/drivers/staging/unisys/Documentation/overview.txt
+++ b/drivers/staging/unisys/Documentation/overview.txt
@@ -15,7 +15,7 @@ normally be unsharable, specifically:
 * visorinput - keyboard and mouse
 
 These drivers conform to the standard Linux bus/device model described
-within Documentation/driver-model/, and utilize a driver named visorbus to
+within Documentation/driver-api/driver-model/, and utilize a driver named visorbus to
 present the virtual busses involved. Drivers in the 'visor*' driver set are
 commonly referred to as "guest drivers" or "client drivers".  All drivers
 except visorbus expose a device of a specific usable class to the Linux guest
@@ -141,7 +141,7 @@ called automatically by the visorbus driver at appropriate times:
 -----------------------------------
 
 Because visorbus is a standard Linux bus driver in the model described in
-Documentation/driver-model/, the hierarchy of s-Par virtual devices is
+Documentation/driver-api/driver-model/, the hierarchy of s-Par virtual devices is
 published in the sysfs tree beneath /bus/visorbus/, e.g.,
 /sys/bus/visorbus/devices/ might look like:
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5eabfa0c4dee..c330b75c6c57 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -6,7 +6,7 @@
  * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
  * Copyright (c) 2008-2009 Novell Inc.
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _DEVICE_H_
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index beb25f277889..9bc36b589827 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _PLATFORM_DEVICE_H_
diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index fefd0331a2de..441799b5359b 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -3,7 +3,7 @@
 /// functions.  Values allocated using the devm_functions are freed when
 /// the device is detached, and thus the use of the standard freeing
 /// function would cause a double free.
-/// See Documentation/driver-model/devres.rst for more information.
+/// See Documentation/driver-api/driver-model/devres.rst for more information.
 ///
 /// A difficulty of detecting this problem is that the standard freeing
 /// function might be called from a different function than the one
-- 
cgit v1.2.3


From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 13:08:35 -0300
Subject: docs: cgroup-v1: add it to the admin-guide book

Those files belong to the admin guide, so add them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 .../admin-guide/cgroup-v1/blkio-controller.rst     |  302 ++++++
 Documentation/admin-guide/cgroup-v1/cgroups.rst    |  695 ++++++++++++++
 Documentation/admin-guide/cgroup-v1/cpuacct.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/cpusets.rst    |  866 +++++++++++++++++
 Documentation/admin-guide/cgroup-v1/devices.rst    |  132 +++
 .../admin-guide/cgroup-v1/freezer-subsystem.rst    |  127 +++
 Documentation/admin-guide/cgroup-v1/hugetlb.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/index.rst      |   28 +
 Documentation/admin-guide/cgroup-v1/memcg_test.rst |  355 +++++++
 Documentation/admin-guide/cgroup-v1/memory.rst     | 1003 ++++++++++++++++++++
 Documentation/admin-guide/cgroup-v1/net_cls.rst    |   44 +
 Documentation/admin-guide/cgroup-v1/net_prio.rst   |   57 ++
 Documentation/admin-guide/cgroup-v1/pids.rst       |   92 ++
 Documentation/admin-guide/cgroup-v1/rdma.rst       |  117 +++
 Documentation/admin-guide/cgroup-v2.rst            |    2 +-
 Documentation/admin-guide/index.rst                |    1 +
 Documentation/admin-guide/kernel-parameters.txt    |    4 +-
 .../admin-guide/mm/numa_memory_policy.rst          |    2 +-
 Documentation/block/bfq-iosched.rst                |    2 +-
 Documentation/cgroup-v1/blkio-controller.rst       |  302 ------
 Documentation/cgroup-v1/cgroups.rst                |  695 --------------
 Documentation/cgroup-v1/cpuacct.rst                |   50 -
 Documentation/cgroup-v1/cpusets.rst                |  866 -----------------
 Documentation/cgroup-v1/devices.rst                |  132 ---
 Documentation/cgroup-v1/freezer-subsystem.rst      |  127 ---
 Documentation/cgroup-v1/hugetlb.rst                |   50 -
 Documentation/cgroup-v1/index.rst                  |   30 -
 Documentation/cgroup-v1/memcg_test.rst             |  355 -------
 Documentation/cgroup-v1/memory.rst                 | 1003 --------------------
 Documentation/cgroup-v1/net_cls.rst                |   44 -
 Documentation/cgroup-v1/net_prio.rst               |   57 --
 Documentation/cgroup-v1/pids.rst                   |   92 --
 Documentation/cgroup-v1/rdma.rst                   |  117 ---
 Documentation/filesystems/tmpfs.txt                |    2 +-
 Documentation/kernel-per-CPU-kthreads.txt          |    2 +-
 Documentation/scheduler/sched-deadline.rst         |    2 +-
 Documentation/scheduler/sched-design-CFS.rst       |    2 +-
 Documentation/scheduler/sched-rt-group.rst         |    2 +-
 Documentation/vm/numa.rst                          |    4 +-
 Documentation/vm/page_migration.rst                |    2 +-
 Documentation/vm/unevictable-lru.rst               |    2 +-
 Documentation/x86/x86_64/fake-numa-for-cpusets.rst |    4 +-
 MAINTAINERS                                        |    4 +-
 block/Kconfig                                      |    2 +-
 include/linux/cgroup-defs.h                        |    2 +-
 include/uapi/linux/bpf.h                           |    2 +-
 init/Kconfig                                       |    4 +-
 kernel/cgroup/cpuset.c                             |    2 +-
 security/device_cgroup.c                           |    2 +-
 tools/include/uapi/linux/bpf.h                     |    2 +-
 50 files changed, 3945 insertions(+), 3946 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst
 delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst
 delete mode 100644 Documentation/cgroup-v1/cgroups.rst
 delete mode 100644 Documentation/cgroup-v1/cpuacct.rst
 delete mode 100644 Documentation/cgroup-v1/cpusets.rst
 delete mode 100644 Documentation/cgroup-v1/devices.rst
 delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst
 delete mode 100644 Documentation/cgroup-v1/hugetlb.rst
 delete mode 100644 Documentation/cgroup-v1/index.rst
 delete mode 100644 Documentation/cgroup-v1/memcg_test.rst
 delete mode 100644 Documentation/cgroup-v1/memory.rst
 delete mode 100644 Documentation/cgroup-v1/net_cls.rst
 delete mode 100644 Documentation/cgroup-v1/net_prio.rst
 delete mode 100644 Documentation/cgroup-v1/pids.rst
 delete mode 100644 Documentation/cgroup-v1/rdma.rst

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..1d7d962933be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,302 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+One IO control policy is throttling policy which can be used to
+specify upper IO rate limits on devices. This policy is implemented in
+generic block layer and can be used on leaf nodes as well as higher
+level logical devices like device mapper.
+
+HOWTO
+=====
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>"::
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Throttling implements hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+Throttling with "sane_behavior" will handle the
+hierarchy correctly. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+	- Block IO controller.
+
+CONFIG_BFQ_CGROUP_DEBUG
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
+	  Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format::
+
+	    # echo dev_maj:dev_minor weight > blkio.weight_device
+
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+	    # echo 8:16 300 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+	    # echo 8:0 500 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:0     500
+	    8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup::
+
+	    # echo 8:0 0 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the existing ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per device. Following is
+	  the format::
+
+	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..b0688011ed06
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/admin-guide/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+	1. Control Groups
+	1.1 What are cgroups ?
+	1.2 Why are cgroups needed ?
+	1.3 How are cgroups implemented ?
+	1.4 What does notify_on_release do ?
+	1.5 What does clone_children do ?
+	1.6 How do I use cgroups ?
+	2. Usage Examples and Syntax
+	2.1 Basic Usage
+	2.2 Attaching processes
+	2.3 Mounting hierarchies by name
+	3. Kernel API
+	3.1 Overview
+	3.2 Synchronization
+	3.3 Subsystem API
+	4. Extended attributes usage
+	5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+  # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+  # mount -t tmpfs cgroup_root /sys/fs/cgroup
+  # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+    xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+  # cd /sys/fs/cgroup/rg1
+  # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+  # cd my_cgroup
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.procs notify_on_release tasks
+  (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+  # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	  ...
+  # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+  # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+
+	- Trusted (XATTR_TRUSTED)
+	- Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+  Q: what's up with this '/bin/echo' ?
+  A: bash's builtin 'echo' command does not check calls to write() against
+     errors. If you use it in the cgroup file system, you won't be
+     able to tell whether a command succeeded or failed.
+
+  Q: When I attach processes, only the first of the line gets really attached !
+  A: We can only return one error code per call to write(). So you should also
+     put only ONE PID.
diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..86a6ae995d54
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+   1. Cpusets
+     1.1 What are cpusets ?
+     1.2 Why are cpusets needed ?
+     1.3 How are cpusets implemented ?
+     1.4 What are exclusive cpusets ?
+     1.5 What is memory_pressure ?
+     1.6 What is memory spread ?
+     1.7 What is sched_load_balance ?
+     1.8 What is sched_relax_domain_level ?
+     1.9 How do I use cpusets ?
+   2. Usage Examples and Syntax
+     2.1 Basic Usage
+     2.2 Adding/removing cpus
+     2.3 Setting flags
+     2.4 Attaching processes
+   3. Questions
+   4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/admin-guide/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+    Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+  -1   no request. use system default or follow request of others.
+   0   no search.
+   1   search siblings (hyperthreads in a core).
+   2   search cores in a package.
+   3   search cpus in a node [= system wide on non-NUMA system]
+   4   search nodes in a chunk of node [on NUMA system]
+   5   search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+   then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+  # cd /sys/fs/cgroup/cpuset
+  # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+  # cd my_cpuset
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.clone_children  cpuset.memory_pressure
+  cgroup.event_control   cpuset.memory_spread_page
+  cgroup.procs           cpuset.memory_spread_slab
+  cpuset.cpu_exclusive   cpuset.mems
+  cpuset.cpus            cpuset.sched_load_balance
+  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+  cpuset.mem_hardwall    notify_on_release
+  cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+  # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+  # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+  # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+  mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+  # /bin/echo "" > cpuset.cpus		-> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	...
+  # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+   what's up with this '/bin/echo' ?
+
+A:
+   bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q:
+   When I attach processes, only the first of the line gets really attached !
+
+A:
+   We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance::
+
+	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+	# echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding ``c *:3 rwm``::
+
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B::
+
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16690
+
+	<at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container::
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container::
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+  hugetlb.1GB.limit_in_bytes
+  hugetlb.1GB.max_usage_in_bytes
+  hugetlb.1GB.usage_in_bytes
+  hugetlb.1GB.failcnt
+  hugetlb.64KB.limit_in_bytes
+  hugetlb.64KB.max_usage_in_bytes
+  hugetlb.64KB.usage_in_bytes
+  hugetlb.64KB.failcnt
+  hugetlb.32MB.limit_in_bytes
+  hugetlb.32MB.max_usage_in_bytes
+  hugetlb.32MB.usage_in_bytes
+  hugetlb.32MB.failcnt
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
new file mode 100644
index 000000000000..10bf48bae0b0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -0,0 +1,28 @@
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+    :maxdepth: 1
+
+    cgroups
+
+    blkio-controller
+    cpuacct
+    cpusets
+    devices
+    freezer-subsystem
+    hugetlb
+    memcg_test
+    memory
+    net_cls
+    net_prio
+    pids
+    rdma
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..3f7115e07b5d
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+
+	Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+
+	Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+	mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
+
+	mem_cgroup_uncharge_swap()
+	  Called when swp_entry's refcnt goes down to 0. A charge against swap
+	  disappears.
+
+3. charge-commit-cancel
+=======================
+
+	Memcg pages are charged in two steps:
+
+		- mem_cgroup_try_charge()
+		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+	At try_charge(), there are no flags to say "this page is charged".
+	at this point, usage += PAGE_SIZE.
+
+	At commit(), the page is associated with the memcg.
+
+	At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+	Anonymous page is newly allocated at
+		  - page fault into MAP_ANONYMOUS mapping.
+		  - Copy-On-Write.
+
+	4.1 Swap-in.
+	At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+	(a) If the SwapCache is newly allocated and read, it has no charges.
+	(b) If the SwapCache has been mapped by processes, it has been
+	    charged already.
+
+	4.2 Swap-out.
+	At swap-out, typical state transition is below.
+
+	(a) add to swap cache. (marked as SwapCache)
+	    swp_entry's refcnt += 1.
+	(b) fully unmapped.
+	    swp_entry's refcnt += # of ptes.
+	(c) write back to swap.
+	(d) delete from swap cache. (remove from SwapCache)
+	    swp_entry's refcnt -= 1.
+
+
+	Finally, at task exit,
+	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+	Page Cache is charged at
+	- add_to_page_cache_locked().
+
+	The logic is very clear. (About migration, see below)
+
+	Note:
+	  __remove_from_page_cache() is called by remove_from_page_cache()
+	  and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
+
+	But brief explanation of the behavior of memcg around shmem will be
+	helpful to understand the logic.
+
+	Shmem's page (just leaf page, not direct/indirect block) can be on
+
+		- radix-tree of shmem's inode.
+		- SwapCache.
+		- Both on radix-tree and SwapCache. This happens at swap-in
+		  and swap-out,
+
+	It's charged when...
+
+	- A new page is added to shmem's radix-tree.
+	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+	mem_cgroup_migrate()
+
+8. LRU
+======
+        Each memcg has its own private LRU. Now, its handling is under global
+	VM's control (means that it's handled under global pgdat->lru_lock).
+	Almost all routines around memcg's LRU is called by global LRU's
+	list management functions under pgdat->lru_lock.
+
+	A special function is mem_cgroup_isolate_pages(). This scans
+	memcg's private LRU and call __isolate_lru_page() to extract a page
+	from LRU.
+
+	(By __isolate_lru_page(), the page is removed from both of global and
+	private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+	When you do test to do racy case, it's good test to set memcg's limit
+	to be very small rather than GB. Many races found in the test under
+	xKB or xxMB limits.
+
+	(Memory behavior under GB and Memory behavior under MB shows very
+	different situation.)
+
+9.2 Shmem
+---------
+
+	Historically, memcg's shmem handling was poor and we saw some amount
+	of troubles here. This is because shmem is page-cache but can be
+	SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+	For NUMA, migration is an another special case. To do easy test, cpuset
+	is useful. Following is a sample script to do migration::
+
+		mount -t cgroup -o cpuset none /opt/cpuset
+
+		mkdir /opt/cpuset/01
+		echo 1 > /opt/cpuset/01/cpuset.cpus
+		echo 0 > /opt/cpuset/01/cpuset.mems
+		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+		mkdir /opt/cpuset/02
+		echo 1 > /opt/cpuset/02/cpuset.cpus
+		echo 1 > /opt/cpuset/02/cpuset.mems
+		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+	In above set, when you moves a task from 01 to 02, page migration to
+	node 0 to node 1 will occur. Following is a script to migrate all
+	under cpuset.::
+
+		--
+		move_task()
+		{
+		for pid in $1
+		do
+			/bin/echo $pid >$2/tasks 2>/dev/null
+			echo -n $pid
+			echo -n " "
+		done
+		echo END
+		}
+
+		G1_TASK=`cat ${G1}/tasks`
+		G2_TASK=`cat ${G2}/tasks`
+		move_task "${G1_TASK}" ${G2} &
+		--
+
+9.4 Memory hotplug
+------------------
+
+	memory hotplug test is one of good test.
+
+	to offline memory, do following::
+
+		# echo offline > /sys/devices/system/memory/memoryXXX/state
+
+	(XXX is the place of memory)
+
+	This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+	When using hierarchy, mkdir/rmdir test should be done.
+	Use tests like the following::
+
+		echo 1 >/opt/cgroup/01/memory/use_hierarchy
+		mkdir /opt/cgroup/01/child_a
+		mkdir /opt/cgroup/01/child_b
+
+		set limit to 01.
+		add limit to 01/child_b
+		run jobs under child_a and child_b
+
+	create/delete following groups at random while jobs are running::
+
+		/opt/cgroup/01/child_a/child_aa
+		/opt/cgroup/01/child_b/child_bb
+		/opt/cgroup/01/child_c
+
+	running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+	Mounting with other subsystems is a good test because there is a
+	race and lock dependency with other cgroup subsystems.
+
+	example::
+
+		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+	and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+	Besides management of swap is one of complicated parts of memcg,
+	call path of swap-in at swapoff is not same as usual swap-in path..
+	It's worth to be tested explicitly.
+
+	For example, test like following is good:
+
+	(Shell-A)::
+
+		# mount -t cgroup none /cgroup -o memory
+		# mkdir /cgroup/test
+		# echo 40M > /cgroup/test/memory.limit_in_bytes
+		# echo 0 > /cgroup/test/tasks
+
+	Run malloc(100M) program under this. You'll see 60M of swaps.
+
+	(Shell-B)::
+
+		# move all tasks in /cgroup/test to /cgroup
+		# /sbin/swapoff -a
+		# rmdir /cgroup/test
+		# kill malloc task.
+
+	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+
+	Case A) when you can swapoff::
+
+		#swapoff -a
+		#echo 50M > /memory.limit_in_bytes
+
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation::
+
+		#echo 50M > memory.limit_in_bytes
+		#echo 50M > memory.memsw.limit_in_bytes
+
+	run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)::
+
+		#mkdir /cgroup/A
+		#echo $$ >/cgroup/A/tasks
+
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)::
+
+		#mkdir /cgroup/B
+		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+		#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading ``*.usage_in_bytes`` or
+	memory.stat of both A and B.
+
+	See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should
+	be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+	Memory controller implements memory thresholds using cgroups notification
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+	(Shell-A) Create cgroup and run event listener::
+
+		# mkdir /cgroup/A
+		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory::
+
+		# echo $$ >/cgroup/A/tasks
+		# a="$(dd if=/dev/zero bs=1M count=10)"
+		# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+      This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE:
+      The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks				     attach a task(thread) and show list of
+				     threads
+ cgroup.procs			     show list of processes
+ cgroup.event_control		     an interface for event_fd()
+ memory.usage_in_bytes		     show current usage for memory
+				     (See 5.5 for details)
+ memory.memsw.usage_in_bytes	     show current usage for memory+Swap
+				     (See 5.5 for details)
+ memory.limit_in_bytes		     set/show limit of memory usage
+ memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
+ memory.failcnt			     show the number of memory usage hits limits
+ memory.memsw.failcnt		     show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes	     show max memory usage recorded
+ memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes	     set/show soft limit of memory usage
+ memory.stat			     show various statistics
+ memory.use_hierarchy		     set/show hierarchical account enabled
+ memory.force_empty		     trigger forced page reclaim
+ memory.pressure_level		     set memory pressure notifications
+ memory.swappiness		     set/show swappiness parameter of vmscan
+				     (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate     set/show controls of moving charges
+ memory.oom_control		     set/show oom controls.
+ memory.numa_stat		     show the number of memory usage per numa
+				     node
+
+ memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes          show current kernel memory allocation
+ memory.kmem.failcnt                 show the number of kernel memory usage
+				     hits limits
+ memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
+				     hits limits
+ memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+		+--------------------+
+		|  mem_cgroup        |
+		|  (page_counter)    |
+		+--------------------+
+		 /            ^      \
+		/             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+  Reclaim does not work for the root cgroup, since we cannot set any
+  limits on the root cgroup.
+
+Note2:
+  When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   the i_pages lock.
+
+   Other lock order is following:
+
+   PG_locked.
+     mm->page_table_lock
+         pgdat->lru_lock
+	   lock_page_cgroup.
+
+  In many cases, just lock_page_cgroup() is called.
+
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+  every process consumes some stack pages. By accounting into
+  kernel memory, we prevent new processes from being created when the kernel
+  memory usage is too high.
+
+slab pages:
+  pages allocated by the SLAB or SLUB allocator are tracked. A copy
+  of each kmem_cache is created every time the cache is touched by the first time
+  from inside the memcg. The creation is done lazily, so some objects can still be
+  skipped while the cache is being created. All objects in a slab page should
+  belong to the same memcg. This only fails to hold when a task is migrated to a
+  different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+  some sockets protocols have memory pressure
+  thresholds. The Memory Controller allows them to be controlled individually
+  per cgroup, instead of globally.
+
+tcp memory pressure:
+  sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+WARNING:
+    In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+	# mount -t tmpfs none /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/memory
+	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+	# mkdir /sys/fs/cgroup/memory/0
+	# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+  Gibibytes.)
+
+NOTE:
+  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+  We cannot set limits on the root cgroup any more.
+
+::
+
+  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+  4194304
+
+We can check the usage::
+
+  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+  1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+  # echo 1 > memory.limit_in_bytes
+  # cat memory.limit_in_bytes
+  4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this::
+
+    # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Though rmdir() offlines memcg, but the memcg may still stay there due to
+  charged file caches. Some out-of-use page caches may keep charged until
+  memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache		# of bytes of page cache memory.
+rss		# of bytes of anonymous and swap cache memory (includes
+		transparent hugepages).
+rss_huge	# of bytes of anonymous transparent hugepages.
+mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
+pgpgin		# of charging events to the memory cgroup. The charging
+		event happens each time a page is accounted as either mapped
+		anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout		# of uncharging events to the memory cgroup. The uncharging
+		event happens each time a page is unaccounted from the cgroup.
+swap		# of bytes of swap usage
+dirty		# of bytes that are waiting to get written back to the disk.
+writeback	# of bytes of file/anon cache that are queued for syncing to
+		disk.
+inactive_anon	# of bytes of anonymous and swap cache memory on inactive
+		LRU list.
+active_anon	# of bytes of anonymous and swap cache memory on active
+		LRU list.
+inactive_file	# of bytes of file-backed memory on inactive LRU list.
+active_file	# of bytes of file-backed memory on active LRU list.
+unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+			  under which the memory cgroup is
+hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
+			  hierarchy under which memory cgroup is.
+
+total_<counter>		  # hierarchical version of <counter>, which in
+			  addition to the cgroup's own value includes the
+			  sum of all hierarchical children's values of
+			  <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+	recent_rotated means recent frequency of LRU rotation.
+	recent_scanned means recent # of scans to LRU.
+	showing for better debug please see the code for meanings.
+
+Note:
+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
+	This should not be confused with the true 'resident set size' or the
+	amount of physical memory used by the cgroup.
+
+	'rss + mapped_file" will give you resident set size of cgroup.
+
+	(Note: file and shmem may be shared among other cgroups. In that case,
+	mapped_file is accounted only when the memory cgroup is owner of page
+	cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+	# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+	       root
+	     /  |   \
+            /	|    \
+	   a	b     c
+		      | \
+		      |  \
+		      d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+	# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+	# echo 0 > memory.use_hierarchy
+
+NOTE1:
+       Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2:
+       When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+	# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+	# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+       Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2:
+       It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+	# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+      Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note:
+      Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note:
+      If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note:
+      It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+	# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ?                                    |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
+|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
+|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+|   | will be moved even if the task hasn't done page fault, i.e. they might   |
+|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+|   | And mapcount of the page is ignored (the page can be moved even if       |
+|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
+|   | enable move of swap charges.                                             |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+	#echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+	* enlarge limit or reduce usage.
+
+To reduce usage,
+
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+	- oom_kill_disable 0 or 1
+	  (if 1, oom-killer is disabled)
+	- under_oom	   0 or 1
+	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+   same as omitting the optional mode parameter, preserved by backwards
+   compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+   behavior, except that propagation continues regardless of whether there are
+   event listeners at each level, with the "hierarchy" mode. In the above
+   example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+   memory pressure is experienced in the memcg for which the notification is
+   registered. In the above example, group C will receive notification if
+   registered for "local" notification and the group experiences memory
+   pressure. However, group B will never receive notification, regardless if
+   there is an event listener for group C or not, if group B is registered for
+   local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir foo
+	# cd foo
+	# cgroup_event_listener memory.pressure_level low,hierarchy &
+	# echo 8000000 > memory.limit_in_bytes
+	# echo 8000000 > memory.memsw.limit_in_bytes
+	# echo $$ > tasks
+	# dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+	mkdir /sys/fs/cgroup/net_cls
+	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+	mkdir /sys/fs/cgroup/net_cls/0
+	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+	1048577
+
+- configuring tc::
+
+	tc qdisc add dev eth0 root handle 10: htb
+	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+  This file is read-only, and is simply informative.  It contains a unique
+  integer value that the kernel uses as an internal representation of this
+  cgroup.
+
+net_prio.ifpriomap
+  This file contains a map of the priorities assigned to traffic originating
+  from processes in this group and egressing the system on various interfaces.
+  It contains a list of tuples in the form <ifname priority>.  Contents of this
+  file can be modified by echoing a string into the file using the same tuple
+  format. For example::
+
+	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+  - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+	# mkdir -p /sys/fs/cgroup/pids
+	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+	# mkdir -p /sys/fs/cgroup/pids/parent/child
+	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.max
+	max
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	#
diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+   1. Overview
+     1-1. What is RDMA controller?
+     1-2. Why RDMA controller needed?
+     1-3. How is RDMA controller implemented?
+   2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+  ==========    =============================
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+  ==========    =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.max
+	#Output:
+	mlx4_0 hca_handle=2 hca_object=2000
+	ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.current
+	#Output:
+	mlx4_0 hca_handle=1 hca_object=20
+	ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 080b18ce2a5d..ed4c5977d6e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
 conventions of cgroup v2.  It describes all userland-visible aspects
 of cgroup including core and specific controller behaviors.  All
 future changes must be reflected in this document.  Documentation for
-v1 is available under Documentation/cgroup-v1/.
+v1 is available under Documentation/admin-guide/cgroup-v1/.
 
 .. CONTENTS
 
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 1f0d9b939311..a5fdb1a846ce 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking.
 
    initrd
    cgroup-v2
+   cgroup-v1/index
    serial-console
    braille-console
    parport
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 78576aa45cce..a571a67e0c85 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4089,7 +4089,7 @@
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cgroup-v1/cpusets.rst.
+			See Documentation/admin-guide/cgroup-v1/cpusets.rst.
 
 	reserve=	[KNL,BUGS] Force kernel to ignore I/O ports or memory
 			Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4599,7 +4599,7 @@
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
-			it if 0 is given (See Documentation/cgroup-v1/memory.rst)
+			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
 			Format: { <int> | force | noforce }
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 546f174e5d6a..8463f5538fda 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
 Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.rst``)
+(``Documentation/admin-guide/cgroup-v1/cpusets.rst``)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst
index 2c13b2fc1888..0d237d402860 100644
--- a/Documentation/block/bfq-iosched.rst
+++ b/Documentation/block/bfq-iosched.rst
@@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
 CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all
 the stat files documented in
-Documentation/cgroup-v1/blkio-controller.rst. If, instead,
+Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead,
 CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files::
 
   blkio.bfq.io_service_bytes
diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst
deleted file mode 100644
index 1d7d962933be..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.rst
+++ /dev/null
@@ -1,302 +0,0 @@
-===================
-Block IO Controller
-===================
-
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-One IO control policy is throttling policy which can be used to
-specify upper IO rate limits on devices. This policy is implemented in
-generic block layer and can be used on leaf nodes as well as higher
-level logical devices like device mapper.
-
-HOWTO
-=====
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller::
-
-	CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer::
-
-	CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
-
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>"::
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not::
-
-        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Throttling implements hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows::
-
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
-
-Throttling with "sane_behavior" will handle the
-hierarchy correctly. For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following::
-
-				pivot
-			     /  /   \  \
-			root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-	- Block IO controller.
-
-CONFIG_BFQ_CGROUP_DEBUG
-	- Debug help. Right now some additional stats file show up in cgroup
-	  if this option is enabled.
-
-CONFIG_BLK_DEV_THROTTLING
-	- Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-	- Specifies per cgroup weight. This is default weight of the group
-	  on all the devices until and unless overridden by per device rule.
-	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-	- One can specify per cgroup per device rules using this interface.
-	  These rules override the default value of group weight as specified
-	  by blkio.weight.
-
-	  Following is the format::
-
-	    # echo dev_maj:dev_minor weight > blkio.weight_device
-
-	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
-
-	    # echo 8:16 300 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
-
-	    # echo 8:0 500 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:0     500
-	    8:16    300
-
-	  Remove specific weight for /dev/sda in this cgroup::
-
-	    # echo 8:0 0 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-- blkio.leaf_weight[_device]
-	- Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-	- disk time allocated to cgroup per device in milliseconds. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the disk time allocated to group in
-	  milliseconds.
-
-- blkio.sectors
-	- number of sectors transferred to/from disk by the group. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the number of sectors transferred by the
-	  group to/from the device.
-
-- blkio.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-- blkio.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.io_service_time
-	- Total amount of time between request dispatch and request completion
-	  for the IOs done by this cgroup. This is in nanoseconds to make it
-	  meaningful for flash devices too. For devices with queue depth of 1,
-	  this time represents the actual service time. When queue_depth > 1,
-	  that is no longer true as requests may be served out of order. This
-	  may cause the service time for a given IO to include the service time
-	  of multiple IOs when served out of order which may result in total
-	  io_service_time > actual time elapsed. This time is further divided by
-	  the type of operation - read or write, sync or async. First two fields
-	  specify the major and minor number of the device, third field
-	  specifies the operation type and the fourth field specifies the
-	  io_service_time in ns.
-
-- blkio.io_wait_time
-	- Total amount of time the IOs for this cgroup spent waiting in the
-	  scheduler queues for service. This can be greater than the total time
-	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
-	  measure of total time the cgroup spent waiting but rather a measure of
-	  the wait_time for its individual IOs. For devices with queue_depth > 1
-	  this metric does not include the time spent waiting for service once
-	  the IO is dispatched to the device but till it actually gets serviced
-	  (there might be a time lag here due to re-ordering of requests by the
-	  device). This is in nanoseconds to make it meaningful for flash
-	  devices too. This time is further divided by the type of operation -
-	  read or write, sync or async. First two fields specify the major and
-	  minor number of the device, third field specifies the operation type
-	  and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-	- Total number of bios/requests merged into requests belonging to this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.io_queued
-	- Total number of requests queued up at any given instant for this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  The average queue size for this cgroup over the entire time of this
-	  cgroup's existence. Queue size samples are taken each time one of the
-	  queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time the cgroup had to wait since it became busy
-	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-	  its queues. This is different from the io_wait_time which is the
-	  cumulative total of the amount of time spent by each IO in that cgroup
-	  waiting in the scheduler queue. This is in nanoseconds. If this is
-	  read when the cgroup is in a waiting (for timeslice) state, the stat
-	  will only report the group_wait_time accumulated till the last time it
-	  got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time a cgroup spends without any pending
-	  requests when not being served, i.e., it does not include any time
-	  spent idling for one of the queues of the cgroup. This is in
-	  nanoseconds. If this is read when the cgroup is in an empty state,
-	  the stat will only report the empty_time accumulated till the last
-	  time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the existing ones
-	  from other queues/cgroups. This is in nanoseconds. If this is read
-	  when the cgroup is in an idling state, the stat will only report the
-	  idle_time accumulated till the last idle period and will not include
-	  the current delta.
-
-- blkio.dequeue
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
-	  gives the statistics about how many a times a group was dequeued
-	  from service tree of the device. First two fields specify the major
-	  and minor number of the device and third field specifies the number
-	  of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-	- Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per device. Following is
-	  the format::
-
-	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-	- Writing an int to this file will result in resetting all the stats
-	  for that cgroup.
diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst
deleted file mode 100644
index 46bbe7e022d4..000000000000
--- a/Documentation/cgroup-v1/cgroups.rst
+++ /dev/null
@@ -1,695 +0,0 @@
-==============
-Control Groups
-==============
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.rst
-
-Original copyright statements from cpusets.txt:
-
-Portions Copyright (C) 2004 BULL SA.
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-
-Modified by Paul Jackson <pj@sgi.com>
-
-Modified by Christoph Lameter <cl@linux.com>
-
-.. CONTENTS:
-
-	1. Control Groups
-	1.1 What are cgroups ?
-	1.2 Why are cgroups needed ?
-	1.3 How are cgroups implemented ?
-	1.4 What does notify_on_release do ?
-	1.5 What does clone_children do ?
-	1.6 How do I use cgroups ?
-	2. Usage Examples and Syntax
-	2.1 Basic Usage
-	2.2 Attaching processes
-	2.3 Mounting hierarchies by name
-	3. Kernel API
-	3.1 Overview
-	3.2 Synchronization
-	3.3 Subsystem API
-	4. Extended attributes usage
-	5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines::
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can::
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of::
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like::
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup::
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type::
-
-  # mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?` you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group::
-
-  # mount -t tmpfs cgroup_root /sys/fs/cgroup
-  # mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type::
-
-  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent::
-
-  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-    xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent::
-
-  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1::
-
-  # cd /sys/fs/cgroup/rg1
-  # mkdir my_cgroup
-
-Now you want to do something with this cgroup:
-
-  # cd my_cgroup
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.procs notify_on_release tasks
-  (plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup::
-
-  # /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	  ...
-  # /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0::
-
-  # echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-``int css_online(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-``void css_offline(struct cgroup *cgrp);``
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-``void css_free(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-``void css_reset(struct cgroup_subsys_state *css)``
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-``void fork(struct task_struct *task)``
-
-Called when a task is forked into a cgroup.
-
-``void exit(struct task_struct *task)``
-
-Called during task exit.
-
-``void free(struct task_struct *task)``
-
-Called when the task_struct is freed.
-
-``void bind(struct cgroup *root)``
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-
-	- Trusted (XATTR_TRUSTED)
-	- Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-::
-
-  Q: what's up with this '/bin/echo' ?
-  A: bash's builtin 'echo' command does not check calls to write() against
-     errors. If you use it in the cgroup file system, you won't be
-     able to tell whether a command succeeded or failed.
-
-  Q: When I attach processes, only the first of the line gets really attached !
-  A: We can only return one error code per call to write(). So you should also
-     put only ONE PID.
diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst
deleted file mode 100644
index d30ed81d2ad7..000000000000
--- a/Documentation/cgroup-v1/cpuacct.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-=========================
-CPU Accounting Controller
-=========================
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem::
-
-  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst
deleted file mode 100644
index b6a42cdea72b..000000000000
--- a/Documentation/cgroup-v1/cpusets.rst
+++ /dev/null
@@ -1,866 +0,0 @@
-=======
-CPUSETS
-=======
-
-Copyright (C) 2004 BULL SA.
-
-Written by Simon.Derr@bull.net
-
-- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-- Modified by Paul Jackson <pj@sgi.com>
-- Modified by Christoph Lameter <cl@linux.com>
-- Modified by Paul Menage <menage@google.com>
-- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-.. CONTENTS:
-
-   1. Cpusets
-     1.1 What are cpusets ?
-     1.2 Why are cpusets needed ?
-     1.3 How are cpusets implemented ?
-     1.4 What are exclusive cpusets ?
-     1.5 What is memory_pressure ?
-     1.6 What is memory spread ?
-     1.7 What is sched_load_balance ?
-     1.8 What is sched_relax_domain_level ?
-     1.9 How do I use cpusets ?
-   2. Usage Examples and Syntax
-     2.1 Basic Usage
-     2.2 Adding/removing cpus
-     2.3 Setting flags
-     2.4 Attaching processes
-   3. Questions
-   4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.rst.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example::
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
-
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==>
-    Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
-
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
-
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-====== ===========================================================
-  -1   no request. use system default or follow request of others.
-   0   no search.
-   1   search siblings (hyperthreads in a core).
-   2   search cores in a package.
-   3   search cpus in a node [= system wide on non-NUMA system]
-   4   search nodes in a chunk of node [on NUMA system]
-   5   search system wide [on NUMA system]
-====== ===========================================================
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
-
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-   then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset::
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
-
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
-
-  # cd /sys/fs/cgroup/cpuset
-  # mkdir my_cpuset
-
-Now you want to do something with this cpuset::
-
-  # cd my_cpuset
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.clone_children  cpuset.memory_pressure
-  cgroup.event_control   cpuset.memory_spread_page
-  cgroup.procs           cpuset.memory_spread_slab
-  cpuset.cpu_exclusive   cpuset.mems
-  cpuset.cpus            cpuset.sched_load_balance
-  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-  cpuset.mem_hardwall    notify_on_release
-  cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus::
-
-  # /bin/echo 0-7 > cpuset.cpus
-
-Add some mems::
-
-  # /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset::
-
-  # /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command::
-
-  mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to::
-
-  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories::
-
-  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
-  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset::
-
-  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs::
-
-  # /bin/echo "" > cpuset.cpus		-> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
-  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	...
-  # /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q:
-   what's up with this '/bin/echo' ?
-
-A:
-   bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q:
-   When I attach processes, only the first of the line gets really attached !
-
-A:
-   We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst
deleted file mode 100644
index e1886783961e..000000000000
--- a/Documentation/cgroup-v1/devices.rst
+++ /dev/null
@@ -1,132 +0,0 @@
-===========================
-Device Whitelist Controller
-===========================
-
-1. Description
-==============
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-=================
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance::
-
-	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-===========
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-============
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example::
-
-      A
-     / \
-        B
-
-    group        behavior	exceptions
-    A            allow		"b 8:* rwm", "c 116:1 rw"
-    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A::
-
-	# echo "c 116:* r" > A/devices.deny
-
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed::
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated::
-
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding ``c *:3 rwm``::
-
-	# echo "c *:3 rwm" >A/devices.allow
-
-the result::
-
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B::
-
-	# echo "c 2:3 rwm" >B/devices.allow
-	# echo "c 50:3 r" >B/devices.allow
-
-or even::
-
-	# echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
----------------------------------------
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst
deleted file mode 100644
index 582d3427de3f..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.rst
+++ /dev/null
@@ -1,127 +0,0 @@
-==============
-Cgroup Freezer
-==============
-
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells::
-
-	$ echo $$
-	16644
-	$ bash
-	$ echo $$
-	16690
-
-	From a second, unrelated bash shell:
-	$ kill -SIGSTOP 16690
-	$ kill -SIGCONT 16690
-
-	<at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage::
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem::
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container::
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container::
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst
deleted file mode 100644
index a3902aa253a9..000000000000
--- a/Documentation/cgroup-v1/hugetlb.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-==================
-HugeTLB Controller
-==================
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files::
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting three hugepage sizes (64k, 32M and 1G), the control
-files include::
-
-  hugetlb.1GB.limit_in_bytes
-  hugetlb.1GB.max_usage_in_bytes
-  hugetlb.1GB.usage_in_bytes
-  hugetlb.1GB.failcnt
-  hugetlb.64KB.limit_in_bytes
-  hugetlb.64KB.max_usage_in_bytes
-  hugetlb.64KB.usage_in_bytes
-  hugetlb.64KB.failcnt
-  hugetlb.32MB.limit_in_bytes
-  hugetlb.32MB.max_usage_in_bytes
-  hugetlb.32MB.usage_in_bytes
-  hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst
deleted file mode 100644
index fe76d42edc11..000000000000
--- a/Documentation/cgroup-v1/index.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-:orphan:
-
-========================
-Control Groups version 1
-========================
-
-.. toctree::
-    :maxdepth: 1
-
-    cgroups
-
-    blkio-controller
-    cpuacct
-    cpusets
-    devices
-    freezer-subsystem
-    hugetlb
-    memcg_test
-    memory
-    net_cls
-    net_prio
-    pids
-    rdma
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst
deleted file mode 100644
index 91bd18c6a514..000000000000
--- a/Documentation/cgroup-v1/memcg_test.rst
+++ /dev/null
@@ -1,355 +0,0 @@
-=====================================================
-Memory Resource Controller(Memcg) Implementation Memo
-=====================================================
-
-Last Updated: 2010/2
-
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.rst)
-
-0. How to record usage ?
-========================
-
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-
-	Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-
-	Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-=========
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-	mem_cgroup_try_charge()
-
-2. Uncharge
-===========
-
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-	mem_cgroup_uncharge()
-	  Called when a page's refcount goes down to 0.
-
-	mem_cgroup_uncharge_swap()
-	  Called when swp_entry's refcnt goes down to 0. A charge against swap
-	  disappears.
-
-3. charge-commit-cancel
-=======================
-
-	Memcg pages are charged in two steps:
-
-		- mem_cgroup_try_charge()
-		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-	At try_charge(), there are no flags to say "this page is charged".
-	at this point, usage += PAGE_SIZE.
-
-	At commit(), the page is associated with the memcg.
-
-	At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-============
-
-	Anonymous page is newly allocated at
-		  - page fault into MAP_ANONYMOUS mapping.
-		  - Copy-On-Write.
-
-	4.1 Swap-in.
-	At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-	(a) If the SwapCache is newly allocated and read, it has no charges.
-	(b) If the SwapCache has been mapped by processes, it has been
-	    charged already.
-
-	4.2 Swap-out.
-	At swap-out, typical state transition is below.
-
-	(a) add to swap cache. (marked as SwapCache)
-	    swp_entry's refcnt += 1.
-	(b) fully unmapped.
-	    swp_entry's refcnt += # of ptes.
-	(c) write back to swap.
-	(d) delete from swap cache. (remove from SwapCache)
-	    swp_entry's refcnt -= 1.
-
-
-	Finally, at task exit,
-	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-=============
-
-	Page Cache is charged at
-	- add_to_page_cache_locked().
-
-	The logic is very clear. (About migration, see below)
-
-	Note:
-	  __remove_from_page_cache() is called by remove_from_page_cache()
-	  and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-===========================
-
-	The best way to understand shmem's page state transition is to read
-	mm/shmem.c.
-
-	But brief explanation of the behavior of memcg around shmem will be
-	helpful to understand the logic.
-
-	Shmem's page (just leaf page, not direct/indirect block) can be on
-
-		- radix-tree of shmem's inode.
-		- SwapCache.
-		- Both on radix-tree and SwapCache. This happens at swap-in
-		  and swap-out,
-
-	It's charged when...
-
-	- A new page is added to shmem's radix-tree.
-	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-=================
-
-	mem_cgroup_migrate()
-
-8. LRU
-======
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-
-	(By __isolate_lru_page(), the page is removed from both of global and
-	private LRU.)
-
-
-9. Typical Tests.
-=================
-
- Tests for racy cases.
-
-9.1 Small limit to memcg.
--------------------------
-
-	When you do test to do racy case, it's good test to set memcg's limit
-	to be very small rather than GB. Many races found in the test under
-	xKB or xxMB limits.
-
-	(Memory behavior under GB and Memory behavior under MB shows very
-	different situation.)
-
-9.2 Shmem
----------
-
-	Historically, memcg's shmem handling was poor and we saw some amount
-	of troubles here. This is because shmem is page-cache but can be
-	SwapCache. Test with shmem/tmpfs is always good test.
-
-9.3 Migration
--------------
-
-	For NUMA, migration is an another special case. To do easy test, cpuset
-	is useful. Following is a sample script to do migration::
-
-		mount -t cgroup -o cpuset none /opt/cpuset
-
-		mkdir /opt/cpuset/01
-		echo 1 > /opt/cpuset/01/cpuset.cpus
-		echo 0 > /opt/cpuset/01/cpuset.mems
-		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-		mkdir /opt/cpuset/02
-		echo 1 > /opt/cpuset/02/cpuset.cpus
-		echo 1 > /opt/cpuset/02/cpuset.mems
-		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-	In above set, when you moves a task from 01 to 02, page migration to
-	node 0 to node 1 will occur. Following is a script to migrate all
-	under cpuset.::
-
-		--
-		move_task()
-		{
-		for pid in $1
-		do
-			/bin/echo $pid >$2/tasks 2>/dev/null
-			echo -n $pid
-			echo -n " "
-		done
-		echo END
-		}
-
-		G1_TASK=`cat ${G1}/tasks`
-		G2_TASK=`cat ${G2}/tasks`
-		move_task "${G1_TASK}" ${G2} &
-		--
-
-9.4 Memory hotplug
-------------------
-
-	memory hotplug test is one of good test.
-
-	to offline memory, do following::
-
-		# echo offline > /sys/devices/system/memory/memoryXXX/state
-
-	(XXX is the place of memory)
-
-	This is an easy way to test page migration, too.
-
-9.5 mkdir/rmdir
----------------
-
-	When using hierarchy, mkdir/rmdir test should be done.
-	Use tests like the following::
-
-		echo 1 >/opt/cgroup/01/memory/use_hierarchy
-		mkdir /opt/cgroup/01/child_a
-		mkdir /opt/cgroup/01/child_b
-
-		set limit to 01.
-		add limit to 01/child_b
-		run jobs under child_a and child_b
-
-	create/delete following groups at random while jobs are running::
-
-		/opt/cgroup/01/child_a/child_aa
-		/opt/cgroup/01/child_b/child_bb
-		/opt/cgroup/01/child_c
-
-	running new jobs in new group is also good.
-
-9.6 Mount with other subsystems
--------------------------------
-
-	Mounting with other subsystems is a good test because there is a
-	race and lock dependency with other cgroup subsystems.
-
-	example::
-
-		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-	and do task move, mkdir, rmdir etc...under this.
-
-9.7 swapoff
------------
-
-	Besides management of swap is one of complicated parts of memcg,
-	call path of swap-in at swapoff is not same as usual swap-in path..
-	It's worth to be tested explicitly.
-
-	For example, test like following is good:
-
-	(Shell-A)::
-
-		# mount -t cgroup none /cgroup -o memory
-		# mkdir /cgroup/test
-		# echo 40M > /cgroup/test/memory.limit_in_bytes
-		# echo 0 > /cgroup/test/tasks
-
-	Run malloc(100M) program under this. You'll see 60M of swaps.
-
-	(Shell-B)::
-
-		# move all tasks in /cgroup/test to /cgroup
-		# /sbin/swapoff -a
-		# rmdir /cgroup/test
-		# kill malloc task.
-
-	Of course, tmpfs v.s. swapoff test should be tested, too.
-
-9.8 OOM-Killer
---------------
-
-	Out-of-memory caused by memcg's limit will kill tasks under
-	the memcg. When hierarchy is used, a task under hierarchy
-	will be killed by the kernel.
-
-	In this case, panic_on_oom shouldn't be invoked and tasks
-	in other groups shouldn't be killed.
-
-	It's not difficult to cause OOM under memcg as following.
-
-	Case A) when you can swapoff::
-
-		#swapoff -a
-		#echo 50M > /memory.limit_in_bytes
-
-	run 51M of malloc
-
-	Case B) when you use mem+swap limitation::
-
-		#echo 50M > memory.limit_in_bytes
-		#echo 50M > memory.memsw.limit_in_bytes
-
-	run 51M of malloc
-
-9.9 Move charges at task migration
-----------------------------------
-
-	Charges associated with a task can be moved along with task migration.
-
-	(Shell-A)::
-
-		#mkdir /cgroup/A
-		#echo $$ >/cgroup/A/tasks
-
-	run some programs which uses some amount of memory in /cgroup/A.
-
-	(Shell-B)::
-
-		#mkdir /cgroup/B
-		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-		#echo "pid of the program running in group A" >/cgroup/B/tasks
-
-	You can see charges have been moved by reading ``*.usage_in_bytes`` or
-	memory.stat of both A and B.
-
-	See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should
-	be written to move_charge_at_immigrate.
-
-9.10 Memory thresholds
-----------------------
-
-	Memory controller implements memory thresholds using cgroups notification
-	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-	(Shell-A) Create cgroup and run event listener::
-
-		# mkdir /cgroup/A
-		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-	(Shell-B) Add task to cgroup and try to allocate and free memory::
-
-		# echo $$ >/cgroup/A/tasks
-		# a="$(dd if=/dev/zero bs=1M count=10)"
-		# a=
-
-	You will see message from cgroup_event_listener every time you cross
-	the thresholds.
-
-	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst
deleted file mode 100644
index 41bdc038dad9..000000000000
--- a/Documentation/cgroup-v1/memory.rst
+++ /dev/null
@@ -1,1003 +0,0 @@
-==========================
-Memory Resource Controller
-==========================
-
-NOTE:
-      This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE:
-      The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors) In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-=============================================
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
-
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
-==================================== ==========================================
- tasks				     attach a task(thread) and show list of
-				     threads
- cgroup.procs			     show list of processes
- cgroup.event_control		     an interface for event_fd()
- memory.usage_in_bytes		     show current usage for memory
-				     (See 5.5 for details)
- memory.memsw.usage_in_bytes	     show current usage for memory+Swap
-				     (See 5.5 for details)
- memory.limit_in_bytes		     set/show limit of memory usage
- memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
- memory.failcnt			     show the number of memory usage hits limits
- memory.memsw.failcnt		     show the number of memory+Swap hits limits
- memory.max_usage_in_bytes	     show max memory usage recorded
- memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
- memory.soft_limit_in_bytes	     set/show soft limit of memory usage
- memory.stat			     show various statistics
- memory.use_hierarchy		     set/show hierarchical account enabled
- memory.force_empty		     trigger forced page reclaim
- memory.pressure_level		     set memory pressure notifications
- memory.swappiness		     set/show swappiness parameter of vmscan
-				     (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate     set/show controls of moving charges
- memory.oom_control		     set/show oom controls.
- memory.numa_stat		     show the number of memory usage per numa
-				     node
-
- memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes          show current kernel memory allocation
- memory.kmem.failcnt                 show the number of kernel memory usage
-				     hits limits
- memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
- memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
-				     hits limits
- memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
-==================================== ==========================================
-
-1. History
-==========
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-=================
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
------------
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
----------------
-
-::
-
-		+--------------------+
-		|  mem_cgroup        |
-		|  (page_counter)    |
-		+--------------------+
-		 /            ^      \
-		/             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-------------------------
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
---------------------------
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
---------------------------------------
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
-
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-**why 'memory+swap' rather than swap**
-
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-**What happens when a cgroup hits memory.memsw.limit_in_bytes**
-
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
------------
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE:
-  Reclaim does not work for the root cgroup, since we cannot set any
-  limits on the root cgroup.
-
-Note2:
-  When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
------------
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
-
-   Other lock order is following:
-
-   PG_locked.
-     mm->page_table_lock
-         pgdat->lru_lock
-	   lock_page_cgroup.
-
-  In many cases, just lock_page_cgroup() is called.
-
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
------------------------------------------------
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
------------------------------------------------
-
-stack pages:
-  every process consumes some stack pages. By accounting into
-  kernel memory, we prevent new processes from being created when the kernel
-  memory usage is too high.
-
-slab pages:
-  pages allocated by the SLAB or SLUB allocator are tracked. A copy
-  of each kmem_cache is created every time the cache is touched by the first time
-  from inside the memcg. The creation is done lazily, so some objects can still be
-  skipped while the cache is being created. All objects in a slab page should
-  belong to the same memcg. This only fails to hold when a task is migrated to a
-  different memcg during the page allocation by the cache.
-
-sockets memory pressure:
-  some sockets protocols have memory pressure
-  thresholds. The Memory Controller allows them to be controlled individually
-  per cgroup, instead of globally.
-
-tcp memory pressure:
-  sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-----------------------
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-
-WARNING:
-    In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-=================
-
-3.0. Configuration
-------------------
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
--------------------------------------------------------------------
-
-::
-
-	# mount -t tmpfs none /sys/fs/cgroup
-	# mkdir /sys/fs/cgroup/memory
-	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it::
-
-	# mkdir /sys/fs/cgroup/memory/0
-	# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit::
-
-	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE:
-  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
-  Gibibytes.)
-
-NOTE:
-  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
-
-NOTE:
-  We cannot set limits on the root cgroup any more.
-
-::
-
-  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-  4194304
-
-We can check the usage::
-
-  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-  1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel::
-
-  # echo 1 > memory.limit_in_bytes
-  # cat memory.limit_in_bytes
-  4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-==========
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
--------------------
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-------------------
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
----------------------
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces
-===================
-
-5.1 force_empty
----------------
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this::
-
-    # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Though rmdir() offlines memcg, but the memcg may still stay there due to
-  charged file caches. Some out-of-use page caches may keep charged until
-  memory pressure happens. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
--------------
-
-memory.stat file includes following statistics
-
-per-memory cgroup local status
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-=============== ===============================================================
-cache		# of bytes of page cache memory.
-rss		# of bytes of anonymous and swap cache memory (includes
-		transparent hugepages).
-rss_huge	# of bytes of anonymous transparent hugepages.
-mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
-pgpgin		# of charging events to the memory cgroup. The charging
-		event happens each time a page is accounted as either mapped
-		anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout		# of uncharging events to the memory cgroup. The uncharging
-		event happens each time a page is unaccounted from the cgroup.
-swap		# of bytes of swap usage
-dirty		# of bytes that are waiting to get written back to the disk.
-writeback	# of bytes of file/anon cache that are queued for syncing to
-		disk.
-inactive_anon	# of bytes of anonymous and swap cache memory on inactive
-		LRU list.
-active_anon	# of bytes of anonymous and swap cache memory on active
-		LRU list.
-inactive_file	# of bytes of file-backed memory on inactive LRU list.
-active_file	# of bytes of file-backed memory on active LRU list.
-unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
-=============== ===============================================================
-
-status considering hierarchy (see memory.use_hierarchy settings)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ===================================================
-hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
-			  under which the memory cgroup is
-hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
-			  hierarchy under which memory cgroup is.
-
-total_<counter>		  # hierarchical version of <counter>, which in
-			  addition to the cgroup's own value includes the
-			  sum of all hierarchical children's values of
-			  <counter>, i.e. total_cache
-========================= ===================================================
-
-The following additional stats are dependent on CONFIG_DEBUG_VM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ========================================
-recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
-========================= ========================================
-
-Memo:
-	recent_rotated means recent frequency of LRU rotation.
-	recent_scanned means recent # of scans to LRU.
-	showing for better debug please see the code for meanings.
-
-Note:
-	Only anonymous and swap cache memory is listed as part of 'rss' stat.
-	This should not be confused with the true 'resident set size' or the
-	amount of physical memory used by the cgroup.
-
-	'rss + mapped_file" will give you resident set size of cgroup.
-
-	(Note: file and shmem may be shared among other cgroups. In that case,
-	mapped_file is accounted only when the memory cgroup is owner of page
-	cache.)
-
-5.3 swappiness
---------------
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
------------
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file::
-
-	# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-------------------
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
--------------
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is::
-
-  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-====================
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy::
-
-	       root
-	     /  |   \
-            /	|    \
-	   a	b     c
-		      | \
-		      |  \
-		      d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-------------------------------------------------
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
-
-	# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by::
-
-	# echo 0 > memory.use_hierarchy
-
-NOTE1:
-       Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2:
-       When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-==============
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
--------------
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)::
-
-	# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use::
-
-	# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1:
-       Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2:
-       It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-=================================
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
--------------
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it::
-
-	# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note:
-      Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note:
-      Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note:
-      If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note:
-      It can take several seconds if you move charges much.
-
-And if you want disable it again::
-
-	# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
---------------------------------------
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-+---+--------------------------------------------------------------------------+
-|bit| what type of charges would be moved ?                                    |
-+===+==========================================================================+
-| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
-|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
-+---+--------------------------------------------------------------------------+
-| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
-|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
-|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
-|   | will be moved even if the task hasn't done page fault, i.e. they might   |
-|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
-|   | And mapcount of the page is ignored (the page can be moved even if       |
-|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
-|   | enable move of swap charges.                                             |
-+---+--------------------------------------------------------------------------+
-
-8.3 TODO
---------
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-====================
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-===============
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
-
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-	#echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-
-	* enlarge limit or reduce usage.
-
-To reduce usage,
-
-	* kill some tasks.
-	* move some tasks to other group with account migration.
-	* remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-
-	- oom_kill_disable 0 or 1
-	  (if 1, oom-killer is disabled)
-	- under_oom	   0 or 1
-	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
-
-11. Memory Pressure
-===================
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
-   same as omitting the optional mode parameter, preserved by backwards
-   compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
-   behavior, except that propagation continues regardless of whether there are
-   event listeners at each level, with the "hierarchy" mode. In the above
-   example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
-   memory pressure is experienced in the memcg for which the notification is
-   registered. In the above example, group C will receive notification if
-   registered for "local" notification and the group experiences memory
-   pressure. However, group B will never receive notification, regardless if
-   there is an event listener for group C or not, if group B is registered for
-   local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure::
-
-	# cd /sys/fs/cgroup/memory/
-	# mkdir foo
-	# cd foo
-	# cgroup_event_listener memory.pressure_level low,hierarchy &
-	# echo 8000000 > memory.limit_in_bytes
-	# echo 8000000 > memory.memsw.limit_in_bytes
-	# echo $$ > tasks
-	# dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-========
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-=======
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-==========
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst
deleted file mode 100644
index a2cf272af7a0..000000000000
--- a/Documentation/cgroup-v1/net_cls.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-=========================
-Network classifier cgroup
-=========================
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example::
-
-	mkdir /sys/fs/cgroup/net_cls
-	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-	mkdir /sys/fs/cgroup/net_cls/0
-	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-
-- setting a 10:1 handle::
-
-	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-	1048577
-
-- configuring tc::
-
-	tc qdisc add dev eth0 root handle 10: htb
-	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
-
-- creating traffic class 10:1::
-
-	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example::
-
-	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst
deleted file mode 100644
index b40905871c64..000000000000
--- a/Documentation/cgroup-v1/net_prio.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-=======================
-Network priority cgroup
-=======================
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem::
-
-	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-  This file is read-only, and is simply informative.  It contains a unique
-  integer value that the kernel uses as an internal representation of this
-  cgroup.
-
-net_prio.ifpriomap
-  This file contains a map of the priorities assigned to traffic originating
-  from processes in this group and egressing the system on various interfaces.
-  It contains a list of tuples in the form <ifname priority>.  Contents of this
-  file can be modified by echoing a string into the file using the same tuple
-  format. For example::
-
-	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst
deleted file mode 100644
index 6acebd9e72c8..000000000000
--- a/Documentation/cgroup-v1/pids.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-=========================
-Process Number Controller
-=========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
-
-  - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller::
-
-	# mkdir -p /sys/fs/cgroup/pids
-	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it::
-
-	# mkdir -p /sys/fs/cgroup/pids/parent/child
-	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail::
-
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's)::
-
-	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.max
-	max
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current)::
-
-	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	#
diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst
deleted file mode 100644
index 2fcb0a9bf790..000000000000
--- a/Documentation/cgroup-v1/rdma.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-===============
-RDMA Controller
-===============
-
-.. Contents
-
-   1. Overview
-     1-1. What is RDMA controller?
-     1-2. Why RDMA controller needed?
-     1-3. How is RDMA controller implemented?
-   2. Usage Examples
-
-1. Overview
-===========
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-
-  ==========    =============================
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-  ==========    =============================
-
-2. Usage Examples
-=================
-
-(a) Configure resource limit::
-
-	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.max
-	#Output:
-	mlx4_0 hca_handle=2 hca_object=2000
-	ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.current
-	#Output:
-	mlx4_0 hca_handle=1 hca_object=20
-	ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit::
-
-	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index cad797a8a39e..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
 use at file creation time.  When a task allocates a file in the file
 system, the mount option memory policy will be applied with a NodeList,
 if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
 below.  If the resulting NodeLists is the empty set, the effective memory
 policy for the file will revert to "default" policy.
 
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 5623b9916411..4f18456dd3b1 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -12,7 +12,7 @@ References
 
 -	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
 
--	Documentation/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
 
 -	man taskset:  Using the taskset command to bind tasks to sets
 	of CPUs.
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 3391e86d810c..14a2f7bf63fe 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -669,7 +669,7 @@ Deadline Task Scheduling
 
  -deadline tasks cannot have an affinity mask smaller that the entire
  root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.rst).
+ through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst).
 
 5.1 SCHED_DEADLINE and cpusets HOWTO
 ------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 53b30d1967cf..a96c72651877 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -222,7 +222,7 @@ SCHED_BATCH) tasks.
 
    These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
-   Documentation/cgroup-v1/cgroups.rst for more information about this filesystem.
+   Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem.
 
 When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index d27d3f3712fd..655a096ec8fb 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
 to control the CPU time reserved for each control group.
 
 For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.rst as well.
+Documentation/admin-guide/cgroup-v1/cgroups.rst as well.
 
 Group settings are checked against the following limits in order to keep the
 configuration schedulable:
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 130f3cfa1c19..99fdeca917ca 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes.  Each emulated node will manage a fraction of the underlying cells'
 physical memory.  NUMA emluation is useful for testing NUMA kernel and
 application features on non-NUMA platforms, and as a sort of memory resource
 management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.rst]
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 For each node with memory, Linux constructs an independent memory management
 subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
 
 System administrators can restrict the CPUs and nodes' memories that a non-
 privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.rst]
+using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 On architectures that do not hide memoryless nodes, Linux will include only
 zones [nodes] with memory in the zonelists.  This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index 35bba27d5fff..1d6cd7db4e43 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
 move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.rst).
+Documentation/admin-guide/cgroup-v1/cpusets.rst).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 109052215bce..17d0861b0f1d 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
 --------------------------------
 
 The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.rst] by extending the
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
 lru_list enum.
 
 The memory controller data structure automatically gets a per-zone unevictable
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 30108684ae87..ff9bcfd2cc14 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
 For more information on the features of cpusets, see
-Documentation/cgroup-v1/cpusets.rst.
+Documentation/admin-guide/cgroup-v1/cpusets.rst.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
+Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets::
 
 	[root@xroads /]# mkdir exampleset
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c603ea73034..c1593a668f80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4158,7 +4158,7 @@ L:	cgroups@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
 F:	Documentation/admin-guide/cgroup-v2.rst
-F:	Documentation/cgroup-v1/
+F:	Documentation/admin-guide/cgroup-v1/
 F:	include/linux/cgroup*
 F:	kernel/cgroup/
 
@@ -4169,7 +4169,7 @@ W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
-F:	Documentation/cgroup-v1/cpusets.rst
+F:	Documentation/admin-guide/cgroup-v1/cpusets.rst
 F:	include/linux/cpuset.h
 F:	kernel/cgroup/cpuset.c
 
diff --git a/block/Kconfig b/block/Kconfig
index b16b3e075d31..8b5f8e560eb4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING
 	one needs to mount and use blkio cgroup controller for creating
 	cgroups and specifying per device IO rate policies.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config BLK_DEV_THROTTLING_LOW
 	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5311935239d..430e219e3aba 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -624,7 +624,7 @@ struct cftype {
 
 /*
  * Control Group subsystem type.
- * See Documentation/cgroup-v1/cgroups.rst for details
+ * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..82699845ef79 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
diff --git a/init/Kconfig b/init/Kconfig
index 9eb92ee52d40..381cdfee6e0e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -821,7 +821,7 @@ menuconfig CGROUPS
 	  controls or device isolation.
 	  See
 		- Documentation/scheduler/sched-design-CFS.rst	(CFS)
-		- Documentation/cgroup-v1/ (features for grouping, isolation
+		- Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation
 					  and resource control)
 
 	  Say N if unsure.
@@ -883,7 +883,7 @@ config BLK_CGROUP
 	CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
 	CONFIG_BLK_DEV_THROTTLING=y.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config CGROUP_WRITEBACK
 	bool
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3b02b9c4405..863e434a6020 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c07196502577..725674f3276d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
  * This is one of the three key functions for hierarchy implementation.
  * This function is responsible for re-evaluating all the cgroup's active
  * exceptions due to a parent's exception change.
- * Refer to Documentation/cgroup-v1/devices.rst for more details.
+ * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
  */
 static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f506c68b2612..17e2b1713702 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
-- 
cgit v1.2.3


From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 14:56:51 -0300
Subject: docs: admin-guide: add a series of orphaned documents

There are lots of documents that belong to the admin-guide but
are on random places (most under Documentation root dir).

Move them to the admin guide.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 Documentation/ABI/stable/sysfs-devices-node        |   2 +-
 Documentation/ABI/testing/procfs-diskstats         |   2 +-
 Documentation/ABI/testing/sysfs-block              |   2 +-
 Documentation/ABI/testing/sysfs-devices-system-cpu |   4 +-
 Documentation/admin-guide/btmrvl.rst               | 124 +++++++
 Documentation/admin-guide/clearing-warn-once.rst   |   9 +
 Documentation/admin-guide/cpu-load.rst             | 114 +++++++
 Documentation/admin-guide/cputopology.rst          | 177 ++++++++++
 .../admin-guide/device-mapper/statistics.rst       |   4 +-
 Documentation/admin-guide/efi-stub.rst             | 100 ++++++
 Documentation/admin-guide/highuid.rst              |  80 +++++
 Documentation/admin-guide/hw-vuln/l1tf.rst         |   2 +-
 Documentation/admin-guide/hw_random.rst            | 105 ++++++
 Documentation/admin-guide/index.rst                |  17 +
 Documentation/admin-guide/iostats.rst              | 197 ++++++++++++
 Documentation/admin-guide/kernel-parameters.txt    |   2 +-
 .../admin-guide/kernel-per-CPU-kthreads.rst        | 356 +++++++++++++++++++++
 Documentation/admin-guide/lcd-panel-cgram.rst      |  27 ++
 Documentation/admin-guide/ldm.rst                  | 121 +++++++
 Documentation/admin-guide/lockup-watchdogs.rst     |  83 +++++
 Documentation/admin-guide/mm/cma_debugfs.rst       |  25 ++
 Documentation/admin-guide/mm/index.rst             |   1 +
 Documentation/admin-guide/numastat.rst             |  30 ++
 Documentation/admin-guide/pnp.rst                  | 292 +++++++++++++++++
 Documentation/admin-guide/rtc.rst                  | 140 ++++++++
 Documentation/admin-guide/svga.rst                 | 249 ++++++++++++++
 Documentation/admin-guide/sysctl/kernel.rst        |   2 +-
 Documentation/admin-guide/video-output.rst         |  34 ++
 Documentation/auxdisplay/lcd-panel-cgram.rst       |  29 --
 Documentation/btmrvl.txt                           | 124 -------
 Documentation/clearing-warn-once.txt               |   9 -
 Documentation/cma/debugfs.rst                      |  27 --
 Documentation/cpu-load.txt                         | 114 -------
 Documentation/cputopology.txt                      | 177 ----------
 Documentation/efi-stub.txt                         | 100 ------
 Documentation/fb/vesafb.rst                        |   2 +-
 Documentation/highuid.txt                          |  80 -----
 Documentation/hw_random.txt                        | 105 ------
 Documentation/iostats.txt                          | 197 ------------
 Documentation/kernel-per-CPU-kthreads.txt          | 356 ---------------------
 Documentation/ldm.txt                              | 121 -------
 Documentation/lockup-watchdogs.txt                 |  83 -----
 Documentation/numastat.txt                         |  30 --
 Documentation/pnp.txt                              | 292 -----------------
 Documentation/rtc.txt                              | 140 --------
 Documentation/svga.txt                             | 249 --------------
 Documentation/video-output.txt                     |  34 --
 Documentation/x86/topology.rst                     |   2 +-
 MAINTAINERS                                        |  12 +-
 arch/arm/Kconfig                                   |   2 +-
 arch/parisc/Kconfig                                |   2 +-
 arch/sh/Kconfig                                    |   2 +-
 arch/sparc/Kconfig                                 |   2 +-
 arch/x86/Kconfig                                   |   4 +-
 block/partitions/Kconfig                           |   2 +-
 drivers/char/Kconfig                               |   4 +-
 drivers/char/hw_random/core.c                      |   2 +-
 include/linux/hw_random.h                          |   2 +-
 58 files changed, 2310 insertions(+), 2296 deletions(-)
 create mode 100644 Documentation/admin-guide/btmrvl.rst
 create mode 100644 Documentation/admin-guide/clearing-warn-once.rst
 create mode 100644 Documentation/admin-guide/cpu-load.rst
 create mode 100644 Documentation/admin-guide/cputopology.rst
 create mode 100644 Documentation/admin-guide/efi-stub.rst
 create mode 100644 Documentation/admin-guide/highuid.rst
 create mode 100644 Documentation/admin-guide/hw_random.rst
 create mode 100644 Documentation/admin-guide/iostats.rst
 create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst
 create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst
 create mode 100644 Documentation/admin-guide/ldm.rst
 create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst
 create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst
 create mode 100644 Documentation/admin-guide/numastat.rst
 create mode 100644 Documentation/admin-guide/pnp.rst
 create mode 100644 Documentation/admin-guide/rtc.rst
 create mode 100644 Documentation/admin-guide/svga.rst
 create mode 100644 Documentation/admin-guide/video-output.rst
 delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst
 delete mode 100644 Documentation/btmrvl.txt
 delete mode 100644 Documentation/clearing-warn-once.txt
 delete mode 100644 Documentation/cma/debugfs.rst
 delete mode 100644 Documentation/cpu-load.txt
 delete mode 100644 Documentation/cputopology.txt
 delete mode 100644 Documentation/efi-stub.txt
 delete mode 100644 Documentation/highuid.txt
 delete mode 100644 Documentation/hw_random.txt
 delete mode 100644 Documentation/iostats.txt
 delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt
 delete mode 100644 Documentation/ldm.txt
 delete mode 100644 Documentation/lockup-watchdogs.txt
 delete mode 100644 Documentation/numastat.txt
 delete mode 100644 Documentation/pnp.txt
 delete mode 100644 Documentation/rtc.txt
 delete mode 100644 Documentation/svga.txt
 delete mode 100644 Documentation/video-output.txt

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index f7ce68fbd4b9..df8413cf1468 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -61,7 +61,7 @@ Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
 		The node's hit/miss statistics, in units of pages.
-		See Documentation/numastat.txt
+		See Documentation/admin-guide/numastat.rst
 
 What:		/sys/devices/system/node/nodeX/distance
 Date:		October 2002
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index abac31d216de..2c44b4f1b060 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,4 @@ Description:
 		17 - sectors discarded
 		18 - time spent discarding
 
-		For more details refer to Documentation/iostats.txt
+		For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dfad7427817c..f8c7c7126bb1 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,7 +15,7 @@ Description:
 		 9 - I/Os currently in progress
 		10 - time spent doing I/Os (ms)
 		11 - weighted time spent doing I/Os (ms)
-		For more details refer Documentation/iostats.txt
+		For more details refer Documentation/admin-guide/iostats.rst
 
 
 What:		/sys/block/<disk>/<part>/stat
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index d404603c6b52..5f7d7b14fa44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -34,7 +34,7 @@ Description:	CPU topology files that describe kernel limits related to
 		present: cpus that have been identified as being present in
 		the system.
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/probe
@@ -103,7 +103,7 @@ Description:	CPU topology files that describe a logical CPU's relationship
 		thread_siblings_list: human-readable list of cpu#'s hardware
 		threads within the same core as cpu#
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/cpuidle/current_driver
diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst
new file mode 100644
index 000000000000..ec57740ead0c
--- /dev/null
+++ b/Documentation/admin-guide/btmrvl.rst
@@ -0,0 +1,124 @@
+=============
+btmrvl driver
+=============
+
+All commands are used via debugfs interface.
+
+Set/get driver configurations
+=============================
+
+Path:	/debug/btmrvl/config/
+
+gpiogap=[n], hscfgcmd
+	These commands are used to configure the host sleep parameters::
+	bit 8:0  -- Gap
+	bit 16:8 -- GPIO
+
+	where GPIO is the pin number of GPIO used to wake up the host.
+	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
+	wakeup will be used instead).
+
+	where Gap is the gap in milli seconds between wakeup signal and
+	wakeup event, or 0xff for special host sleep setting.
+
+	Usage::
+
+		# Use SDIO interface to wake up the host and set GAP to 0x80:
+		echo 0xff80 > /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
+		echo 0x03ff >  /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+psmode=[n], pscmd
+	These commands are used to enable/disable auto sleep mode
+
+	where the option is::
+
+			1 	-- Enable auto sleep mode
+			0 	-- Disable auto sleep mode
+
+	Usage::
+
+		# Enable auto sleep mode
+		echo 1 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+		# Disable auto sleep mode
+		echo 0 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+
+hsmode=[n], hscmd
+	These commands are used to enable host sleep or wake up firmware
+
+	where the option is::
+
+			1	-- Enable host sleep
+			0	-- Wake up firmware
+
+	Usage::
+
+		# Enable host sleep
+		echo 1 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+		# Wake up firmware
+		echo 0 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+
+Get driver status
+=================
+
+Path:	/debug/btmrvl/status/
+
+Usage::
+
+	cat /debug/btmrvl/status/<args>
+
+where the args are:
+
+curpsmode
+	This command displays current auto sleep status.
+
+psstate
+	This command display the power save state.
+
+hsstate
+	This command display the host sleep state.
+
+txdnldrdy
+	This command displays the value of Tx download ready flag.
+
+Issuing a raw hci command
+=========================
+
+Use hcitool to issue raw hci command, refer to hcitool manual
+
+Usage::
+
+	Hcitool cmd <ogf> <ocf> [Parameters]
+
+Interface Control Command::
+
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
+
+SD8688 firmware
+===============
+
+Images:
+
+- /lib/firmware/sd8688_helper.bin
+- /lib/firmware/sd8688.bin
+
+
+The images can be downloaded from:
+
+git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst
new file mode 100644
index 000000000000..211fd926cf00
--- /dev/null
+++ b/Documentation/admin-guide/clearing-warn-once.rst
@@ -0,0 +1,9 @@
+Clearing WARN_ONCE
+------------------
+
+WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst
new file mode 100644
index 000000000000..2d01ce43d2a2
--- /dev/null
+++ b/Documentation/admin-guide/cpu-load.rst
@@ -0,0 +1,114 @@
+========
+CPU load
+========
+
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
+
+    $ iostat
+    Linux 2.6.18.3-exp (linmac)     02/20/2007
+
+    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
+              10.01    0.00    2.92    5.44    0.00   81.63
+
+    ...
+
+Here the system thinks that over the default sampling period the
+system spent 10.01% of the time doing work in user space, 2.92% in the
+kernel, and was overall 81.63% of the time idle.
+
+In most cases the ``/proc/stat``	 information reflects the reality quite
+closely, however due to the nature of how/when the kernel collects
+this data sometimes it can not be trusted at all.
+
+So how is this information collected?  Whenever timer interrupt is
+signalled the kernel looks what kind of task was running at this
+moment and increments the counter that corresponds to this tasks
+kind/state.  The problem with this is that the system could have
+switched between various states multiple times between two timer
+interrupts yet the counter is incremented only for the last state.
+
+
+Example
+-------
+
+If we imagine the system with one task that periodically burns cycles
+in the following manner::
+
+     time line between two timer interrupts
+    |--------------------------------------|
+     ^                                    ^
+     |_ something begins working          |
+                                          |_ something goes to sleep
+                                         (only to be awaken quite soon)
+
+In the above situation the system will be 0% loaded according to the
+``/proc/stat`` (since the timer interrupt will always happen when the
+system is executing the idle handler), but in reality the load is
+closer to 99%.
+
+One can imagine many more situations where this behavior of the kernel
+will lead to quite erratic information inside ``/proc/stat``::
+
+
+	/* gcc -o hog smallhog.c */
+	#include <time.h>
+	#include <limits.h>
+	#include <signal.h>
+	#include <sys/time.h>
+	#define HIST 10
+
+	static volatile sig_atomic_t stop;
+
+	static void sighandler (int signr)
+	{
+	(void) signr;
+	stop = 1;
+	}
+	static unsigned long hog (unsigned long niters)
+	{
+	stop = 0;
+	while (!stop && --niters);
+	return niters;
+	}
+	int main (void)
+	{
+	int i;
+	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
+	sigset_t set;
+	unsigned long v[HIST];
+	double tmp = 0.0;
+	unsigned long n;
+	signal (SIGALRM, &sighandler);
+	setitimer (ITIMER_REAL, &it, NULL);
+
+	hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) tmp += v[i];
+	tmp /= HIST;
+	n = tmp - (tmp / 3.0);
+
+	sigemptyset (&set);
+	sigaddset (&set, SIGALRM);
+
+	for (;;) {
+		hog (n);
+		sigwait (&set, &i);
+	}
+	return 0;
+	}
+
+
+References
+----------
+
+- http://lkml.org/lkml/2007/2/12/6
+- Documentation/filesystems/proc.txt (1.8)
+
+
+Thanks
+------
+
+Con Kolivas, Pavel Machek
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
new file mode 100644
index 000000000000..b90dafcc8237
--- /dev/null
+++ b/Documentation/admin-guide/cputopology.rst
@@ -0,0 +1,177 @@
+===========================================
+How CPU topology info is exported via sysfs
+===========================================
+
+Export CPU topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo output of some architectures.  They reside in
+/sys/devices/system/cpu/cpuX/topology/:
+
+physical_package_id:
+
+	physical package id of cpuX. Typically corresponds to a physical
+	socket number, but the actual value is architecture and platform
+	dependent.
+
+die_id:
+
+	the CPU die ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+core_id:
+
+	the CPU core ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+book_id:
+
+	the book ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+drawer_id:
+
+	the drawer ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+core_cpus:
+
+	internal kernel map of CPUs within the same core.
+	(deprecated name: "thread_siblings")
+
+core_cpus_list:
+
+	human-readable list of CPUs within the same core.
+	(deprecated name: "thread_siblings_list");
+
+package_cpus:
+
+	internal kernel map of the CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings")
+
+package_cpus_list:
+
+	human-readable list of CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings_list")
+
+die_cpus:
+
+	internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+	human-readable list of CPUs within the same die.
+
+book_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	book_id.
+
+book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
+drawer_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	drawer_id.
+
+drawer_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	drawer_id.
+
+Architecture-neutral, drivers/base/topology.c, exports these attributes.
+However, the book and drawer related sysfs files will only be created if
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
+
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
+where they reflect the cpu and cache hierarchy.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h::
+
+	#define topology_physical_package_id(cpu)
+	#define topology_die_id(cpu)
+	#define topology_core_id(cpu)
+	#define topology_book_id(cpu)
+	#define topology_drawer_id(cpu)
+	#define topology_sibling_cpumask(cpu)
+	#define topology_core_cpumask(cpu)
+	#define topology_die_cpumask(cpu)
+	#define topology_book_cpumask(cpu)
+	#define topology_drawer_cpumask(cpu)
+
+The type of ``**_id macros`` is int.
+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
+correspond with appropriate ``**_siblings`` sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+
+1) topology_physical_package_id: -1
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
+
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
+no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+
+Additionally, CPU topology information is provided under
+/sys/devices/system/cpu and includes these files.  The internal
+source for the output is in brackets ("[]").
+
+    =========== ==========================================================
+    kernel_max: the maximum CPU index allowed by the kernel configuration.
+		[NR_CPUS-1]
+
+    offline:	CPUs that are not online because they have been
+		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
+		of CPUs allowed by the kernel configuration (kernel_max
+		above). [~cpu_online_mask + cpus >= NR_CPUS]
+
+    online:	CPUs that are online and being scheduled [cpu_online_mask]
+
+    possible:	CPUs that have been allocated resources and can be
+		brought online if they are present. [cpu_possible_mask]
+
+    present:	CPUs that have been identified as being present in the
+		system. [cpu_present_mask]
+    =========== ==========================================================
+
+The format for the above output is compatible with cpulist_parse()
+[see <linux/cpumask.h>].  Some examples follow.
+
+In this example, there are 64 CPUs in the system but cpus 32-63 exceed
+the kernel max which is limited to 0..31 by the NR_CPUS config option
+being 32.  Note also that CPUs 2 and 4-31 are not online but could be
+brought online as they are both present and possible::
+
+     kernel_max: 31
+        offline: 2,4-31,32-63
+         online: 0-1,3
+       possible: 0-31
+        present: 0-31
+
+In this example, the NR_CPUS config option is 128, but the kernel was
+started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
+was manually taken offline (and is the only CPU that can be brought
+online.)::
+
+     kernel_max: 127
+        offline: 2,4-127,128-143
+         online: 0-1,3
+       possible: 0-127
+        present: 0-3
+
+See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
+as well as more information on the various cpumasks.
diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst
index 3d80a9f850cc..41ded0bc5933 100644
--- a/Documentation/admin-guide/device-mapper/statistics.rst
+++ b/Documentation/admin-guide/device-mapper/statistics.rst
@@ -13,7 +13,7 @@ the range specified.
 
 The I/O statistics counters for each step-sized area of a region are
 in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
-Documentation/iostats.txt).  But two extra counters (12 and 13) are
+Documentation/admin-guide/iostats.rst).  But two extra counters (12 and 13) are
 provided: total time spent reading and writing.  When the histogram
 argument is used, the 14th parameter is reported that represents the
 histogram of latencies.  All these counters may be accessed by sending
@@ -151,7 +151,7 @@ Messages
 	  The first 11 counters have the same meaning as
 	  `/sys/block/*/stat or /proc/diskstats`.
 
-	  Please refer to Documentation/iostats.txt for details.
+	  Please refer to Documentation/admin-guide/iostats.rst for details.
 
 	  1. the number of reads completed
 	  2. the number of reads merged
diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst
new file mode 100644
index 000000000000..833edb0d0bc4
--- /dev/null
+++ b/Documentation/admin-guide/efi-stub.rst
@@ -0,0 +1,100 @@
+=================
+The EFI Boot Stub
+=================
+
+On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
+as a PE/COFF image, thereby convincing EFI firmware loaders to load
+it as an EFI executable. The code that modifies the bzImage header,
+along with the EFI-specific entry point that the firmware loader
+jumps to are collectively known as the "EFI boot stub", and live in
+arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
+respectively. For ARM the EFI stub is implemented in
+arch/arm/boot/compressed/efi-header.S and
+arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
+between architectures is in drivers/firmware/efi/libstub.
+
+For arm64, there is no compressed kernel support, so the Image itself
+masquerades as a PE/COFF image and the EFI stub is linked into the
+kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
+and drivers/firmware/efi/libstub/arm64-stub.c.
+
+By using the EFI boot stub it's possible to boot a Linux kernel
+without the use of a conventional EFI boot loader, such as grub or
+elilo. Since the EFI boot stub performs the jobs of a boot loader, in
+a certain sense it *IS* the boot loader.
+
+The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
+
+
+How to install bzImage.efi
+--------------------------
+
+The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
+System Partition (ESP) and renamed with the extension ".efi". Without
+the extension the EFI firmware loader will refuse to execute it. It's
+not possible to execute bzImage.efi from the usual Linux file systems
+because EFI firmware doesn't have support for them. For ARM the
+arch/arm/boot/zImage should be copied to the system partition, and it
+may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
+should be copied but not necessarily renamed.
+
+
+Passing kernel parameters from the EFI shell
+--------------------------------------------
+
+Arguments to the kernel can be passed after bzImage.efi, e.g.::
+
+	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
+
+
+The "initrd=" option
+--------------------
+
+Like most boot loaders, the EFI stub allows the user to specify
+multiple initrd files using the "initrd=" option. This is the only EFI
+stub-specific command line parameter, everything else is passed to the
+kernel when it boots.
+
+The path to the initrd file must be an absolute path from the
+beginning of the ESP, relative path names do not work. Also, the path
+is an EFI-style path and directory elements must be separated with
+backslashes (\). For example, given the following directory layout::
+
+  fs0:>
+	Kernels\
+			bzImage.efi
+			initrd-large.img
+
+	Ramdisks\
+			initrd-small.img
+			initrd-medium.img
+
+to boot with the initrd-large.img file if the current working
+directory is fs0:\Kernels, the following command must be used::
+
+	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
+
+Notice how bzImage.efi can be specified with a relative path. That's
+because the image we're executing is interpreted by the EFI shell,
+which understands relative paths, whereas the rest of the command line
+is passed to bzImage.efi.
+
+
+The "dtb=" option
+-----------------
+
+For the ARM and arm64 architectures, a device tree must be provided to
+the kernel. Normally firmware shall supply the device tree via the
+EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
+be used to override the firmware supplied device tree, or to supply
+one when firmware is unable to.
+
+Please note: Firmware adds runtime configuration information to the
+device tree before booting the kernel. If dtb= is used to override
+the device tree, then any runtime data provided by firmware will be
+lost. The dtb= option should only be used either as a debug tool, or
+as a last resort when a device tree is not provided in the EFI
+CONFIGURATION TABLE.
+
+"dtb=" is processed in the same manner as the "initrd=" option that is
+described above.
diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst
new file mode 100644
index 000000000000..6ee70465c0ea
--- /dev/null
+++ b/Documentation/admin-guide/highuid.rst
@@ -0,0 +1,80 @@
+===================================================
+Notes on the change from 16-bit UIDs to 32-bit UIDs
+===================================================
+
+:Author: Chris Wing <wingc@umich.edu>
+:Last updated: January 11, 2000
+
+- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
+  when communicating between user and kernel space in an ioctl or data
+  structure.
+
+- kernel code should use uid_t and gid_t in kernel-private structures and
+  code.
+
+What's left to be done for 32-bit UIDs on all Linux architectures:
+
+- Disk quotas have an interesting limitation that is not related to the
+  maximum UID/GID. They are limited by the maximum file size on the
+  underlying filesystem, because quota records are written at offsets
+  corresponding to the UID in question.
+  Further investigation is needed to see if the quota system can cope
+  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
+  architectures, this should not be a problem.
+
+- Decide whether or not to keep backwards compatibility with the system
+  accounting file, or if we should break it as the comments suggest
+  (currently, the old 16-bit UID and GID are still written to disk, and
+  part of the former pad space is used to store separate 32-bit UID and
+  GID)
+
+- Need to validate that OS emulation calls the 16-bit UID
+  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
+  uses the 32-bit UID system calls properly otherwise.
+
+  This affects at least:
+
+	- iBCS on Intel
+
+	- sparc32 emulation on sparc64
+	  (need to support whatever new 32-bit UID system calls are added to
+	  sparc32)
+
+- Validate that all filesystems behave properly.
+
+  At present, 32-bit UIDs _should_ work for:
+
+	- ext2
+	- ufs
+	- isofs
+	- nfs
+	- coda
+	- udf
+
+  Ioctl() fixups have been made for:
+
+	- ncpfs
+	- smbfs
+
+  Filesystems with simple fixups to prevent 16-bit UID wraparound:
+
+	- minix
+	- sysv
+	- qnx4
+
+  Other filesystems have not been checked yet.
+
+- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
+  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
+  more are needed. (as well as new user<->kernel data structures)
+
+- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
+  sh, and sparc32. Fixing this is probably not that important, but would
+  require adding a new ELF section.
+
+- The ioctl()s used to control the in-kernel NFS server only support
+  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
+
+- make sure that the UID mapping feature of AX25 networking works properly
+  (it should be safe because it's always used a 32-bit integer to
+  communicate between user and kernel)
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 656aee262e23..f83212fae4d5 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
 
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst
+   https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst
 
 .. _interrupt_isolation:
 
diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst
new file mode 100644
index 000000000000..121de96e395e
--- /dev/null
+++ b/Documentation/admin-guide/hw_random.rst
@@ -0,0 +1,105 @@
+==========================================================
+Linux support for random number generator in i8xx chipsets
+==========================================================
+
+Introduction
+============
+
+The hw_random framework is software that makes use of a
+special hardware feature on your CPU or motherboard,
+a Random Number Generator (RNG).  The software has two parts:
+a core providing the /dev/hwrng character device and its
+sysfs support, plus a hardware-specific driver that plugs
+into that core.
+
+To make the most effective use of these mechanisms, you
+should download the support software as well.  Download the
+latest version of the "rng-tools" package from the
+hw_random driver's official Web site:
+
+	http://sourceforge.net/projects/gkernel/
+
+Those tools use /dev/hwrng to fill the kernel entropy pool,
+which is used internally and exported by the /dev/urandom and
+/dev/random special files.
+
+Theory of operation
+===================
+
+CHARACTER DEVICE.  Using the standard open()
+and read() system calls, you can read random data from
+the hardware RNG device.  This data is NOT CHECKED by any
+fitness tests, and could potentially be bogus (if the
+hardware is faulty or has been tampered with).  Data is only
+output if the hardware "has-data" flag is set, but nevertheless
+a security-conscious person would run fitness tests on the
+data before assuming it is truly random.
+
+The rng-tools package uses such tests in "rngd", and lets you
+run them by hand with a "rngtest" utility.
+
+/dev/hwrng is char device major 10, minor 183.
+
+CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
+two unique attributes, "rng_available" and "rng_current".  The
+"rng_available" attribute lists the hardware-specific drivers
+available, while "rng_current" lists the one which is currently
+connected to /dev/hwrng.  If your system has more than one
+RNG available, you may change the one used by writing a name from
+the list in "rng_available" into "rng_current".
+
+==========================================================================
+
+
+Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
+	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
+	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
+
+
+About the Intel RNG hardware, from the firmware hub datasheet
+=============================================================
+
+The Firmware Hub integrates a Random Number Generator (RNG)
+using thermal noise generated from inherently random quantum
+mechanical properties of silicon. When not generating new random
+bits the RNG circuitry will enter a low power state. Intel will
+provide a binary software driver to give third party software
+access to our RNG for use as a security feature. At this time,
+the RNG is only to be used with a system in an OS-present state.
+
+Intel RNG Driver notes
+======================
+
+FIXME: support poll(2)
+
+.. note::
+
+	request_mem_region was removed, for three reasons:
+
+	1) Only one RNG is supported by this driver;
+	2) The location used by the RNG is a fixed location in
+	   MMIO-addressable memory;
+	3) users with properly working BIOS e820 handling will always
+	   have the region in which the RNG is located reserved, so
+	   request_mem_region calls always fail for proper setups.
+	   However, for people who use mem=XX, BIOS e820 information is
+	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
+	   succeed.
+
+Driver details
+==============
+
+Based on:
+	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
+	May 1999 Order Number: 290658-002 R
+
+Intel 82802 Firmware Hub:
+	Random Number Generator
+	Programmer's Reference Manual
+	December 1999 Order Number: 298029-001 R
+
+Intel 82802 Firmware HUB Random Number Generator Driver
+	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
+
+Special thanks to Matt Sottek.  I did the "guts", he
+did the "brains" and all the testing.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index a5fdb1a846ce..4e98f5596da0 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking.
    perf-security
    acpi/index
    aoe/index
+   btmrvl
+   clearing-warn-once
+   cpu-load
+   cputopology
    device-mapper/index
+   efi-stub
+   highuid
+   hw_random
+   iostats
+   kernel-per-CPU-kthreads
    laptops/index
+   lcd-panel-cgram
+   ldm
+   lockup-watchdogs
+   numastat
+   pnp
+   rtc
+   svga
+   video-output
 
 .. only::  subproject and html
 
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
new file mode 100644
index 000000000000..5d63b18bd6d1
--- /dev/null
+++ b/Documentation/admin-guide/iostats.rst
@@ -0,0 +1,197 @@
+=====================
+I/O statistics fields
+=====================
+
+Since 2.4.20 (and some versions before, with patches), and 2.5.45,
+more extensive disk statistics have been introduced to help measure disk
+activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
+the work for you, but in case you are interested in creating your own
+tools, the fields are explained here.
+
+In 2.4 now, the information is found as additional fields in
+``/proc/partitions``.  In 2.6 and upper, the same information is found in two
+places: one is in the file ``/proc/diskstats``, and the other is within
+the sysfs file system, which must be mounted in order to obtain
+the information. Throughout this document we'll assume that sysfs
+is mounted on ``/sys``, although of course it may be mounted anywhere.
+Both ``/proc/diskstats`` and sysfs use the same source for the information
+and so should not differ.
+
+Here are examples of these different formats::
+
+   2.4:
+      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
+
+   2.6+ sysfs:
+      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      35486    38030    38030    38030
+
+   2.6+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3    1   hda1 35486 38030 38030 38030
+
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
+On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
+a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
+
+The advantage of one over the other is that the sysfs choice works well
+if you are watching a known, small set of disks.  ``/proc/diskstats`` may
+be a better choice if you are watching a large number of disks because
+you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
+each snapshot of your disk statistics.
+
+In 2.4, the statistics fields are those after the device name. In
+the above example, the first field of statistics would be 446216.
+By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
+find just the eleven fields, beginning with 446216.  If you look at
+``/proc/diskstats``, the eleven fields will be preceded by the major and
+minor device numbers, and device name.  Each of these formats provides
+eleven fields of statistics, each meaning exactly the same things.
+All fields except field 9 are cumulative since boot.  Field 9 should
+go to zero as I/Os complete; all others only increase (unless they
+overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
+(native word size) numbers, and on a very busy or long-lived system they
+may wrap. Applications should be prepared to deal with that; unless
+your observations are measured in large numbers of minutes or hours,
+they should not wrap twice before you notice them.
+
+Each set of stats only applies to the indicated device; if you want
+system-wide stats you'll have to find all the devices and sum them all up.
+
+Field  1 -- # of reads completed
+    This is the total number of reads completed successfully.
+
+Field  2 -- # of reads merged, field 6 -- # of writes merged
+    Reads and writes which are adjacent to each other may be merged for
+    efficiency.  Thus two 4K reads may become one 8K read before it is
+    ultimately handed to the disk, and so it will be counted (and queued)
+    as only one I/O.  This field lets you know how often this was done.
+
+Field  3 -- # of sectors read
+    This is the total number of sectors read successfully.
+
+Field  4 -- # of milliseconds spent reading
+    This is the total number of milliseconds spent by all reads (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  5 -- # of writes completed
+    This is the total number of writes completed successfully.
+
+Field  6 -- # of writes merged
+    See the description of field 2.
+
+Field  7 -- # of sectors written
+    This is the total number of sectors written successfully.
+
+Field  8 -- # of milliseconds spent writing
+    This is the total number of milliseconds spent by all writes (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  9 -- # of I/Os currently in progress
+    The only field that should go to zero. Incremented as requests are
+    given to appropriate struct request_queue and decremented as they finish.
+
+Field 10 -- # of milliseconds spent doing I/Os
+    This field increases so long as field 9 is nonzero.
+
+    Since 5.0 this field counts jiffies when at least one request was
+    started or completed. If request runs more than 2 jiffies then some
+    I/O time will not be accounted unless there are other requests.
+
+Field 11 -- weighted # of milliseconds spent doing I/Os
+    This field is incremented at each I/O start, I/O completion, I/O
+    merge, or read of these stats by the number of I/Os in progress
+    (field 9) times the number of milliseconds spent doing I/O since the
+    last update of this field.  This can provide an easy measure of both
+    I/O completion time and the backlog that may be accumulating.
+
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
+
+To avoid introducing performance bottlenecks, no locks are held while
+modifying these counters.  This implies that minor inaccuracies may be
+introduced when changes collide, so (for instance) adding up all the
+read I/Os issued per partition should equal those made to the disks ...
+but due to the lack of locking it may only be very close.
+
+In 2.6+, there are counters for each CPU, which make the lack of locking
+almost a non-issue.  When the statistics are read, the per-CPU counters
+are summed (possibly overflowing the unsigned long variable they are
+summed to) and the result given to the user.  There is no convenient
+user interface for accessing the per-CPU counters themselves.
+
+Disks vs Partitions
+-------------------
+
+There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
+As a result, some statistic information disappeared. The translation from
+a disk address relative to a partition to the disk address relative to
+the host disk happens much earlier.  All merges and timings now happen
+at the disk level rather than at both the disk and partition level as
+in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
+partitions from that for disks.  There are only *four* fields available
+for partitions on 2.6+ machines.  This is reflected in the examples above.
+
+Field  1 -- # of reads issued
+    This is the total number of reads issued to this partition.
+
+Field  2 -- # of sectors read
+    This is the total number of sectors requested to be read from this
+    partition.
+
+Field  3 -- # of writes issued
+    This is the total number of writes issued to this partition.
+
+Field  4 -- # of sectors written
+    This is the total number of sectors requested to be written to
+    this partition.
+
+Note that since the address is translated to a disk-relative one, and no
+record of the partition-relative address is kept, the subsequent success
+or failure of the read cannot be attributed to the partition.  In other
+words, the number of reads for partitions is counted slightly before time
+of queuing for partitions, and at completion for whole disks.  This is
+a subtle distinction that is probably uninteresting for most cases.
+
+More significant is the error induced by counting the numbers of
+reads/writes before merges for partitions and after for disks. Since a
+typical workload usually contains a lot of successive and adjacent requests,
+the number of reads/writes issued can be several times higher than the
+number of reads/writes completed.
+
+In 2.6.25, the full statistic set is again available for partitions and
+disk and partition statistics are consistent again. Since we still don't
+keep record of the partition-relative address, an operation is attributed to
+the partition which contains the first sector of the request after the
+eventual merges. As requests can be merged across partition, this could lead
+to some (probably insignificant) inaccuracy.
+
+Additional notes
+----------------
+
+In 2.6+, sysfs is not mounted by default.  If your distribution of
+Linux hasn't added it already, here's the line you'll want to add to
+your ``/etc/fstab``::
+
+	none /sys sysfs defaults 0 0
+
+
+In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
+appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
+``/proc/stat`` take a very different format from those in ``/proc/partitions``
+(see proc(5), if your system has it.)
+
+-- ricklind@us.ibm.com
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a571a67e0c85..19b1e3bef56c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5066,7 +5066,7 @@
 
 	vga=		[BOOT,X86-32] Select a particular video mode
 			See Documentation/x86/boot.rst and
-			Documentation/svga.txt.
+			Documentation/admin-guide/svga.rst.
 			Use vga=ask for menu.
 			This is actually a boot loader parameter; the value is
 			passed to the kernel using a special protocol.
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
new file mode 100644
index 000000000000..4f18456dd3b1
--- /dev/null
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -0,0 +1,356 @@
+==========================================
+Reducing OS jitter due to per-cpu kthreads
+==========================================
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter.  Note that non-per-CPU kthreads are
+not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+References
+==========
+
+-	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
+
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+
+-	man taskset:  Using the taskset command to bind tasks to sets
+	of CPUs.
+
+-	man sched_setaffinity:  Using the sched_setaffinity() system
+	call to bind tasks to sets of CPUs.
+
+-	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
+	writing "0" to offline and "1" to online.
+
+-	In order to locate kernel-generated OS jitter on CPU N:
+
+		cd /sys/kernel/debug/tracing
+		echo 1 > max_graph_depth # Increase the "1" for more detail
+		echo function_graph > current_tracer
+		# run workload
+		cat per_cpu/cpuN/trace
+
+kthreads
+========
+
+Name:
+  ehca_comp/%u
+
+Purpose:
+  Periodically process Infiniband-related work.
+
+To reduce its OS jitter, do any of the following:
+
+1.	Don't use eHCA Infiniband hardware, instead choosing hardware
+	that does not require per-CPU kthreads.  This will prevent these
+	kthreads from being created in the first place.  (This will
+	work for most people, as this hardware, though important, is
+	relatively old and is produced in relatively low unit volumes.)
+2.	Do all eHCA-Infiniband-related work on other CPUs, including
+	interrupts.
+3.	Rework the eHCA driver so that its per-CPU kthreads are
+	provisioned only on selected CPUs.
+
+
+Name:
+  irq/%d-%s
+
+Purpose:
+  Handle threaded interrupts.
+
+To reduce its OS jitter, do the following:
+
+1.	Use irq affinity to force the irq threads to execute on
+	some other CPU.
+
+Name:
+  kcmtpd_ctr_%d
+
+Purpose:
+  Handle Bluetooth work.
+
+To reduce its OS jitter, do one of the following:
+
+1.	Don't use Bluetooth, in which case these kthreads won't be
+	created in the first place.
+2.	Use irq affinity to force Bluetooth-related interrupts to
+	occur on some other CPU and furthermore initiate all
+	Bluetooth activity on some other CPU.
+
+Name:
+  ksoftirqd/%u
+
+Purpose:
+  Execute softirq handlers when threaded or when under heavy load.
+
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+
+TIMER_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by forcing
+	both kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
+	the CPU offline, then bring it back online.  This forces
+	recurring timers to migrate elsewhere.	If you are concerned
+	with multiple CPUs, force them all offline before bringing the
+	first one back online.  Once you have onlined the CPUs in question,
+	do not offline any other CPUs, because doing so could force the
+	timer back onto one of the CPUs in question.
+
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
+---------------------------------
+
+Do all of the following:
+
+1.	Force networking interrupts onto other CPUs.
+2.	Initiate any network I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+BLOCK_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+IRQ_POLL_SOFTIRQ
+----------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O and block-I/O polling on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+TASKLET_SOFTIRQ
+---------------
+
+Do one or more of the following:
+
+1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
+	calls to things like tasklet_schedule().)
+2.	Convert all drivers that you must use from tasklets to workqueues.
+3.	Force interrupts for drivers using tasklets onto other CPUs,
+	and also do I/O involving these drivers on other CPUs.
+
+SCHED_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
+	for example, ensure that at most one runnable kthread is present
+	on that CPU.  If a thread that expects to run on the de-jittered
+	CPU awakens, the scheduler will send an IPI that can result in
+	a subsequent SCHED_SOFTIRQ.
+2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+	is marked as an adaptive-ticks CPU using the "nohz_full="
+	boot parameter.  This reduces the number of scheduler-clock
+	interrupts that the de-jittered CPU receives, minimizing its
+	chances of being selected to do the load balancing work that
+	runs in SCHED_SOFTIRQ context.
+3.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by
+	forcing both kernel threads and interrupts to execute elsewhere.
+	This further reduces the number of scheduler-clock interrupts
+	received by the de-jittered CPU.
+
+HRTIMER_SOFTIRQ
+---------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle.  For example, avoid system calls and force both
+	kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
+	CPU offline, then bring it back online.  This forces recurring
+	timers to migrate elsewhere.  If you are concerned with multiple
+	CPUs, force them all offline before bringing the first one
+	back online.  Once you have onlined the CPUs in question, do not
+	offline any other CPUs, because doing so could force the timer
+	back onto one of the CPUs in question.
+
+RCU_SOFTIRQ
+-----------
+
+Do at least one of the following:
+
+1.	Offload callbacks and keep the CPU in either dyntick-idle or
+	adaptive-ticks state by doing all of the following:
+
+	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+		de-jittered is marked as an adaptive-ticks CPU using the
+		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
+		housekeeping CPUs, which can tolerate OS jitter.
+	b.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+2.	Enable RCU to do its processing remotely via dyntick-idle by
+	doing all of the following:
+
+	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+	b.	Ensure that the CPU goes idle frequently, allowing other
+		CPUs to detect that it has passed through an RCU quiescent
+		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
+		userspace execution also allows other CPUs to detect that
+		the CPU in question has passed through a quiescent state.
+	c.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+Name:
+  kworker/%u:%d%s (cpu, id, priority)
+
+Purpose:
+  Execute workqueue requests
+
+To reduce its OS jitter, do any of the following:
+
+1.	Run your workload at a real-time priority, which will allow
+	preempting the kworker daemons.
+2.	A given workqueue can be made visible in the sysfs filesystem
+	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+	Such a workqueue can be confined to a given subset of the
+	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
+	files.	The set of WQ_SYSFS workqueues can be displayed using
+	"ls sys/devices/virtual/workqueue".  That said, the workqueues
+	maintainer would like to caution people against indiscriminately
+	sprinkling WQ_SYSFS across all the workqueues.	The reason for
+	caution is that it is easy to add WQ_SYSFS, but because sysfs is
+	part of the formal user/kernel API, it can be nearly impossible
+	to remove it, even if its addition was a mistake.
+3.	Do any of the following needed to avoid jitter that your
+	application cannot tolerate:
+
+	a.	Build your kernel with CONFIG_SLUB=y rather than
+		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
+		use of each CPU's workqueues to run its cache_reap()
+		function.
+	b.	Avoid using oprofile, thus avoiding OS jitter from
+		wq_sync_buffer().
+	c.	Limit your CPU frequency so that a CPU-frequency
+		governor is not required, possibly enlisting the aid of
+		special heatsinks or other cooling technologies.  If done
+		correctly, and if you CPU architecture permits, you should
+		be able to build your kernel with CONFIG_CPU_FREQ=n to
+		avoid the CPU-frequency governor periodically running
+		on each CPU, including cs_dbs_timer() and od_dbs_timer().
+
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
+		but if your workload is CPU-bound, this is a bad idea.
+		However, there is an RFC patch from Christoph Lameter
+		(based on an earlier one from Gilad Ben-Yossef) that
+		reduces or even eliminates vmstat overhead for some
+		workloads at https://lkml.org/lkml/2013/9/4/379.
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
+		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
+		daemon from running on each CPU every second or so.
+		(This will require editing Kconfig files and will defeat
+		this platform's RAS functionality.)  This avoids jitter
+		due to the rtas_event_scan() function.
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	g.	If running on Cell Processor, build your kernel with
+		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
+		spu_gov_work().
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	h.	If running on PowerMAC, build your kernel with
+		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
+		avoiding OS jitter from rackmeter_do_timer().
+
+Name:
+  rcuc/%u
+
+Purpose:
+  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
+	kthreads from being created in the first place, and also obviates
+	the need for RCU priority boosting.  This approach is feasible
+	for workloads that do not require high degrees of responsiveness.
+2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
+	kthreads from being created in the first place.  This approach
+	is feasible only if your workload never requires RCU priority
+	boosting, for example, if you ensure frequent idle time on all
+	CPUs that might execute within the kernel.
+3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+	boot parameter offloading RCU callbacks from all CPUs susceptible
+	to OS jitter.  This approach prevents the rcuc/%u kthreads from
+	having any work to do, so that they are never awakened.
+4.	Ensure that the CPU never enters the kernel, and, in particular,
+	avoid initiating any CPU hotplug operations on this CPU.  This is
+	another way of preventing any callbacks from being queued on the
+	CPU, again preventing the rcuc/%u kthreads from having any work
+	to do.
+
+Name:
+  rcuop/%d and rcuos/%d
+
+Purpose:
+  Offload RCU callbacks from the corresponding CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Use affinity, cgroups, or other mechanism to force these kthreads
+	to execute on some other CPU.
+2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
+	kthreads from being created in the first place.  However, please
+	note that this will not eliminate OS jitter, but will instead
+	shift it to RCU_SOFTIRQ.
+
+Name:
+  watchdog/%u
+
+Purpose:
+  Detect software lockups on each CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+	kthreads from being created in the first place.
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
+	and Documentation/watchdog/watchdog-parameters.rst.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+	watchdog timer.
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+	order to reduce the frequency of OS jitter due to the watchdog
+	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst
new file mode 100644
index 000000000000..a3eb00c62f53
--- /dev/null
+++ b/Documentation/admin-guide/lcd-panel-cgram.rst
@@ -0,0 +1,27 @@
+======================================
+Parallel port LCD/Keypad Panel support
+======================================
+
+Some LCDs allow you to define up to 8 characters, mapped to ASCII
+characters 0 to 7. The escape code to define a new character is
+'\e[LG' followed by one digit from 0 to 7, representing the character
+number, and up to 8 couples of hex digits terminated by a semi-colon
+(';'). Each couple of digits represents a line, with 1-bits for each
+illuminated pixel with LSB on the right. Lines are numbered from the
+top of the character to the bottom. On a 5x7 matrix, only the 5 lower
+bits of the 7 first bytes are used for each character. If the string
+is incomplete, only complete lines will be redefined. Here are some
+examples::
+
+  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
+  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
+  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
+  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
+  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
+  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
+  printf "\e[LG60016051516141400;"  => 6 = "IP"
+
+  printf "\e[LG00103071F1F070301;"  => big speaker
+  printf "\e[LG00002061E1E060200;"  => small speaker
+
+Willy
diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst
new file mode 100644
index 000000000000..12c571368e73
--- /dev/null
+++ b/Documentation/admin-guide/ldm.rst
@@ -0,0 +1,121 @@
+==========================================
+LDM - Logical Disk Manager (Dynamic Disks)
+==========================================
+
+:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
+:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
+
+Overview
+--------
+
+Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
+replacement for the MSDOS style partitions.  It stores its information in a
+1MiB journalled database at the end of the physical disk.  The size of
+partitions is limited only by disk space.  The maximum number of partitions is
+nearly 2000.
+
+Any partitions created under the LDM are called "Dynamic Disks".  There are no
+longer any primary or extended partitions.  Normal MSDOS style partitions are
+now known as Basic Disks.
+
+If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
+Dynamic Disks.  The journalling allows Windows to make changes to these
+partitions and filesystems without the need to reboot.
+
+Once the LDM driver has divided up the disk, you can use the MD driver to
+assemble any multi-partition volumes, e.g.  Stripes, RAID5.
+
+To prevent legacy applications from repartitioning the disk, the LDM creates a
+dummy MSDOS partition containing one disk-sized partition.  This is what is
+supported with the Linux LDM driver.
+
+A newer approach that has been implemented with Vista is to put LDM on top of a
+GPT label disk.  This is not supported by the Linux LDM driver yet.
+
+
+Example
+-------
+
+Below we have a 50MiB disk, divided into seven partitions.
+
+.. note::
+
+   The missing 1MiB at the end of the disk is where the LDM database is
+   stored.
+
++-------++--------------+---------+-----++--------------+---------+----+
+|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
++=======++==============+=========+=====++==============+=========+====+
+|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
++-------++--------------+---------+-----++--------------+---------+----+
+
+The LDM Database may not store the partitions in the order that they appear on
+disk, but the driver will sort them.
+
+When Linux boots, you will see something like::
+
+  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
+  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
+
+
+Compiling LDM Support
+---------------------
+
+To enable LDM, choose the following two options: 
+
+  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
+  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
+
+If you believe the driver isn't working as it should, you can enable the extra
+debugging code.  This will produce a LOT of output.  The option is:
+
+  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
+
+N.B. The partition code cannot be compiled as a module.
+
+As with all the partition code, if the driver doesn't see signs of its type of
+partition, it will pass control to another driver, so there is no harm in
+enabling it.
+
+If you have Dynamic Disks but don't enable the driver, then all you will see
+is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
+any of the volumes on the disk.
+
+
+Booting
+-------
+
+If you enable LDM support, then lilo is capable of booting from any of the
+discovered partitions.  However, grub does not understand the LDM partitioning
+and cannot boot from a Dynamic Disk.
+
+
+More Documentation
+------------------
+
+There is an Overview of the LDM together with complete Technical Documentation.
+It is available for download.
+
+  http://www.linux-ntfs.org/
+
+If you have any LDM questions that aren't answered in the documentation, email
+me.
+
+Cheers,
+    FlatCap - Richard Russon
+    ldm@flatcap.org
+
diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst
new file mode 100644
index 000000000000..290840c160af
--- /dev/null
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
@@ -0,0 +1,83 @@
+===============================================================
+Softlockup detector and hardlockup detector (aka nmi_watchdog)
+===============================================================
+
+The Linux kernel can act as a watchdog to detect both soft and hard
+lockups.
+
+A 'softlockup' is defined as a bug that causes the kernel to loop in
+kernel mode for more than 20 seconds (see "Implementation" below for
+details), without giving other tasks a chance to run. The current
+stack trace is displayed upon detection and, by default, the system
+will stay locked up. Alternatively, the kernel can be configured to
+panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
+"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
+details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
+provided for this.
+
+A 'hardlockup' is defined as a bug that causes the CPU to loop in
+kernel mode for more than 10 seconds (see "Implementation" below for
+details), without letting other interrupts have a chance to run.
+Similarly to the softlockup case, the current stack trace is displayed
+upon detection and the system will stay locked up unless the default
+behavior is changed, which can be done through a sysctl,
+'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
+and a kernel parameter, "nmi_watchdog"
+(see "Documentation/admin-guide/kernel-parameters.rst" for details).
+
+The panic option can be used in combination with panic_timeout (this
+timeout is set through the confusingly named "kernel.panic" sysctl),
+to cause the system to reboot automatically after a specified amount
+of time.
+
+Implementation
+==============
+
+The soft and hard lockup detectors are built on top of the hrtimer and
+perf subsystems, respectively. A direct consequence of this is that,
+in principle, they should work in any architecture where these
+subsystems are present.
+
+A periodic hrtimer runs to generate interrupts and kick the watchdog
+task. An NMI perf event is generated every "watchdog_thresh"
+(compile-time initialized to 10 and configurable through sysctl of the
+same name) seconds to check for hardlockups. If any CPU in the system
+does not receive any hrtimer interrupt during that time the
+'hardlockup detector' (the handler for the NMI perf event) will
+generate a kernel warning or call panic, depending on the
+configuration.
+
+The watchdog task is a high priority kernel thread that updates a
+timestamp every time it is scheduled. If that timestamp is not updated
+for 2*watchdog_thresh seconds (the softlockup threshold) the
+'softlockup detector' (coded inside the hrtimer callback function)
+will dump useful debug information to the system log, after which it
+will call panic if it was instructed to do so or resume execution of
+other kernel code.
+
+The period of the hrtimer is 2*watchdog_thresh/5, which means it has
+two or three chances to generate an interrupt before the hardlockup
+detector kicks in.
+
+As explained above, a kernel knob is provided that allows
+administrators to configure the period of the hrtimer and the perf
+event. The right value for a particular environment is a trade-off
+between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
new file mode 100644
index 000000000000..4e06ffabd78a
--- /dev/null
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -0,0 +1,25 @@
+=====================
+CMA Debugfs Interface
+=====================
+
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+	<debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example::
+
+	echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index 5f61a6c429e0..11db46448354 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -26,6 +26,7 @@ the Linux memory management.
    :maxdepth: 1
 
    concepts
+   cma_debugfs
    hugetlbpage
    idle_page_tracking
    ksm
diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst
new file mode 100644
index 000000000000..aaf1667489f8
--- /dev/null
+++ b/Documentation/admin-guide/numastat.rst
@@ -0,0 +1,30 @@
+===============================
+Numa policy hit/miss statistics
+===============================
+
+/sys/devices/system/node/node*/numastat
+
+All units are pages. Hugepages have separate counters.
+
+=============== ============================================================
+numa_hit	A process wanted to allocate memory from this node,
+		and succeeded.
+
+numa_miss	A process wanted to allocate memory from another node,
+		but ended up with memory from this node.
+
+numa_foreign	A process wanted to allocate on this node,
+		but ended up with memory from another one.
+
+local_node	A process ran on this node and got memory from it.
+
+other_node	A process ran on this node and got memory from another node.
+
+interleave_hit 	Interleaving wanted to allocate from this node
+		and succeeded.
+=============== ============================================================
+
+For easier reading you can use the numastat utility from the numactl package
+(http://oss.sgi.com/projects/libnuma/). Note that it only works
+well right now on machines with a small number of CPUs.
+
diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst
new file mode 100644
index 000000000000..bab2d10631f0
--- /dev/null
+++ b/Documentation/admin-guide/pnp.rst
@@ -0,0 +1,292 @@
+=================================
+Linux Plug and Play Documentation
+=================================
+
+:Author: Adam Belay <ambx1@neo.rr.com>
+:Last updated: Oct. 16, 2002
+
+
+Overview
+--------
+
+Plug and Play provides a means of detecting and setting resources for legacy or
+otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
+services to compatible drivers.
+
+
+The User Interface
+------------------
+
+The Linux Plug and Play user interface provides a means to activate PnP devices
+for legacy and user level drivers that do not support Linux Plug and Play.  The 
+user interface is integrated into sysfs.
+
+In addition to the standard sysfs file the following are created in each
+device's directory:
+- id - displays a list of support EISA IDs
+- options - displays possible resource configurations
+- resources - displays currently allocated resources and allows resource changes
+
+activating a device
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "auto" > resources
+
+this will invoke the automatic resource config system to activate the device
+
+manually activating a device
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "manual <depnum> <mode>" > resources
+
+	<depnum> - the configuration number
+	<mode> - static or dynamic
+		 static = for next boot
+		 dynamic = now
+
+disabling a device
+^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "disable" > resources
+
+
+EXAMPLE:
+
+Suppose you need to activate the floppy disk controller.
+
+1. change to the proper directory, in my case it is
+   /driver/bus/pnp/devices/00:0f::
+
+	# cd /driver/bus/pnp/devices/00:0f
+	# cat name
+	PC standard floppy disk controller
+
+2. check if the device is already active::
+
+	# cat resources
+	DISABLED
+
+  - Notice the string "DISABLED".  This means the device is not active.
+
+3. check the device's possible configurations (optional)::
+
+	# cat options
+	Dependent: 01 - Priority acceptable
+	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+	Dependent: 02 - Priority acceptable
+	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+
+4. now activate the device::
+
+	# echo "auto" > resources
+
+5. finally check if the device is active::
+
+	# cat resources
+	io 0x3f0-0x3f5
+	io 0x3f7-0x3f7
+	irq 6
+	dma 2
+
+also there are a series of kernel parameters::
+
+	pnp_reserve_irq=irq1[,irq2] ....
+	pnp_reserve_dma=dma1[,dma2] ....
+	pnp_reserve_io=io1,size1[,io2,size2] ....
+	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
+
+
+
+The Unified Plug and Play Layer
+-------------------------------
+
+All Plug and Play drivers, protocols, and services meet at a central location
+called the Plug and Play Layer.  This layer is responsible for the exchange of 
+information between PnP drivers and PnP protocols.  Thus it automatically 
+forwards commands to the proper protocol.  This makes writing PnP drivers 
+significantly easier.
+
+The following functions are available from the Plug and Play Layer:
+
+pnp_get_protocol
+  increments the number of uses by one
+
+pnp_put_protocol
+  deincrements the number of uses by one
+
+pnp_register_protocol
+  use this to register a new PnP protocol
+
+pnp_unregister_protocol
+  use this function to remove a PnP protocol from the Plug and Play Layer
+
+pnp_register_driver
+  adds a PnP driver to the Plug and Play Layer
+
+  this includes driver model integration
+  returns zero for success or a negative error number for failure; count
+  calls to the .add() method if you need to know how many devices bind to
+  the driver
+
+pnp_unregister_driver
+  removes a PnP driver from the Plug and Play Layer
+
+
+
+Plug and Play Protocols
+-----------------------
+
+This section contains information for PnP protocol developers.
+
+The following Protocols are currently available in the computing world:
+
+- PNPBIOS:
+    used for system devices such as serial and parallel ports.
+- ISAPNP:
+    provides PnP support for the ISA bus
+- ACPI:
+    among its many uses, ACPI provides information about system level
+    devices.
+
+It is meant to replace the PNPBIOS.  It is not currently supported by Linux
+Plug and Play but it is planned to be in the near future.
+
+
+Requirements for a Linux PnP protocol:
+1. the protocol must use EISA IDs
+2. the protocol must inform the PnP Layer of a device's current configuration
+
+- the ability to set resources is optional but preferred.
+
+The following are PnP protocol related functions:
+
+pnp_add_device
+  use this function to add a PnP device to the PnP layer
+
+  only call this function when all wanted values are set in the pnp_dev
+  structure
+
+pnp_init_device
+  call this to initialize the PnP structure
+
+pnp_remove_device
+  call this to remove a device from the Plug and Play Layer.
+  it will fail if the device is still in use.
+  automatically will free mem used by the device and related structures
+
+pnp_add_id
+  adds an EISA ID to the list of supported IDs for the specified device
+
+For more information consult the source of a protocol such as
+/drivers/pnp/pnpbios/core.c.
+
+
+
+Linux Plug and Play Drivers
+---------------------------
+
+This section contains information for Linux PnP driver developers.
+
+The New Way
+^^^^^^^^^^^
+
+1. first make a list of supported EISA IDS
+
+   ex::
+
+	static const struct pnp_id pnp_dev_table[] = {
+		/* Standard LPT Printer Port */
+		{.id = "PNP0400", .driver_data = 0},
+		/* ECP Printer Port */
+		{.id = "PNP0401", .driver_data = 0},
+		{.id = ""}
+	};
+
+   Please note that the character 'X' can be used as a wild card in the function
+   portion (last four characters).
+
+   ex::
+
+	/* Unknown PnP modems */
+	{	"PNPCXXX",		UNKNOWN_DEV	},
+
+   Supported PnP card IDs can optionally be defined.
+   ex::
+
+	static const struct pnp_id pnp_card_table[] = {
+		{	"ANYDEVS",		0	},
+		{	"",			0	}
+	};
+
+2. Optionally define probe and remove functions.  It may make sense not to
+   define these functions if the driver already has a reliable method of detecting
+   the resources, such as the parport_pc driver.
+
+   ex::
+
+	static int
+	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
+			struct pnp_id *dev_id)
+	{
+	. . .
+
+   ex::
+
+	static void serial_pnp_remove(struct pnp_dev * dev)
+	{
+	. . .
+
+   consult /drivers/serial/8250_pnp.c for more information.
+
+3. create a driver structure
+
+   ex::
+
+	static struct pnp_driver serial_pnp_driver = {
+		.name		= "serial",
+		.card_id_table	= pnp_card_table,
+		.id_table	= pnp_dev_table,
+		.probe		= serial_pnp_probe,
+		.remove		= serial_pnp_remove,
+	};
+
+   * name and id_table cannot be NULL.
+
+4. register the driver
+
+   ex::
+
+	static int __init serial8250_pnp_init(void)
+	{
+		return pnp_register_driver(&serial_pnp_driver);
+	}
+
+The Old Way
+^^^^^^^^^^^
+
+A series of compatibility functions have been created to make it easy to convert
+ISAPNP drivers.  They should serve as a temporary solution only.
+
+They are as follows::
+
+	struct pnp_card *pnp_find_card(unsigned short vendor,
+				       unsigned short device,
+				       struct pnp_card *from)
+
+	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
+				     unsigned short vendor,
+				     unsigned short function,
+				     struct pnp_dev *from)
+
diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst
new file mode 100644
index 000000000000..688c95b11919
--- /dev/null
+++ b/Documentation/admin-guide/rtc.rst
@@ -0,0 +1,140 @@
+=======================================
+Real Time Clock (RTC) Drivers for Linux
+=======================================
+
+When Linux developers talk about a "Real Time Clock", they usually mean
+something that tracks wall clock time and is battery backed so that it
+works even with system power off.  Such clocks will normally not track
+the local time zone or daylight savings time -- unless they dual boot
+with MS-Windows -- but will instead be set to Coordinated Universal Time
+(UTC, formerly "Greenwich Mean Time").
+
+The newest non-PC hardware tends to just count seconds, like the time(2)
+system call reports, but RTCs also very commonly represent time using
+the Gregorian calendar and 24 hour time, as reported by gmtime(3).
+
+Linux has two largely-compatible userspace RTC API families you may
+need to know about:
+
+    *	/dev/rtc ... is the RTC provided by PC compatible systems,
+	so it's not very portable to non-x86 systems.
+
+    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
+	supported by a wide variety of RTC chips on all systems.
+
+Programmers need to understand that the PC/AT functionality is not
+always available, and some systems can do much more.  That is, the
+RTCs use the same API to make requests in both RTC frameworks (using
+different filenames of course), but the hardware may not offer the
+same functionality.  For example, not every RTC is hooked up to an
+IRQ, so they can't all issue alarms; and where standard PC RTCs can
+only issue an alarm up to 24 hours in the future, other hardware may
+be able to schedule one any time in the upcoming century.
+
+
+Old PC/AT-Compatible driver:  /dev/rtc
+--------------------------------------
+
+All PCs (even Alpha machines) have a Real Time Clock built into them.
+Usually they are built into the chipset of the computer, but some may
+actually have a Motorola MC146818 (or clone) on the board. This is the
+clock that keeps the date and time while your computer is turned off.
+
+ACPI has standardized that MC146818 functionality, and extended it in
+a few ways (enabling longer alarm periods, and wake-from-hibernate).
+That functionality is NOT exposed in the old driver.
+
+However it can also be used to generate signals from a slow 2Hz to a
+relatively fast 8192Hz, in increments of powers of two. These signals
+are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
+for...) It can also function as a 24hr alarm, raising IRQ 8 when the
+alarm goes off. The alarm can also be programmed to only check any
+subset of the three programmable values, meaning that it could be set to
+ring on the 30th second of the 30th minute of every hour, for example.
+The clock can also be set to generate an interrupt upon every clock
+update, thus generating a 1Hz signal.
+
+The interrupts are reported via /dev/rtc (major 10, minor 135, read only
+character device) in the form of an unsigned long. The low byte contains
+the type of interrupt (update-done, alarm-rang, or periodic) that was
+raised, and the remaining bytes contain the number of interrupts since
+the last read.  Status information is reported through the pseudo-file
+/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
+built in locking so that only one process is allowed to have the /dev/rtc
+interface open at a time.
+
+A user process can monitor these interrupts by doing a read(2) or a
+select(2) on /dev/rtc -- either will block/stop the user process until
+the next interrupt is received. This is useful for things like
+reasonably high frequency data acquisition where one doesn't want to
+burn up 100% CPU by polling gettimeofday etc. etc.
+
+At high frequencies, or under high loads, the user process should check
+the number of interrupts received since the last read to determine if
+there has been any interrupt "pileup" so to speak. Just for reference, a
+typical 486-33 running a tight read loop on /dev/rtc will start to suffer
+occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
+frequencies above 1024Hz. So you really should check the high bytes
+of the value you read, especially at frequencies above that of the
+normal timer interrupt, which is 100Hz.
+
+Programming and/or enabling interrupt frequencies greater than 64Hz is
+only allowed by root. This is perhaps a bit conservative, but we don't want
+an evil user generating lots of IRQs on a slow 386sx-16, where it might have
+a negative impact on performance. This 64Hz limit can be changed by writing
+a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
+interrupt handler is only a few lines of code to minimize any possibility
+of this effect.
+
+Also, if the kernel time is synchronized with an external source, the 
+kernel will write the time back to the CMOS clock every 11 minutes. In 
+the process of doing this, the kernel briefly turns off RTC periodic 
+interrupts, so be aware of this if you are doing serious work. If you
+don't synchronize the kernel time with an external source (via ntp or
+whatever) then the kernel will keep its hands off the RTC, allowing you
+exclusive access to the device for your applications.
+
+The alarm and/or interrupt frequency are programmed into the RTC via
+various ioctl(2) calls as listed in ./include/linux/rtc.h
+Rather than write 50 pages describing the ioctl() and so on, it is
+perhaps more useful to include a small test program that demonstrates
+how to use them, and demonstrates the features of the driver. This is
+probably a lot more useful to people interested in writing applications
+that will be using this driver.  See the code at the end of this document.
+
+(The original /dev/rtc driver was written by Paul Gortmaker.)
+
+
+New portable "RTC Class" drivers:  /dev/rtcN
+--------------------------------------------
+
+Because Linux supports many non-ACPI and non-PC platforms, some of which
+have more than one RTC style clock, it needed a more portable solution
+than expecting a single battery-backed MC146818 clone on every system.
+Accordingly, a new "RTC Class" framework has been defined.  It offers
+three different userspace interfaces:
+
+    *	/dev/rtcN ... much the same as the older /dev/rtc interface
+
+    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
+	access to some RTC attributes.
+
+    *	/proc/driver/rtc ... the system clock RTC may expose itself
+	using a procfs interface. If there is no RTC for the system clock,
+	rtc0 is used by default. More information is (currently) shown
+	here than through sysfs.
+
+The RTC Class framework supports a wide variety of RTCs, ranging from those
+integrated into embeddable system-on-chip (SOC) processors to discrete chips
+using I2C, SPI, or some other bus to communicate with the host CPU.  There's
+even support for PC-style RTCs ... including the features exposed on newer PCs
+through ACPI.
+
+The new framework also removes the "one RTC per system" restriction.  For
+example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
+a high functionality RTC is integrated into the SOC.  That system might read
+the system clock from the discrete RTC, but use the integrated one for all
+other tasks, because of its greater functionality.
+
+Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
+ioctl interface.
diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst
new file mode 100644
index 000000000000..b6c2f9acca92
--- /dev/null
+++ b/Documentation/admin-guide/svga.rst
@@ -0,0 +1,249 @@
+.. include:: <isonum.txt>
+
+=================================
+Video Mode Selection Support 2.13
+=================================
+
+:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
+
+Intro
+~~~~~
+
+This small document describes the "Video Mode Selection" feature which
+allows the use of various special video modes supported by the video BIOS. Due
+to usage of the BIOS, the selection is limited to boot time (before the
+kernel decompression starts) and works only on 80X86 machines.
+
+.. note::
+
+   Short intro for the impatient: Just use vga=ask for the first time,
+   enter ``scan`` on the video mode prompt, pick the mode you want to use,
+   remember its mode ID (the four-digit hexadecimal number) and then
+   set the vga parameter to this number (converted to decimal first).
+
+The video mode to be used is selected by a kernel parameter which can be
+specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
+option of LILO (or some other boot loader you use) or by the "vidmode" utility
+(present in standard Linux utility packages). You can use the following values
+of this parameter::
+
+   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
+
+   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
+
+   ASK_VGA - Display a video mode menu upon startup (see below).
+
+   0..35 - Menu item number (when you have used the menu to view the list of
+      modes available on your adapter, you can specify the menu item you want
+      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
+      mode list displayed may vary as the kernel version changes, because the
+      modes are listed in a "first detected -- first displayed" manner. It's
+      better to use absolute mode numbers instead.
+
+   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
+      for exact meaning of the ID). Warning: rdev and LILO don't support
+      hexadecimal numbers -- you have to convert it to decimal manually.
+
+Menu
+~~~~
+
+The ASK_VGA mode causes the kernel to offer a video mode menu upon
+bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
+to continue or wait 30 secs" message. If you press <RETURN>, you enter the
+menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
+the standard 80x25 mode.
+
+The menu looks like::
+
+	Video adapter: <name-of-detected-video-adapter>
+	Mode:    COLSxROWS:
+	0  0F00  80x25
+	1  0F01  80x50
+	2  0F02  80x43
+	3  0F03  80x26
+	....
+	Enter mode number or ``scan``: <flashing-cursor-here>
+
+<name-of-detected-video-adapter> tells what video adapter did Linux detect
+-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
+with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
+of chipsets is turned off by default as it's inherently unreliable due to
+absolutely insane PC design.
+
+"0  0F00  80x25" means that the first menu item (the menu items are numbered
+from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
+next section for a description of mode IDs).
+
+<flashing-cursor-here> encourages you to enter the item number or mode ID
+you wish to set and press <RETURN>. If the computer complains something about
+"Unknown mode ID", it is trying to tell you that it isn't possible to set such
+a mode. It's also possible to press only <RETURN> which leaves the current mode.
+
+The mode list usually contains a few basic modes and some VESA modes.  In
+case your chipset has been detected, some chipset-specific modes are shown as
+well (some of these might be missing or unusable on your machine as different
+BIOSes are often shipped with the same card and the mode numbers depend purely
+on the VGA BIOS).
+
+The modes displayed on the menu are partially sorted: The list starts with
+the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
+80x43), local modes (if the local modes feature is enabled), VESA modes and
+finally SVGA modes for the auto-detected adapter.
+
+If you are not happy with the mode list offered (e.g., if you think your card
+is able to do more), you can enter "scan" instead of item number / mode ID.  The
+program will try to ask the BIOS for all possible video mode numbers and test
+what happens then. The screen will be probably flashing wildly for some time and
+strange noises will be heard from inside the monitor and so on and then, really
+all consistent video modes supported by your BIOS will appear (plus maybe some
+``ghost modes``). If you are afraid this could damage your monitor, don't use
+this function.
+
+After scanning, the mode ordering is a bit different: the auto-detected SVGA
+modes are not listed at all and the modes revealed by ``scan`` are shown before
+all VESA modes.
+
+Mode IDs
+~~~~~~~~
+
+Because of the complexity of all the video stuff, the video mode IDs
+used here are also a bit complex. A video mode ID is a 16-bit number usually
+expressed in a hexadecimal notation (starting with "0x"). You can set a mode
+by entering its mode directly if you know it even if it isn't shown on the menu.
+
+The ID numbers can be divided to those regions::
+
+   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
+	outside the menu as this can change from boot to boot (especially if you
+	have used the ``scan`` feature).
+
+   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
+	(as presented to INT 10, function 00) increased by 0x0100.
+
+   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
+	0x0100. All VESA modes should be autodetected and shown on the menu.
+
+   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
+	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
+	945=132x28 for the standard Video7 BIOS)
+
+   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
+	by modifying one of the standard modes). Currently available:
+	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
+	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
+	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
+	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
+	0x0f04	leave current video mode
+	0x0f05	VGA 80x30 (480 scans, 16-point font)
+	0x0f06	VGA 80x34 (480 scans, 14-point font)
+	0x0f07	VGA 80x60 (480 scans, 8-point font)
+	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
+
+   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
+	form where RR is a number of rows and CC is a number of columns.
+	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
+	This is the only fully portable way to refer to a non-standard mode,
+	but it relies on the mode being found and displayed on the menu
+	(remember that mode scanning is not done automatically).
+
+   0xff00 to 0xffff - aliases for backward compatibility:
+	0xffff	equivalent to 0x0f00 (standard 80x25)
+	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
+
+If you add 0x8000 to the mode ID, the program will try to recalculate
+vertical display timing according to mode parameters, which can be used to
+eliminate some annoying bugs of certain VGA BIOSes (usually those used for
+cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
+end of the display.
+
+Options
+~~~~~~~
+
+Build options for arch/x86/boot/* are selected by the kernel kconfig
+utility and the kernel .config file.
+
+VIDEO_GFX_HACK - includes special hack for setting of graphics modes
+to be used later by special drivers.
+Allows to set _any_ BIOS mode including graphic ones and forcing specific
+text screen resolution instead of peeking it from BIOS variables. Don't use
+unless you think you know what you're doing. To activate this setup, use
+mode number 0x0f08 (see the Mode IDs section above).
+
+Still doesn't work?
+~~~~~~~~~~~~~~~~~~~
+
+When the mode detection doesn't work (e.g., the mode list is incorrect or
+the machine hangs instead of displaying the menu), try to switch off some of
+the configuration options listed under "Options". If it fails, you can still use
+your kernel with the video mode set directly via the kernel parameter.
+
+In either case, please send me a bug report containing what _exactly_
+happens and how do the configuration switches affect the behaviour of the bug.
+
+If you start Linux from M$-DOS, you might also use some DOS tools for
+video mode setting. In this case, you must specify the 0x0f04 mode ("leave
+current settings") to Linux, because if you don't and you use any non-standard
+mode, Linux will switch to 80x25 automatically.
+
+If you set some extended mode and there's one or more extra lines on the
+bottom of the display containing already scrolled-out text, your VGA BIOS
+contains the most common video BIOS bug called "incorrect vertical display
+end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
+this must be done manually -- no autodetection mechanisms are available.
+
+History
+~~~~~~~
+
+=============== ================================================================
+1.0 (??-Nov-95)	First version supporting all adapters supported by the old
+		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
+		and then removed due to instability on some machines.
+2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
+		everything is configurable, the VESA support should be much more
+		stable, explicit mode numbering allowed, "scan" implemented etc.
+2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
+		supported. Few bugs fixed. VESA modes are listed prior to
+		modes supplied by SVGA autodetection as they are more reliable.
+		CLGD autodetect works better. Doesn't depend on 80x25 being
+		active when started. Scanning fixed. 80x43 (any VGA) added.
+		Code cleaned up.
+2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
+		VESA modes work now). Display end bug workaround supported.
+		Special modes renumbered to allow adding of the "recalculate"
+		flag, 0xffff and 0xfffe became aliases instead of real IDs.
+		Screen contents retained during mode changes.
+2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
+2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
+		with some boot loaders. Memory management rewritten to reflect
+		these changes. Unfortunately, screen contents retaining works
+		only with some loaders now.
+		Added a Tseng 132x60 mode.
+2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
+2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
+		several cards with broken VESA code (e.g., ATI VGA).
+2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
+		  cards use very strange mode numbers.
+		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
+		- Hardware testing order slightly changed, tests based on ROM
+		  contents done as first.
+		- Added support for special Video7 mode switching functions
+		  (thanks to Tom Vander Aa).
+		- Added 480-scanline modes (especially useful for notebooks,
+		  original version written by hhanemaa@cs.ruu.nl, patched by
+		  Jeff Chua, rewritten by me).
+		- Screen store/restore fixed.
+2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
+		- Better recognition of text modes during mode scan.
+2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
+2.10(11-Nov-96) - The whole thing made optional.
+		- Added the CONFIG_VIDEO_400_HACK switch.
+		- Added the CONFIG_VIDEO_GFX_HACK switch.
+		- Code cleanup.
+2.11(03-May-97) - Yet another cleanup, now including also the documentation.
+		- Direct testing of SVGA adapters turned off by default, ``scan``
+		  offered explicitly on the prompt line.
+		- Removed the doc section describing adding of new probing
+		  functions as I try to get rid of _all_ hardware probing here.
+2.12(25-May-98) Added support for VESA frame buffer graphics.
+2.13(14-May-99) Minor documentation fixes.
+=============== ================================================================
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index a0c1d4ce403a..032c7cd3cede 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -327,7 +327,7 @@ when a hard lockup is detected.
    0 - don't panic on hard lockup
    1 - panic on hard lockup
 
-See Documentation/lockup-watchdogs.txt for more information.  This can
+See Documentation/admin-guide/lockup-watchdogs.rst for more information.  This can
 also be set using the nmi_watchdog kernel parameter.
 
 
diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst
new file mode 100644
index 000000000000..56d6fa2e2368
--- /dev/null
+++ b/Documentation/admin-guide/video-output.rst
@@ -0,0 +1,34 @@
+Video Output Switcher Control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+2006 luming.yu@intel.com
+
+The output sysfs class driver provides an abstract video output layer that
+can be used to hook platform specific methods to enable/disable video output
+device through common sysfs interface. For example, on my IBM ThinkPad T42
+laptop, The ACPI video driver registered its output devices and read/write
+method for 'state' with output sysfs class. The user interface under sysfs is::
+
+  linux:/sys/class/video_output # tree .
+  .
+  |-- CRT0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- DVI0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- LCD0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  `-- TV0
+     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+     |-- state
+     |-- subsystem -> ../../../class/video_output
+     `-- uevent
+
diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst
deleted file mode 100644
index dfef50286018..000000000000
--- a/Documentation/auxdisplay/lcd-panel-cgram.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-:orphan:
-
-======================================
-Parallel port LCD/Keypad Panel support
-======================================
-
-Some LCDs allow you to define up to 8 characters, mapped to ASCII
-characters 0 to 7. The escape code to define a new character is
-'\e[LG' followed by one digit from 0 to 7, representing the character
-number, and up to 8 couples of hex digits terminated by a semi-colon
-(';'). Each couple of digits represents a line, with 1-bits for each
-illuminated pixel with LSB on the right. Lines are numbered from the
-top of the character to the bottom. On a 5x7 matrix, only the 5 lower
-bits of the 7 first bytes are used for each character. If the string
-is incomplete, only complete lines will be redefined. Here are some
-examples::
-
-  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
-  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
-  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
-  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
-  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
-  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
-  printf "\e[LG60016051516141400;"  => 6 = "IP"
-
-  printf "\e[LG00103071F1F070301;"  => big speaker
-  printf "\e[LG00002061E1E060200;"  => small speaker
-
-Willy
diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt
deleted file mode 100644
index ec57740ead0c..000000000000
--- a/Documentation/btmrvl.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-=============
-btmrvl driver
-=============
-
-All commands are used via debugfs interface.
-
-Set/get driver configurations
-=============================
-
-Path:	/debug/btmrvl/config/
-
-gpiogap=[n], hscfgcmd
-	These commands are used to configure the host sleep parameters::
-	bit 8:0  -- Gap
-	bit 16:8 -- GPIO
-
-	where GPIO is the pin number of GPIO used to wake up the host.
-	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
-	wakeup will be used instead).
-
-	where Gap is the gap in milli seconds between wakeup signal and
-	wakeup event, or 0xff for special host sleep setting.
-
-	Usage::
-
-		# Use SDIO interface to wake up the host and set GAP to 0x80:
-		echo 0xff80 > /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
-		echo 0x03ff >  /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-psmode=[n], pscmd
-	These commands are used to enable/disable auto sleep mode
-
-	where the option is::
-
-			1 	-- Enable auto sleep mode
-			0 	-- Disable auto sleep mode
-
-	Usage::
-
-		# Enable auto sleep mode
-		echo 1 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-		# Disable auto sleep mode
-		echo 0 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-
-hsmode=[n], hscmd
-	These commands are used to enable host sleep or wake up firmware
-
-	where the option is::
-
-			1	-- Enable host sleep
-			0	-- Wake up firmware
-
-	Usage::
-
-		# Enable host sleep
-		echo 1 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-		# Wake up firmware
-		echo 0 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-
-Get driver status
-=================
-
-Path:	/debug/btmrvl/status/
-
-Usage::
-
-	cat /debug/btmrvl/status/<args>
-
-where the args are:
-
-curpsmode
-	This command displays current auto sleep status.
-
-psstate
-	This command display the power save state.
-
-hsstate
-	This command display the host sleep state.
-
-txdnldrdy
-	This command displays the value of Tx download ready flag.
-
-Issuing a raw hci command
-=========================
-
-Use hcitool to issue raw hci command, refer to hcitool manual
-
-Usage::
-
-	Hcitool cmd <ogf> <ocf> [Parameters]
-
-Interface Control Command::
-
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
-
-SD8688 firmware
-===============
-
-Images:
-
-- /lib/firmware/sd8688_helper.bin
-- /lib/firmware/sd8688.bin
-
-
-The images can be downloaded from:
-
-git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
deleted file mode 100644
index 211fd926cf00..000000000000
--- a/Documentation/clearing-warn-once.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-Clearing WARN_ONCE
-------------------
-
-WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
-
-echo 1 > /sys/kernel/debug/clear_warn_once
-
-clears the state and allows the warnings to print once again.
-This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst
deleted file mode 100644
index 518fe401b5ee..000000000000
--- a/Documentation/cma/debugfs.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-:orphan:
-
-=====================
-CMA Debugfs Interface
-=====================
-
-The CMA debugfs interface is useful to retrieve basic information out of the
-different CMA areas and to test allocation/release in each of the areas.
-
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
-
-	<debugfs>/cma/cma-0
-
-The structure of the files created under that directory is as follows:
-
- - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
- - [RO] count: Amount of memory in the CMA area.
- - [RO] order_per_bit: Order of pages represented by one bit.
- - [RO] bitmap: The bitmap of page states in the zone.
- - [WO] alloc: Allocate N pages from that CMA area. For example::
-
-	echo 5 > <debugfs>/cma/cma-2/alloc
-
-would try to allocate 5 pages from the cma-2 area.
-
- - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
deleted file mode 100644
index 2d01ce43d2a2..000000000000
--- a/Documentation/cpu-load.txt
+++ /dev/null
@@ -1,114 +0,0 @@
-========
-CPU load
-========
-
-Linux exports various bits of information via ``/proc/stat`` and
-``/proc/uptime`` that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example::
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the ``/proc/stat``	 information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner::
-
-     time line between two timer interrupts
-    |--------------------------------------|
-     ^                                    ^
-     |_ something begins working          |
-                                          |_ something goes to sleep
-                                         (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-``/proc/stat`` (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside ``/proc/stat``::
-
-
-	/* gcc -o hog smallhog.c */
-	#include <time.h>
-	#include <limits.h>
-	#include <signal.h>
-	#include <sys/time.h>
-	#define HIST 10
-
-	static volatile sig_atomic_t stop;
-
-	static void sighandler (int signr)
-	{
-	(void) signr;
-	stop = 1;
-	}
-	static unsigned long hog (unsigned long niters)
-	{
-	stop = 0;
-	while (!stop && --niters);
-	return niters;
-	}
-	int main (void)
-	{
-	int i;
-	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
-	sigset_t set;
-	unsigned long v[HIST];
-	double tmp = 0.0;
-	unsigned long n;
-	signal (SIGALRM, &sighandler);
-	setitimer (ITIMER_REAL, &it, NULL);
-
-	hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) tmp += v[i];
-	tmp /= HIST;
-	n = tmp - (tmp / 3.0);
-
-	sigemptyset (&set);
-	sigaddset (&set, SIGALRM);
-
-	for (;;) {
-		hog (n);
-		sigwait (&set, &i);
-	}
-	return 0;
-	}
-
-
-References
-----------
-
-- http://lkml.org/lkml/2007/2/12/6
-- Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
deleted file mode 100644
index b90dafcc8237..000000000000
--- a/Documentation/cputopology.txt
+++ /dev/null
@@ -1,177 +0,0 @@
-===========================================
-How CPU topology info is exported via sysfs
-===========================================
-
-Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo output of some architectures.  They reside in
-/sys/devices/system/cpu/cpuX/topology/:
-
-physical_package_id:
-
-	physical package id of cpuX. Typically corresponds to a physical
-	socket number, but the actual value is architecture and platform
-	dependent.
-
-die_id:
-
-	the CPU die ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-core_id:
-
-	the CPU core ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-book_id:
-
-	the book ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-drawer_id:
-
-	the drawer ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-core_cpus:
-
-	internal kernel map of CPUs within the same core.
-	(deprecated name: "thread_siblings")
-
-core_cpus_list:
-
-	human-readable list of CPUs within the same core.
-	(deprecated name: "thread_siblings_list");
-
-package_cpus:
-
-	internal kernel map of the CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings")
-
-package_cpus_list:
-
-	human-readable list of CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings_list")
-
-die_cpus:
-
-	internal kernel map of CPUs within the same die.
-
-die_cpus_list:
-
-	human-readable list of CPUs within the same die.
-
-book_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	book_id.
-
-book_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	book_id.
-
-drawer_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	drawer_id.
-
-drawer_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	drawer_id.
-
-Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
-
-For an architecture to support this feature, it must define some of
-these macros in include/asm-XXX/topology.h::
-
-	#define topology_physical_package_id(cpu)
-	#define topology_die_id(cpu)
-	#define topology_core_id(cpu)
-	#define topology_book_id(cpu)
-	#define topology_drawer_id(cpu)
-	#define topology_sibling_cpumask(cpu)
-	#define topology_core_cpumask(cpu)
-	#define topology_die_cpumask(cpu)
-	#define topology_book_cpumask(cpu)
-	#define topology_drawer_cpumask(cpu)
-
-The type of ``**_id macros`` is int.
-The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
-correspond with appropriate ``**_siblings`` sysfs attributes (except for
-topology_sibling_cpumask() which corresponds with thread_siblings).
-
-To be consistent on all architectures, include/linux/topology.h
-provides default definitions for any of the above macros that are
-not defined by include/asm-XXX/topology.h:
-
-1) topology_physical_package_id: -1
-2) topology_die_id: -1
-3) topology_core_id: 0
-4) topology_sibling_cpumask: just the given CPU
-5) topology_core_cpumask: just the given CPU
-6) topology_die_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
-
-Additionally, CPU topology information is provided under
-/sys/devices/system/cpu and includes these files.  The internal
-source for the output is in brackets ("[]").
-
-    =========== ==========================================================
-    kernel_max: the maximum CPU index allowed by the kernel configuration.
-		[NR_CPUS-1]
-
-    offline:	CPUs that are not online because they have been
-		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
-		of CPUs allowed by the kernel configuration (kernel_max
-		above). [~cpu_online_mask + cpus >= NR_CPUS]
-
-    online:	CPUs that are online and being scheduled [cpu_online_mask]
-
-    possible:	CPUs that have been allocated resources and can be
-		brought online if they are present. [cpu_possible_mask]
-
-    present:	CPUs that have been identified as being present in the
-		system. [cpu_present_mask]
-    =========== ==========================================================
-
-The format for the above output is compatible with cpulist_parse()
-[see <linux/cpumask.h>].  Some examples follow.
-
-In this example, there are 64 CPUs in the system but cpus 32-63 exceed
-the kernel max which is limited to 0..31 by the NR_CPUS config option
-being 32.  Note also that CPUs 2 and 4-31 are not online but could be
-brought online as they are both present and possible::
-
-     kernel_max: 31
-        offline: 2,4-31,32-63
-         online: 0-1,3
-       possible: 0-31
-        present: 0-31
-
-In this example, the NR_CPUS config option is 128, but the kernel was
-started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
-was manually taken offline (and is the only CPU that can be brought
-online.)::
-
-     kernel_max: 127
-        offline: 2,4-127,128-143
-         online: 0-1,3
-       possible: 0-127
-        present: 0-3
-
-See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
-as well as more information on the various cpumasks.
diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt
deleted file mode 100644
index 833edb0d0bc4..000000000000
--- a/Documentation/efi-stub.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-=================
-The EFI Boot Stub
-=================
-
-On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
-as a PE/COFF image, thereby convincing EFI firmware loaders to load
-it as an EFI executable. The code that modifies the bzImage header,
-along with the EFI-specific entry point that the firmware loader
-jumps to are collectively known as the "EFI boot stub", and live in
-arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
-respectively. For ARM the EFI stub is implemented in
-arch/arm/boot/compressed/efi-header.S and
-arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
-between architectures is in drivers/firmware/efi/libstub.
-
-For arm64, there is no compressed kernel support, so the Image itself
-masquerades as a PE/COFF image and the EFI stub is linked into the
-kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
-and drivers/firmware/efi/libstub/arm64-stub.c.
-
-By using the EFI boot stub it's possible to boot a Linux kernel
-without the use of a conventional EFI boot loader, such as grub or
-elilo. Since the EFI boot stub performs the jobs of a boot loader, in
-a certain sense it *IS* the boot loader.
-
-The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
-
-
-How to install bzImage.efi
---------------------------
-
-The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
-System Partition (ESP) and renamed with the extension ".efi". Without
-the extension the EFI firmware loader will refuse to execute it. It's
-not possible to execute bzImage.efi from the usual Linux file systems
-because EFI firmware doesn't have support for them. For ARM the
-arch/arm/boot/zImage should be copied to the system partition, and it
-may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
-should be copied but not necessarily renamed.
-
-
-Passing kernel parameters from the EFI shell
---------------------------------------------
-
-Arguments to the kernel can be passed after bzImage.efi, e.g.::
-
-	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
-
-
-The "initrd=" option
---------------------
-
-Like most boot loaders, the EFI stub allows the user to specify
-multiple initrd files using the "initrd=" option. This is the only EFI
-stub-specific command line parameter, everything else is passed to the
-kernel when it boots.
-
-The path to the initrd file must be an absolute path from the
-beginning of the ESP, relative path names do not work. Also, the path
-is an EFI-style path and directory elements must be separated with
-backslashes (\). For example, given the following directory layout::
-
-  fs0:>
-	Kernels\
-			bzImage.efi
-			initrd-large.img
-
-	Ramdisks\
-			initrd-small.img
-			initrd-medium.img
-
-to boot with the initrd-large.img file if the current working
-directory is fs0:\Kernels, the following command must be used::
-
-	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
-
-Notice how bzImage.efi can be specified with a relative path. That's
-because the image we're executing is interpreted by the EFI shell,
-which understands relative paths, whereas the rest of the command line
-is passed to bzImage.efi.
-
-
-The "dtb=" option
------------------
-
-For the ARM and arm64 architectures, a device tree must be provided to
-the kernel. Normally firmware shall supply the device tree via the
-EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
-be used to override the firmware supplied device tree, or to supply
-one when firmware is unable to.
-
-Please note: Firmware adds runtime configuration information to the
-device tree before booting the kernel. If dtb= is used to override
-the device tree, then any runtime data provided by firmware will be
-lost. The dtb= option should only be used either as a debug tool, or
-as a last resort when a device tree is not provided in the EFI
-CONFIGURATION TABLE.
-
-"dtb=" is processed in the same manner as the "initrd=" option that is
-described above.
diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst
index 2ed0dfb661cf..6821c87b7893 100644
--- a/Documentation/fb/vesafb.rst
+++ b/Documentation/fb/vesafb.rst
@@ -30,7 +30,7 @@ How to use it?
 ==============
 
 Switching modes is done using the vga=... boot parameter.  Read
-Documentation/svga.txt for details.
+Documentation/admin-guide/svga.rst for details.
 
 You should compile in both vgacon (for text mode) and vesafb (for
 graphics mode). Which of them takes over the console depends on
diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt
deleted file mode 100644
index 6ee70465c0ea..000000000000
--- a/Documentation/highuid.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-===================================================
-Notes on the change from 16-bit UIDs to 32-bit UIDs
-===================================================
-
-:Author: Chris Wing <wingc@umich.edu>
-:Last updated: January 11, 2000
-
-- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
-  when communicating between user and kernel space in an ioctl or data
-  structure.
-
-- kernel code should use uid_t and gid_t in kernel-private structures and
-  code.
-
-What's left to be done for 32-bit UIDs on all Linux architectures:
-
-- Disk quotas have an interesting limitation that is not related to the
-  maximum UID/GID. They are limited by the maximum file size on the
-  underlying filesystem, because quota records are written at offsets
-  corresponding to the UID in question.
-  Further investigation is needed to see if the quota system can cope
-  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
-  architectures, this should not be a problem.
-
-- Decide whether or not to keep backwards compatibility with the system
-  accounting file, or if we should break it as the comments suggest
-  (currently, the old 16-bit UID and GID are still written to disk, and
-  part of the former pad space is used to store separate 32-bit UID and
-  GID)
-
-- Need to validate that OS emulation calls the 16-bit UID
-  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
-  uses the 32-bit UID system calls properly otherwise.
-
-  This affects at least:
-
-	- iBCS on Intel
-
-	- sparc32 emulation on sparc64
-	  (need to support whatever new 32-bit UID system calls are added to
-	  sparc32)
-
-- Validate that all filesystems behave properly.
-
-  At present, 32-bit UIDs _should_ work for:
-
-	- ext2
-	- ufs
-	- isofs
-	- nfs
-	- coda
-	- udf
-
-  Ioctl() fixups have been made for:
-
-	- ncpfs
-	- smbfs
-
-  Filesystems with simple fixups to prevent 16-bit UID wraparound:
-
-	- minix
-	- sysv
-	- qnx4
-
-  Other filesystems have not been checked yet.
-
-- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
-  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
-  more are needed. (as well as new user<->kernel data structures)
-
-- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
-  sh, and sparc32. Fixing this is probably not that important, but would
-  require adding a new ELF section.
-
-- The ioctl()s used to control the in-kernel NFS server only support
-  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
-
-- make sure that the UID mapping feature of AX25 networking works properly
-  (it should be safe because it's always used a 32-bit integer to
-  communicate between user and kernel)
diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt
deleted file mode 100644
index 121de96e395e..000000000000
--- a/Documentation/hw_random.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-==========================================================
-Linux support for random number generator in i8xx chipsets
-==========================================================
-
-Introduction
-============
-
-The hw_random framework is software that makes use of a
-special hardware feature on your CPU or motherboard,
-a Random Number Generator (RNG).  The software has two parts:
-a core providing the /dev/hwrng character device and its
-sysfs support, plus a hardware-specific driver that plugs
-into that core.
-
-To make the most effective use of these mechanisms, you
-should download the support software as well.  Download the
-latest version of the "rng-tools" package from the
-hw_random driver's official Web site:
-
-	http://sourceforge.net/projects/gkernel/
-
-Those tools use /dev/hwrng to fill the kernel entropy pool,
-which is used internally and exported by the /dev/urandom and
-/dev/random special files.
-
-Theory of operation
-===================
-
-CHARACTER DEVICE.  Using the standard open()
-and read() system calls, you can read random data from
-the hardware RNG device.  This data is NOT CHECKED by any
-fitness tests, and could potentially be bogus (if the
-hardware is faulty or has been tampered with).  Data is only
-output if the hardware "has-data" flag is set, but nevertheless
-a security-conscious person would run fitness tests on the
-data before assuming it is truly random.
-
-The rng-tools package uses such tests in "rngd", and lets you
-run them by hand with a "rngtest" utility.
-
-/dev/hwrng is char device major 10, minor 183.
-
-CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
-two unique attributes, "rng_available" and "rng_current".  The
-"rng_available" attribute lists the hardware-specific drivers
-available, while "rng_current" lists the one which is currently
-connected to /dev/hwrng.  If your system has more than one
-RNG available, you may change the one used by writing a name from
-the list in "rng_available" into "rng_current".
-
-==========================================================================
-
-
-Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
-	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
-	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
-
-
-About the Intel RNG hardware, from the firmware hub datasheet
-=============================================================
-
-The Firmware Hub integrates a Random Number Generator (RNG)
-using thermal noise generated from inherently random quantum
-mechanical properties of silicon. When not generating new random
-bits the RNG circuitry will enter a low power state. Intel will
-provide a binary software driver to give third party software
-access to our RNG for use as a security feature. At this time,
-the RNG is only to be used with a system in an OS-present state.
-
-Intel RNG Driver notes
-======================
-
-FIXME: support poll(2)
-
-.. note::
-
-	request_mem_region was removed, for three reasons:
-
-	1) Only one RNG is supported by this driver;
-	2) The location used by the RNG is a fixed location in
-	   MMIO-addressable memory;
-	3) users with properly working BIOS e820 handling will always
-	   have the region in which the RNG is located reserved, so
-	   request_mem_region calls always fail for proper setups.
-	   However, for people who use mem=XX, BIOS e820 information is
-	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
-	   succeed.
-
-Driver details
-==============
-
-Based on:
-	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
-	May 1999 Order Number: 290658-002 R
-
-Intel 82802 Firmware Hub:
-	Random Number Generator
-	Programmer's Reference Manual
-	December 1999 Order Number: 298029-001 R
-
-Intel 82802 Firmware HUB Random Number Generator Driver
-	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
-
-Special thanks to Matt Sottek.  I did the "guts", he
-did the "brains" and all the testing.
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
deleted file mode 100644
index 5d63b18bd6d1..000000000000
--- a/Documentation/iostats.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-=====================
-I/O statistics fields
-=====================
-
-Since 2.4.20 (and some versions before, with patches), and 2.5.45,
-more extensive disk statistics have been introduced to help measure disk
-activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
-the work for you, but in case you are interested in creating your own
-tools, the fields are explained here.
-
-In 2.4 now, the information is found as additional fields in
-``/proc/partitions``.  In 2.6 and upper, the same information is found in two
-places: one is in the file ``/proc/diskstats``, and the other is within
-the sysfs file system, which must be mounted in order to obtain
-the information. Throughout this document we'll assume that sysfs
-is mounted on ``/sys``, although of course it may be mounted anywhere.
-Both ``/proc/diskstats`` and sysfs use the same source for the information
-and so should not differ.
-
-Here are examples of these different formats::
-
-   2.4:
-      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
-
-   2.6+ sysfs:
-      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      35486    38030    38030    38030
-
-   2.6+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3    1   hda1 35486 38030 38030 38030
-
-   4.18+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
-
-On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
-a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
-
-The advantage of one over the other is that the sysfs choice works well
-if you are watching a known, small set of disks.  ``/proc/diskstats`` may
-be a better choice if you are watching a large number of disks because
-you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
-each snapshot of your disk statistics.
-
-In 2.4, the statistics fields are those after the device name. In
-the above example, the first field of statistics would be 446216.
-By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
-find just the eleven fields, beginning with 446216.  If you look at
-``/proc/diskstats``, the eleven fields will be preceded by the major and
-minor device numbers, and device name.  Each of these formats provides
-eleven fields of statistics, each meaning exactly the same things.
-All fields except field 9 are cumulative since boot.  Field 9 should
-go to zero as I/Os complete; all others only increase (unless they
-overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
-(native word size) numbers, and on a very busy or long-lived system they
-may wrap. Applications should be prepared to deal with that; unless
-your observations are measured in large numbers of minutes or hours,
-they should not wrap twice before you notice them.
-
-Each set of stats only applies to the indicated device; if you want
-system-wide stats you'll have to find all the devices and sum them all up.
-
-Field  1 -- # of reads completed
-    This is the total number of reads completed successfully.
-
-Field  2 -- # of reads merged, field 6 -- # of writes merged
-    Reads and writes which are adjacent to each other may be merged for
-    efficiency.  Thus two 4K reads may become one 8K read before it is
-    ultimately handed to the disk, and so it will be counted (and queued)
-    as only one I/O.  This field lets you know how often this was done.
-
-Field  3 -- # of sectors read
-    This is the total number of sectors read successfully.
-
-Field  4 -- # of milliseconds spent reading
-    This is the total number of milliseconds spent by all reads (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  5 -- # of writes completed
-    This is the total number of writes completed successfully.
-
-Field  6 -- # of writes merged
-    See the description of field 2.
-
-Field  7 -- # of sectors written
-    This is the total number of sectors written successfully.
-
-Field  8 -- # of milliseconds spent writing
-    This is the total number of milliseconds spent by all writes (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  9 -- # of I/Os currently in progress
-    The only field that should go to zero. Incremented as requests are
-    given to appropriate struct request_queue and decremented as they finish.
-
-Field 10 -- # of milliseconds spent doing I/Os
-    This field increases so long as field 9 is nonzero.
-
-    Since 5.0 this field counts jiffies when at least one request was
-    started or completed. If request runs more than 2 jiffies then some
-    I/O time will not be accounted unless there are other requests.
-
-Field 11 -- weighted # of milliseconds spent doing I/Os
-    This field is incremented at each I/O start, I/O completion, I/O
-    merge, or read of these stats by the number of I/Os in progress
-    (field 9) times the number of milliseconds spent doing I/O since the
-    last update of this field.  This can provide an easy measure of both
-    I/O completion time and the backlog that may be accumulating.
-
-Field 12 -- # of discards completed
-    This is the total number of discards completed successfully.
-
-Field 13 -- # of discards merged
-    See the description of field 2
-
-Field 14 -- # of sectors discarded
-    This is the total number of sectors discarded successfully.
-
-Field 15 -- # of milliseconds spent discarding
-    This is the total number of milliseconds spent by all discards (as
-    measured from __make_request() to end_that_request_last()).
-
-To avoid introducing performance bottlenecks, no locks are held while
-modifying these counters.  This implies that minor inaccuracies may be
-introduced when changes collide, so (for instance) adding up all the
-read I/Os issued per partition should equal those made to the disks ...
-but due to the lack of locking it may only be very close.
-
-In 2.6+, there are counters for each CPU, which make the lack of locking
-almost a non-issue.  When the statistics are read, the per-CPU counters
-are summed (possibly overflowing the unsigned long variable they are
-summed to) and the result given to the user.  There is no convenient
-user interface for accessing the per-CPU counters themselves.
-
-Disks vs Partitions
--------------------
-
-There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
-As a result, some statistic information disappeared. The translation from
-a disk address relative to a partition to the disk address relative to
-the host disk happens much earlier.  All merges and timings now happen
-at the disk level rather than at both the disk and partition level as
-in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
-partitions from that for disks.  There are only *four* fields available
-for partitions on 2.6+ machines.  This is reflected in the examples above.
-
-Field  1 -- # of reads issued
-    This is the total number of reads issued to this partition.
-
-Field  2 -- # of sectors read
-    This is the total number of sectors requested to be read from this
-    partition.
-
-Field  3 -- # of writes issued
-    This is the total number of writes issued to this partition.
-
-Field  4 -- # of sectors written
-    This is the total number of sectors requested to be written to
-    this partition.
-
-Note that since the address is translated to a disk-relative one, and no
-record of the partition-relative address is kept, the subsequent success
-or failure of the read cannot be attributed to the partition.  In other
-words, the number of reads for partitions is counted slightly before time
-of queuing for partitions, and at completion for whole disks.  This is
-a subtle distinction that is probably uninteresting for most cases.
-
-More significant is the error induced by counting the numbers of
-reads/writes before merges for partitions and after for disks. Since a
-typical workload usually contains a lot of successive and adjacent requests,
-the number of reads/writes issued can be several times higher than the
-number of reads/writes completed.
-
-In 2.6.25, the full statistic set is again available for partitions and
-disk and partition statistics are consistent again. Since we still don't
-keep record of the partition-relative address, an operation is attributed to
-the partition which contains the first sector of the request after the
-eventual merges. As requests can be merged across partition, this could lead
-to some (probably insignificant) inaccuracy.
-
-Additional notes
-----------------
-
-In 2.6+, sysfs is not mounted by default.  If your distribution of
-Linux hasn't added it already, here's the line you'll want to add to
-your ``/etc/fstab``::
-
-	none /sys sysfs defaults 0 0
-
-
-In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
-appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
-``/proc/stat`` take a very different format from those in ``/proc/partitions``
-(see proc(5), if your system has it.)
-
--- ricklind@us.ibm.com
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
deleted file mode 100644
index 4f18456dd3b1..000000000000
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ /dev/null
@@ -1,356 +0,0 @@
-==========================================
-Reducing OS jitter due to per-cpu kthreads
-==========================================
-
-This document lists per-CPU kthreads in the Linux kernel and presents
-options to control their OS jitter.  Note that non-per-CPU kthreads are
-not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
-them to a "housekeeping" CPU dedicated to such work.
-
-References
-==========
-
--	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
-
--	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
-
--	man taskset:  Using the taskset command to bind tasks to sets
-	of CPUs.
-
--	man sched_setaffinity:  Using the sched_setaffinity() system
-	call to bind tasks to sets of CPUs.
-
--	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
-	writing "0" to offline and "1" to online.
-
--	In order to locate kernel-generated OS jitter on CPU N:
-
-		cd /sys/kernel/debug/tracing
-		echo 1 > max_graph_depth # Increase the "1" for more detail
-		echo function_graph > current_tracer
-		# run workload
-		cat per_cpu/cpuN/trace
-
-kthreads
-========
-
-Name:
-  ehca_comp/%u
-
-Purpose:
-  Periodically process Infiniband-related work.
-
-To reduce its OS jitter, do any of the following:
-
-1.	Don't use eHCA Infiniband hardware, instead choosing hardware
-	that does not require per-CPU kthreads.  This will prevent these
-	kthreads from being created in the first place.  (This will
-	work for most people, as this hardware, though important, is
-	relatively old and is produced in relatively low unit volumes.)
-2.	Do all eHCA-Infiniband-related work on other CPUs, including
-	interrupts.
-3.	Rework the eHCA driver so that its per-CPU kthreads are
-	provisioned only on selected CPUs.
-
-
-Name:
-  irq/%d-%s
-
-Purpose:
-  Handle threaded interrupts.
-
-To reduce its OS jitter, do the following:
-
-1.	Use irq affinity to force the irq threads to execute on
-	some other CPU.
-
-Name:
-  kcmtpd_ctr_%d
-
-Purpose:
-  Handle Bluetooth work.
-
-To reduce its OS jitter, do one of the following:
-
-1.	Don't use Bluetooth, in which case these kthreads won't be
-	created in the first place.
-2.	Use irq affinity to force Bluetooth-related interrupts to
-	occur on some other CPU and furthermore initiate all
-	Bluetooth activity on some other CPU.
-
-Name:
-  ksoftirqd/%u
-
-Purpose:
-  Execute softirq handlers when threaded or when under heavy load.
-
-To reduce its OS jitter, each softirq vector must be handled
-separately as follows:
-
-TIMER_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by forcing
-	both kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
-	the CPU offline, then bring it back online.  This forces
-	recurring timers to migrate elsewhere.	If you are concerned
-	with multiple CPUs, force them all offline before bringing the
-	first one back online.  Once you have onlined the CPUs in question,
-	do not offline any other CPUs, because doing so could force the
-	timer back onto one of the CPUs in question.
-
-NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
----------------------------------
-
-Do all of the following:
-
-1.	Force networking interrupts onto other CPUs.
-2.	Initiate any network I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-BLOCK_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-IRQ_POLL_SOFTIRQ
-----------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O and block-I/O polling on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-TASKLET_SOFTIRQ
----------------
-
-Do one or more of the following:
-
-1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
-	calls to things like tasklet_schedule().)
-2.	Convert all drivers that you must use from tasklets to workqueues.
-3.	Force interrupts for drivers using tasklets onto other CPUs,
-	and also do I/O involving these drivers on other CPUs.
-
-SCHED_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
-	for example, ensure that at most one runnable kthread is present
-	on that CPU.  If a thread that expects to run on the de-jittered
-	CPU awakens, the scheduler will send an IPI that can result in
-	a subsequent SCHED_SOFTIRQ.
-2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
-	is marked as an adaptive-ticks CPU using the "nohz_full="
-	boot parameter.  This reduces the number of scheduler-clock
-	interrupts that the de-jittered CPU receives, minimizing its
-	chances of being selected to do the load balancing work that
-	runs in SCHED_SOFTIRQ context.
-3.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by
-	forcing both kernel threads and interrupts to execute elsewhere.
-	This further reduces the number of scheduler-clock interrupts
-	received by the de-jittered CPU.
-
-HRTIMER_SOFTIRQ
----------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle.  For example, avoid system calls and force both
-	kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
-	CPU offline, then bring it back online.  This forces recurring
-	timers to migrate elsewhere.  If you are concerned with multiple
-	CPUs, force them all offline before bringing the first one
-	back online.  Once you have onlined the CPUs in question, do not
-	offline any other CPUs, because doing so could force the timer
-	back onto one of the CPUs in question.
-
-RCU_SOFTIRQ
------------
-
-Do at least one of the following:
-
-1.	Offload callbacks and keep the CPU in either dyntick-idle or
-	adaptive-ticks state by doing all of the following:
-
-	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
-		de-jittered is marked as an adaptive-ticks CPU using the
-		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
-		housekeeping CPUs, which can tolerate OS jitter.
-	b.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-2.	Enable RCU to do its processing remotely via dyntick-idle by
-	doing all of the following:
-
-	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
-	b.	Ensure that the CPU goes idle frequently, allowing other
-		CPUs to detect that it has passed through an RCU quiescent
-		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
-		userspace execution also allows other CPUs to detect that
-		the CPU in question has passed through a quiescent state.
-	c.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-Name:
-  kworker/%u:%d%s (cpu, id, priority)
-
-Purpose:
-  Execute workqueue requests
-
-To reduce its OS jitter, do any of the following:
-
-1.	Run your workload at a real-time priority, which will allow
-	preempting the kworker daemons.
-2.	A given workqueue can be made visible in the sysfs filesystem
-	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
-	Such a workqueue can be confined to a given subset of the
-	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
-	files.	The set of WQ_SYSFS workqueues can be displayed using
-	"ls sys/devices/virtual/workqueue".  That said, the workqueues
-	maintainer would like to caution people against indiscriminately
-	sprinkling WQ_SYSFS across all the workqueues.	The reason for
-	caution is that it is easy to add WQ_SYSFS, but because sysfs is
-	part of the formal user/kernel API, it can be nearly impossible
-	to remove it, even if its addition was a mistake.
-3.	Do any of the following needed to avoid jitter that your
-	application cannot tolerate:
-
-	a.	Build your kernel with CONFIG_SLUB=y rather than
-		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
-		use of each CPU's workqueues to run its cache_reap()
-		function.
-	b.	Avoid using oprofile, thus avoiding OS jitter from
-		wq_sync_buffer().
-	c.	Limit your CPU frequency so that a CPU-frequency
-		governor is not required, possibly enlisting the aid of
-		special heatsinks or other cooling technologies.  If done
-		correctly, and if you CPU architecture permits, you should
-		be able to build your kernel with CONFIG_CPU_FREQ=n to
-		avoid the CPU-frequency governor periodically running
-		on each CPU, including cs_dbs_timer() and od_dbs_timer().
-
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
-		commit prevents OS jitter due to vmstat_update() on
-		CONFIG_SMP=y systems.  Before v3.18, is not possible
-		to entirely get rid of the OS jitter, but you can
-		decrease its frequency by writing a large value to
-		/proc/sys/vm/stat_interval.  The default value is HZ,
-		for an interval of one second.	Of course, larger values
-		will make your virtual-memory statistics update more
-		slowly.  Of course, you can also run your workload at
-		a real-time priority, thus preempting vmstat_update(),
-		but if your workload is CPU-bound, this is a bad idea.
-		However, there is an RFC patch from Christoph Lameter
-		(based on an earlier one from Gilad Ben-Yossef) that
-		reduces or even eliminates vmstat overhead for some
-		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	Boot with "elevator=noop" to avoid workqueue use by
-		the block layer.
-	f.	If running on high-end powerpc servers, build with
-		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
-		daemon from running on each CPU every second or so.
-		(This will require editing Kconfig files and will defeat
-		this platform's RAS functionality.)  This avoids jitter
-		due to the rtas_event_scan() function.
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	g.	If running on Cell Processor, build your kernel with
-		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
-		spu_gov_work().
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	h.	If running on PowerMAC, build your kernel with
-		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
-		avoiding OS jitter from rackmeter_do_timer().
-
-Name:
-  rcuc/%u
-
-Purpose:
-  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
-	kthreads from being created in the first place, and also obviates
-	the need for RCU priority boosting.  This approach is feasible
-	for workloads that do not require high degrees of responsiveness.
-2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
-	kthreads from being created in the first place.  This approach
-	is feasible only if your workload never requires RCU priority
-	boosting, for example, if you ensure frequent idle time on all
-	CPUs that might execute within the kernel.
-3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
-	boot parameter offloading RCU callbacks from all CPUs susceptible
-	to OS jitter.  This approach prevents the rcuc/%u kthreads from
-	having any work to do, so that they are never awakened.
-4.	Ensure that the CPU never enters the kernel, and, in particular,
-	avoid initiating any CPU hotplug operations on this CPU.  This is
-	another way of preventing any callbacks from being queued on the
-	CPU, again preventing the rcuc/%u kthreads from having any work
-	to do.
-
-Name:
-  rcuop/%d and rcuos/%d
-
-Purpose:
-  Offload RCU callbacks from the corresponding CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Use affinity, cgroups, or other mechanism to force these kthreads
-	to execute on some other CPU.
-2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
-	kthreads from being created in the first place.  However, please
-	note that this will not eliminate OS jitter, but will instead
-	shift it to RCU_SOFTIRQ.
-
-Name:
-  watchdog/%u
-
-Purpose:
-  Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
-	kthreads from being created in the first place.
-2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
-	from being created.  Other related watchdog and softlockup boot
-	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
-	and Documentation/watchdog/watchdog-parameters.rst.
-3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
-	watchdog timer.
-4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
-	order to reduce the frequency of OS jitter due to the watchdog
-	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt
deleted file mode 100644
index 12c571368e73..000000000000
--- a/Documentation/ldm.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-==========================================
-LDM - Logical Disk Manager (Dynamic Disks)
-==========================================
-
-:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
-:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
-
-Overview
---------
-
-Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
-replacement for the MSDOS style partitions.  It stores its information in a
-1MiB journalled database at the end of the physical disk.  The size of
-partitions is limited only by disk space.  The maximum number of partitions is
-nearly 2000.
-
-Any partitions created under the LDM are called "Dynamic Disks".  There are no
-longer any primary or extended partitions.  Normal MSDOS style partitions are
-now known as Basic Disks.
-
-If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
-Dynamic Disks.  The journalling allows Windows to make changes to these
-partitions and filesystems without the need to reboot.
-
-Once the LDM driver has divided up the disk, you can use the MD driver to
-assemble any multi-partition volumes, e.g.  Stripes, RAID5.
-
-To prevent legacy applications from repartitioning the disk, the LDM creates a
-dummy MSDOS partition containing one disk-sized partition.  This is what is
-supported with the Linux LDM driver.
-
-A newer approach that has been implemented with Vista is to put LDM on top of a
-GPT label disk.  This is not supported by the Linux LDM driver yet.
-
-
-Example
--------
-
-Below we have a 50MiB disk, divided into seven partitions.
-
-.. note::
-
-   The missing 1MiB at the end of the disk is where the LDM database is
-   stored.
-
-+-------++--------------+---------+-----++--------------+---------+----+
-|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
-+=======++==============+=========+=====++==============+=========+====+
-|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
-+-------++--------------+---------+-----++--------------+---------+----+
-
-The LDM Database may not store the partitions in the order that they appear on
-disk, but the driver will sort them.
-
-When Linux boots, you will see something like::
-
-  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
-  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
-
-
-Compiling LDM Support
----------------------
-
-To enable LDM, choose the following two options: 
-
-  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
-  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
-
-If you believe the driver isn't working as it should, you can enable the extra
-debugging code.  This will produce a LOT of output.  The option is:
-
-  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
-
-N.B. The partition code cannot be compiled as a module.
-
-As with all the partition code, if the driver doesn't see signs of its type of
-partition, it will pass control to another driver, so there is no harm in
-enabling it.
-
-If you have Dynamic Disks but don't enable the driver, then all you will see
-is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
-any of the volumes on the disk.
-
-
-Booting
--------
-
-If you enable LDM support, then lilo is capable of booting from any of the
-discovered partitions.  However, grub does not understand the LDM partitioning
-and cannot boot from a Dynamic Disk.
-
-
-More Documentation
-------------------
-
-There is an Overview of the LDM together with complete Technical Documentation.
-It is available for download.
-
-  http://www.linux-ntfs.org/
-
-If you have any LDM questions that aren't answered in the documentation, email
-me.
-
-Cheers,
-    FlatCap - Richard Russon
-    ldm@flatcap.org
-
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
deleted file mode 100644
index 290840c160af..000000000000
--- a/Documentation/lockup-watchdogs.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-===============================================================
-Softlockup detector and hardlockup detector (aka nmi_watchdog)
-===============================================================
-
-The Linux kernel can act as a watchdog to detect both soft and hard
-lockups.
-
-A 'softlockup' is defined as a bug that causes the kernel to loop in
-kernel mode for more than 20 seconds (see "Implementation" below for
-details), without giving other tasks a chance to run. The current
-stack trace is displayed upon detection and, by default, the system
-will stay locked up. Alternatively, the kernel can be configured to
-panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
-"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
-details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
-provided for this.
-
-A 'hardlockup' is defined as a bug that causes the CPU to loop in
-kernel mode for more than 10 seconds (see "Implementation" below for
-details), without letting other interrupts have a chance to run.
-Similarly to the softlockup case, the current stack trace is displayed
-upon detection and the system will stay locked up unless the default
-behavior is changed, which can be done through a sysctl,
-'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
-and a kernel parameter, "nmi_watchdog"
-(see "Documentation/admin-guide/kernel-parameters.rst" for details).
-
-The panic option can be used in combination with panic_timeout (this
-timeout is set through the confusingly named "kernel.panic" sysctl),
-to cause the system to reboot automatically after a specified amount
-of time.
-
-Implementation
-==============
-
-The soft and hard lockup detectors are built on top of the hrtimer and
-perf subsystems, respectively. A direct consequence of this is that,
-in principle, they should work in any architecture where these
-subsystems are present.
-
-A periodic hrtimer runs to generate interrupts and kick the watchdog
-task. An NMI perf event is generated every "watchdog_thresh"
-(compile-time initialized to 10 and configurable through sysctl of the
-same name) seconds to check for hardlockups. If any CPU in the system
-does not receive any hrtimer interrupt during that time the
-'hardlockup detector' (the handler for the NMI perf event) will
-generate a kernel warning or call panic, depending on the
-configuration.
-
-The watchdog task is a high priority kernel thread that updates a
-timestamp every time it is scheduled. If that timestamp is not updated
-for 2*watchdog_thresh seconds (the softlockup threshold) the
-'softlockup detector' (coded inside the hrtimer callback function)
-will dump useful debug information to the system log, after which it
-will call panic if it was instructed to do so or resume execution of
-other kernel code.
-
-The period of the hrtimer is 2*watchdog_thresh/5, which means it has
-two or three chances to generate an interrupt before the hardlockup
-detector kicks in.
-
-As explained above, a kernel knob is provided that allows
-administrators to configure the period of the hrtimer and the perf
-event. The right value for a particular environment is a trade-off
-between fast response to lockups and detection overhead.
-
-By default, the watchdog runs on all online cores.  However, on a
-kernel configured with NO_HZ_FULL, by default the watchdog runs only
-on the housekeeping cores, not the cores specified in the "nohz_full"
-boot argument.  If we allowed the watchdog to run by default on
-the "nohz_full" cores, we would have to run timer ticks to activate
-the scheduler, which would prevent the "nohz_full" functionality
-from protecting the user code on those cores from the kernel.
-Of course, disabling it by default on the nohz_full cores means that
-when those cores do enter the kernel, by default we will not be
-able to detect if they lock up.  However, allowing the watchdog
-to continue to run on the housekeeping (non-tickless) cores means
-that we will continue to detect lockups properly on those cores.
-
-In either case, the set of cores excluded from running the watchdog
-may be adjusted via the kernel.watchdog_cpumask sysctl.  For
-nohz_full cores, this may be useful for debugging a case where the
-kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
deleted file mode 100644
index aaf1667489f8..000000000000
--- a/Documentation/numastat.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-===============================
-Numa policy hit/miss statistics
-===============================
-
-/sys/devices/system/node/node*/numastat
-
-All units are pages. Hugepages have separate counters.
-
-=============== ============================================================
-numa_hit	A process wanted to allocate memory from this node,
-		and succeeded.
-
-numa_miss	A process wanted to allocate memory from another node,
-		but ended up with memory from this node.
-
-numa_foreign	A process wanted to allocate on this node,
-		but ended up with memory from another one.
-
-local_node	A process ran on this node and got memory from it.
-
-other_node	A process ran on this node and got memory from another node.
-
-interleave_hit 	Interleaving wanted to allocate from this node
-		and succeeded.
-=============== ============================================================
-
-For easier reading you can use the numastat utility from the numactl package
-(http://oss.sgi.com/projects/libnuma/). Note that it only works
-well right now on machines with a small number of CPUs.
-
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt
deleted file mode 100644
index bab2d10631f0..000000000000
--- a/Documentation/pnp.txt
+++ /dev/null
@@ -1,292 +0,0 @@
-=================================
-Linux Plug and Play Documentation
-=================================
-
-:Author: Adam Belay <ambx1@neo.rr.com>
-:Last updated: Oct. 16, 2002
-
-
-Overview
---------
-
-Plug and Play provides a means of detecting and setting resources for legacy or
-otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
-services to compatible drivers.
-
-
-The User Interface
-------------------
-
-The Linux Plug and Play user interface provides a means to activate PnP devices
-for legacy and user level drivers that do not support Linux Plug and Play.  The 
-user interface is integrated into sysfs.
-
-In addition to the standard sysfs file the following are created in each
-device's directory:
-- id - displays a list of support EISA IDs
-- options - displays possible resource configurations
-- resources - displays currently allocated resources and allows resource changes
-
-activating a device
-^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "auto" > resources
-
-this will invoke the automatic resource config system to activate the device
-
-manually activating a device
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "manual <depnum> <mode>" > resources
-
-	<depnum> - the configuration number
-	<mode> - static or dynamic
-		 static = for next boot
-		 dynamic = now
-
-disabling a device
-^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "disable" > resources
-
-
-EXAMPLE:
-
-Suppose you need to activate the floppy disk controller.
-
-1. change to the proper directory, in my case it is
-   /driver/bus/pnp/devices/00:0f::
-
-	# cd /driver/bus/pnp/devices/00:0f
-	# cat name
-	PC standard floppy disk controller
-
-2. check if the device is already active::
-
-	# cat resources
-	DISABLED
-
-  - Notice the string "DISABLED".  This means the device is not active.
-
-3. check the device's possible configurations (optional)::
-
-	# cat options
-	Dependent: 01 - Priority acceptable
-	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-	Dependent: 02 - Priority acceptable
-	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-
-4. now activate the device::
-
-	# echo "auto" > resources
-
-5. finally check if the device is active::
-
-	# cat resources
-	io 0x3f0-0x3f5
-	io 0x3f7-0x3f7
-	irq 6
-	dma 2
-
-also there are a series of kernel parameters::
-
-	pnp_reserve_irq=irq1[,irq2] ....
-	pnp_reserve_dma=dma1[,dma2] ....
-	pnp_reserve_io=io1,size1[,io2,size2] ....
-	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
-
-
-
-The Unified Plug and Play Layer
--------------------------------
-
-All Plug and Play drivers, protocols, and services meet at a central location
-called the Plug and Play Layer.  This layer is responsible for the exchange of 
-information between PnP drivers and PnP protocols.  Thus it automatically 
-forwards commands to the proper protocol.  This makes writing PnP drivers 
-significantly easier.
-
-The following functions are available from the Plug and Play Layer:
-
-pnp_get_protocol
-  increments the number of uses by one
-
-pnp_put_protocol
-  deincrements the number of uses by one
-
-pnp_register_protocol
-  use this to register a new PnP protocol
-
-pnp_unregister_protocol
-  use this function to remove a PnP protocol from the Plug and Play Layer
-
-pnp_register_driver
-  adds a PnP driver to the Plug and Play Layer
-
-  this includes driver model integration
-  returns zero for success or a negative error number for failure; count
-  calls to the .add() method if you need to know how many devices bind to
-  the driver
-
-pnp_unregister_driver
-  removes a PnP driver from the Plug and Play Layer
-
-
-
-Plug and Play Protocols
------------------------
-
-This section contains information for PnP protocol developers.
-
-The following Protocols are currently available in the computing world:
-
-- PNPBIOS:
-    used for system devices such as serial and parallel ports.
-- ISAPNP:
-    provides PnP support for the ISA bus
-- ACPI:
-    among its many uses, ACPI provides information about system level
-    devices.
-
-It is meant to replace the PNPBIOS.  It is not currently supported by Linux
-Plug and Play but it is planned to be in the near future.
-
-
-Requirements for a Linux PnP protocol:
-1. the protocol must use EISA IDs
-2. the protocol must inform the PnP Layer of a device's current configuration
-
-- the ability to set resources is optional but preferred.
-
-The following are PnP protocol related functions:
-
-pnp_add_device
-  use this function to add a PnP device to the PnP layer
-
-  only call this function when all wanted values are set in the pnp_dev
-  structure
-
-pnp_init_device
-  call this to initialize the PnP structure
-
-pnp_remove_device
-  call this to remove a device from the Plug and Play Layer.
-  it will fail if the device is still in use.
-  automatically will free mem used by the device and related structures
-
-pnp_add_id
-  adds an EISA ID to the list of supported IDs for the specified device
-
-For more information consult the source of a protocol such as
-/drivers/pnp/pnpbios/core.c.
-
-
-
-Linux Plug and Play Drivers
----------------------------
-
-This section contains information for Linux PnP driver developers.
-
-The New Way
-^^^^^^^^^^^
-
-1. first make a list of supported EISA IDS
-
-   ex::
-
-	static const struct pnp_id pnp_dev_table[] = {
-		/* Standard LPT Printer Port */
-		{.id = "PNP0400", .driver_data = 0},
-		/* ECP Printer Port */
-		{.id = "PNP0401", .driver_data = 0},
-		{.id = ""}
-	};
-
-   Please note that the character 'X' can be used as a wild card in the function
-   portion (last four characters).
-
-   ex::
-
-	/* Unknown PnP modems */
-	{	"PNPCXXX",		UNKNOWN_DEV	},
-
-   Supported PnP card IDs can optionally be defined.
-   ex::
-
-	static const struct pnp_id pnp_card_table[] = {
-		{	"ANYDEVS",		0	},
-		{	"",			0	}
-	};
-
-2. Optionally define probe and remove functions.  It may make sense not to
-   define these functions if the driver already has a reliable method of detecting
-   the resources, such as the parport_pc driver.
-
-   ex::
-
-	static int
-	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
-			struct pnp_id *dev_id)
-	{
-	. . .
-
-   ex::
-
-	static void serial_pnp_remove(struct pnp_dev * dev)
-	{
-	. . .
-
-   consult /drivers/serial/8250_pnp.c for more information.
-
-3. create a driver structure
-
-   ex::
-
-	static struct pnp_driver serial_pnp_driver = {
-		.name		= "serial",
-		.card_id_table	= pnp_card_table,
-		.id_table	= pnp_dev_table,
-		.probe		= serial_pnp_probe,
-		.remove		= serial_pnp_remove,
-	};
-
-   * name and id_table cannot be NULL.
-
-4. register the driver
-
-   ex::
-
-	static int __init serial8250_pnp_init(void)
-	{
-		return pnp_register_driver(&serial_pnp_driver);
-	}
-
-The Old Way
-^^^^^^^^^^^
-
-A series of compatibility functions have been created to make it easy to convert
-ISAPNP drivers.  They should serve as a temporary solution only.
-
-They are as follows::
-
-	struct pnp_card *pnp_find_card(unsigned short vendor,
-				       unsigned short device,
-				       struct pnp_card *from)
-
-	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
-				     unsigned short vendor,
-				     unsigned short function,
-				     struct pnp_dev *from)
-
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
deleted file mode 100644
index 688c95b11919..000000000000
--- a/Documentation/rtc.txt
+++ /dev/null
@@ -1,140 +0,0 @@
-=======================================
-Real Time Clock (RTC) Drivers for Linux
-=======================================
-
-When Linux developers talk about a "Real Time Clock", they usually mean
-something that tracks wall clock time and is battery backed so that it
-works even with system power off.  Such clocks will normally not track
-the local time zone or daylight savings time -- unless they dual boot
-with MS-Windows -- but will instead be set to Coordinated Universal Time
-(UTC, formerly "Greenwich Mean Time").
-
-The newest non-PC hardware tends to just count seconds, like the time(2)
-system call reports, but RTCs also very commonly represent time using
-the Gregorian calendar and 24 hour time, as reported by gmtime(3).
-
-Linux has two largely-compatible userspace RTC API families you may
-need to know about:
-
-    *	/dev/rtc ... is the RTC provided by PC compatible systems,
-	so it's not very portable to non-x86 systems.
-
-    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
-	supported by a wide variety of RTC chips on all systems.
-
-Programmers need to understand that the PC/AT functionality is not
-always available, and some systems can do much more.  That is, the
-RTCs use the same API to make requests in both RTC frameworks (using
-different filenames of course), but the hardware may not offer the
-same functionality.  For example, not every RTC is hooked up to an
-IRQ, so they can't all issue alarms; and where standard PC RTCs can
-only issue an alarm up to 24 hours in the future, other hardware may
-be able to schedule one any time in the upcoming century.
-
-
-Old PC/AT-Compatible driver:  /dev/rtc
---------------------------------------
-
-All PCs (even Alpha machines) have a Real Time Clock built into them.
-Usually they are built into the chipset of the computer, but some may
-actually have a Motorola MC146818 (or clone) on the board. This is the
-clock that keeps the date and time while your computer is turned off.
-
-ACPI has standardized that MC146818 functionality, and extended it in
-a few ways (enabling longer alarm periods, and wake-from-hibernate).
-That functionality is NOT exposed in the old driver.
-
-However it can also be used to generate signals from a slow 2Hz to a
-relatively fast 8192Hz, in increments of powers of two. These signals
-are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
-for...) It can also function as a 24hr alarm, raising IRQ 8 when the
-alarm goes off. The alarm can also be programmed to only check any
-subset of the three programmable values, meaning that it could be set to
-ring on the 30th second of the 30th minute of every hour, for example.
-The clock can also be set to generate an interrupt upon every clock
-update, thus generating a 1Hz signal.
-
-The interrupts are reported via /dev/rtc (major 10, minor 135, read only
-character device) in the form of an unsigned long. The low byte contains
-the type of interrupt (update-done, alarm-rang, or periodic) that was
-raised, and the remaining bytes contain the number of interrupts since
-the last read.  Status information is reported through the pseudo-file
-/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
-built in locking so that only one process is allowed to have the /dev/rtc
-interface open at a time.
-
-A user process can monitor these interrupts by doing a read(2) or a
-select(2) on /dev/rtc -- either will block/stop the user process until
-the next interrupt is received. This is useful for things like
-reasonably high frequency data acquisition where one doesn't want to
-burn up 100% CPU by polling gettimeofday etc. etc.
-
-At high frequencies, or under high loads, the user process should check
-the number of interrupts received since the last read to determine if
-there has been any interrupt "pileup" so to speak. Just for reference, a
-typical 486-33 running a tight read loop on /dev/rtc will start to suffer
-occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
-frequencies above 1024Hz. So you really should check the high bytes
-of the value you read, especially at frequencies above that of the
-normal timer interrupt, which is 100Hz.
-
-Programming and/or enabling interrupt frequencies greater than 64Hz is
-only allowed by root. This is perhaps a bit conservative, but we don't want
-an evil user generating lots of IRQs on a slow 386sx-16, where it might have
-a negative impact on performance. This 64Hz limit can be changed by writing
-a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
-interrupt handler is only a few lines of code to minimize any possibility
-of this effect.
-
-Also, if the kernel time is synchronized with an external source, the 
-kernel will write the time back to the CMOS clock every 11 minutes. In 
-the process of doing this, the kernel briefly turns off RTC periodic 
-interrupts, so be aware of this if you are doing serious work. If you
-don't synchronize the kernel time with an external source (via ntp or
-whatever) then the kernel will keep its hands off the RTC, allowing you
-exclusive access to the device for your applications.
-
-The alarm and/or interrupt frequency are programmed into the RTC via
-various ioctl(2) calls as listed in ./include/linux/rtc.h
-Rather than write 50 pages describing the ioctl() and so on, it is
-perhaps more useful to include a small test program that demonstrates
-how to use them, and demonstrates the features of the driver. This is
-probably a lot more useful to people interested in writing applications
-that will be using this driver.  See the code at the end of this document.
-
-(The original /dev/rtc driver was written by Paul Gortmaker.)
-
-
-New portable "RTC Class" drivers:  /dev/rtcN
---------------------------------------------
-
-Because Linux supports many non-ACPI and non-PC platforms, some of which
-have more than one RTC style clock, it needed a more portable solution
-than expecting a single battery-backed MC146818 clone on every system.
-Accordingly, a new "RTC Class" framework has been defined.  It offers
-three different userspace interfaces:
-
-    *	/dev/rtcN ... much the same as the older /dev/rtc interface
-
-    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
-	access to some RTC attributes.
-
-    *	/proc/driver/rtc ... the system clock RTC may expose itself
-	using a procfs interface. If there is no RTC for the system clock,
-	rtc0 is used by default. More information is (currently) shown
-	here than through sysfs.
-
-The RTC Class framework supports a wide variety of RTCs, ranging from those
-integrated into embeddable system-on-chip (SOC) processors to discrete chips
-using I2C, SPI, or some other bus to communicate with the host CPU.  There's
-even support for PC-style RTCs ... including the features exposed on newer PCs
-through ACPI.
-
-The new framework also removes the "one RTC per system" restriction.  For
-example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
-a high functionality RTC is integrated into the SOC.  That system might read
-the system clock from the discrete RTC, but use the integrated one for all
-other tasks, because of its greater functionality.
-
-Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
-ioctl interface.
diff --git a/Documentation/svga.txt b/Documentation/svga.txt
deleted file mode 100644
index b6c2f9acca92..000000000000
--- a/Documentation/svga.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-.. include:: <isonum.txt>
-
-=================================
-Video Mode Selection Support 2.13
-=================================
-
-:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
-
-Intro
-~~~~~
-
-This small document describes the "Video Mode Selection" feature which
-allows the use of various special video modes supported by the video BIOS. Due
-to usage of the BIOS, the selection is limited to boot time (before the
-kernel decompression starts) and works only on 80X86 machines.
-
-.. note::
-
-   Short intro for the impatient: Just use vga=ask for the first time,
-   enter ``scan`` on the video mode prompt, pick the mode you want to use,
-   remember its mode ID (the four-digit hexadecimal number) and then
-   set the vga parameter to this number (converted to decimal first).
-
-The video mode to be used is selected by a kernel parameter which can be
-specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
-option of LILO (or some other boot loader you use) or by the "vidmode" utility
-(present in standard Linux utility packages). You can use the following values
-of this parameter::
-
-   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
-
-   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
-
-   ASK_VGA - Display a video mode menu upon startup (see below).
-
-   0..35 - Menu item number (when you have used the menu to view the list of
-      modes available on your adapter, you can specify the menu item you want
-      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
-      mode list displayed may vary as the kernel version changes, because the
-      modes are listed in a "first detected -- first displayed" manner. It's
-      better to use absolute mode numbers instead.
-
-   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
-      for exact meaning of the ID). Warning: rdev and LILO don't support
-      hexadecimal numbers -- you have to convert it to decimal manually.
-
-Menu
-~~~~
-
-The ASK_VGA mode causes the kernel to offer a video mode menu upon
-bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
-to continue or wait 30 secs" message. If you press <RETURN>, you enter the
-menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
-the standard 80x25 mode.
-
-The menu looks like::
-
-	Video adapter: <name-of-detected-video-adapter>
-	Mode:    COLSxROWS:
-	0  0F00  80x25
-	1  0F01  80x50
-	2  0F02  80x43
-	3  0F03  80x26
-	....
-	Enter mode number or ``scan``: <flashing-cursor-here>
-
-<name-of-detected-video-adapter> tells what video adapter did Linux detect
--- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
-with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
-of chipsets is turned off by default as it's inherently unreliable due to
-absolutely insane PC design.
-
-"0  0F00  80x25" means that the first menu item (the menu items are numbered
-from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
-next section for a description of mode IDs).
-
-<flashing-cursor-here> encourages you to enter the item number or mode ID
-you wish to set and press <RETURN>. If the computer complains something about
-"Unknown mode ID", it is trying to tell you that it isn't possible to set such
-a mode. It's also possible to press only <RETURN> which leaves the current mode.
-
-The mode list usually contains a few basic modes and some VESA modes.  In
-case your chipset has been detected, some chipset-specific modes are shown as
-well (some of these might be missing or unusable on your machine as different
-BIOSes are often shipped with the same card and the mode numbers depend purely
-on the VGA BIOS).
-
-The modes displayed on the menu are partially sorted: The list starts with
-the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
-80x43), local modes (if the local modes feature is enabled), VESA modes and
-finally SVGA modes for the auto-detected adapter.
-
-If you are not happy with the mode list offered (e.g., if you think your card
-is able to do more), you can enter "scan" instead of item number / mode ID.  The
-program will try to ask the BIOS for all possible video mode numbers and test
-what happens then. The screen will be probably flashing wildly for some time and
-strange noises will be heard from inside the monitor and so on and then, really
-all consistent video modes supported by your BIOS will appear (plus maybe some
-``ghost modes``). If you are afraid this could damage your monitor, don't use
-this function.
-
-After scanning, the mode ordering is a bit different: the auto-detected SVGA
-modes are not listed at all and the modes revealed by ``scan`` are shown before
-all VESA modes.
-
-Mode IDs
-~~~~~~~~
-
-Because of the complexity of all the video stuff, the video mode IDs
-used here are also a bit complex. A video mode ID is a 16-bit number usually
-expressed in a hexadecimal notation (starting with "0x"). You can set a mode
-by entering its mode directly if you know it even if it isn't shown on the menu.
-
-The ID numbers can be divided to those regions::
-
-   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
-	outside the menu as this can change from boot to boot (especially if you
-	have used the ``scan`` feature).
-
-   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
-	(as presented to INT 10, function 00) increased by 0x0100.
-
-   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
-	0x0100. All VESA modes should be autodetected and shown on the menu.
-
-   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
-	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
-	945=132x28 for the standard Video7 BIOS)
-
-   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
-	by modifying one of the standard modes). Currently available:
-	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
-	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
-	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
-	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
-	0x0f04	leave current video mode
-	0x0f05	VGA 80x30 (480 scans, 16-point font)
-	0x0f06	VGA 80x34 (480 scans, 14-point font)
-	0x0f07	VGA 80x60 (480 scans, 8-point font)
-	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
-
-   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
-	form where RR is a number of rows and CC is a number of columns.
-	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
-	This is the only fully portable way to refer to a non-standard mode,
-	but it relies on the mode being found and displayed on the menu
-	(remember that mode scanning is not done automatically).
-
-   0xff00 to 0xffff - aliases for backward compatibility:
-	0xffff	equivalent to 0x0f00 (standard 80x25)
-	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
-
-If you add 0x8000 to the mode ID, the program will try to recalculate
-vertical display timing according to mode parameters, which can be used to
-eliminate some annoying bugs of certain VGA BIOSes (usually those used for
-cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
-end of the display.
-
-Options
-~~~~~~~
-
-Build options for arch/x86/boot/* are selected by the kernel kconfig
-utility and the kernel .config file.
-
-VIDEO_GFX_HACK - includes special hack for setting of graphics modes
-to be used later by special drivers.
-Allows to set _any_ BIOS mode including graphic ones and forcing specific
-text screen resolution instead of peeking it from BIOS variables. Don't use
-unless you think you know what you're doing. To activate this setup, use
-mode number 0x0f08 (see the Mode IDs section above).
-
-Still doesn't work?
-~~~~~~~~~~~~~~~~~~~
-
-When the mode detection doesn't work (e.g., the mode list is incorrect or
-the machine hangs instead of displaying the menu), try to switch off some of
-the configuration options listed under "Options". If it fails, you can still use
-your kernel with the video mode set directly via the kernel parameter.
-
-In either case, please send me a bug report containing what _exactly_
-happens and how do the configuration switches affect the behaviour of the bug.
-
-If you start Linux from M$-DOS, you might also use some DOS tools for
-video mode setting. In this case, you must specify the 0x0f04 mode ("leave
-current settings") to Linux, because if you don't and you use any non-standard
-mode, Linux will switch to 80x25 automatically.
-
-If you set some extended mode and there's one or more extra lines on the
-bottom of the display containing already scrolled-out text, your VGA BIOS
-contains the most common video BIOS bug called "incorrect vertical display
-end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
-this must be done manually -- no autodetection mechanisms are available.
-
-History
-~~~~~~~
-
-=============== ================================================================
-1.0 (??-Nov-95)	First version supporting all adapters supported by the old
-		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
-		and then removed due to instability on some machines.
-2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
-		everything is configurable, the VESA support should be much more
-		stable, explicit mode numbering allowed, "scan" implemented etc.
-2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
-		supported. Few bugs fixed. VESA modes are listed prior to
-		modes supplied by SVGA autodetection as they are more reliable.
-		CLGD autodetect works better. Doesn't depend on 80x25 being
-		active when started. Scanning fixed. 80x43 (any VGA) added.
-		Code cleaned up.
-2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
-		VESA modes work now). Display end bug workaround supported.
-		Special modes renumbered to allow adding of the "recalculate"
-		flag, 0xffff and 0xfffe became aliases instead of real IDs.
-		Screen contents retained during mode changes.
-2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
-2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
-		with some boot loaders. Memory management rewritten to reflect
-		these changes. Unfortunately, screen contents retaining works
-		only with some loaders now.
-		Added a Tseng 132x60 mode.
-2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
-2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
-		several cards with broken VESA code (e.g., ATI VGA).
-2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
-		  cards use very strange mode numbers.
-		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
-		- Hardware testing order slightly changed, tests based on ROM
-		  contents done as first.
-		- Added support for special Video7 mode switching functions
-		  (thanks to Tom Vander Aa).
-		- Added 480-scanline modes (especially useful for notebooks,
-		  original version written by hhanemaa@cs.ruu.nl, patched by
-		  Jeff Chua, rewritten by me).
-		- Screen store/restore fixed.
-2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
-		- Better recognition of text modes during mode scan.
-2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
-2.10(11-Nov-96) - The whole thing made optional.
-		- Added the CONFIG_VIDEO_400_HACK switch.
-		- Added the CONFIG_VIDEO_GFX_HACK switch.
-		- Code cleanup.
-2.11(03-May-97) - Yet another cleanup, now including also the documentation.
-		- Direct testing of SVGA adapters turned off by default, ``scan``
-		  offered explicitly on the prompt line.
-		- Removed the doc section describing adding of new probing
-		  functions as I try to get rid of _all_ hardware probing here.
-2.12(25-May-98) Added support for VESA frame buffer graphics.
-2.13(14-May-99) Minor documentation fixes.
-=============== ================================================================
diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt
deleted file mode 100644
index 56d6fa2e2368..000000000000
--- a/Documentation/video-output.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Video Output Switcher Control
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-2006 luming.yu@intel.com
-
-The output sysfs class driver provides an abstract video output layer that
-can be used to hook platform specific methods to enable/disable video output
-device through common sysfs interface. For example, on my IBM ThinkPad T42
-laptop, The ACPI video driver registered its output devices and read/write
-method for 'state' with output sysfs class. The user interface under sysfs is::
-
-  linux:/sys/class/video_output # tree .
-  .
-  |-- CRT0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- DVI0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- LCD0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  `-- TV0
-     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-     |-- state
-     |-- subsystem -> ../../../class/video_output
-     `-- uevent
-
diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst
index 8e9704f61017..e29739904e37 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/x86/topology.rst
@@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the
 respective code.
 
 The architecture-agnostic topology definitions are in
-Documentation/cputopology.txt. This file holds x86-specific
+Documentation/admin-guide/cputopology.rst. This file holds x86-specific
 differences/specialities which must not necessarily apply to the generic
 definitions. Thus, the way to read up on Linux topology on x86 is to start
 with the generic one and look at this one in parallel for the x86 specifics.
diff --git a/MAINTAINERS b/MAINTAINERS
index c1593a668f80..570572627fd1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6080,7 +6080,7 @@ M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 S:	Maintained
-F:	Documentation/efi-stub.txt
+F:	Documentation/admin-guide/efi-stub.rst
 F:	arch/*/kernel/efi.c
 F:	arch/x86/boot/compressed/eboot.[ch]
 F:	arch/*/include/asm/efi.h
@@ -7088,7 +7088,7 @@ M:	Herbert Xu <herbert@gondor.apana.org.au>
 L:	linux-crypto@vger.kernel.org
 S:	Odd fixes
 F:	Documentation/devicetree/bindings/rng/
-F:	Documentation/hw_random.txt
+F:	Documentation/admin-guide/hw_random.rst
 F:	drivers/char/hw_random/
 F:	include/linux/hw_random.h
 
@@ -9398,7 +9398,7 @@ M:	"Richard Russon (FlatCap)" <ldm@flatcap.org>
 L:	linux-ntfs-dev@lists.sourceforge.net
 W:	http://www.linux-ntfs.org/content/view/19/37/
 S:	Maintained
-F:	Documentation/ldm.txt
+F:	Documentation/admin-guide/ldm.rst
 F:	block/partitions/ldm.*
 
 LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
@@ -12058,7 +12058,7 @@ PARALLEL LCD/KEYPAD PANEL DRIVER
 M:	Willy Tarreau <willy@haproxy.com>
 M:	Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
 S:	Odd Fixes
-F:	Documentation/auxdisplay/lcd-panel-cgram.rst
+F:	Documentation/admin-guide/lcd-panel-cgram.rst
 F:	drivers/auxdisplay/panel.c
 
 PARALLEL PORT SUBSYSTEM
@@ -13476,7 +13476,7 @@ Q:	http://patchwork.ozlabs.org/project/rtc-linux/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/rtc/
-F:	Documentation/rtc.txt
+F:	Documentation/admin-guide/rtc.rst
 F:	drivers/rtc/
 F:	include/linux/rtc.h
 F:	include/uapi/linux/rtc.h
@@ -15306,7 +15306,7 @@ SVGA HANDLING
 M:	Martin Mares <mj@ucw.cz>
 L:	linux-video@atrey.karlin.mff.cuni.cz
 S:	Maintained
-F:	Documentation/svga.txt
+F:	Documentation/admin-guide/svga.rst
 F:	arch/x86/boot/video*
 
 SWIOTLB SUBSYSTEM
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 20afd6077465..600c5ba1af41 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1297,7 +1297,7 @@ config SMP
 	  will run faster if you say N here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://tldp.org/HOWTO/SMP-HOWTO.html>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 42875ff15671..6d732e451071 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -277,7 +277,7 @@ config SMP
 	  machines, but will use only one CPU of a multiprocessor machine.
 	  On a uniprocessor machine, the kernel will run faster if you say N.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c2858ac6a46a..6b1b5941b618 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -679,7 +679,7 @@ config SMP
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e9f5d62e9817..7926a2e11bdc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -180,7 +180,7 @@ config SMP
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9505066b7ba3..9e95af666b33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -402,7 +402,7 @@ config SMP
 	  Management" code will be disabled if you say Y here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
@@ -1959,7 +1959,7 @@ config EFI_STUB
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
 
-	  See Documentation/efi-stub.txt for more information.
+	  See Documentation/admin-guide/efi-stub.rst for more information.
 
 config EFI_MIXED
 	bool "EFI mixed-mode support"
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 37b9710cc80a..702689a628f0 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -194,7 +194,7 @@ config LDM_PARTITION
 	  Normal partitions are now called Basic Disks under Windows 2000, XP,
 	  and Vista.
 
-	  For a fuller description read <file:Documentation/ldm.txt>.
+	  For a fuller description read <file:Documentation/admin-guide/ldm.rst>.
 
 	  If unsure, say N.
 
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 442403abd73a..3e866885a405 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -291,7 +291,7 @@ config RTC
 	  and set the RTC in an SMP compatible fashion.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
@@ -313,7 +313,7 @@ config JS_RTC
 	  /dev/rtc.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 95be7228f327..9044d31ab1a1 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -4,7 +4,7 @@
  * Copyright 2006 Michael Buesch <m@bues.ch>
  * Copyright 2005 (c) MontaVista Software, Inc.
  *
- * Please read Documentation/hw_random.txt for details on use.
+ * Please read Documentation/admin-guide/hw_random.rst for details on use.
  *
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c0b93e0ff0c0..8e6dd908da21 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -1,7 +1,7 @@
 /*
 	Hardware Random Number Generator
 
-	Please read Documentation/hw_random.txt for details on use.
+	Please read Documentation/admin-guide/hw_random.rst for details on use.
 
 	----------------------------------------------------------
 	This software may be used and distributed according to the terms
-- 
cgit v1.2.3


From 65388dad1bbb51a4eb6cc91b9fa865b57646fb67 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 16:31:35 -0300
Subject: docs: serial: move it to the driver-api

The contents of this directory is mostly driver-api stuff.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-api/serial/cyclades_z.rst     |  11 +
 Documentation/driver-api/serial/driver.rst         | 549 ++++++++++++++++++
 Documentation/driver-api/serial/index.rst          |  32 ++
 Documentation/driver-api/serial/moxa-smartio.rst   | 615 +++++++++++++++++++++
 Documentation/driver-api/serial/n_gsm.rst          | 103 ++++
 Documentation/driver-api/serial/rocket.rst         | 185 +++++++
 Documentation/driver-api/serial/serial-iso7816.rst |  90 +++
 Documentation/driver-api/serial/serial-rs485.rst   | 103 ++++
 Documentation/driver-api/serial/tty.rst            | 328 +++++++++++
 Documentation/serial/cyclades_z.rst                |  11 -
 Documentation/serial/driver.rst                    | 549 ------------------
 Documentation/serial/index.rst                     |  32 --
 Documentation/serial/moxa-smartio.rst              | 615 ---------------------
 Documentation/serial/n_gsm.rst                     | 103 ----
 Documentation/serial/rocket.rst                    | 185 -------
 Documentation/serial/serial-iso7816.rst            |  90 ---
 Documentation/serial/serial-rs485.rst              | 103 ----
 Documentation/serial/tty.rst                       | 328 -----------
 MAINTAINERS                                        |   6 +-
 drivers/tty/Kconfig                                |   4 +-
 drivers/tty/serial/ucc_uart.c                      |   2 +-
 include/linux/serial_core.h                        |   2 +-
 23 files changed, 2024 insertions(+), 2023 deletions(-)
 create mode 100644 Documentation/driver-api/serial/cyclades_z.rst
 create mode 100644 Documentation/driver-api/serial/driver.rst
 create mode 100644 Documentation/driver-api/serial/index.rst
 create mode 100644 Documentation/driver-api/serial/moxa-smartio.rst
 create mode 100644 Documentation/driver-api/serial/n_gsm.rst
 create mode 100644 Documentation/driver-api/serial/rocket.rst
 create mode 100644 Documentation/driver-api/serial/serial-iso7816.rst
 create mode 100644 Documentation/driver-api/serial/serial-rs485.rst
 create mode 100644 Documentation/driver-api/serial/tty.rst
 delete mode 100644 Documentation/serial/cyclades_z.rst
 delete mode 100644 Documentation/serial/driver.rst
 delete mode 100644 Documentation/serial/index.rst
 delete mode 100644 Documentation/serial/moxa-smartio.rst
 delete mode 100644 Documentation/serial/n_gsm.rst
 delete mode 100644 Documentation/serial/rocket.rst
 delete mode 100644 Documentation/serial/serial-iso7816.rst
 delete mode 100644 Documentation/serial/serial-rs485.rst
 delete mode 100644 Documentation/serial/tty.rst

(limited to 'include/linux')

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 1dde9692075c..cf39b8f9d0f9 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -88,6 +88,7 @@ available subsections can be seen below.
    pti_intel_mid
    pwm
    rfkill
+   serial/index
    sgi-ioc4
    sm501
    smsc_ece1099
diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
new file mode 100644
index 000000000000..532ff67e2f1c
--- /dev/null
+++ b/Documentation/driver-api/serial/cyclades_z.rst
@@ -0,0 +1,11 @@
+================
+Cyclades-Z notes
+================
+
+The Cyclades-Z must have firmware loaded onto the card before it will
+operate.  This operation should be performed during system startup,
+
+The firmware, loader program and the latest device driver code are
+available from Cyclades at
+
+    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
new file mode 100644
index 000000000000..31bd4e16fb1f
--- /dev/null
+++ b/Documentation/driver-api/serial/driver.rst
@@ -0,0 +1,549 @@
+====================
+Low Level Serial API
+====================
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver.  It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba-pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver.  The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions.  This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_console_write) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock.  There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are two locks.  A per-port spinlock, and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data::
+
+	port->mctrl
+	port->icount
+	port->state->xmit.head (circ_buf->head)
+	port->state->xmit.tail (circ_buf->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times. Since v2.6.27, this
+semaphore has been the 'mutex' member of the tty_port struct, and
+commonly referred to as the port mutex.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver.  It contains all the methods to control the
+hardware.
+
+  tx_empty(port)
+	This function tests whether the transmitter fifo and shifter
+	for the port described by 'port' is empty.  If it is empty,
+	this function should return TIOCSER_TEMT, otherwise return 0.
+	If the port does not support this operation, then it should
+	return TIOCSER_TEMT.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_mctrl(port, mctrl)
+	This function sets the modem control lines for port described
+	by 'port' to the state described by mctrl.  The relevant bits
+	of mctrl are:
+
+		- TIOCM_RTS	RTS signal.
+		- TIOCM_DTR	DTR signal.
+		- TIOCM_OUT1	OUT1 signal.
+		- TIOCM_OUT2	OUT2 signal.
+		- TIOCM_LOOP	Set the port into loopback mode.
+
+	If the appropriate bit is set, the signal should be driven
+	active.  If the bit is clear, the signal should be driven
+	inactive.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  get_mctrl(port)
+	Returns the current state of modem control inputs.  The state
+	of the outputs should not be returned, since the core keeps
+	track of their state.  The state information should include:
+
+		- TIOCM_CAR	state of DCD signal
+		- TIOCM_CTS	state of CTS signal
+		- TIOCM_DSR	state of DSR signal
+		- TIOCM_RI	state of RI signal
+
+	The bit is set if the signal is currently driven active.  If
+	the port does not support CTS, DCD or DSR, the driver should
+	indicate that the signal is permanently active.  If RI is
+	not available, the signal should not be indicated as active.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  stop_tx(port)
+	Stop transmitting characters.  This might be due to the CTS
+	line becoming inactive or the tty layer indicating we want
+	to stop transmission due to an XOFF character.
+
+	The driver should stop transmitting characters as soon as
+	possible.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  start_tx(port)
+	Start transmitting characters.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  throttle(port)
+	Notify the serial driver that input buffers for the line discipline are
+	close to full, and it should somehow signal that no more characters
+	should be sent to the serial port.
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .unthrottle() and termios modification by the
+	tty layer.
+
+  unthrottle(port)
+	Notify the serial driver that characters can now be sent to the serial
+	port without fear of overrunning the input buffers of the line
+	disciplines.
+
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .throttle() and termios modification by the
+	tty layer.
+
+  send_xchar(port,ch)
+	Transmit a high priority character, even if the port is stopped.
+	This is used to implement XON/XOFF flow control and tcflow().  If
+	the serial driver does not implement this function, the tty core
+	will append the character to the circular buffer and then call
+	start_tx() / stop_tx() to flush the data out.
+
+	Do not transmit if ch == '\0' (__DISABLED_CHAR).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  stop_rx(port)
+	Stop receiving characters; the port is in the process of
+	being closed.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  enable_ms(port)
+	Enable the modem status interrupts.
+
+	This method may be called multiple times.  Modem status
+	interrupts should be disabled when the shutdown method is
+	called.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  break_ctl(port,ctl)
+	Control the transmission of a break signal.  If ctl is
+	nonzero, the break signal should be transmitted.  The signal
+	should be terminated when another call is made with a zero
+	ctl.
+
+	Locking: caller holds tty_port->mutex
+
+  startup(port)
+	Grab any interrupt resources and initialise any low level driver
+	state.  Enable the port for reception.  It should not activate
+	RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+	This method will only be called when the port is initially opened.
+
+	Locking: port_sem taken.
+
+	Interrupts: globally disabled.
+
+  shutdown(port)
+	Disable the port, disable any break condition that may be in
+	effect, and free any interrupt resources.  It should not disable
+	RTS nor DTR; this will have already been done via a separate
+	call to set_mctrl.
+
+	Drivers must not access port->state once this call has completed.
+
+	This method will only be called when there are no more users of
+	this port.
+
+	Locking: port_sem taken.
+
+	Interrupts: caller dependent.
+
+  flush_buffer(port)
+	Flush any write buffers, reset any DMA state and stop any
+	ongoing DMA transfers.
+
+	This will be called whenever the port->state->xmit circular
+	buffer is cleared.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  set_termios(port,termios,oldtermios)
+	Change the port parameters, including word length, parity, stop
+	bits.  Update read_status_mask and ignore_status_mask to indicate
+	the types of events we are interested in receiving.  Relevant
+	termios->c_cflag bits are:
+
+		CSIZE
+			- word size
+		CSTOPB
+			- 2 stop bits
+		PARENB
+			- parity enable
+		PARODD
+			- odd parity (when PARENB is in force)
+		CREAD
+			- enable reception of characters (if not set,
+			  still receive characters from the port, but
+			  throw them away.
+		CRTSCTS
+			- if set, enable CTS status change reporting
+		CLOCAL
+			- if not set, enable modem status change
+			  reporting.
+
+	Relevant termios->c_iflag bits are:
+
+		INPCK
+			- enable frame and parity error events to be
+			  passed to the TTY layer.
+		BRKINT / PARMRK
+			- both of these enable break events to be
+			  passed to the TTY layer.
+
+		IGNPAR
+			- ignore parity and framing errors
+		IGNBRK
+			- ignore break errors,  If IGNPAR is also
+			  set, ignore overrun errors as well.
+
+	The interaction of the iflag bits is as follows (parity error
+	given as an example):
+
+	=============== ======= ======  =============================
+	Parity error	INPCK	IGNPAR
+	=============== ======= ======  =============================
+	n/a		0	n/a	character received, marked as
+					TTY_NORMAL
+	None		1	n/a	character received, marked as
+					TTY_NORMAL
+	Yes		1	0	character received, marked as
+					TTY_PARITY
+	Yes		1	1	character discarded
+	=============== ======= ======  =============================
+
+	Other flags may be used (eg, xon/xoff characters) if your
+	hardware supports hardware "soft" flow control.
+
+	Locking: caller holds tty_port->mutex
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_ldisc(port,termios)
+	Notifier for discipline change. See Documentation/driver-api/serial/tty.rst.
+
+	Locking: caller holds tty_port->mutex
+
+  pm(port,state,oldstate)
+	Perform any power management related activities on the specified
+	port.  State indicates the new state (defined by
+	enum uart_pm_state), oldstate indicates the previous state.
+
+	This function should not be used to grab any resources.
+
+	This will be called when the port is initially opened and finally
+	closed, except when the port is also the system console.  This
+	will occur even if CONFIG_PM is not set.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  type(port)
+	Return a pointer to a string constant describing the specified
+	port, or return NULL, in which case the string 'unknown' is
+	substituted.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  release_port(port)
+	Release any memory and IO region resources currently in use by
+	the port.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  request_port(port)
+	Request any memory and IO region resources required by the port.
+	If any fail, no resources should be registered when this function
+	returns, and it should return -EBUSY on failure.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  config_port(port,type)
+	Perform any autoconfiguration steps required for the port.  `type`
+	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
+	indicates that the port requires detection and identification.
+	port->type should be set to the type found, or PORT_UNKNOWN if
+	no port was detected.
+
+	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+	which should be probed using standard kernel autoprobing techniques.
+	This is not necessary on platforms where ports have interrupts
+	internally hard wired (eg, system on a chip implementations).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  verify_port(port,serinfo)
+	Verify the new serial port information contained within serinfo is
+	suitable for this port type.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  ioctl(port,cmd,arg)
+	Perform any port specific IOCTLs.  IOCTL commands must be defined
+	using the standard numbering system found in <asm/ioctl.h>
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  poll_init(port)
+	Called by kgdb to perform the minimal hardware initialization needed
+	to support poll_put_char() and poll_get_char().  Unlike ->startup()
+	this should not request interrupts.
+
+	Locking: tty_mutex and tty_port->mutex taken.
+
+	Interrupts: n/a.
+
+  poll_put_char(port,ch)
+	Called by kgdb to write a single character directly to the serial
+	port.  It can and should block until there is space in the TX FIFO.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  poll_get_char(port)
+	Called by kgdb to read a single character directly from the serial
+	port.  If data is available, it should be returned; otherwise
+	the function should return NO_POLL_CHAR immediately.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+	Update the FIFO drain timeout, port->timeout, according to the
+	number of bits, parity, stop bits and baud rate.
+
+	Locking: caller is expected to take port->lock
+
+	Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+	Return the numeric baud rate for the specified termios, taking
+	account of the special 38400 baud "kludge".  The B0 baud rate
+	is mapped to 9600 baud.
+
+	If the baud rate is not within min..max, then if old is non-NULL,
+	the original baud rate will be tried.  If that exceeds the
+	min..max constraint, 9600 baud will be returned.  termios will
+	be updated to the baud rate in use.
+
+	Note: min..max must always allow 9600 baud to be selected.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_get_divisor(port,baud)
+	Return the divisor (baud_base / baud) for the specified baud
+	rate, appropriately rounded.
+
+	If 38400 baud and custom divisor is selected, return the
+	custom divisor instead.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_match_port(port1,port2)
+	This utility function can be used to determine whether two
+	uart_port structures describe the same port.
+
+	Locking: n/a
+
+	Interrupts: n/a
+
+uart_write_wakeup(port)
+	A driver is expected to call this function when the number of
+	characters in the transmit buffer have dropped below a threshold.
+
+	Locking: port->lock should be held.
+
+	Interrupts: n/a
+
+uart_register_driver(drv)
+	Register a uart driver with the core driver.  We in turn register
+	with the tty layer, and initialise the core driver per-port state.
+
+	drv->port should be NULL, and the per-port structures should be
+	registered using uart_add_one_port after this call has succeeded.
+
+	Locking: none
+
+	Interrupts: enabled
+
+uart_unregister_driver()
+	Remove all references to a driver from the core driver.  The low
+	level driver must have removed all its ports via the
+	uart_remove_one_port() if it registered them with uart_add_one_port().
+
+	Locking: none
+
+	Interrupts: enabled
+
+**uart_suspend_port()**
+
+**uart_resume_port()**
+
+**uart_add_one_port()**
+
+**uart_remove_one_port()**
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core.  This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus::
+
+	struct my_port {
+		struct uart_port	port;
+		int			my_stuff;
+	};
+
+Modem control lines via GPIO
+----------------------------
+
+Some helpers are provided in order to set/get modem control lines via GPIO.
+
+mctrl_gpio_init(port, idx):
+	This will get the {cts,rts,...}-gpios from device tree if they are
+	present and request them, set direction etc, and return an
+	allocated structure. `devm_*` functions are used, so there's no need
+	to call mctrl_gpio_free().
+	As this sets up the irq handling make sure to not handle changes to the
+	gpio input lines in your driver, too.
+
+mctrl_gpio_free(dev, gpios):
+	This will free the requested gpios in mctrl_gpio_init().
+	As `devm_*` functions are used, there's generally no need to call
+	this function.
+
+mctrl_gpio_to_gpiod(gpios, gidx)
+	This returns the gpio_desc structure associated to the modem line
+	index.
+
+mctrl_gpio_set(gpios, mctrl):
+	This will sets the gpios according to the mctrl state.
+
+mctrl_gpio_get(gpios, mctrl):
+	This will update mctrl with the gpios values.
+
+mctrl_gpio_enable_ms(gpios):
+	Enables irqs and handling of changes to the ms lines.
+
+mctrl_gpio_disable_ms(gpios):
+	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
new file mode 100644
index 000000000000..33ad10d05b26
--- /dev/null
+++ b/Documentation/driver-api/serial/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Support for Serial devices
+==========================
+
+.. toctree::
+    :maxdepth: 1
+
+
+    driver
+    tty
+
+Serial drivers
+==============
+
+.. toctree::
+    :maxdepth: 1
+
+    cyclades_z
+    moxa-smartio
+    n_gsm
+    rocket
+    serial-iso7816
+    serial-rs485
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst
new file mode 100644
index 000000000000..156100f17c3f
--- /dev/null
+++ b/Documentation/driver-api/serial/moxa-smartio.rst
@@ -0,0 +1,615 @@
+=============================================================
+MOXA Smartio/Industio Family Device Driver Installation Guide
+=============================================================
+
+.. note::
+
+   This file is outdated. It needs some care in order to make it
+   updated to Kernel 5.0 and upper
+
+Copyright (C) 2008, Moxa Inc.
+
+Date: 01/21/2008
+
+.. Content
+
+   1. Introduction
+   2. System Requirement
+   3. Installation
+      3.1 Hardware installation
+      3.2 Driver files
+      3.3 Device naming convention
+      3.4 Module driver configuration
+      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
+      3.6 Custom configuration
+      3.7 Verify driver installation
+   4. Utilities
+   5. Setserial
+   6. Troubleshooting
+
+1. Introduction
+^^^^^^^^^^^^^^^
+
+   The Smartio/Industio/UPCI family Linux driver supports following multiport
+   boards.
+
+    - 2 ports multiport board
+	CP-102U, CP-102UL, CP-102UF
+	CP-132U-I, CP-132UL,
+	CP-132, CP-132I, CP132S, CP-132IS,
+	CI-132, CI-132I, CI-132IS,
+	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
+
+    - 4 ports multiport board
+	CP-104EL,
+	CP-104UL, CP-104JU,
+	CP-134U, CP-134U-I,
+	C104H/PCI, C104HS/PCI,
+	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
+	C104H, C104HS,
+	CI-104J, CI-104JS,
+	CI-134, CI-134I, CI-134IS,
+	(C114HI, CT-114I, C104P),
+	POS-104UL,
+	CB-114,
+	CB-134I
+
+    - 8 ports multiport board
+	CP-118EL, CP-168EL,
+	CP-118U, CP-168U,
+	C168H/PCI,
+	C168H, C168HS,
+	(C168P),
+	CB-108
+
+   This driver and installation procedure have been developed upon Linux Kernel
+   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
+   to maintain compatibility, this version has also been properly tested with
+   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
+   occurs, please contact Moxa at support@moxa.com.tw.
+
+   In addition to device driver, useful utilities are also provided in this
+   version. They are:
+
+    - msdiag
+		 Diagnostic program for displaying installed Moxa
+                 Smartio/Industio boards.
+    - msmon
+		 Monitor program to observe data count and line status signals.
+    - msterm     A simple terminal program which is useful in testing serial
+	         ports.
+    - io-irq.exe
+		 Configuration program to setup ISA boards. Please note that
+                 this program can only be executed under DOS.
+
+   All the drivers and utilities are published in form of source code under
+   GNU General Public License in this version. Please refer to GNU General
+   Public License announcement in each source code file for more detail.
+
+   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
+
+   This version of driver can be installed as Loadable Module (Module driver)
+   or built-in into kernel (Static driver). You may refer to following
+   installation procedure for suitable one. Before you install the driver,
+   please refer to hardware installation procedure in the User's Manual.
+
+   We assume the user should be familiar with following documents.
+
+   - Serial-HOWTO
+   - Kernel-HOWTO
+
+2. System Requirement
+^^^^^^^^^^^^^^^^^^^^^
+
+   - Hardware platform: Intel x86 machine
+   - Kernel version: 2.4.x or 2.6.x
+   - gcc version 2.72 or later
+   - Maximum 4 boards can be installed in combination
+
+3. Installation
+^^^^^^^^^^^^^^^
+
+3.1 Hardware installation
+=========================
+
+   There are two types of buses, ISA and PCI, for Smartio/Industio
+   family multiport board.
+
+ISA board
+---------
+
+   You'll have to configure CAP address, I/O address, Interrupt Vector
+   as well as IRQ before installing this driver. Please refer to hardware
+   installation procedure in User's Manual before proceed any further.
+   Please make sure the JP1 is open after the ISA board is set properly.
+
+PCI/UPCI board
+--------------
+
+   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
+   with other ISA devices. Please refer to hardware installation
+   procedure in User's Manual in advance.
+
+PCI IRQ Sharing
+---------------
+
+   Each port within the same multiport board shares the same IRQ. Up to
+   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
+   together on one system and they can share the same IRQ.
+
+
+3.2 Driver files
+================
+
+   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
+   first step, anyway, is to copy driver file "mxser.tgz" into specified
+   directory. e.g. /moxa. The execute commands as below::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # tar xvf /dev/fd0
+
+or::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
+       # tar xvfz mxser.tgz
+
+
+3.3 Device naming convention
+============================
+
+   You may find all the driver and utilities files in /moxa/mxser.
+   Following installation procedure depends on the model you'd like to
+   run the driver. If you prefer module driver, please refer to 3.4.
+   If static driver is required, please refer to 3.5.
+
+Dialin and callout port
+-----------------------
+
+   This driver remains traditional serial device properties. There are
+   two special file name for each serial port. One is dial-in port
+   which is named "ttyMxx". For callout port, the naming convention
+   is "cumxx".
+
+Device naming when more than 2 boards installed
+-----------------------------------------------
+
+   Naming convention for each Smartio/Industio multiport board is
+   pre-defined as below.
+
+   ============ ===============       ==============
+   Board Num.	 Dial-in Port	      Callout port
+   1st board	ttyM0  - ttyM7	      cum0  - cum7
+   2nd board	ttyM8  - ttyM15       cum8  - cum15
+   3rd board	ttyM16 - ttyM23       cum16 - cum23
+   4th board	ttyM24 - ttym31       cum24 - cum31
+   ============ ===============       ==============
+
+.. note::
+
+   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
+   device instead.
+
+Board sequence
+--------------
+
+   This driver will activate ISA boards according to the parameter set
+   in the driver. After all specified ISA board activated, PCI board
+   will be installed in the system automatically driven.
+   Therefore the board number is sorted by the CAP address of ISA boards.
+   For PCI boards, their sequence will be after ISA boards and C168H/PCI
+   has higher priority than C104H/PCI boards.
+
+3.4 Module driver configuration
+===============================
+
+   Module driver is easiest way to install. If you prefer static driver
+   installation, please skip this paragraph.
+
+
+   ------------- Prepare to use the MOXA driver --------------------
+
+3.4.1 Create tty device with correct major number
+-------------------------------------------------
+
+   Before using MOXA driver, your system must have the tty devices
+   which are created with driver's major number. We offer one shell
+   script "msmknod" to simplify the procedure.
+   This step is only needed to be executed once. But you still
+   need to do this procedure when:
+
+   a. You change the driver's major number. Please refer the "3.7"
+      section.
+   b. Your total installed MOXA boards number is changed. Maybe you
+      add/delete one MOXA board.
+   c. You want to change the tty name. This needs to modify the
+      shell script "msmknod"
+
+   The procedure is::
+
+	 # cd /moxa/mxser/driver
+	 # ./msmknod
+
+   This shell script will require the major number for dial-in
+   device and callout device to create tty device. You also need
+   to specify the total installed MOXA board number. Default major
+   numbers for dial-in device and callout device are 30, 35. If
+   you need to change to other number, please refer section "3.7"
+   for more detailed procedure.
+   Msmknod will delete any special files occupying the same device
+   naming.
+
+3.4.2 Build the MOXA driver and utilities
+-----------------------------------------
+
+   Before using the MOXA driver and utilities, you need compile the
+   all the source code. This step is only need to be executed once.
+   But you still re-compile the source code if you modify the source
+   code. For example, if you change the driver's major number (see
+   "3.7" section), then you need to do this step again.
+
+   Find "Makefile" in /moxa/mxser, then run
+
+	 # make clean; make install
+
+   ..note::
+
+	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
+	 # make clean; make installsp1
+
+	 For Red Hat Enterprise Linux AS4/ES4/WS4:
+	 # make clean; make installsp2
+
+   The driver files "mxser.o" and utilities will be properly compiled
+   and copied to system directories respectively.
+
+------------- Load MOXA driver--------------------
+
+3.4.3 Load the MOXA driver
+--------------------------
+
+   ::
+
+	 # modprobe mxser <argument>
+
+   will activate the module driver. You may run "lsmod" to check
+   if "mxser" is activated. If the MOXA board is ISA board, the
+   <argument> is needed. Please refer to section "3.4.5" for more
+   information.
+
+------------- Load MOXA driver on boot --------------------
+
+3.4.4 Load the mxser driver
+---------------------------
+
+
+   For the above description, you may manually execute
+   "modprobe mxser" to activate this driver and run
+   "rmmod mxser" to remove it.
+
+   However, it's better to have a boot time configuration to
+   eliminate manual operation. Boot time configuration can be
+   achieved by rc file. We offer one "rc.mxser" file to simplify
+   the procedure under "moxa/mxser/driver".
+
+   But if you use ISA board, please modify the "modprobe ..." command
+   to add the argument (see "3.4.5" section). After modifying the
+   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
+   manually to make sure the modification is ok. If any error
+   encountered, please try to modify again. If the modification is
+   completed, follow the below step.
+
+   Run following command for setting rc files::
+
+	 # cd /moxa/mxser/driver
+	 # cp ./rc.mxser /etc/rc.d
+	 # cd /etc/rc.d
+
+   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
+   create it by vi, run "chmod 755 rc.serial" to change the permission.
+
+   Add "/etc/rc.d/rc.mxser" in last line.
+
+   Reboot and check if moxa.o activated by "lsmod" command.
+
+3.4.5. specify CAP address
+--------------------------
+
+   If you'd like to drive Smartio/Industio ISA boards in the system,
+   you'll have to add parameter to specify CAP address of given
+   board while activating "mxser.o". The format for parameters are
+   as follows.::
+
+	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
+				  |  |  |    |
+				  |  |  |    +- 4th ISA board
+				  |  |  +------ 3rd ISA board
+				  |  +------------ 2nd ISA board
+				  +-------------------1st ISA board
+
+3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
+================================================================
+
+    Note:
+          To use static driver, you must install the linux kernel
+          source package.
+
+3.5.1 Backup the built-in driver in the kernel
+----------------------------------------------
+
+    ::
+
+       # cd /usr/src/linux/drivers/char
+       # mv mxser.c mxser.c.old
+
+       For Red Hat 7.x user, you need to create link:
+       # cd /usr/src
+       # ln -s linux-2.4 linux
+
+3.5.2 Create link
+-----------------
+    ::
+
+	  # cd /usr/src/linux/drivers/char
+	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
+
+3.5.3 Add CAP address list for ISA boards.
+------------------------------------------
+
+    For PCI boards user, please skip this step.
+
+    In module mode, the CAP address for ISA board is given by
+    parameter. In static driver configuration, you'll have to
+    assign it within driver's source code. If you will not
+    install any ISA boards, you may skip to next portion.
+    The instructions to modify driver source code are as
+    below.
+
+    a. run::
+
+	# cd /moxa/mxser/driver
+	# vi mxser.c
+
+    b. Find the array mxserBoardCAP[] as below::
+
+	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
+
+    c. Change the address within this array using vi. For
+       example, to driver 2 ISA boards with CAP address
+       0x280 and 0x180 as 1st and 2nd board. Just to change
+       the source code as follows::
+
+	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
+
+3.5.4 Setup kernel configuration
+--------------------------------
+
+    Configure the kernel::
+
+      # cd /usr/src/linux
+      # make menuconfig
+
+    You will go into a menu-driven system. Please select [Character
+    devices][Non-standard serial port support], enable the [Moxa
+    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
+    select [Exit] to exit this program.
+
+3.5.5 Rebuild kernel
+--------------------
+
+    The following are for Linux kernel rebuilding, for your
+    reference only.
+
+    For appropriate details, please refer to the Linux document:
+
+        a. Run the following commands::
+
+	     cd /usr/src/linux
+	     make clean		     # take a few minutes
+	     make dep		     # take a few minutes
+	     make bzImage	     # take probably 10-20 minutes
+	     make install	     # copy boot image to correct position
+
+	f. Please make sure the boot kernel (vmlinuz) is in the
+	   correct position.
+	g. If you use 'lilo' utility, you should check /etc/lilo.conf
+	   'image' item specified the path which is the 'vmlinuz' path,
+	   or you will load wrong (or old) boot kernel image (vmlinuz).
+	   After checking /etc/lilo.conf, please run "lilo".
+
+	  Note that if the result of "make bzImage" is ERROR, then you have to
+	  go back to Linux configuration Setup. Type "make menuconfig" in
+          directory /usr/src/linux.
+
+
+3.5.6 Make tty device and special file
+--------------------------------------
+
+    ::
+       # cd /moxa/mxser/driver
+       # ./msmknod
+
+3.5.7 Make utility
+------------------
+
+    ::
+
+	  # cd /moxa/mxser/utility
+	  # make clean; make install
+
+3.5.8 Reboot
+------------
+
+
+
+3.6 Custom configuration
+========================
+
+    Although this driver already provides you default configuration, you
+    still can change the device name and major number. The instruction to
+    change these parameters are shown as below.
+
+a. Change Device name
+
+    If you'd like to use other device names instead of default naming
+    convention, all you have to do is to modify the internal code
+    within the shell script "msmknod". First, you have to open "msmknod"
+    by vi. Locate each line contains "ttyM" and "cum" and change them
+    to the device name you desired. "msmknod" creates the device names
+    you need next time executed.
+
+b. Change Major number
+
+    If major number 30 and 35 had been occupied, you may have to select
+    2 free major numbers for this driver. There are 3 steps to change
+    major numbers.
+
+3.6.1 Find free major numbers
+-----------------------------
+
+    In /proc/devices, you may find all the major numbers occupied
+    in the system. Please select 2 major numbers that are available.
+    e.g. 40, 45.
+
+3.6.2 Create special files
+--------------------------
+
+   Run /moxa/mxser/driver/msmknod to create special files with
+   specified major numbers.
+
+3.6.3 Modify driver with new major number
+-----------------------------------------
+
+   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
+   contains "MXSERMAJOR". Change the content as below::
+
+	  #define	  MXSERMAJOR		  40
+	  #define	  MXSERCUMAJOR		  45
+
+    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
+
+3.7 Verify driver installation
+==============================
+
+    You may refer to /var/log/messages to check the latest status
+    log reported by this driver whenever it's activated.
+
+4. Utilities
+^^^^^^^^^^^^
+
+   There are 3 utilities contained in this driver. They are msdiag, msmon and
+   msterm. These 3 utilities are released in form of source code. They should
+   be compiled into executable file and copied into /usr/bin.
+
+   Before using these utilities, please load driver (refer 3.4 & 3.5) and
+   make sure you had run the "msmknod" utility.
+
+msdiag - Diagnostic
+===================
+
+   This utility provides the function to display what Moxa Smartio/Industio
+   board found by driver in the system.
+
+msmon - Port Monitoring
+=======================
+
+   This utility gives the user a quick view about all the MOXA ports'
+   activities. One can easily learn each port's total received/transmitted
+   (Rx/Tx) character count since the time when the monitoring is started.
+
+   Rx/Tx throughputs per second are also reported in interval basis (e.g.
+   the last 5 seconds) and in average basis (since the time the monitoring
+   is started). You can reset all ports' count by <HOME> key. <+> <->
+   (plus/minus) keys to change the displaying time interval. Press <ENTER>
+   on the port, that cursor stay, to view the port's communication
+   parameters, signal status, and input/output queue.
+
+msterm - Terminal Emulation
+===========================
+
+   This utility provides data sending and receiving ability of all tty ports,
+   especially for MOXA ports. It is quite useful for testing simple
+   application, for example, sending AT command to a modem connected to the
+   port or used as a terminal for login purpose. Note that this is only a
+   dumb terminal emulation without handling full screen operation.
+
+5. Setserial
+^^^^^^^^^^^^
+
+   Supported Setserial parameters are listed as below.
+
+   ============== =========================================================
+   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
+   close_delay	  set the amount of time(in 1/100 of a second) that DTR
+		  should be kept low while being closed.
+   closing_wait   set the amount of time(in 1/100 of a second) that the
+		  serial port should wait for data to be drained while
+		  being closed, before the receiver is disable.
+   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
+   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
+   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
+   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
+   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
+   spd_cust	  Use  the custom divisor to set the speed when  the
+		  application requests 38.4kb.
+   divisor	  This option set the custom division.
+   baud_base	  This option set the base baud rate.
+   ============== =========================================================
+
+6. Troubleshooting
+^^^^^^^^^^^^^^^^^^
+
+   The boot time error messages and solutions are stated as clearly as
+   possible. If all the possible solutions fail, please contact our technical
+   support team to get more help.
+
+
+   Error msg:
+	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
+              and after are ignored.
+
+   Solution:
+   To avoid this problem, please unplug fifth and after board, because Moxa
+   driver supports up to 4 boards.
+
+   Error msg:
+	      Request_irq fail, IRQ(?) may be conflict with another device.
+
+   Solution:
+   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
+   which device causes the situation, please check /proc/interrupts to find
+   free IRQ and simply change another free IRQ for Moxa board.
+
+   Error msg:
+	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
+
+   Solution:
+   Each port within the same multiport board shares the same IRQ. Please set
+   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
+
+   Error msg:
+	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
+
+   Solution:
+   Moxa ISA board needs an interrupt vector.Please refer to user's manual
+   "Hardware Installation" chapter to set interrupt vector.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family driver!
+
+   Solution:
+   Load Moxa driver fail, the major number may conflict with other devices.
+   Please refer to previous section 3.7 to change a free major number for
+   Moxa driver.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family callout driver!
+
+   Solution:
+   Load Moxa callout driver fail, the callout device major number may
+   conflict with other devices. Please refer to previous section 3.7 to
+   change a free callout device major number for Moxa driver.
diff --git a/Documentation/driver-api/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst
new file mode 100644
index 000000000000..f3ad9fd26408
--- /dev/null
+++ b/Documentation/driver-api/serial/n_gsm.rst
@@ -0,0 +1,103 @@
+==============================
+GSM 0710 tty multiplexor HOWTO
+==============================
+
+This line discipline implements the GSM 07.10 multiplexing protocol
+detailed in the following 3GPP document:
+
+	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
+
+This document give some hints on how to use this driver with GPRS and 3G
+modems connected to a physical serial port.
+
+How to use it
+-------------
+1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
+   its serial port. Depending on the modem used, you can pass more or less
+   parameters to this command,
+2. switch the serial line to using the n_gsm line discipline by using
+   TIOCSETD ioctl,
+3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
+
+Major parts of the initialization program :
+(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
+
+  #include <linux/gsmmux.h>
+  #define N_GSM0710	21	/* GSM 0710 Mux */
+  #define DEFAULT_SPEED	B115200
+  #define SERIAL_PORT	/dev/ttyS0
+
+	int ldisc = N_GSM0710;
+	struct gsm_config c;
+	struct termios configuration;
+
+	/* open the serial port connected to the modem */
+	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
+
+	/* configure the serial port : speed, flow control ... */
+
+	/* send the AT commands to switch the modem to CMUX mode
+	   and check that it's successful (should return OK) */
+	write(fd, "AT+CMUX=0\r", 10);
+
+	/* experience showed that some modems need some time before
+	   being able to answer to the first MUX packet so a delay
+	   may be needed here in some case */
+	sleep(3);
+
+	/* use n_gsm line discipline */
+	ioctl(fd, TIOCSETD, &ldisc);
+
+	/* get n_gsm configuration */
+	ioctl(fd, GSMIOC_GETCONF, &c);
+	/* we are initiator and need encoding 0 (basic) */
+	c.initiator = 1;
+	c.encapsulation = 0;
+	/* our modem defaults to a maximum size of 127 bytes */
+	c.mru = 127;
+	c.mtu = 127;
+	/* set the new configuration */
+	ioctl(fd, GSMIOC_SETCONF, &c);
+
+	/* and wait for ever to keep the line discipline enabled */
+	daemon(0,0);
+	pause();
+
+4. create the devices corresponding to the "virtual" serial ports (take care,
+   each modem has its configuration and some DLC have dedicated functions,
+   for example GPS), starting with minor 1 (DLC0 is reserved for the management
+   of the mux)::
+
+     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
+     for i in `seq 1 4`; do
+	mknod /dev/ttygsm$i c $MAJOR $i
+     done
+
+5. use these devices as plain serial ports.
+
+   for example, it's possible:
+
+   - and to use gnokii to send / receive SMS on ttygsm1
+   - to use ppp to establish a datalink on ttygsm2
+
+6. first close all virtual ports before closing the physical port.
+
+   Note that after closing the physical port the modem is still in multiplexing
+   mode. This may prevent a successful re-opening of the port later. To avoid
+   this situation either reset the modem if your hardware allows that or send
+   a disconnect command frame manually before initializing the multiplexing mode
+   for the second time. The byte sequence for the disconnect command frame is::
+
+      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
+
+Additional Documentation
+------------------------
+More practical details on the protocol and how it's supported by industrial
+modems can be found in the following documents :
+
+- http://www.telit.com/module/infopool/download.php?id=616
+- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
+- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
+- http://wm.sim.com/sim/News/photo/2010721161442.pdf
+
+11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/driver-api/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
new file mode 100644
index 000000000000..23761eae4282
--- /dev/null
+++ b/Documentation/driver-api/serial/rocket.rst
@@ -0,0 +1,185 @@
+================================================
+Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
+================================================
+
+Device Driver for the Linux Operating System
+============================================
+
+Product overview
+----------------
+
+This driver provides a loadable kernel driver for the Comtrol RocketPort
+and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
+high-speed serial ports or modems.  This driver supports up to a combination
+of four RocketPort or RocketModems boards in one machine simultaneously.
+This file assumes that you are using the RocketPort driver which is
+integrated into the kernel sources.
+
+The driver can also be installed as an external module using the usual
+"make;make install" routine.  This external module driver, obtainable
+from the Comtrol website listed below, is useful for updating the driver
+or installing it into kernels which do not have the driver configured
+into them.  Installations instructions for the external module
+are in the included README and HW_INSTALL files.
+
+RocketPort ISA and RocketModem II PCI boards currently are only supported by
+this driver in module form.
+
+The RocketPort ISA board requires I/O ports to be configured by the DIP
+switches on the board.  See the section "ISA Rocketport Boards" below for
+information on how to set the DIP switches.
+
+You pass the I/O port to the driver using the following module parameters:
+
+board1:
+	I/O port for the first ISA board
+board2:
+	I/O port for the second ISA board
+board3:
+	I/O port for the third ISA board
+board4:
+	I/O port for the fourth ISA board
+
+There is a set of utilities and scripts provided with the external driver
+(downloadable from http://www.comtrol.com) that ease the configuration and
+setup of the ISA cards.
+
+The RocketModem II PCI boards require firmware to be loaded into the card
+before it will function.  The driver has only been tested as a module for this
+board.
+
+Installation Procedures
+-----------------------
+
+RocketPort/RocketModem PCI cards require no driver configuration, they are
+automatically detected and configured.
+
+The RocketPort driver can be installed as a module (recommended) or built
+into the kernel. This is selected, as for other drivers, through the `make config`
+command from the root of the Linux source tree during the kernel build process.
+
+The RocketPort/RocketModem serial ports installed by this driver are assigned
+device major number 46, and will be named /dev/ttyRx, where x is the port number
+starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
+installed in the system, the mapping of port names to serial ports is displayed
+in the system log at /var/log/messages.
+
+If installed as a module, the module must be loaded.  This can be done
+manually by entering "modprobe rocket".  To have the module loaded automatically
+upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
+"alias char-major-46 rocket".
+
+In order to use the ports, their device names (nodes) must be created with mknod.
+This is only required once, the system will retain the names once created.  To
+create the RocketPort/RocketModem device names, use the command
+"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
+
+For example::
+
+	> mknod /dev/ttyR0 c 46 0
+	> mknod /dev/ttyR1 c 46 1
+	> mknod /dev/ttyR2 c 46 2
+
+The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
+for you::
+
+	>/dev/MAKEDEV ttyR
+
+ISA Rocketport Boards
+---------------------
+
+You must assign and configure the I/O addresses used by the ISA Rocketport
+card before installing and using it.  This is done by setting a set of DIP
+switches on the Rocketport board.
+
+
+Setting the I/O address
+-----------------------
+
+Before installing RocketPort(R) or RocketPort RA boards, you must find
+a range of I/O addresses for it to use. The first RocketPort card
+requires a 68-byte contiguous block of I/O addresses, starting at one
+of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
+0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
+switches of *all* of the Rocketport cards.
+
+The second, third, and fourth RocketPort cards require a 64-byte
+contiguous block of I/O addresses, starting at one of the following
+I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
+0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
+second, third, and fourth Rocketport cards (if present) are set via
+software control.  The DIP switch settings for the I/O address must be
+set to the value of the first Rocketport cards.
+
+In order to distinguish each of the card from the others, each card
+must have a unique board ID set on the dip switches.  The first
+Rocketport board must be set with the DIP switches corresponding to
+the first board, the second board must be set with the DIP switches
+corresponding to the second board, etc.  IMPORTANT: The board ID is
+the only place where the DIP switch settings should differ between the
+various Rocketport boards in a system.
+
+The I/O address range used by any of the RocketPort cards must not
+conflict with any other cards in the system, including other
+RocketPort cards.  Below, you will find a list of commonly used I/O
+address ranges which may be in use by other devices in your system.
+On a Linux system, "cat /proc/ioports" will also be helpful in
+identifying what I/O addresses are being used by devices on your
+system.
+
+Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
+for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
+CAN NOT set the second, third or fourth board for address 0x140 since
+the first 4 bytes of that range are used by the first board.  You would
+need to set the second, third, or fourth board to one of the next available
+blocks such as 0x180.
+
+RocketPort and RocketPort RA SW1 Settings::
+
+            +-------------------------------+
+            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
+            +-------+-------+---------------+
+            | Unused| Card  | I/O Port Block|
+            +-------------------------------+
+
+  DIP Switches                             DIP Switches
+  7    8                                   6    5
+  ===================                      ===================
+  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
+                                           On   Off  Second Card
+                                           Off  On   Third Card
+                                           Off  Off  Fourth Card
+
+  DIP Switches         I/O Address Range
+  4    3    2    1     Used by the First Card
+  =====================================
+  On   Off  On   Off   100-143
+  On   Off  Off  On    140-183
+  On   Off  Off  Off   180-1C3       <==== Default
+  Off  On   On   Off   200-243
+  Off  On   Off  On    240-283
+  Off  On   Off  Off   280-2C3
+  Off  Off  On   Off   300-343
+  Off  Off  Off  On    340-383
+  Off  Off  Off  Off   380-3C3
+
+Reporting Bugs
+--------------
+
+For technical support, please provide the following
+information: Driver version, kernel release, distribution of
+kernel, and type of board you are using. Error messages and log
+printouts port configuration details are especially helpful.
+
+USA:
+    :Phone: (612) 494-4100
+    :FAX: (612) 494-4199
+    :email: support@comtrol.com
+
+Comtrol Europe:
+    :Phone: +44 (0) 1 869 323-220
+    :FAX: +44 (0) 1 869 323-211
+    :email: support@comtrol.co.uk
+
+Web:	http://www.comtrol.com
+FTP:	ftp.comtrol.com
diff --git a/Documentation/driver-api/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst
new file mode 100644
index 000000000000..d990143de0c6
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-iso7816.rst
@@ -0,0 +1,90 @@
+=============================
+ISO7816 Serial Communications
+=============================
+
+1. Introduction
+===============
+
+  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
+  also known as smart cards.
+
+2. Hardware-related considerations
+==================================
+
+  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
+  handling communication with a smart card.
+
+  For these microcontrollers, the Linux driver should be made capable of
+  working in both modes, and proper ioctls (see later) should be made
+  available at user-level to allow switching from one mode to the other, and
+  vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
+  ISO7816 communications. This data structure is used to set and configure
+  ISO7816 parameters in ioctls.
+
+  Any driver for devices capable of working both as RS232 and ISO7816 should
+  implement the iso7816_config callback in the uart_port structure. The
+  serial_core calls iso7816_config to do the device specific part in response
+  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
+  callback receives a pointer to struct serial_iso7816.
+
+4. Usage from user-level
+========================
+
+  From user-level, ISO7816 configuration can be get/set using the previous
+  ioctls. For instance, to set ISO7816 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_iso7816 iso7816conf;
+
+	/* Reserved fields as to be zeroed */
+	memset(&iso7816conf, 0, sizeof(iso7816conf));
+
+	/* Enable ISO7816 mode: */
+	iso7816conf.flags |= SER_ISO7816_ENABLED;
+
+	/* Select the protocol: */
+	/* T=0 */
+	iso7816conf.flags |= SER_ISO7816_T(0);
+	/* or T=1 */
+	iso7816conf.flags |= SER_ISO7816_T(1);
+
+	/* Set the guard time: */
+	iso7816conf.tg = 2;
+
+	/* Set the clock frequency*/
+	iso7816conf.clk = 3571200;
+
+	/* Set transmission factors: */
+	iso7816conf.sc_fi = 372;
+	iso7816conf.sc_di = 1;
+
+	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]    include/uapi/linux/serial.h
diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
new file mode 100644
index 000000000000..6bc824f948f9
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-rs485.rst
@@ -0,0 +1,103 @@
+===========================
+RS485 Serial Communications
+===========================
+
+1. Introduction
+===============
+
+   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
+   electrical characteristics of drivers and receivers for use in balanced
+   digital multipoint systems.
+   This standard is widely used for communications in industrial automation
+   because it can be used effectively over long distances and in electrically
+   noisy environments.
+
+2. Hardware-related Considerations
+==================================
+
+   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
+   half-duplex mode capable of automatically controlling line direction by
+   toggling RTS or DTR signals. That can be used to control external
+   half-duplex hardware like an RS485 transceiver or any RS232-connected
+   half-duplex devices like some modems.
+
+   For these microcontrollers, the Linux driver should be made capable of
+   working in both modes, and proper ioctls (see later) should be made
+   available at user-level to allow switching from one mode to the other, and
+   vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
+   RS485 communications. This data structure is used to set and configure RS485
+   parameters in the platform data and in ioctls.
+
+   The device tree can also provide RS485 boot time parameters (see [2]
+   for bindings). The driver is in charge of filling this data structure from
+   the values given by the device tree.
+
+   Any driver for devices capable of working both as RS232 and RS485 should
+   implement the rs485_config callback in the uart_port structure. The
+   serial_core calls rs485_config to do the device specific part in response
+   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
+   receives a pointer to struct serial_rs485.
+
+4. Usage from user-level
+========================
+
+   From user-level, RS485 configuration can be get/set using the previous
+   ioctls. For instance, to set RS485 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_rs485 rs485conf;
+
+	/* Enable RS485 mode: */
+	rs485conf.flags |= SER_RS485_ENABLED;
+
+	/* Set logical level for RTS pin equal to 1 when sending: */
+	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
+	/* or, set logical level for RTS pin equal to 0 when sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
+
+	/* Set logical level for RTS pin equal to 1 after sending: */
+	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
+	/* or, set logical level for RTS pin equal to 0 after sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
+
+	/* Set rts delay before send, if needed: */
+	rs485conf.delay_rts_before_send = ...;
+
+	/* Set rts delay after send, if needed: */
+	rs485conf.delay_rts_after_send = ...;
+
+	/* Set this flag if you want to receive data even while sending data */
+	rs485conf.flags |= SER_RS485_RX_DURING_TX;
+
+	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]	include/uapi/linux/serial.h
+
+ [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/driver-api/serial/tty.rst b/Documentation/driver-api/serial/tty.rst
new file mode 100644
index 000000000000..dd972caacf3e
--- /dev/null
+++ b/Documentation/driver-api/serial/tty.rst
@@ -0,0 +1,328 @@
+=================
+The Lockronomicon
+=================
+
+Your guide to the ancient and twisted locking policies of the tty layer and
+the warped logic behind them. Beware all ye who read on.
+
+
+Line Discipline
+---------------
+
+Line disciplines are registered with tty_register_ldisc() passing the
+discipline number and the ldisc structure. At the point of registration the
+discipline must be ready to use and it is possible it will get used before
+the call returns success. If the call returns an error then it won't get
+called. Do not re-use ldisc numbers as they are part of the userspace ABI
+and writing over an existing ldisc will cause demons to eat your computer.
+After the return the ldisc data has been copied so you may free your own
+copy of the structure. You must not re-register over the top of the line
+discipline even with the same data or your computer again will be eaten by
+demons.
+
+In order to remove a line discipline call tty_unregister_ldisc().
+In ancient times this always worked. In modern times the function will
+return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
+code manages the module counts this should not usually be a concern.
+
+Heed this warning: the reference count field of the registered copies of the
+tty_ldisc structure in the ldisc table counts the number of lines using this
+discipline. The reference count of the tty_ldisc structure within a tty
+counts the number of active users of the ldisc at this instant. In effect it
+counts the number of threads of execution within an ldisc method (plus those
+about to enter and exit although this detail matters not).
+
+Line Discipline Methods
+-----------------------
+
+TTY side interfaces
+^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+open()			Called when the line discipline is attached to
+			the terminal. No other call into the line
+			discipline for this tty will occur until it
+			completes successfully. Should initialize any
+			state needed by the ldisc, and set receive_room
+			in the tty_struct to the maximum amount of data
+			the line discipline is willing to accept from the
+			driver with a single call to receive_buf().
+			Returning an error will prevent the ldisc from
+			being attached. Can sleep.
+
+close()			This is called on a terminal when the line
+			discipline is being unplugged. At the point of
+			execution no further users will enter the
+			ldisc code for this tty. Can sleep.
+
+hangup()		Called when the tty line is hung up.
+			The line discipline should cease I/O to the tty.
+			No further calls into the ldisc code will occur.
+			The return value is ignored. Can sleep.
+
+read()			(optional) A process requests reading data from
+			the line. Multiple read calls may occur in parallel
+			and the ldisc must deal with serialization issues.
+			If not defined, the process will receive an EIO
+			error. May sleep.
+
+write()			(optional) A process requests writing data to the
+			line. Multiple write calls are serialized by the
+			tty layer for the ldisc. If not defined, the
+			process will receive an EIO error. May sleep.
+
+flush_buffer()		(optional) May be called at any point between
+			open and close, and instructs the line discipline
+			to empty its input buffer.
+
+set_termios()		(optional) Called on termios structure changes.
+			The caller passes the old termios data and the
+			current data is in the tty. Called under the
+			termios semaphore so allowed to sleep. Serialized
+			against itself only.
+
+poll()			(optional) Check the status for the poll/select
+			calls. Multiple poll calls may occur in parallel.
+			May sleep.
+
+ioctl()			(optional) Called when an ioctl is handed to the
+			tty layer that might be for the ldisc. Multiple
+			ioctl calls may occur in parallel. May sleep.
+
+compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
+			to the tty layer that might be for the ldisc.
+			Multiple ioctl calls may occur in parallel.
+			May sleep.
+======================= =======================================================
+
+Driver Side Interfaces
+^^^^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+receive_buf()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. The number of bytes is guaranteed not
+			to exceed the current value of tty->receive_room.
+			All bytes must be processed.
+
+receive_buf2()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. Returns the number of bytes processed.
+
+			If both receive_buf() and receive_buf2() are
+			defined, receive_buf2() should be preferred.
+
+write_wakeup()		May be called at any point between open and close.
+			The TTY_DO_WRITE_WAKEUP flag indicates if a call
+			is needed but always races versus calls. Thus the
+			ldisc must be careful about setting order and to
+			handle unexpected calls. Must not sleep.
+
+			The driver is forbidden from calling this directly
+			from the ->write call from the ldisc as the ldisc
+			is permitted to call the driver write method from
+			this function. In such a situation defer it.
+
+dcd_change()		Report to the tty line the current DCD pin status
+			changes and the relative timestamp. The timestamp
+			cannot be NULL.
+======================= =======================================================
+
+
+Driver Access
+^^^^^^^^^^^^^
+
+Line discipline methods can call the following methods of the underlying
+hardware driver through the function pointers within the tty->driver
+structure:
+
+======================= =======================================================
+write()			Write a block of characters to the tty device.
+			Returns the number of characters accepted. The
+			character buffer passed to this method is already
+			in kernel space.
+
+put_char()		Queues a character for writing to the tty device.
+			If there is no room in the queue, the character is
+			ignored.
+
+flush_chars()		(Optional) If defined, must be called after
+			queueing characters with put_char() in order to
+			start transmission.
+
+write_room()		Returns the numbers of characters the tty driver
+			will accept for queueing to be written.
+
+ioctl()			Invoke device specific ioctl.
+			Expects data pointers to refer to userspace.
+			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
+
+set_termios()		Notify the tty driver that the device's termios
+			settings have changed. New settings are in
+			tty->termios. Previous settings should be passed in
+			the "old" argument.
+
+			The API is defined such that the driver should return
+			the actual modes selected. This means that the
+			driver function is responsible for modifying any
+			bits in the request it cannot fulfill to indicate
+			the actual modes being used. A device with no
+			hardware capability for change (e.g. a USB dongle or
+			virtual port) can provide NULL for this method.
+
+throttle()		Notify the tty driver that input buffers for the
+			line discipline are close to full, and it should
+			somehow signal that no more characters should be
+			sent to the tty.
+
+unthrottle()		Notify the tty driver that characters can now be
+			sent to the tty without fear of overrunning the
+			input buffers of the line disciplines.
+
+stop()			Ask the tty driver to stop outputting characters
+			to the tty device.
+
+start()			Ask the tty driver to resume sending characters
+			to the tty device.
+
+hangup()		Ask the tty driver to hang up the tty device.
+
+break_ctl()		(Optional) Ask the tty driver to turn on or off
+			BREAK status on the RS-232 port.  If state is -1,
+			then the BREAK status should be turned on; if
+			state is 0, then BREAK should be turned off.
+			If this routine is not implemented, use ioctls
+			TIOCSBRK / TIOCCBRK instead.
+
+wait_until_sent()	Waits until the device has written out all of the
+			characters in its transmitter FIFO.
+
+send_xchar()		Send a high-priority XON/XOFF character to the device.
+======================= =======================================================
+
+
+Flags
+^^^^^
+
+Line discipline methods have access to tty->flags field containing the
+following interesting flags:
+
+======================= =======================================================
+TTY_THROTTLED		Driver input is throttled. The ldisc should call
+			tty->driver->unthrottle() in order to resume
+			reception when it is ready to process more data.
+
+TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
+			write_wakeup() method in order to resume
+			transmission when it can accept more data
+			to transmit.
+
+TTY_IO_ERROR		If set, causes all subsequent userspace read/write
+			calls on the tty to fail, returning -EIO.
+
+TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
+
+TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
+			smaller chunks.
+======================= =======================================================
+
+
+Locking
+^^^^^^^
+
+Callers to the line discipline functions from the tty layer are required to
+take line discipline locks. The same is true of calls from the driver side
+but not yet enforced.
+
+Three calls are now provided::
+
+	ldisc = tty_ldisc_ref(tty);
+
+takes a handle to the line discipline in the tty and returns it. If no ldisc
+is currently attached or the ldisc is being closed and re-opened at this
+point then NULL is returned. While this handle is held the ldisc will not
+change or go away::
+
+	tty_ldisc_deref(ldisc)
+
+Returns the ldisc reference and allows the ldisc to be closed. Returning the
+reference takes away your right to call the ldisc functions until you take
+a new reference::
+
+	ldisc = tty_ldisc_ref_wait(tty);
+
+Performs the same function as tty_ldisc_ref except that it will wait for an
+ldisc change to complete and then return a reference to the new ldisc.
+
+While these functions are slightly slower than the old code they should have
+minimal impact as most receive logic uses the flip buffers and they only
+need to take a reference when they push bits up through the driver.
+
+A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
+functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
+fail in this situation if used within these functions. Ldisc and driver
+code calling its own functions must be careful in this case.
+
+
+Driver Interface
+----------------
+
+======================= =======================================================
+open()			Called when a device is opened. May sleep
+
+close()			Called when a device is closed. At the point of
+			return from this call the driver must make no
+			further ldisc calls of any kind. May sleep
+
+write()			Called to write bytes to the device. May not
+			sleep. May occur in parallel in special cases.
+			Because this includes panic paths drivers generally
+			shouldn't try and do clever locking here.
+
+put_char()		Stuff a single character onto the queue. The
+			driver is guaranteed following up calls to
+			flush_chars.
+
+flush_chars()		Ask the kernel to write put_char queue
+
+write_room()		Return the number of characters that can be stuffed
+			into the port buffers without overflow (or less).
+			The ldisc is responsible for being intelligent
+			about multi-threading of write_room/write calls
+
+ioctl()			Called when an ioctl may be for the driver
+
+set_termios()		Called on termios change, serialized against
+			itself by a semaphore. May sleep.
+
+set_ldisc()		Notifier for discipline change. At the point this
+			is done the discipline is not yet usable. Can now
+			sleep (I think)
+
+throttle()		Called by the ldisc to ask the driver to do flow
+			control.  Serialization including with unthrottle
+			is the job of the ldisc layer.
+
+unthrottle()		Called by the ldisc to ask the driver to stop flow
+			control.
+
+stop()			Ldisc notifier to the driver to stop output. As with
+			throttle the serializations with start() are down
+			to the ldisc layer.
+
+start()			Ldisc notifier to the driver to start output.
+
+hangup()		Ask the tty driver to cause a hangup initiated
+			from the host side. [Can sleep ??]
+
+break_ctl()		Send RS232 break. Can sleep. Can get called in
+			parallel, driver must serialize (for now), and
+			with write calls.
+
+wait_until_sent()	Wait for characters to exit the hardware queue
+			of the driver. Can sleep
+
+send_xchar()	  	Send XON/XOFF and if possible jump the queue with
+			it in order to get fast flow control responses.
+			Cannot sleep ??
+======================= =======================================================
diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/serial/cyclades_z.rst
deleted file mode 100644
index 532ff67e2f1c..000000000000
--- a/Documentation/serial/cyclades_z.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-================
-Cyclades-Z notes
-================
-
-The Cyclades-Z must have firmware loaded onto the card before it will
-operate.  This operation should be performed during system startup,
-
-The firmware, loader program and the latest device driver code are
-available from Cyclades at
-
-    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst
deleted file mode 100644
index 4537119bf624..000000000000
--- a/Documentation/serial/driver.rst
+++ /dev/null
@@ -1,549 +0,0 @@
-====================
-Low Level Serial API
-====================
-
-
-This document is meant as a brief overview of some aspects of the new serial
-driver.  It is not complete, any questions you have should be directed to
-<rmk@arm.linux.org.uk>
-
-The reference implementation is contained within amba-pl011.c.
-
-
-
-Low Level Serial Hardware Driver
---------------------------------
-
-The low level serial hardware driver is responsible for supplying port
-information (defined by uart_port) and a set of control methods (defined
-by uart_ops) to the core serial driver.  The low level driver is also
-responsible for handling interrupts for the port, and providing any
-console support.
-
-
-Console Support
----------------
-
-The serial core provides a few helper functions.  This includes identifing
-the correct port structure (via uart_get_console) and decoding command line
-arguments (uart_parse_options).
-
-There is also a helper function (uart_console_write) which performs a
-character by character write, translating newlines to CRLF sequences.
-Driver writers are recommended to use this function rather than implementing
-their own version.
-
-
-Locking
--------
-
-It is the responsibility of the low level hardware driver to perform the
-necessary locking using port->lock.  There are some exceptions (which
-are described in the uart_ops listing below.)
-
-There are two locks.  A per-port spinlock, and an overall semaphore.
-
-From the core driver perspective, the port->lock locks the following
-data::
-
-	port->mctrl
-	port->icount
-	port->state->xmit.head (circ_buf->head)
-	port->state->xmit.tail (circ_buf->tail)
-
-The low level driver is free to use this lock to provide any additional
-locking.
-
-The port_sem semaphore is used to protect against ports being added/
-removed or reconfigured at inappropriate times. Since v2.6.27, this
-semaphore has been the 'mutex' member of the tty_port struct, and
-commonly referred to as the port mutex.
-
-
-uart_ops
---------
-
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See Documentation/serial/tty.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-Other functions
----------------
-
-uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
-
-	Locking: caller is expected to take port->lock
-
-	Interrupts: n/a
-
-uart_get_baud_rate(port,termios,old,min,max)
-	Return the numeric baud rate for the specified termios, taking
-	account of the special 38400 baud "kludge".  The B0 baud rate
-	is mapped to 9600 baud.
-
-	If the baud rate is not within min..max, then if old is non-NULL,
-	the original baud rate will be tried.  If that exceeds the
-	min..max constraint, 9600 baud will be returned.  termios will
-	be updated to the baud rate in use.
-
-	Note: min..max must always allow 9600 baud to be selected.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_get_divisor(port,baud)
-	Return the divisor (baud_base / baud) for the specified baud
-	rate, appropriately rounded.
-
-	If 38400 baud and custom divisor is selected, return the
-	custom divisor instead.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_match_port(port1,port2)
-	This utility function can be used to determine whether two
-	uart_port structures describe the same port.
-
-	Locking: n/a
-
-	Interrupts: n/a
-
-uart_write_wakeup(port)
-	A driver is expected to call this function when the number of
-	characters in the transmit buffer have dropped below a threshold.
-
-	Locking: port->lock should be held.
-
-	Interrupts: n/a
-
-uart_register_driver(drv)
-	Register a uart driver with the core driver.  We in turn register
-	with the tty layer, and initialise the core driver per-port state.
-
-	drv->port should be NULL, and the per-port structures should be
-	registered using uart_add_one_port after this call has succeeded.
-
-	Locking: none
-
-	Interrupts: enabled
-
-uart_unregister_driver()
-	Remove all references to a driver from the core driver.  The low
-	level driver must have removed all its ports via the
-	uart_remove_one_port() if it registered them with uart_add_one_port().
-
-	Locking: none
-
-	Interrupts: enabled
-
-**uart_suspend_port()**
-
-**uart_resume_port()**
-
-**uart_add_one_port()**
-
-**uart_remove_one_port()**
-
-Other notes
------------
-
-It is intended some day to drop the 'unused' entries from uart_port, and
-allow low level drivers to register their own individual uart_port's with
-the core.  This will allow drivers to use uart_port as a pointer to a
-structure containing both the uart_port entry with their own extensions,
-thus::
-
-	struct my_port {
-		struct uart_port	port;
-		int			my_stuff;
-	};
-
-Modem control lines via GPIO
-----------------------------
-
-Some helpers are provided in order to set/get modem control lines via GPIO.
-
-mctrl_gpio_init(port, idx):
-	This will get the {cts,rts,...}-gpios from device tree if they are
-	present and request them, set direction etc, and return an
-	allocated structure. `devm_*` functions are used, so there's no need
-	to call mctrl_gpio_free().
-	As this sets up the irq handling make sure to not handle changes to the
-	gpio input lines in your driver, too.
-
-mctrl_gpio_free(dev, gpios):
-	This will free the requested gpios in mctrl_gpio_init().
-	As `devm_*` functions are used, there's generally no need to call
-	this function.
-
-mctrl_gpio_to_gpiod(gpios, gidx)
-	This returns the gpio_desc structure associated to the modem line
-	index.
-
-mctrl_gpio_set(gpios, mctrl):
-	This will sets the gpios according to the mctrl state.
-
-mctrl_gpio_get(gpios, mctrl):
-	This will update mctrl with the gpios values.
-
-mctrl_gpio_enable_ms(gpios):
-	Enables irqs and handling of changes to the ms lines.
-
-mctrl_gpio_disable_ms(gpios):
-	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst
deleted file mode 100644
index d0ba22ea23bf..000000000000
--- a/Documentation/serial/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:orphan:
-
-==========================
-Support for Serial devices
-==========================
-
-.. toctree::
-    :maxdepth: 1
-
-
-    driver
-    tty
-
-Serial drivers
-==============
-
-.. toctree::
-    :maxdepth: 1
-
-    cyclades_z
-    moxa-smartio
-    n_gsm
-    rocket
-    serial-iso7816
-    serial-rs485
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/serial/moxa-smartio.rst
deleted file mode 100644
index 156100f17c3f..000000000000
--- a/Documentation/serial/moxa-smartio.rst
+++ /dev/null
@@ -1,615 +0,0 @@
-=============================================================
-MOXA Smartio/Industio Family Device Driver Installation Guide
-=============================================================
-
-.. note::
-
-   This file is outdated. It needs some care in order to make it
-   updated to Kernel 5.0 and upper
-
-Copyright (C) 2008, Moxa Inc.
-
-Date: 01/21/2008
-
-.. Content
-
-   1. Introduction
-   2. System Requirement
-   3. Installation
-      3.1 Hardware installation
-      3.2 Driver files
-      3.3 Device naming convention
-      3.4 Module driver configuration
-      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
-      3.6 Custom configuration
-      3.7 Verify driver installation
-   4. Utilities
-   5. Setserial
-   6. Troubleshooting
-
-1. Introduction
-^^^^^^^^^^^^^^^
-
-   The Smartio/Industio/UPCI family Linux driver supports following multiport
-   boards.
-
-    - 2 ports multiport board
-	CP-102U, CP-102UL, CP-102UF
-	CP-132U-I, CP-132UL,
-	CP-132, CP-132I, CP132S, CP-132IS,
-	CI-132, CI-132I, CI-132IS,
-	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
-
-    - 4 ports multiport board
-	CP-104EL,
-	CP-104UL, CP-104JU,
-	CP-134U, CP-134U-I,
-	C104H/PCI, C104HS/PCI,
-	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
-	C104H, C104HS,
-	CI-104J, CI-104JS,
-	CI-134, CI-134I, CI-134IS,
-	(C114HI, CT-114I, C104P),
-	POS-104UL,
-	CB-114,
-	CB-134I
-
-    - 8 ports multiport board
-	CP-118EL, CP-168EL,
-	CP-118U, CP-168U,
-	C168H/PCI,
-	C168H, C168HS,
-	(C168P),
-	CB-108
-
-   This driver and installation procedure have been developed upon Linux Kernel
-   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
-   to maintain compatibility, this version has also been properly tested with
-   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
-   occurs, please contact Moxa at support@moxa.com.tw.
-
-   In addition to device driver, useful utilities are also provided in this
-   version. They are:
-
-    - msdiag
-		 Diagnostic program for displaying installed Moxa
-                 Smartio/Industio boards.
-    - msmon
-		 Monitor program to observe data count and line status signals.
-    - msterm     A simple terminal program which is useful in testing serial
-	         ports.
-    - io-irq.exe
-		 Configuration program to setup ISA boards. Please note that
-                 this program can only be executed under DOS.
-
-   All the drivers and utilities are published in form of source code under
-   GNU General Public License in this version. Please refer to GNU General
-   Public License announcement in each source code file for more detail.
-
-   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
-
-   This version of driver can be installed as Loadable Module (Module driver)
-   or built-in into kernel (Static driver). You may refer to following
-   installation procedure for suitable one. Before you install the driver,
-   please refer to hardware installation procedure in the User's Manual.
-
-   We assume the user should be familiar with following documents.
-
-   - Serial-HOWTO
-   - Kernel-HOWTO
-
-2. System Requirement
-^^^^^^^^^^^^^^^^^^^^^
-
-   - Hardware platform: Intel x86 machine
-   - Kernel version: 2.4.x or 2.6.x
-   - gcc version 2.72 or later
-   - Maximum 4 boards can be installed in combination
-
-3. Installation
-^^^^^^^^^^^^^^^
-
-3.1 Hardware installation
-=========================
-
-   There are two types of buses, ISA and PCI, for Smartio/Industio
-   family multiport board.
-
-ISA board
----------
-
-   You'll have to configure CAP address, I/O address, Interrupt Vector
-   as well as IRQ before installing this driver. Please refer to hardware
-   installation procedure in User's Manual before proceed any further.
-   Please make sure the JP1 is open after the ISA board is set properly.
-
-PCI/UPCI board
---------------
-
-   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
-   with other ISA devices. Please refer to hardware installation
-   procedure in User's Manual in advance.
-
-PCI IRQ Sharing
----------------
-
-   Each port within the same multiport board shares the same IRQ. Up to
-   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
-   together on one system and they can share the same IRQ.
-
-
-3.2 Driver files
-================
-
-   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
-   first step, anyway, is to copy driver file "mxser.tgz" into specified
-   directory. e.g. /moxa. The execute commands as below::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # tar xvf /dev/fd0
-
-or::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
-       # tar xvfz mxser.tgz
-
-
-3.3 Device naming convention
-============================
-
-   You may find all the driver and utilities files in /moxa/mxser.
-   Following installation procedure depends on the model you'd like to
-   run the driver. If you prefer module driver, please refer to 3.4.
-   If static driver is required, please refer to 3.5.
-
-Dialin and callout port
------------------------
-
-   This driver remains traditional serial device properties. There are
-   two special file name for each serial port. One is dial-in port
-   which is named "ttyMxx". For callout port, the naming convention
-   is "cumxx".
-
-Device naming when more than 2 boards installed
------------------------------------------------
-
-   Naming convention for each Smartio/Industio multiport board is
-   pre-defined as below.
-
-   ============ ===============       ==============
-   Board Num.	 Dial-in Port	      Callout port
-   1st board	ttyM0  - ttyM7	      cum0  - cum7
-   2nd board	ttyM8  - ttyM15       cum8  - cum15
-   3rd board	ttyM16 - ttyM23       cum16 - cum23
-   4th board	ttyM24 - ttym31       cum24 - cum31
-   ============ ===============       ==============
-
-.. note::
-
-   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
-   device instead.
-
-Board sequence
---------------
-
-   This driver will activate ISA boards according to the parameter set
-   in the driver. After all specified ISA board activated, PCI board
-   will be installed in the system automatically driven.
-   Therefore the board number is sorted by the CAP address of ISA boards.
-   For PCI boards, their sequence will be after ISA boards and C168H/PCI
-   has higher priority than C104H/PCI boards.
-
-3.4 Module driver configuration
-===============================
-
-   Module driver is easiest way to install. If you prefer static driver
-   installation, please skip this paragraph.
-
-
-   ------------- Prepare to use the MOXA driver --------------------
-
-3.4.1 Create tty device with correct major number
--------------------------------------------------
-
-   Before using MOXA driver, your system must have the tty devices
-   which are created with driver's major number. We offer one shell
-   script "msmknod" to simplify the procedure.
-   This step is only needed to be executed once. But you still
-   need to do this procedure when:
-
-   a. You change the driver's major number. Please refer the "3.7"
-      section.
-   b. Your total installed MOXA boards number is changed. Maybe you
-      add/delete one MOXA board.
-   c. You want to change the tty name. This needs to modify the
-      shell script "msmknod"
-
-   The procedure is::
-
-	 # cd /moxa/mxser/driver
-	 # ./msmknod
-
-   This shell script will require the major number for dial-in
-   device and callout device to create tty device. You also need
-   to specify the total installed MOXA board number. Default major
-   numbers for dial-in device and callout device are 30, 35. If
-   you need to change to other number, please refer section "3.7"
-   for more detailed procedure.
-   Msmknod will delete any special files occupying the same device
-   naming.
-
-3.4.2 Build the MOXA driver and utilities
------------------------------------------
-
-   Before using the MOXA driver and utilities, you need compile the
-   all the source code. This step is only need to be executed once.
-   But you still re-compile the source code if you modify the source
-   code. For example, if you change the driver's major number (see
-   "3.7" section), then you need to do this step again.
-
-   Find "Makefile" in /moxa/mxser, then run
-
-	 # make clean; make install
-
-   ..note::
-
-	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
-	 # make clean; make installsp1
-
-	 For Red Hat Enterprise Linux AS4/ES4/WS4:
-	 # make clean; make installsp2
-
-   The driver files "mxser.o" and utilities will be properly compiled
-   and copied to system directories respectively.
-
-------------- Load MOXA driver--------------------
-
-3.4.3 Load the MOXA driver
---------------------------
-
-   ::
-
-	 # modprobe mxser <argument>
-
-   will activate the module driver. You may run "lsmod" to check
-   if "mxser" is activated. If the MOXA board is ISA board, the
-   <argument> is needed. Please refer to section "3.4.5" for more
-   information.
-
-------------- Load MOXA driver on boot --------------------
-
-3.4.4 Load the mxser driver
----------------------------
-
-
-   For the above description, you may manually execute
-   "modprobe mxser" to activate this driver and run
-   "rmmod mxser" to remove it.
-
-   However, it's better to have a boot time configuration to
-   eliminate manual operation. Boot time configuration can be
-   achieved by rc file. We offer one "rc.mxser" file to simplify
-   the procedure under "moxa/mxser/driver".
-
-   But if you use ISA board, please modify the "modprobe ..." command
-   to add the argument (see "3.4.5" section). After modifying the
-   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
-   manually to make sure the modification is ok. If any error
-   encountered, please try to modify again. If the modification is
-   completed, follow the below step.
-
-   Run following command for setting rc files::
-
-	 # cd /moxa/mxser/driver
-	 # cp ./rc.mxser /etc/rc.d
-	 # cd /etc/rc.d
-
-   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
-   create it by vi, run "chmod 755 rc.serial" to change the permission.
-
-   Add "/etc/rc.d/rc.mxser" in last line.
-
-   Reboot and check if moxa.o activated by "lsmod" command.
-
-3.4.5. specify CAP address
---------------------------
-
-   If you'd like to drive Smartio/Industio ISA boards in the system,
-   you'll have to add parameter to specify CAP address of given
-   board while activating "mxser.o". The format for parameters are
-   as follows.::
-
-	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
-				  |  |  |    |
-				  |  |  |    +- 4th ISA board
-				  |  |  +------ 3rd ISA board
-				  |  +------------ 2nd ISA board
-				  +-------------------1st ISA board
-
-3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
-================================================================
-
-    Note:
-          To use static driver, you must install the linux kernel
-          source package.
-
-3.5.1 Backup the built-in driver in the kernel
-----------------------------------------------
-
-    ::
-
-       # cd /usr/src/linux/drivers/char
-       # mv mxser.c mxser.c.old
-
-       For Red Hat 7.x user, you need to create link:
-       # cd /usr/src
-       # ln -s linux-2.4 linux
-
-3.5.2 Create link
------------------
-    ::
-
-	  # cd /usr/src/linux/drivers/char
-	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
-
-3.5.3 Add CAP address list for ISA boards.
-------------------------------------------
-
-    For PCI boards user, please skip this step.
-
-    In module mode, the CAP address for ISA board is given by
-    parameter. In static driver configuration, you'll have to
-    assign it within driver's source code. If you will not
-    install any ISA boards, you may skip to next portion.
-    The instructions to modify driver source code are as
-    below.
-
-    a. run::
-
-	# cd /moxa/mxser/driver
-	# vi mxser.c
-
-    b. Find the array mxserBoardCAP[] as below::
-
-	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
-
-    c. Change the address within this array using vi. For
-       example, to driver 2 ISA boards with CAP address
-       0x280 and 0x180 as 1st and 2nd board. Just to change
-       the source code as follows::
-
-	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
-
-3.5.4 Setup kernel configuration
---------------------------------
-
-    Configure the kernel::
-
-      # cd /usr/src/linux
-      # make menuconfig
-
-    You will go into a menu-driven system. Please select [Character
-    devices][Non-standard serial port support], enable the [Moxa
-    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
-    select [Exit] to exit this program.
-
-3.5.5 Rebuild kernel
---------------------
-
-    The following are for Linux kernel rebuilding, for your
-    reference only.
-
-    For appropriate details, please refer to the Linux document:
-
-        a. Run the following commands::
-
-	     cd /usr/src/linux
-	     make clean		     # take a few minutes
-	     make dep		     # take a few minutes
-	     make bzImage	     # take probably 10-20 minutes
-	     make install	     # copy boot image to correct position
-
-	f. Please make sure the boot kernel (vmlinuz) is in the
-	   correct position.
-	g. If you use 'lilo' utility, you should check /etc/lilo.conf
-	   'image' item specified the path which is the 'vmlinuz' path,
-	   or you will load wrong (or old) boot kernel image (vmlinuz).
-	   After checking /etc/lilo.conf, please run "lilo".
-
-	  Note that if the result of "make bzImage" is ERROR, then you have to
-	  go back to Linux configuration Setup. Type "make menuconfig" in
-          directory /usr/src/linux.
-
-
-3.5.6 Make tty device and special file
---------------------------------------
-
-    ::
-       # cd /moxa/mxser/driver
-       # ./msmknod
-
-3.5.7 Make utility
-------------------
-
-    ::
-
-	  # cd /moxa/mxser/utility
-	  # make clean; make install
-
-3.5.8 Reboot
-------------
-
-
-
-3.6 Custom configuration
-========================
-
-    Although this driver already provides you default configuration, you
-    still can change the device name and major number. The instruction to
-    change these parameters are shown as below.
-
-a. Change Device name
-
-    If you'd like to use other device names instead of default naming
-    convention, all you have to do is to modify the internal code
-    within the shell script "msmknod". First, you have to open "msmknod"
-    by vi. Locate each line contains "ttyM" and "cum" and change them
-    to the device name you desired. "msmknod" creates the device names
-    you need next time executed.
-
-b. Change Major number
-
-    If major number 30 and 35 had been occupied, you may have to select
-    2 free major numbers for this driver. There are 3 steps to change
-    major numbers.
-
-3.6.1 Find free major numbers
------------------------------
-
-    In /proc/devices, you may find all the major numbers occupied
-    in the system. Please select 2 major numbers that are available.
-    e.g. 40, 45.
-
-3.6.2 Create special files
---------------------------
-
-   Run /moxa/mxser/driver/msmknod to create special files with
-   specified major numbers.
-
-3.6.3 Modify driver with new major number
------------------------------------------
-
-   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
-   contains "MXSERMAJOR". Change the content as below::
-
-	  #define	  MXSERMAJOR		  40
-	  #define	  MXSERCUMAJOR		  45
-
-    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
-
-3.7 Verify driver installation
-==============================
-
-    You may refer to /var/log/messages to check the latest status
-    log reported by this driver whenever it's activated.
-
-4. Utilities
-^^^^^^^^^^^^
-
-   There are 3 utilities contained in this driver. They are msdiag, msmon and
-   msterm. These 3 utilities are released in form of source code. They should
-   be compiled into executable file and copied into /usr/bin.
-
-   Before using these utilities, please load driver (refer 3.4 & 3.5) and
-   make sure you had run the "msmknod" utility.
-
-msdiag - Diagnostic
-===================
-
-   This utility provides the function to display what Moxa Smartio/Industio
-   board found by driver in the system.
-
-msmon - Port Monitoring
-=======================
-
-   This utility gives the user a quick view about all the MOXA ports'
-   activities. One can easily learn each port's total received/transmitted
-   (Rx/Tx) character count since the time when the monitoring is started.
-
-   Rx/Tx throughputs per second are also reported in interval basis (e.g.
-   the last 5 seconds) and in average basis (since the time the monitoring
-   is started). You can reset all ports' count by <HOME> key. <+> <->
-   (plus/minus) keys to change the displaying time interval. Press <ENTER>
-   on the port, that cursor stay, to view the port's communication
-   parameters, signal status, and input/output queue.
-
-msterm - Terminal Emulation
-===========================
-
-   This utility provides data sending and receiving ability of all tty ports,
-   especially for MOXA ports. It is quite useful for testing simple
-   application, for example, sending AT command to a modem connected to the
-   port or used as a terminal for login purpose. Note that this is only a
-   dumb terminal emulation without handling full screen operation.
-
-5. Setserial
-^^^^^^^^^^^^
-
-   Supported Setserial parameters are listed as below.
-
-   ============== =========================================================
-   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
-   close_delay	  set the amount of time(in 1/100 of a second) that DTR
-		  should be kept low while being closed.
-   closing_wait   set the amount of time(in 1/100 of a second) that the
-		  serial port should wait for data to be drained while
-		  being closed, before the receiver is disable.
-   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
-   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
-   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
-   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
-   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
-   spd_cust	  Use  the custom divisor to set the speed when  the
-		  application requests 38.4kb.
-   divisor	  This option set the custom division.
-   baud_base	  This option set the base baud rate.
-   ============== =========================================================
-
-6. Troubleshooting
-^^^^^^^^^^^^^^^^^^
-
-   The boot time error messages and solutions are stated as clearly as
-   possible. If all the possible solutions fail, please contact our technical
-   support team to get more help.
-
-
-   Error msg:
-	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
-              and after are ignored.
-
-   Solution:
-   To avoid this problem, please unplug fifth and after board, because Moxa
-   driver supports up to 4 boards.
-
-   Error msg:
-	      Request_irq fail, IRQ(?) may be conflict with another device.
-
-   Solution:
-   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
-   which device causes the situation, please check /proc/interrupts to find
-   free IRQ and simply change another free IRQ for Moxa board.
-
-   Error msg:
-	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
-
-   Solution:
-   Each port within the same multiport board shares the same IRQ. Please set
-   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
-
-   Error msg:
-	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
-
-   Solution:
-   Moxa ISA board needs an interrupt vector.Please refer to user's manual
-   "Hardware Installation" chapter to set interrupt vector.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family driver!
-
-   Solution:
-   Load Moxa driver fail, the major number may conflict with other devices.
-   Please refer to previous section 3.7 to change a free major number for
-   Moxa driver.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family callout driver!
-
-   Solution:
-   Load Moxa callout driver fail, the callout device major number may
-   conflict with other devices. Please refer to previous section 3.7 to
-   change a free callout device major number for Moxa driver.
diff --git a/Documentation/serial/n_gsm.rst b/Documentation/serial/n_gsm.rst
deleted file mode 100644
index f3ad9fd26408..000000000000
--- a/Documentation/serial/n_gsm.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-==============================
-GSM 0710 tty multiplexor HOWTO
-==============================
-
-This line discipline implements the GSM 07.10 multiplexing protocol
-detailed in the following 3GPP document:
-
-	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
-
-This document give some hints on how to use this driver with GPRS and 3G
-modems connected to a physical serial port.
-
-How to use it
--------------
-1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
-   its serial port. Depending on the modem used, you can pass more or less
-   parameters to this command,
-2. switch the serial line to using the n_gsm line discipline by using
-   TIOCSETD ioctl,
-3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
-
-Major parts of the initialization program :
-(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
-
-  #include <linux/gsmmux.h>
-  #define N_GSM0710	21	/* GSM 0710 Mux */
-  #define DEFAULT_SPEED	B115200
-  #define SERIAL_PORT	/dev/ttyS0
-
-	int ldisc = N_GSM0710;
-	struct gsm_config c;
-	struct termios configuration;
-
-	/* open the serial port connected to the modem */
-	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
-
-	/* configure the serial port : speed, flow control ... */
-
-	/* send the AT commands to switch the modem to CMUX mode
-	   and check that it's successful (should return OK) */
-	write(fd, "AT+CMUX=0\r", 10);
-
-	/* experience showed that some modems need some time before
-	   being able to answer to the first MUX packet so a delay
-	   may be needed here in some case */
-	sleep(3);
-
-	/* use n_gsm line discipline */
-	ioctl(fd, TIOCSETD, &ldisc);
-
-	/* get n_gsm configuration */
-	ioctl(fd, GSMIOC_GETCONF, &c);
-	/* we are initiator and need encoding 0 (basic) */
-	c.initiator = 1;
-	c.encapsulation = 0;
-	/* our modem defaults to a maximum size of 127 bytes */
-	c.mru = 127;
-	c.mtu = 127;
-	/* set the new configuration */
-	ioctl(fd, GSMIOC_SETCONF, &c);
-
-	/* and wait for ever to keep the line discipline enabled */
-	daemon(0,0);
-	pause();
-
-4. create the devices corresponding to the "virtual" serial ports (take care,
-   each modem has its configuration and some DLC have dedicated functions,
-   for example GPS), starting with minor 1 (DLC0 is reserved for the management
-   of the mux)::
-
-     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
-     for i in `seq 1 4`; do
-	mknod /dev/ttygsm$i c $MAJOR $i
-     done
-
-5. use these devices as plain serial ports.
-
-   for example, it's possible:
-
-   - and to use gnokii to send / receive SMS on ttygsm1
-   - to use ppp to establish a datalink on ttygsm2
-
-6. first close all virtual ports before closing the physical port.
-
-   Note that after closing the physical port the modem is still in multiplexing
-   mode. This may prevent a successful re-opening of the port later. To avoid
-   this situation either reset the modem if your hardware allows that or send
-   a disconnect command frame manually before initializing the multiplexing mode
-   for the second time. The byte sequence for the disconnect command frame is::
-
-      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
-
-Additional Documentation
-------------------------
-More practical details on the protocol and how it's supported by industrial
-modems can be found in the following documents :
-
-- http://www.telit.com/module/infopool/download.php?id=616
-- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
-- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
-- http://wm.sim.com/sim/News/photo/2010721161442.pdf
-
-11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/serial/rocket.rst b/Documentation/serial/rocket.rst
deleted file mode 100644
index 23761eae4282..000000000000
--- a/Documentation/serial/rocket.rst
+++ /dev/null
@@ -1,185 +0,0 @@
-================================================
-Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
-================================================
-
-Device Driver for the Linux Operating System
-============================================
-
-Product overview
-----------------
-
-This driver provides a loadable kernel driver for the Comtrol RocketPort
-and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
-high-speed serial ports or modems.  This driver supports up to a combination
-of four RocketPort or RocketModems boards in one machine simultaneously.
-This file assumes that you are using the RocketPort driver which is
-integrated into the kernel sources.
-
-The driver can also be installed as an external module using the usual
-"make;make install" routine.  This external module driver, obtainable
-from the Comtrol website listed below, is useful for updating the driver
-or installing it into kernels which do not have the driver configured
-into them.  Installations instructions for the external module
-are in the included README and HW_INSTALL files.
-
-RocketPort ISA and RocketModem II PCI boards currently are only supported by
-this driver in module form.
-
-The RocketPort ISA board requires I/O ports to be configured by the DIP
-switches on the board.  See the section "ISA Rocketport Boards" below for
-information on how to set the DIP switches.
-
-You pass the I/O port to the driver using the following module parameters:
-
-board1:
-	I/O port for the first ISA board
-board2:
-	I/O port for the second ISA board
-board3:
-	I/O port for the third ISA board
-board4:
-	I/O port for the fourth ISA board
-
-There is a set of utilities and scripts provided with the external driver
-(downloadable from http://www.comtrol.com) that ease the configuration and
-setup of the ISA cards.
-
-The RocketModem II PCI boards require firmware to be loaded into the card
-before it will function.  The driver has only been tested as a module for this
-board.
-
-Installation Procedures
------------------------
-
-RocketPort/RocketModem PCI cards require no driver configuration, they are
-automatically detected and configured.
-
-The RocketPort driver can be installed as a module (recommended) or built
-into the kernel. This is selected, as for other drivers, through the `make config`
-command from the root of the Linux source tree during the kernel build process.
-
-The RocketPort/RocketModem serial ports installed by this driver are assigned
-device major number 46, and will be named /dev/ttyRx, where x is the port number
-starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
-installed in the system, the mapping of port names to serial ports is displayed
-in the system log at /var/log/messages.
-
-If installed as a module, the module must be loaded.  This can be done
-manually by entering "modprobe rocket".  To have the module loaded automatically
-upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
-"alias char-major-46 rocket".
-
-In order to use the ports, their device names (nodes) must be created with mknod.
-This is only required once, the system will retain the names once created.  To
-create the RocketPort/RocketModem device names, use the command
-"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
-
-For example::
-
-	> mknod /dev/ttyR0 c 46 0
-	> mknod /dev/ttyR1 c 46 1
-	> mknod /dev/ttyR2 c 46 2
-
-The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
-for you::
-
-	>/dev/MAKEDEV ttyR
-
-ISA Rocketport Boards
----------------------
-
-You must assign and configure the I/O addresses used by the ISA Rocketport
-card before installing and using it.  This is done by setting a set of DIP
-switches on the Rocketport board.
-
-
-Setting the I/O address
------------------------
-
-Before installing RocketPort(R) or RocketPort RA boards, you must find
-a range of I/O addresses for it to use. The first RocketPort card
-requires a 68-byte contiguous block of I/O addresses, starting at one
-of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
-0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
-switches of *all* of the Rocketport cards.
-
-The second, third, and fourth RocketPort cards require a 64-byte
-contiguous block of I/O addresses, starting at one of the following
-I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
-0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
-second, third, and fourth Rocketport cards (if present) are set via
-software control.  The DIP switch settings for the I/O address must be
-set to the value of the first Rocketport cards.
-
-In order to distinguish each of the card from the others, each card
-must have a unique board ID set on the dip switches.  The first
-Rocketport board must be set with the DIP switches corresponding to
-the first board, the second board must be set with the DIP switches
-corresponding to the second board, etc.  IMPORTANT: The board ID is
-the only place where the DIP switch settings should differ between the
-various Rocketport boards in a system.
-
-The I/O address range used by any of the RocketPort cards must not
-conflict with any other cards in the system, including other
-RocketPort cards.  Below, you will find a list of commonly used I/O
-address ranges which may be in use by other devices in your system.
-On a Linux system, "cat /proc/ioports" will also be helpful in
-identifying what I/O addresses are being used by devices on your
-system.
-
-Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
-for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
-CAN NOT set the second, third or fourth board for address 0x140 since
-the first 4 bytes of that range are used by the first board.  You would
-need to set the second, third, or fourth board to one of the next available
-blocks such as 0x180.
-
-RocketPort and RocketPort RA SW1 Settings::
-
-            +-------------------------------+
-            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
-            +-------+-------+---------------+
-            | Unused| Card  | I/O Port Block|
-            +-------------------------------+
-
-  DIP Switches                             DIP Switches
-  7    8                                   6    5
-  ===================                      ===================
-  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
-                                           On   Off  Second Card
-                                           Off  On   Third Card
-                                           Off  Off  Fourth Card
-
-  DIP Switches         I/O Address Range
-  4    3    2    1     Used by the First Card
-  =====================================
-  On   Off  On   Off   100-143
-  On   Off  Off  On    140-183
-  On   Off  Off  Off   180-1C3       <==== Default
-  Off  On   On   Off   200-243
-  Off  On   Off  On    240-283
-  Off  On   Off  Off   280-2C3
-  Off  Off  On   Off   300-343
-  Off  Off  Off  On    340-383
-  Off  Off  Off  Off   380-3C3
-
-Reporting Bugs
---------------
-
-For technical support, please provide the following
-information: Driver version, kernel release, distribution of
-kernel, and type of board you are using. Error messages and log
-printouts port configuration details are especially helpful.
-
-USA:
-    :Phone: (612) 494-4100
-    :FAX: (612) 494-4199
-    :email: support@comtrol.com
-
-Comtrol Europe:
-    :Phone: +44 (0) 1 869 323-220
-    :FAX: +44 (0) 1 869 323-211
-    :email: support@comtrol.co.uk
-
-Web:	http://www.comtrol.com
-FTP:	ftp.comtrol.com
diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/serial/serial-iso7816.rst
deleted file mode 100644
index d990143de0c6..000000000000
--- a/Documentation/serial/serial-iso7816.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-=============================
-ISO7816 Serial Communications
-=============================
-
-1. Introduction
-===============
-
-  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
-  also known as smart cards.
-
-2. Hardware-related considerations
-==================================
-
-  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
-  handling communication with a smart card.
-
-  For these microcontrollers, the Linux driver should be made capable of
-  working in both modes, and proper ioctls (see later) should be made
-  available at user-level to allow switching from one mode to the other, and
-  vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
-  ISO7816 communications. This data structure is used to set and configure
-  ISO7816 parameters in ioctls.
-
-  Any driver for devices capable of working both as RS232 and ISO7816 should
-  implement the iso7816_config callback in the uart_port structure. The
-  serial_core calls iso7816_config to do the device specific part in response
-  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
-  callback receives a pointer to struct serial_iso7816.
-
-4. Usage from user-level
-========================
-
-  From user-level, ISO7816 configuration can be get/set using the previous
-  ioctls. For instance, to set ISO7816 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_iso7816 iso7816conf;
-
-	/* Reserved fields as to be zeroed */
-	memset(&iso7816conf, 0, sizeof(iso7816conf));
-
-	/* Enable ISO7816 mode: */
-	iso7816conf.flags |= SER_ISO7816_ENABLED;
-
-	/* Select the protocol: */
-	/* T=0 */
-	iso7816conf.flags |= SER_ISO7816_T(0);
-	/* or T=1 */
-	iso7816conf.flags |= SER_ISO7816_T(1);
-
-	/* Set the guard time: */
-	iso7816conf.tg = 2;
-
-	/* Set the clock frequency*/
-	iso7816conf.clk = 3571200;
-
-	/* Set transmission factors: */
-	iso7816conf.sc_fi = 372;
-	iso7816conf.sc_di = 1;
-
-	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]    include/uapi/linux/serial.h
diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/serial/serial-rs485.rst
deleted file mode 100644
index 6bc824f948f9..000000000000
--- a/Documentation/serial/serial-rs485.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-===========================
-RS485 Serial Communications
-===========================
-
-1. Introduction
-===============
-
-   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
-   electrical characteristics of drivers and receivers for use in balanced
-   digital multipoint systems.
-   This standard is widely used for communications in industrial automation
-   because it can be used effectively over long distances and in electrically
-   noisy environments.
-
-2. Hardware-related Considerations
-==================================
-
-   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
-   half-duplex mode capable of automatically controlling line direction by
-   toggling RTS or DTR signals. That can be used to control external
-   half-duplex hardware like an RS485 transceiver or any RS232-connected
-   half-duplex devices like some modems.
-
-   For these microcontrollers, the Linux driver should be made capable of
-   working in both modes, and proper ioctls (see later) should be made
-   available at user-level to allow switching from one mode to the other, and
-   vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
-   RS485 communications. This data structure is used to set and configure RS485
-   parameters in the platform data and in ioctls.
-
-   The device tree can also provide RS485 boot time parameters (see [2]
-   for bindings). The driver is in charge of filling this data structure from
-   the values given by the device tree.
-
-   Any driver for devices capable of working both as RS232 and RS485 should
-   implement the rs485_config callback in the uart_port structure. The
-   serial_core calls rs485_config to do the device specific part in response
-   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
-   receives a pointer to struct serial_rs485.
-
-4. Usage from user-level
-========================
-
-   From user-level, RS485 configuration can be get/set using the previous
-   ioctls. For instance, to set RS485 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_rs485 rs485conf;
-
-	/* Enable RS485 mode: */
-	rs485conf.flags |= SER_RS485_ENABLED;
-
-	/* Set logical level for RTS pin equal to 1 when sending: */
-	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
-	/* or, set logical level for RTS pin equal to 0 when sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
-
-	/* Set logical level for RTS pin equal to 1 after sending: */
-	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
-	/* or, set logical level for RTS pin equal to 0 after sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
-
-	/* Set rts delay before send, if needed: */
-	rs485conf.delay_rts_before_send = ...;
-
-	/* Set rts delay after send, if needed: */
-	rs485conf.delay_rts_after_send = ...;
-
-	/* Set this flag if you want to receive data even while sending data */
-	rs485conf.flags |= SER_RS485_RX_DURING_TX;
-
-	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]	include/uapi/linux/serial.h
-
- [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/serial/tty.rst b/Documentation/serial/tty.rst
deleted file mode 100644
index dd972caacf3e..000000000000
--- a/Documentation/serial/tty.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-=================
-The Lockronomicon
-=================
-
-Your guide to the ancient and twisted locking policies of the tty layer and
-the warped logic behind them. Beware all ye who read on.
-
-
-Line Discipline
----------------
-
-Line disciplines are registered with tty_register_ldisc() passing the
-discipline number and the ldisc structure. At the point of registration the
-discipline must be ready to use and it is possible it will get used before
-the call returns success. If the call returns an error then it won't get
-called. Do not re-use ldisc numbers as they are part of the userspace ABI
-and writing over an existing ldisc will cause demons to eat your computer.
-After the return the ldisc data has been copied so you may free your own
-copy of the structure. You must not re-register over the top of the line
-discipline even with the same data or your computer again will be eaten by
-demons.
-
-In order to remove a line discipline call tty_unregister_ldisc().
-In ancient times this always worked. In modern times the function will
-return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
-code manages the module counts this should not usually be a concern.
-
-Heed this warning: the reference count field of the registered copies of the
-tty_ldisc structure in the ldisc table counts the number of lines using this
-discipline. The reference count of the tty_ldisc structure within a tty
-counts the number of active users of the ldisc at this instant. In effect it
-counts the number of threads of execution within an ldisc method (plus those
-about to enter and exit although this detail matters not).
-
-Line Discipline Methods
------------------------
-
-TTY side interfaces
-^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-open()			Called when the line discipline is attached to
-			the terminal. No other call into the line
-			discipline for this tty will occur until it
-			completes successfully. Should initialize any
-			state needed by the ldisc, and set receive_room
-			in the tty_struct to the maximum amount of data
-			the line discipline is willing to accept from the
-			driver with a single call to receive_buf().
-			Returning an error will prevent the ldisc from
-			being attached. Can sleep.
-
-close()			This is called on a terminal when the line
-			discipline is being unplugged. At the point of
-			execution no further users will enter the
-			ldisc code for this tty. Can sleep.
-
-hangup()		Called when the tty line is hung up.
-			The line discipline should cease I/O to the tty.
-			No further calls into the ldisc code will occur.
-			The return value is ignored. Can sleep.
-
-read()			(optional) A process requests reading data from
-			the line. Multiple read calls may occur in parallel
-			and the ldisc must deal with serialization issues.
-			If not defined, the process will receive an EIO
-			error. May sleep.
-
-write()			(optional) A process requests writing data to the
-			line. Multiple write calls are serialized by the
-			tty layer for the ldisc. If not defined, the
-			process will receive an EIO error. May sleep.
-
-flush_buffer()		(optional) May be called at any point between
-			open and close, and instructs the line discipline
-			to empty its input buffer.
-
-set_termios()		(optional) Called on termios structure changes.
-			The caller passes the old termios data and the
-			current data is in the tty. Called under the
-			termios semaphore so allowed to sleep. Serialized
-			against itself only.
-
-poll()			(optional) Check the status for the poll/select
-			calls. Multiple poll calls may occur in parallel.
-			May sleep.
-
-ioctl()			(optional) Called when an ioctl is handed to the
-			tty layer that might be for the ldisc. Multiple
-			ioctl calls may occur in parallel. May sleep.
-
-compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
-			to the tty layer that might be for the ldisc.
-			Multiple ioctl calls may occur in parallel.
-			May sleep.
-======================= =======================================================
-
-Driver Side Interfaces
-^^^^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-receive_buf()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. The number of bytes is guaranteed not
-			to exceed the current value of tty->receive_room.
-			All bytes must be processed.
-
-receive_buf2()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. Returns the number of bytes processed.
-
-			If both receive_buf() and receive_buf2() are
-			defined, receive_buf2() should be preferred.
-
-write_wakeup()		May be called at any point between open and close.
-			The TTY_DO_WRITE_WAKEUP flag indicates if a call
-			is needed but always races versus calls. Thus the
-			ldisc must be careful about setting order and to
-			handle unexpected calls. Must not sleep.
-
-			The driver is forbidden from calling this directly
-			from the ->write call from the ldisc as the ldisc
-			is permitted to call the driver write method from
-			this function. In such a situation defer it.
-
-dcd_change()		Report to the tty line the current DCD pin status
-			changes and the relative timestamp. The timestamp
-			cannot be NULL.
-======================= =======================================================
-
-
-Driver Access
-^^^^^^^^^^^^^
-
-Line discipline methods can call the following methods of the underlying
-hardware driver through the function pointers within the tty->driver
-structure:
-
-======================= =======================================================
-write()			Write a block of characters to the tty device.
-			Returns the number of characters accepted. The
-			character buffer passed to this method is already
-			in kernel space.
-
-put_char()		Queues a character for writing to the tty device.
-			If there is no room in the queue, the character is
-			ignored.
-
-flush_chars()		(Optional) If defined, must be called after
-			queueing characters with put_char() in order to
-			start transmission.
-
-write_room()		Returns the numbers of characters the tty driver
-			will accept for queueing to be written.
-
-ioctl()			Invoke device specific ioctl.
-			Expects data pointers to refer to userspace.
-			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
-
-set_termios()		Notify the tty driver that the device's termios
-			settings have changed. New settings are in
-			tty->termios. Previous settings should be passed in
-			the "old" argument.
-
-			The API is defined such that the driver should return
-			the actual modes selected. This means that the
-			driver function is responsible for modifying any
-			bits in the request it cannot fulfill to indicate
-			the actual modes being used. A device with no
-			hardware capability for change (e.g. a USB dongle or
-			virtual port) can provide NULL for this method.
-
-throttle()		Notify the tty driver that input buffers for the
-			line discipline are close to full, and it should
-			somehow signal that no more characters should be
-			sent to the tty.
-
-unthrottle()		Notify the tty driver that characters can now be
-			sent to the tty without fear of overrunning the
-			input buffers of the line disciplines.
-
-stop()			Ask the tty driver to stop outputting characters
-			to the tty device.
-
-start()			Ask the tty driver to resume sending characters
-			to the tty device.
-
-hangup()		Ask the tty driver to hang up the tty device.
-
-break_ctl()		(Optional) Ask the tty driver to turn on or off
-			BREAK status on the RS-232 port.  If state is -1,
-			then the BREAK status should be turned on; if
-			state is 0, then BREAK should be turned off.
-			If this routine is not implemented, use ioctls
-			TIOCSBRK / TIOCCBRK instead.
-
-wait_until_sent()	Waits until the device has written out all of the
-			characters in its transmitter FIFO.
-
-send_xchar()		Send a high-priority XON/XOFF character to the device.
-======================= =======================================================
-
-
-Flags
-^^^^^
-
-Line discipline methods have access to tty->flags field containing the
-following interesting flags:
-
-======================= =======================================================
-TTY_THROTTLED		Driver input is throttled. The ldisc should call
-			tty->driver->unthrottle() in order to resume
-			reception when it is ready to process more data.
-
-TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
-			write_wakeup() method in order to resume
-			transmission when it can accept more data
-			to transmit.
-
-TTY_IO_ERROR		If set, causes all subsequent userspace read/write
-			calls on the tty to fail, returning -EIO.
-
-TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
-
-TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
-			smaller chunks.
-======================= =======================================================
-
-
-Locking
-^^^^^^^
-
-Callers to the line discipline functions from the tty layer are required to
-take line discipline locks. The same is true of calls from the driver side
-but not yet enforced.
-
-Three calls are now provided::
-
-	ldisc = tty_ldisc_ref(tty);
-
-takes a handle to the line discipline in the tty and returns it. If no ldisc
-is currently attached or the ldisc is being closed and re-opened at this
-point then NULL is returned. While this handle is held the ldisc will not
-change or go away::
-
-	tty_ldisc_deref(ldisc)
-
-Returns the ldisc reference and allows the ldisc to be closed. Returning the
-reference takes away your right to call the ldisc functions until you take
-a new reference::
-
-	ldisc = tty_ldisc_ref_wait(tty);
-
-Performs the same function as tty_ldisc_ref except that it will wait for an
-ldisc change to complete and then return a reference to the new ldisc.
-
-While these functions are slightly slower than the old code they should have
-minimal impact as most receive logic uses the flip buffers and they only
-need to take a reference when they push bits up through the driver.
-
-A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
-functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
-fail in this situation if used within these functions. Ldisc and driver
-code calling its own functions must be careful in this case.
-
-
-Driver Interface
-----------------
-
-======================= =======================================================
-open()			Called when a device is opened. May sleep
-
-close()			Called when a device is closed. At the point of
-			return from this call the driver must make no
-			further ldisc calls of any kind. May sleep
-
-write()			Called to write bytes to the device. May not
-			sleep. May occur in parallel in special cases.
-			Because this includes panic paths drivers generally
-			shouldn't try and do clever locking here.
-
-put_char()		Stuff a single character onto the queue. The
-			driver is guaranteed following up calls to
-			flush_chars.
-
-flush_chars()		Ask the kernel to write put_char queue
-
-write_room()		Return the number of characters that can be stuffed
-			into the port buffers without overflow (or less).
-			The ldisc is responsible for being intelligent
-			about multi-threading of write_room/write calls
-
-ioctl()			Called when an ioctl may be for the driver
-
-set_termios()		Called on termios change, serialized against
-			itself by a semaphore. May sleep.
-
-set_ldisc()		Notifier for discipline change. At the point this
-			is done the discipline is not yet usable. Can now
-			sleep (I think)
-
-throttle()		Called by the ldisc to ask the driver to do flow
-			control.  Serialization including with unthrottle
-			is the job of the ldisc layer.
-
-unthrottle()		Called by the ldisc to ask the driver to stop flow
-			control.
-
-stop()			Ldisc notifier to the driver to stop output. As with
-			throttle the serializations with start() are down
-			to the ldisc layer.
-
-start()			Ldisc notifier to the driver to start output.
-
-hangup()		Ask the tty driver to cause a hangup initiated
-			from the host side. [Can sleep ??]
-
-break_ctl()		Send RS232 break. Can sleep. Can get called in
-			parallel, driver must serialize (for now), and
-			with write calls.
-
-wait_until_sent()	Wait for characters to exit the hardware queue
-			of the driver. Can sleep
-
-send_xchar()	  	Send XON/XOFF and if possible jump the queue with
-			it in order to get fast flow control responses.
-			Cannot sleep ??
-======================= =======================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index d1a0a817dd92..4f88bca37c55 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10767,7 +10767,7 @@ F:	include/uapi/linux/meye.h
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
 M:	Jiri Slaby <jirislaby@gmail.com>
 S:	Maintained
-F:	Documentation/serial/moxa-smartio.rst
+F:	Documentation/driver-api/serial/moxa-smartio.rst
 F:	drivers/tty/mxser.*
 
 MR800 AVERMEDIA USB FM RADIO DRIVER
@@ -13689,7 +13689,7 @@ ROCKETPORT DRIVER
 P:	Comtrol Corp.
 W:	http://www.comtrol.com
 S:	Maintained
-F:	Documentation/serial/rocket.rst
+F:	Documentation/driver-api/serial/rocket.rst
 F:	drivers/tty/rocket*
 
 ROCKETPORT EXPRESS/INFINITY DRIVER
@@ -16228,7 +16228,7 @@ M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Jiri Slaby <jslaby@suse.com>
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
-F:	Documentation/serial/
+F:	Documentation/driver-api/serial/
 F:	drivers/tty/
 F:	drivers/tty/serial/serial_core.c
 F:	include/linux/serial_core.h
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index ee51b9514225..c7623f99ac0f 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -175,7 +175,7 @@ config ROCKETPORT
 	  This driver supports Comtrol RocketPort and RocketModem PCI boards.   
           These boards provide 2, 4, 8, 16, or 32 high-speed serial ports or
           modems.  For information about the RocketPort/RocketModem  boards
-          and this driver read <file:Documentation/serial/rocket.rst>.
+          and this driver read <file:Documentation/driver-api/serial/rocket.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rocket.
@@ -193,7 +193,7 @@ config CYCLADES
 	  your Linux box, for instance in order to become a dial-in server.
 
 	  For information about the Cyclades-Z card, read
-	  <file:Documentation/serial/cyclades_z.rst>.
+	  <file:Documentation/driver-api/serial/cyclades_z.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called cyclades.
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 6e3c66ab0e62..a0555ae2b1ef 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -1081,7 +1081,7 @@ static int qe_uart_verify_port(struct uart_port *port,
 }
 /* UART operations
  *
- * Details on these functions can be found in Documentation/serial/driver.rst
+ * Details on these functions can be found in Documentation/driver-api/serial/driver.rst
  */
 static const struct uart_ops qe_uart_pops = {
 	.tx_empty       = qe_uart_tx_empty,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 05b179015d6c..2b78cc734719 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -32,7 +32,7 @@ struct device;
 
 /*
  * This structure describes all the operations that can be done on the
- * physical hardware.  See Documentation/serial/driver.rst for details.
+ * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
  */
 struct uart_ops {
 	unsigned int	(*tx_empty)(struct uart_port *);
-- 
cgit v1.2.3


From 7e4b4dfc98d54bc79f7ca29c8bc6307ed2948014 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:06:29 +1000
Subject: Revert "mm: adjust apply_to_pfn_range interface for dropped token."

This reverts commit 6dfc43d3a19174faead54575c204aee106225f43.

Going to revert the whole vmwwgfx pull.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++++--
 mm/memory.c           | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c45f936bd81c..798cdda9560e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index 6352a3729408..f600e31534fb 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,6 +26,7 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -35,7 +36,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte,
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -77,6 +78,7 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -89,7 +91,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte,
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index b8218e962231..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, addr, closure);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte,
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, NULL, addr, pra->data);
+	return pra->fn(pte, token, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3


From 3729fe2bc2a01f4cc1aa88be8f64af06084c87d6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:07:13 +1000
Subject: Revert "Merge branch 'vmwgfx-next' of
 git://people.freedesktop.org/~thomash/linux into drm-next"

This reverts commit 031e610a6a21448a63dff7a0416e5e206724caac, reversing
changes made to 52d2d44eee8091e740d0d275df1311fb8373c9a9.

The mm changes in there we premature and not fully ack or reviewed by core mm folks,
I dropped the ball by merging them via this tree, so lets take em all back out.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 MAINTAINERS                                        |   1 -
 drivers/gpu/drm/ttm/ttm_bo.c                       |   1 -
 drivers/gpu/drm/ttm/ttm_bo_vm.c                    | 169 +++-----
 drivers/gpu/drm/vmwgfx/Kconfig                     |   1 -
 drivers/gpu/drm/vmwgfx/Makefile                    |   2 +-
 .../drm/vmwgfx/device_include/svga3d_surfacedefs.h | 233 +---------
 drivers/gpu/drm/vmwgfx/ttm_lock.c                  | 100 +++++
 drivers/gpu/drm/vmwgfx/ttm_lock.h                  |  30 ++
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c                 |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_context.c            |   4 -
 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c            |  13 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c                | 167 +++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                | 139 ++----
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c            |   1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c                |  23 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c         | 472 ---------------------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c           | 245 ++---------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h      |  15 -
 drivers/gpu/drm/vmwgfx/vmwgfx_shader.c             |   8 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            | 405 +-----------------
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.c         |  74 +---
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.h         |  16 +-
 include/drm/ttm/ttm_bo_api.h                       |  10 -
 include/drm/ttm/ttm_bo_driver.h                    |   6 -
 include/linux/mm.h                                 |  19 +-
 include/uapi/drm/vmwgfx_drm.h                      |   4 +-
 mm/Kconfig                                         |   3 -
 mm/Makefile                                        |   1 -
 mm/as_dirty_helpers.c                              | 300 -------------
 mm/memory.c                                        | 145 ++-----
 30 files changed, 483 insertions(+), 2136 deletions(-)
 delete mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
 delete mode 100644 mm/as_dirty_helpers.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d6600715a662..2abf6d28db64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5191,7 +5191,6 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
-F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a7fd5a4955c9..58c403eda04e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1739,7 +1739,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
 	mutex_lock(&ttm_global_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&ttm_global_mutex);
-	bdev->vm_ops = &ttm_bo_vm_ops;
 
 	return 0;
 out_no_sys:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 0c4576cbafcf..6dacff49c1cc 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,6 +42,8 @@
 #include <linux/uaccess.h>
 #include <linux/mem_encrypt.h>
 
+#define TTM_BO_VM_NUM_PREFAULT 16
+
 static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 				struct vm_fault *vmf)
 {
@@ -104,30 +106,25 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
 		+ page_offset;
 }
 
-/**
- * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
- * @bo: The buffer object
- * @vmf: The fault structure handed to the callback
- *
- * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
- * during long waits, and after the wait the callback will be restarted. This
- * is to allow other threads using the same virtual memory space concurrent
- * access to map(), unmap() completely unrelated buffer objects. TTM buffer
- * object reservations sometimes wait for GPU and should therefore be
- * considered long waits. This function reserves the buffer object interruptibly
- * taking this into account. Starvation is avoided by the vm system not
- * allowing too many repeated restarts.
- * This function is intended to be used in customized fault() and _mkwrite()
- * handlers.
- *
- * Return:
- *    0 on success and the bo was reserved.
- *    VM_FAULT_RETRY if blocking wait.
- *    VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
- */
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf)
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
+	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+	    vma->vm_private_data;
+	struct ttm_bo_device *bdev = bo->bdev;
+	unsigned long page_offset;
+	unsigned long page_last;
+	unsigned long pfn;
+	struct ttm_tt *ttm = NULL;
+	struct page *page;
+	int err;
+	int i;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+	unsigned long address = vmf->address;
+	struct ttm_mem_type_manager *man =
+		&bdev->man[bo->mem.mem_type];
+	struct vm_area_struct cvma;
+
 	/*
 	 * Work around locking order reversal in fault / nopfn
 	 * between mmap_sem and bo_reserve: Perform a trylock operation
@@ -154,55 +151,14 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 		return VM_FAULT_NOPAGE;
 	}
 
-	return 0;
-}
-EXPORT_SYMBOL(ttm_bo_vm_reserve);
-
-/**
- * ttm_bo_vm_fault_reserved - TTM fault helper
- * @vmf: The struct vm_fault given as argument to the fault callback
- * @prot: The page protection to be used for this memory area.
- * @num_prefault: Maximum number of prefault pages. The caller may want to
- * specify this based on madvice settings and the size of the GPU object
- * backed by the memory.
- *
- * This function inserts one or more page table entries pointing to the
- * memory backing the buffer object, and then returns a return code
- * instructing the caller to retry the page access.
- *
- * Return:
- *   VM_FAULT_NOPAGE on success or pending signal
- *   VM_FAULT_SIGBUS on unspecified error
- *   VM_FAULT_OOM on out-of-memory
- *   VM_FAULT_RETRY if retryable wait
- */
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct vm_area_struct cvma = *vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct ttm_bo_device *bdev = bo->bdev;
-	unsigned long page_offset;
-	unsigned long page_last;
-	unsigned long pfn;
-	struct ttm_tt *ttm = NULL;
-	struct page *page;
-	int err;
-	pgoff_t i;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-	unsigned long address = vmf->address;
-	struct ttm_mem_type_manager *man =
-		&bdev->man[bo->mem.mem_type];
-
 	/*
 	 * Refuse to fault imported pages. This should be handled
 	 * (if at all) by redirecting mmap to the exporter.
 	 */
-	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
-		return VM_FAULT_SIGBUS;
+	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
 
 	if (bdev->driver->fault_reserve_notify) {
 		struct dma_fence *moving = dma_fence_get(bo->moving);
@@ -213,9 +169,11 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 			break;
 		case -EBUSY:
 		case -ERESTARTSYS:
-			return VM_FAULT_NOPAGE;
+			ret = VM_FAULT_NOPAGE;
+			goto out_unlock;
 		default:
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_unlock;
 		}
 
 		if (bo->moving != moving) {
@@ -231,12 +189,21 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	 * move.
 	 */
 	ret = ttm_bo_vm_fault_idle(bo, vmf);
-	if (unlikely(ret != 0))
-		return ret;
+	if (unlikely(ret != 0)) {
+		if (ret == VM_FAULT_RETRY &&
+		    !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			/* The BO has already been unreserved. */
+			return ret;
+		}
+
+		goto out_unlock;
+	}
 
 	err = ttm_mem_io_lock(man, true);
-	if (unlikely(err != 0))
-		return VM_FAULT_NOPAGE;
+	if (unlikely(err != 0)) {
+		ret = VM_FAULT_NOPAGE;
+		goto out_unlock;
+	}
 	err = ttm_mem_io_reserve_vm(bo);
 	if (unlikely(err != 0)) {
 		ret = VM_FAULT_SIGBUS;
@@ -253,8 +220,18 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		goto out_io_unlock;
 	}
 
-	cvma.vm_page_prot = ttm_io_prot(bo->mem.placement, prot);
-	if (!bo->mem.bus.is_iomem) {
+	/*
+	 * Make a local vma copy to modify the page_prot member
+	 * and vm_flags if necessary. The vma parameter is protected
+	 * by mmap_sem in write mode.
+	 */
+	cvma = *vma;
+	cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
+
+	if (bo->mem.bus.is_iomem) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+	} else {
 		struct ttm_operation_ctx ctx = {
 			.interruptible = false,
 			.no_wait_gpu = false,
@@ -263,21 +240,24 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		};
 
 		ttm = bo->ttm;
-		if (ttm_tt_populate(bo->ttm, &ctx)) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+
+		/* Allocate all page at once, most common usage */
+		if (ttm_tt_populate(ttm, &ctx)) {
 			ret = VM_FAULT_OOM;
 			goto out_io_unlock;
 		}
-	} else {
-		/* Iomem should not be marked encrypted */
-		cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 	}
 
 	/*
 	 * Speculatively prefault a number of pages. Only error on
 	 * first page.
 	 */
-	for (i = 0; i < num_prefault; ++i) {
+	for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
 		if (bo->mem.bus.is_iomem) {
+			/* Iomem should not be marked encrypted */
+			cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 			pfn = ttm_bo_io_mem_pfn(bo, page_offset);
 		} else {
 			page = ttm->pages[page_offset];
@@ -315,26 +295,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	ret = VM_FAULT_NOPAGE;
 out_io_unlock:
 	ttm_mem_io_unlock(man);
-	return ret;
-}
-EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
-
-static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	pgprot_t prot;
-	struct ttm_buffer_object *bo = vma->vm_private_data;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	prot = vm_get_page_prot(vma->vm_flags);
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
+out_unlock:
 	reservation_object_unlock(bo->resv);
 	return ret;
 }
@@ -434,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-const struct vm_operations_struct ttm_bo_vm_ops = {
+static const struct vm_operations_struct ttm_bo_vm_ops = {
 	.fault = ttm_bo_vm_fault,
 	.open = ttm_bo_vm_open,
 	.close = ttm_bo_vm_close,
@@ -487,7 +448,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
 	if (unlikely(ret != 0))
 		goto out_unref;
 
-	vma->vm_ops = bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 
 	/*
 	 * Note: We're transferring the bo reference to
@@ -519,7 +480,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
 	ttm_bo_get(bo);
 
-	vma->vm_ops = bo->bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 	vma->vm_private_data = bo;
 	vma->vm_flags |= VM_MIXEDMAP;
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND;
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index d5fd81a521f6..6b28a326f8bb 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -8,7 +8,6 @@ config DRM_VMWGFX
 	select FB_CFB_IMAGEBLIT
 	select DRM_TTM
 	select FB
-	select AS_DIRTY_HELPERS
 	# Only needed for the transitional use of drm_crtc_init - can be removed
 	# again once vmwgfx sets up the primary plane itself.
 	select DRM_KMS_HELPER
diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index c877a21a0739..8841bd30e1e5 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
 	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
 	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
-	    vmwgfx_validation.o vmwgfx_page_dirty.o \
+	    vmwgfx_validation.o \
 	    ttm_object.o ttm_lock.o
 
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
index 61414f105c67..f2bfd3d80598 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
@@ -1280,6 +1280,7 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
 	return offset;
 }
 
+
 static inline u32
 svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
 			       surf_size_struct baseLevelSize,
@@ -1374,236 +1375,4 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
 	return svga3dsurface_is_dx_screen_target_format(format);
 }
 
-/**
- * struct svga3dsurface_mip - Mimpmap level information
- * @bytes: Bytes required in the backing store of this mipmap level.
- * @img_stride: Byte stride per image.
- * @row_stride: Byte stride per block row.
- * @size: The size of the mipmap.
- */
-struct svga3dsurface_mip {
-	size_t bytes;
-	size_t img_stride;
-	size_t row_stride;
-	struct drm_vmw_size size;
-
-};
-
-/**
- * struct svga3dsurface_cache - Cached surface information
- * @desc: Pointer to the surface descriptor
- * @mip: Array of mipmap level information. Valid size is @num_mip_levels.
- * @mip_chain_bytes: Bytes required in the backing store for the whole chain
- * of mip levels.
- * @sheet_bytes: Bytes required in the backing store for a sheet
- * representing a single sample.
- * @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
- * a chain.
- * @num_layers: Number of slices in an array texture or number of faces in
- * a cubemap texture.
- */
-struct svga3dsurface_cache {
-	const struct svga3d_surface_desc *desc;
-	struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
-	size_t mip_chain_bytes;
-	size_t sheet_bytes;
-	u32 num_mip_levels;
-	u32 num_layers;
-};
-
-/**
- * struct svga3dsurface_loc - Surface location
- * @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
- * mip_level.
- * @x: X coordinate.
- * @y: Y coordinate.
- * @z: Z coordinate.
- */
-struct svga3dsurface_loc {
-	u32 sub_resource;
-	u32 x, y, z;
-};
-
-/**
- * svga3dsurface_subres - Compute the subresource from layer and mipmap.
- * @cache: Surface layout data.
- * @mip_level: The mipmap level.
- * @layer: The surface layer (face or array slice).
- *
- * Return: The subresource.
- */
-static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
-				       u32 mip_level, u32 layer)
-{
-	return cache->num_mip_levels * layer + mip_level;
-}
-
-/**
- * svga3dsurface_setup_cache - Build a surface cache entry
- * @size: The surface base level dimensions.
- * @format: The surface format.
- * @num_mip_levels: Number of mipmap levels.
- * @num_layers: Number of layers.
- * @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
- *
- * Return: Zero on success, -EINVAL on invalid surface layout.
- */
-static inline int svga3dsurface_setup_cache(const struct drm_vmw_size *size,
-					    SVGA3dSurfaceFormat format,
-					    u32 num_mip_levels,
-					    u32 num_layers,
-					    u32 num_samples,
-					    struct svga3dsurface_cache *cache)
-{
-	const struct svga3d_surface_desc *desc;
-	u32 i;
-
-	memset(cache, 0, sizeof(*cache));
-	cache->desc = desc = svga3dsurface_get_desc(format);
-	cache->num_mip_levels = num_mip_levels;
-	cache->num_layers = num_layers;
-	for (i = 0; i < cache->num_mip_levels; i++) {
-		struct svga3dsurface_mip *mip = &cache->mip[i];
-
-		mip->size = svga3dsurface_get_mip_size(*size, i);
-		mip->bytes = svga3dsurface_get_image_buffer_size
-			(desc, &mip->size, 0);
-		mip->row_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.width,
-					      desc->block_size.width) *
-			desc->bytes_per_block * num_samples;
-		if (!mip->row_stride)
-			goto invalid_dim;
-
-		mip->img_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.height,
-					      desc->block_size.height) *
-			mip->row_stride;
-		if (!mip->img_stride)
-			goto invalid_dim;
-
-		cache->mip_chain_bytes += mip->bytes;
-	}
-	cache->sheet_bytes = cache->mip_chain_bytes * num_layers;
-	if (!cache->sheet_bytes)
-		goto invalid_dim;
-
-	return 0;
-
-invalid_dim:
-	VMW_DEBUG_USER("Invalid surface layout for dirty tracking.\n");
-	return -EINVAL;
-}
-
-/**
- * svga3dsurface_get_loc - Get a surface location from an offset into the
- * backing store
- * @cache: Surface layout data.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- * @offset: Offset into the surface backing store.
- */
-static inline void
-svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc,
-		      size_t offset)
-{
-	const struct svga3dsurface_mip *mip = &cache->mip[0];
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 layer;
-	int i;
-
-	if (offset >= cache->sheet_bytes)
-		offset %= cache->sheet_bytes;
-
-	layer = offset / cache->mip_chain_bytes;
-	offset -= layer * cache->mip_chain_bytes;
-	for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
-		if (mip->bytes > offset)
-			break;
-		offset -= mip->bytes;
-	}
-
-	loc->sub_resource = svga3dsurface_subres(cache, i, layer);
-	loc->z = offset / mip->img_stride;
-	offset -= loc->z * mip->img_stride;
-	loc->z *= desc->block_size.depth;
-	loc->y = offset / mip->row_stride;
-	offset -= loc->y * mip->row_stride;
-	loc->y *= desc->block_size.height;
-	loc->x = offset / desc->bytes_per_block;
-	loc->x *= desc->block_size.width;
-}
-
-/**
- * svga3dsurface_inc_loc - Clamp increment a surface location with one block
- * size
- * in each dimension.
- * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
- *
- * When computing the size of a range as size = end - start, the range does not
- * include the end element. However a location representing the last byte
- * of a touched region in the backing store *is* included in the range.
- * This function modifies such a location to match the end definition
- * given as start + size which is the one used in a SVGA3dBox.
- */
-static inline void
-svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 mip = loc->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-
-	loc->sub_resource++;
-	loc->x += desc->block_size.width;
-	if (loc->x > size->width)
-		loc->x = size->width;
-	loc->y += desc->block_size.height;
-	if (loc->y > size->height)
-		loc->y = size->height;
-	loc->z += desc->block_size.depth;
-	if (loc->z > size->depth)
-		loc->z = size->depth;
-}
-
-/**
- * svga3dsurface_min_loc - The start location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- */
-static inline void
-svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	loc->sub_resource = sub_resource;
-	loc->x = loc->y = loc->z = 0;
-}
-
-/**
- * svga3dsurface_min_loc - The end location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- *
- * Following the end definition given in svga3dsurface_inc_loc(),
- * Compute the end location of a surface subresource.
- */
-static inline void
-svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct drm_vmw_size *size;
-	u32 mip;
-
-	loc->sub_resource = sub_resource + 1;
-	mip = sub_resource % cache->num_mip_levels;
-	size = &cache->mip[mip].size;
-	loc->x = size->width;
-	loc->y = size->height;
-	loc->z = size->depth;
-}
-
 #endif /* _SVGA3D_SURFACEDEFS_H_ */
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.c b/drivers/gpu/drm/vmwgfx/ttm_lock.c
index 5971c72e6d10..16b2083cb9d4 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.c
@@ -29,6 +29,7 @@
  * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
  */
 
+#include <drm/ttm/ttm_module.h>
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
@@ -48,6 +49,8 @@ void ttm_lock_init(struct ttm_lock *lock)
 	init_waitqueue_head(&lock->queue);
 	lock->rw = 0;
 	lock->flags = 0;
+	lock->kill_takers = false;
+	lock->signal = SIGKILL;
 }
 
 void ttm_read_unlock(struct ttm_lock *lock)
@@ -63,6 +66,11 @@ static bool __ttm_read_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		locked = true;
@@ -90,6 +98,11 @@ static bool __ttm_read_trylock(struct ttm_lock *lock, bool *locked)
 	*locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		block = false;
@@ -134,6 +147,11 @@ static bool __ttm_write_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw == 0 && ((lock->flags & ~TTM_WRITE_LOCK_PENDING) == 0)) {
 		lock->rw = -1;
 		lock->flags &= ~TTM_WRITE_LOCK_PENDING;
@@ -164,6 +182,88 @@ int ttm_write_lock(struct ttm_lock *lock, bool interruptible)
 	return ret;
 }
 
+static int __ttm_vt_unlock(struct ttm_lock *lock)
+{
+	int ret = 0;
+
+	spin_lock(&lock->lock);
+	if (unlikely(!(lock->flags & TTM_VT_LOCK)))
+		ret = -EINVAL;
+	lock->flags &= ~TTM_VT_LOCK;
+	wake_up_all(&lock->queue);
+	spin_unlock(&lock->lock);
+
+	return ret;
+}
+
+static void ttm_vt_lock_remove(struct ttm_base_object **p_base)
+{
+	struct ttm_base_object *base = *p_base;
+	struct ttm_lock *lock = container_of(base, struct ttm_lock, base);
+	int ret;
+
+	*p_base = NULL;
+	ret = __ttm_vt_unlock(lock);
+	BUG_ON(ret != 0);
+}
+
+static bool __ttm_vt_lock(struct ttm_lock *lock)
+{
+	bool locked = false;
+
+	spin_lock(&lock->lock);
+	if (lock->rw == 0) {
+		lock->flags &= ~TTM_VT_LOCK_PENDING;
+		lock->flags |= TTM_VT_LOCK;
+		locked = true;
+	} else {
+		lock->flags |= TTM_VT_LOCK_PENDING;
+	}
+	spin_unlock(&lock->lock);
+	return locked;
+}
+
+int ttm_vt_lock(struct ttm_lock *lock,
+		bool interruptible,
+		struct ttm_object_file *tfile)
+{
+	int ret = 0;
+
+	if (interruptible) {
+		ret = wait_event_interruptible(lock->queue,
+					       __ttm_vt_lock(lock));
+		if (unlikely(ret != 0)) {
+			spin_lock(&lock->lock);
+			lock->flags &= ~TTM_VT_LOCK_PENDING;
+			wake_up_all(&lock->queue);
+			spin_unlock(&lock->lock);
+			return ret;
+		}
+	} else
+		wait_event(lock->queue, __ttm_vt_lock(lock));
+
+	/*
+	 * Add a base-object, the destructor of which will
+	 * make sure the lock is released if the client dies
+	 * while holding it.
+	 */
+
+	ret = ttm_base_object_init(tfile, &lock->base, false,
+				   ttm_lock_type, &ttm_vt_lock_remove, NULL);
+	if (ret)
+		(void)__ttm_vt_unlock(lock);
+	else
+		lock->vt_holder = tfile;
+
+	return ret;
+}
+
+int ttm_vt_unlock(struct ttm_lock *lock)
+{
+	return ttm_ref_object_base_unref(lock->vt_holder,
+					 lock->base.handle, TTM_REF_USAGE);
+}
+
 void ttm_suspend_unlock(struct ttm_lock *lock)
 {
 	spin_lock(&lock->lock);
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.h b/drivers/gpu/drm/vmwgfx/ttm_lock.h
index 3d454e8b491f..0c3af9836863 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.h
@@ -63,6 +63,8 @@
  * @lock: Spinlock protecting some lock members.
  * @rw: Read-write lock counter. Protected by @lock.
  * @flags: Lock state. Protected by @lock.
+ * @kill_takers: Boolean whether to kill takers of the lock.
+ * @signal: Signal to send when kill_takers is true.
  */
 
 struct ttm_lock {
@@ -71,6 +73,9 @@ struct ttm_lock {
 	spinlock_t lock;
 	int32_t rw;
 	uint32_t flags;
+	bool kill_takers;
+	int signal;
+	struct ttm_object_file *vt_holder;
 };
 
 
@@ -215,4 +220,29 @@ extern void ttm_write_unlock(struct ttm_lock *lock);
  */
 extern int ttm_write_lock(struct ttm_lock *lock, bool interruptible);
 
+/**
+ * ttm_lock_set_kill
+ *
+ * @lock: Pointer to a struct ttm_lock
+ * @val: Boolean whether to kill processes taking the lock.
+ * @signal: Signal to send to the process taking the lock.
+ *
+ * The kill-when-taking-lock functionality is used to kill processes that keep
+ * on using the TTM functionality when its resources has been taken down, for
+ * example when the X server exits. A typical sequence would look like this:
+ * - X server takes lock in write mode.
+ * - ttm_lock_set_kill() is called with @val set to true.
+ * - As part of X server exit, TTM resources are taken down.
+ * - X server releases the lock on file release.
+ * - Another dri client wants to render, takes the lock and is killed.
+ *
+ */
+static inline void ttm_lock_set_kill(struct ttm_lock *lock, bool val,
+				     int signal)
+{
+	lock->kill_takers = val;
+	if (val)
+		lock->signal = signal;
+}
+
 #endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index e8bc7a7ac031..5d5c2bce01f3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -463,8 +463,6 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 {
 	struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
 
-	WARN_ON(vmw_bo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
 	vmw_bo_unmap(vmw_bo);
 	kfree(vmw_bo);
 }
@@ -478,11 +476,8 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
 {
 	struct vmw_user_buffer_object *vmw_user_bo = vmw_user_buffer_object(bo);
-	struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
 
-	WARN_ON(vbo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
-	vmw_bo_unmap(vbo);
+	vmw_bo_unmap(&vmw_user_bo->vbo);
 	ttm_prime_object_kfree(vmw_user_bo, prime);
 }
 
@@ -515,9 +510,8 @@ int vmw_bo_init(struct vmw_private *dev_priv,
 
 	acc_size = vmw_bo_acc_size(dev_priv, size, user);
 	memset(vmw_bo, 0, sizeof(*vmw_bo));
-	BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
-	vmw_bo->base.priority = 3;
-	vmw_bo->res_tree = RB_ROOT;
+
+	INIT_LIST_HEAD(&vmw_bo->res_list);
 
 	ret = ttm_bo_init(bdev, &vmw_bo->base, size,
 			  ttm_bo_type_device, placement,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
index a56c9d802382..63f111068a44 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
@@ -88,8 +88,6 @@ static const struct vmw_res_func vmw_gb_context_func = {
 	.res_type = vmw_res_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_context_create,
@@ -102,8 +100,6 @@ static const struct vmw_res_func vmw_dx_context_func = {
 	.res_type = vmw_res_dx_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "dx contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_context_create,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
index 8c699cb2565b..b4f6e1217c9d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
@@ -116,8 +116,6 @@ static const struct vmw_res_func vmw_cotable_func = {
 	.res_type = vmw_res_cotable,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "context guest backed object tables",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_cotable_create,
@@ -309,7 +307,7 @@ static int vmw_cotable_unbind(struct vmw_resource *res,
 	struct ttm_buffer_object *bo = val_buf->bo;
 	struct vmw_fence_obj *fence;
 
-	if (!vmw_resource_mob_attached(res))
+	if (list_empty(&res->mob_head))
 		return 0;
 
 	WARN_ON_ONCE(bo->mem.mem_type != VMW_PL_MOB);
@@ -455,7 +453,6 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		goto out_wait;
 	}
 
-	vmw_resource_mob_detach(res);
 	res->backup = buf;
 	res->backup_size = new_size;
 	vcotbl->size_read_back = cur_size_read_back;
@@ -470,12 +467,12 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		res->backup = old_buf;
 		res->backup_size = old_size;
 		vcotbl->size_read_back = old_size_read_back;
-		vmw_resource_mob_attach(res);
 		goto out_wait;
 	}
 
-	vmw_resource_mob_attach(res);
 	/* Let go of the old mob. */
+	list_del(&res->mob_head);
+	list_add_tail(&res->mob_head, &buf->res_list);
 	vmw_bo_unreference(&old_buf);
 	res->id = vcotbl->type;
 
@@ -499,7 +496,7 @@ out_wait:
  * is called before bind() in the validation sequence is instead used for two
  * things.
  * 1) Unscrub the cotable if it is scrubbed and still attached to a backup
- *    buffer.
+ *    buffer, that is, if @res->mob_head is non-empty.
  * 2) Resize the cotable if needed.
  */
 static int vmw_cotable_create(struct vmw_resource *res)
@@ -515,7 +512,7 @@ static int vmw_cotable_create(struct vmw_resource *res)
 		new_size *= 2;
 
 	if (likely(new_size <= res->backup_size)) {
-		if (vcotbl->scrubbed && vmw_resource_mob_attached(res)) {
+		if (vcotbl->scrubbed && !list_empty(&res->mob_head)) {
 			ret = vmw_cotable_unscrub(res);
 			if (ret)
 				return ret;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8349a6cc126f..4ff11a0077e1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -254,6 +254,7 @@ static int vmw_restrict_dma_mask;
 static int vmw_assume_16bpp;
 
 static int vmw_probe(struct pci_dev *, const struct pci_device_id *);
+static void vmw_master_init(struct vmw_master *);
 static int vmwgfx_pm_notifier(struct notifier_block *nb, unsigned long val,
 			      void *ptr);
 
@@ -761,6 +762,10 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	DRM_INFO("MMIO at 0x%08x size is %u kiB\n",
 		 dev_priv->mmio_start, dev_priv->mmio_size / 1024);
 
+	vmw_master_init(&dev_priv->fbdev_master);
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	dev_priv->active_master = &dev_priv->fbdev_master;
+
 	dev_priv->mmio_virt = memremap(dev_priv->mmio_start,
 				       dev_priv->mmio_size, MEMREMAP_WB);
 
@@ -828,11 +833,6 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 		DRM_ERROR("Failed initializing TTM buffer object driver.\n");
 		goto out_no_bdev;
 	}
-	dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
-	dev_priv->vm_ops.fault = vmw_bo_vm_fault;
-	dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
 
 	/*
 	 * Enable VRAM, but initially don't use it until SVGA is enabled and
@@ -1007,7 +1007,18 @@ static void vmw_driver_unload(struct drm_device *dev)
 static void vmw_postclose(struct drm_device *dev,
 			 struct drm_file *file_priv)
 {
-	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_fpriv *vmw_fp;
+
+	vmw_fp = vmw_fpriv(file_priv);
+
+	if (vmw_fp->locked_master) {
+		struct vmw_master *vmaster =
+			vmw_master(vmw_fp->locked_master);
+
+		ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+		ttm_vt_unlock(&vmaster->lock);
+		drm_master_put(&vmw_fp->locked_master);
+	}
 
 	ttm_object_file_release(&vmw_fp->tfile);
 	kfree(vmw_fp);
@@ -1036,6 +1047,55 @@ out_no_tfile:
 	return ret;
 }
 
+static struct vmw_master *vmw_master_check(struct drm_device *dev,
+					   struct drm_file *file_priv,
+					   unsigned int flags)
+{
+	int ret;
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster;
+
+	if (!drm_is_primary_client(file_priv) || !(flags & DRM_AUTH))
+		return NULL;
+
+	ret = mutex_lock_interruptible(&dev->master_mutex);
+	if (unlikely(ret != 0))
+		return ERR_PTR(-ERESTARTSYS);
+
+	if (drm_is_current_master(file_priv)) {
+		mutex_unlock(&dev->master_mutex);
+		return NULL;
+	}
+
+	/*
+	 * Check if we were previously master, but now dropped. In that
+	 * case, allow at least render node functionality.
+	 */
+	if (vmw_fp->locked_master) {
+		mutex_unlock(&dev->master_mutex);
+
+		if (flags & DRM_RENDER_ALLOW)
+			return NULL;
+
+		DRM_ERROR("Dropped master trying to access ioctl that "
+			  "requires authentication.\n");
+		return ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&dev->master_mutex);
+
+	/*
+	 * Take the TTM lock. Possibly sleep waiting for the authenticating
+	 * master to become master again, or for a SIGTERM if the
+	 * authenticating master exits.
+	 */
+	vmaster = vmw_master(file_priv->master);
+	ret = ttm_read_lock(&vmaster->lock, true);
+	if (unlikely(ret != 0))
+		vmaster = ERR_PTR(ret);
+
+	return vmaster;
+}
+
 static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg,
 			      long (*ioctl_func)(struct file *, unsigned int,
@@ -1044,6 +1104,7 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	struct drm_file *file_priv = filp->private_data;
 	struct drm_device *dev = file_priv->minor->dev;
 	unsigned int nr = DRM_IOCTL_NR(cmd);
+	struct vmw_master *vmaster;
 	unsigned int flags;
 	long ret;
 
@@ -1079,7 +1140,21 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	} else if (!drm_ioctl_flags(nr, &flags))
 		return -EINVAL;
 
-	return ioctl_func(filp, cmd, arg);
+	vmaster = vmw_master_check(dev, file_priv, flags);
+	if (IS_ERR(vmaster)) {
+		ret = PTR_ERR(vmaster);
+
+		if (ret != -ERESTARTSYS)
+			DRM_INFO("IOCTL ERROR Command %d, Error %ld.\n",
+				 nr, ret);
+		return ret;
+	}
+
+	ret = ioctl_func(filp, cmd, arg);
+	if (vmaster)
+		ttm_read_unlock(&vmaster->lock);
+
+	return ret;
 
 out_io_encoding:
 	DRM_ERROR("Invalid command format, ioctl %d\n",
@@ -1106,10 +1181,65 @@ static void vmw_lastclose(struct drm_device *dev)
 {
 }
 
+static void vmw_master_init(struct vmw_master *vmaster)
+{
+	ttm_lock_init(&vmaster->lock);
+}
+
+static int vmw_master_create(struct drm_device *dev,
+			     struct drm_master *master)
+{
+	struct vmw_master *vmaster;
+
+	vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
+	if (unlikely(!vmaster))
+		return -ENOMEM;
+
+	vmw_master_init(vmaster);
+	ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+	master->driver_priv = vmaster;
+
+	return 0;
+}
+
+static void vmw_master_destroy(struct drm_device *dev,
+			       struct drm_master *master)
+{
+	struct vmw_master *vmaster = vmw_master(master);
+
+	master->driver_priv = NULL;
+	kfree(vmaster);
+}
+
 static int vmw_master_set(struct drm_device *dev,
 			  struct drm_file *file_priv,
 			  bool from_open)
 {
+	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *active = dev_priv->active_master;
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret = 0;
+
+	if (active) {
+		BUG_ON(active != &dev_priv->fbdev_master);
+		ret = ttm_vt_lock(&active->lock, false, vmw_fp->tfile);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ttm_lock_set_kill(&active->lock, true, SIGTERM);
+		dev_priv->active_master = NULL;
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+	if (!from_open) {
+		ttm_vt_unlock(&vmaster->lock);
+		BUG_ON(vmw_fp->locked_master != file_priv->master);
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	dev_priv->active_master = vmaster;
+
 	/*
 	 * Inform a new master that the layout may have changed while
 	 * it was gone.
@@ -1124,10 +1254,31 @@ static void vmw_master_drop(struct drm_device *dev,
 			    struct drm_file *file_priv)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret;
+
+	/**
+	 * Make sure the master doesn't disappear while we have
+	 * it locked.
+	 */
 
+	vmw_fp->locked_master = drm_master_get(file_priv->master);
+	ret = ttm_vt_lock(&vmaster->lock, false, vmw_fp->tfile);
 	vmw_kms_legacy_hotspot_clear(dev_priv);
+	if (unlikely((ret != 0))) {
+		DRM_ERROR("Unable to lock TTM at VT switch.\n");
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+
 	if (!dev_priv->enable_fb)
 		vmw_svga_disable(dev_priv);
+
+	dev_priv->active_master = &dev_priv->fbdev_master;
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	ttm_vt_unlock(&dev_priv->fbdev_master.lock);
 }
 
 /**
@@ -1406,6 +1557,8 @@ static struct drm_driver driver = {
 	.disable_vblank = vmw_disable_vblank,
 	.ioctls = vmw_ioctls,
 	.num_ioctls = ARRAY_SIZE(vmw_ioctls),
+	.master_create = vmw_master_create,
+	.master_destroy = vmw_master_destroy,
 	.master_set = vmw_master_set,
 	.master_drop = vmw_master_drop,
 	.open = vmw_driver_open,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 3a358a5495e4..366dcfc1f9bb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -44,9 +44,9 @@
 #include <linux/sync_file.h>
 
 #define VMWGFX_DRIVER_NAME "vmwgfx"
-#define VMWGFX_DRIVER_DATE "20190328"
+#define VMWGFX_DRIVER_DATE "20180704"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 16
+#define VMWGFX_DRIVER_MINOR 15
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
 #define VMWGFX_MAX_RELOCATIONS 2048
@@ -81,30 +81,19 @@
 #define VMW_RES_SHADER ttm_driver_type4
 
 struct vmw_fpriv {
+	struct drm_master *locked_master;
 	struct ttm_object_file *tfile;
 	bool gb_aware; /* user-space is guest-backed aware */
 };
 
-/**
- * struct vmw_buffer_object - TTM buffer object with vmwgfx additions
- * @base: The TTM buffer object
- * @res_tree: RB tree of resources using this buffer object as a backing MOB
- * @pin_count: pin depth
- * @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
- * @map: Kmap object for semi-persistent mappings
- * @res_prios: Eviction priority counts for attached resources
- * @dirty: structure for user-space dirty-tracking
- */
 struct vmw_buffer_object {
 	struct ttm_buffer_object base;
-	struct rb_root res_tree;
+	struct list_head res_list;
 	s32 pin_count;
 	/* Not ref-counted.  Protected by binding_mutex */
 	struct vmw_resource *dx_query_ctx;
 	/* Protected by reservation */
 	struct ttm_bo_kmap_obj map;
-	u32 res_prios[TTM_MAX_BO_PRIORITY];
-	struct vmw_bo_dirty *dirty;
 };
 
 /**
@@ -135,8 +124,7 @@ struct vmw_res_func;
  * @res_dirty: Resource contains data not yet in the backup buffer. Protected
  * by resource reserved.
  * @backup_dirty: Backup buffer contains data not yet in the HW resource.
- * Protected by resource reserved.
- * @coherent: Emulate coherency by tracking vm accesses.
+ * Protecte by resource reserved.
  * @backup: The backup buffer if any. Protected by resource reserved.
  * @backup_offset: Offset into the backup buffer if any. Protected by resource
  * reserved. Note that only a few resource types can have a @backup_offset
@@ -145,32 +133,28 @@ struct vmw_res_func;
  * pin-count greater than zero. It is not on the resource LRU lists and its
  * backup buffer is pinned. Hence it can't be evicted.
  * @func: Method vtable for this resource. Immutable.
- * @mob_node; Node for the MOB backup rbtree. Protected by @backup reserved.
  * @lru_head: List head for the LRU list. Protected by @dev_priv::resource_lock.
+ * @mob_head: List head for the MOB backup list. Protected by @backup reserved.
  * @binding_head: List head for the context binding list. Protected by
  * the @dev_priv::binding_mutex
  * @res_free: The resource destructor.
  * @hw_destroy: Callback to destroy the resource on the device, as part of
  * resource destruction.
  */
-struct vmw_resource_dirty;
 struct vmw_resource {
 	struct kref kref;
 	struct vmw_private *dev_priv;
 	int id;
-	u32 used_prio;
 	unsigned long backup_size;
-	u32 res_dirty : 1;
-	u32 backup_dirty : 1;
-	u32 coherent : 1;
+	bool res_dirty;
+	bool backup_dirty;
 	struct vmw_buffer_object *backup;
 	unsigned long backup_offset;
 	unsigned long pin_count;
 	const struct vmw_res_func *func;
-	struct rb_node mob_node;
 	struct list_head lru_head;
+	struct list_head mob_head;
 	struct list_head binding_head;
-	struct vmw_resource_dirty *dirty;
 	void (*res_free) (struct vmw_resource *res);
 	void (*hw_destroy) (struct vmw_resource *res);
 };
@@ -392,6 +376,10 @@ struct vmw_sw_context{
 struct vmw_legacy_display;
 struct vmw_overlay;
 
+struct vmw_master {
+	struct ttm_lock lock;
+};
+
 struct vmw_vga_topology_state {
 	uint32_t width;
 	uint32_t height;
@@ -554,8 +542,11 @@ struct vmw_private {
 	spinlock_t svga_lock;
 
 	/**
-	 * PM management.
+	 * Master management.
 	 */
+
+	struct vmw_master *active_master;
+	struct vmw_master fbdev_master;
 	struct notifier_block pm_nb;
 	bool refuse_hibernation;
 	bool suspend_locked;
@@ -604,9 +595,6 @@ struct vmw_private {
 
 	/* Validation memory reservation */
 	struct vmw_validation_mem vvm;
-
-	/* VM operations */
-	struct vm_operations_struct vm_ops;
 };
 
 static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource *res)
@@ -624,6 +612,11 @@ static inline struct vmw_fpriv *vmw_fpriv(struct drm_file *file_priv)
 	return (struct vmw_fpriv *)file_priv->driver_priv;
 }
 
+static inline struct vmw_master *vmw_master(struct drm_master *master)
+{
+	return (struct vmw_master *) master->driver_priv;
+}
+
 /*
  * The locking here is fine-grained, so that it is performed once
  * for every read- and write operation. This is of course costly, but we
@@ -676,8 +669,7 @@ extern void vmw_resource_unreference(struct vmw_resource **p_res);
 extern struct vmw_resource *vmw_resource_reference(struct vmw_resource *res);
 extern struct vmw_resource *
 vmw_resource_reference_unless_doomed(struct vmw_resource *res);
-extern int vmw_resource_validate(struct vmw_resource *res, bool intr,
-				 bool dirtying);
+extern int vmw_resource_validate(struct vmw_resource *res, bool intr);
 extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
 				bool no_backup);
 extern bool vmw_resource_needs_backup(const struct vmw_resource *res);
@@ -717,23 +709,6 @@ extern void vmw_query_move_notify(struct ttm_buffer_object *bo,
 extern int vmw_query_readback_all(struct vmw_buffer_object *dx_query_mob);
 extern void vmw_resource_evict_all(struct vmw_private *dev_priv);
 extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
-void vmw_resource_mob_attach(struct vmw_resource *res);
-void vmw_resource_mob_detach(struct vmw_resource *res);
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end);
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault);
-
-/**
- * vmw_resource_mob_attached - Whether a resource currently has a mob attached
- * @res: The resource
- *
- * Return: true if the resource has a mob attached, false otherwise.
- */
-static inline bool vmw_resource_mob_attached(const struct vmw_resource *res)
-{
-	return !RB_EMPTY_NODE(&res->mob_node);
-}
 
 /**
  * vmw_user_resource_noref_release - release a user resource pointer looked up
@@ -812,54 +787,6 @@ static inline void vmw_user_bo_noref_release(void)
 	ttm_base_object_noref_release();
 }
 
-/**
- * vmw_bo_adjust_prio - Adjust the buffer object eviction priority
- * according to attached resources
- * @vbo: The struct vmw_buffer_object
- */
-static inline void vmw_bo_prio_adjust(struct vmw_buffer_object *vbo)
-{
-	int i = ARRAY_SIZE(vbo->res_prios);
-
-	while (i--) {
-		if (vbo->res_prios[i]) {
-			vbo->base.priority = i;
-			return;
-		}
-	}
-
-	vbo->base.priority = 3;
-}
-
-/**
- * vmw_bo_prio_add - Notify a buffer object of a newly attached resource
- * eviction priority
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_add(struct vmw_buffer_object *vbo, int prio)
-{
-	if (vbo->res_prios[prio]++ == 0)
-		vmw_bo_prio_adjust(vbo);
-}
-
-/**
- * vmw_bo_prio_del - Notify a buffer object of a resource with a certain
- * priority being removed
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_del(struct vmw_buffer_object *vbo, int prio)
-{
-	if (--vbo->res_prios[prio] == 0)
-		vmw_bo_prio_adjust(vbo);
-}
 
 /**
  * Misc Ioctl functionality - vmwgfx_ioctl.c
@@ -1089,6 +1016,7 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf,
 int vmw_kms_write_svga(struct vmw_private *vmw_priv,
 		       unsigned width, unsigned height, unsigned pitch,
 		       unsigned bpp, unsigned depth);
+void vmw_kms_idle_workqueues(struct vmw_master *vmaster);
 bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv,
 				uint32_t pitch,
 				uint32_t height);
@@ -1410,25 +1338,6 @@ int vmw_host_log(const char *log);
 #define VMW_DEBUG_USER(fmt, ...)                                              \
 	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
 
-/**
- * VMW_DEBUG_KMS - Debug output for kernel mode-setting
- *
- * This macro is for debugging vmwgfx mode-setting code.
- */
-#define VMW_DEBUG_KMS(fmt, ...)                                               \
-	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
-
-/* Resource dirtying - vmwgfx_page_dirty.c */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
-void vmw_bo_dirty_clear_res(struct vmw_resource *res);
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end);
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
-
 /**
  * Inline helper functions
  */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 319c1ca35663..33533d126277 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -2560,6 +2560,7 @@ static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
 		     offsetof(typeof(*cmd), sid));
 
 	cmd = container_of(header, typeof(*cmd), header);
+
 	return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
 				 VMW_RES_DIRTY_NONE, user_surface_converter,
 				 &cmd->sid, NULL);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index e7222fa2cfdf..b97bc8e5944b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1462,7 +1462,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		if (dev_priv->active_display_unit == vmw_du_screen_target &&
 		    (drm_rect_width(&rects[i]) > dev_priv->stdu_max_width ||
 		     drm_rect_height(&rects[i]) > dev_priv->stdu_max_height)) {
-			VMW_DEBUG_KMS("Screen size not supported.\n");
+			DRM_ERROR("Screen size not supported.\n");
 			return -EINVAL;
 		}
 
@@ -1486,7 +1486,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 	 * limit on primary bounding box
 	 */
 	if (pixel_mem > dev_priv->prim_bb_mem) {
-		VMW_DEBUG_KMS("Combined output size too large.\n");
+		DRM_ERROR("Combined output size too large.\n");
 		return -EINVAL;
 	}
 
@@ -1496,7 +1496,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		bb_mem = (u64) bounding_box.x2 * bounding_box.y2 * 4;
 
 		if (bb_mem > dev_priv->prim_bb_mem) {
-			VMW_DEBUG_KMS("Topology is beyond supported limits.\n");
+			DRM_ERROR("Topology is beyond supported limits.\n");
 			return -EINVAL;
 		}
 	}
@@ -1645,7 +1645,6 @@ static int vmw_kms_check_topology(struct drm_device *dev,
 		struct vmw_connector_state *vmw_conn_state;
 
 		if (!du->pref_active && new_crtc_state->enable) {
-			VMW_DEBUG_KMS("Enabling a disabled display unit\n");
 			ret = -EINVAL;
 			goto clean;
 		}
@@ -1702,10 +1701,8 @@ vmw_kms_atomic_check_modeset(struct drm_device *dev,
 		return ret;
 
 	ret = vmw_kms_check_implicit(dev, state);
-	if (ret) {
-		VMW_DEBUG_KMS("Invalid implicit state\n");
+	if (ret)
 		return ret;
-	}
 
 	if (!state->allow_modeset)
 		return ret;
@@ -2350,9 +2347,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	if (!arg->num_outputs) {
 		struct drm_rect def_rect = {0, 0, 800, 600};
-		VMW_DEBUG_KMS("Default layout x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      def_rect.x1, def_rect.y1,
-			      def_rect.x2, def_rect.y2);
 		vmw_du_update_layout(dev_priv, 1, &def_rect);
 		return 0;
 	}
@@ -2373,7 +2367,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	drm_rects = (struct drm_rect *)rects;
 
-	VMW_DEBUG_KMS("Layout count = %u\n", arg->num_outputs);
 	for (i = 0; i < arg->num_outputs; i++) {
 		struct drm_vmw_rect curr_rect;
 
@@ -2390,10 +2383,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		drm_rects[i].x2 = curr_rect.x + curr_rect.w;
 		drm_rects[i].y2 = curr_rect.y + curr_rect.h;
 
-		VMW_DEBUG_KMS("  x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      drm_rects[i].x1, drm_rects[i].y1,
-			      drm_rects[i].x2, drm_rects[i].y2);
-
 		/*
 		 * Currently this check is limiting the topology within
 		 * mode_config->max (which actually is max texture size
@@ -2404,9 +2393,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		if (drm_rects[i].x1 < 0 ||  drm_rects[i].y1 < 0 ||
 		    drm_rects[i].x2 > mode_config->max_width ||
 		    drm_rects[i].y2 > mode_config->max_height) {
-			VMW_DEBUG_KMS("Invalid layout %d %d %d %d\n",
-				      drm_rects[i].x1, drm_rects[i].y1,
-				      drm_rects[i].x2, drm_rects[i].y2);
+			DRM_ERROR("Invalid GUI layout.\n");
 			ret = -EINVAL;
 			goto out_free;
 		}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
deleted file mode 100644
index 730c51e397dd..000000000000
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ /dev/null
@@ -1,472 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/**************************************************************************
- *
- * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-#include "vmwgfx_drv.h"
-
-/*
- * Different methods for tracking dirty:
- * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty bits
- * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and record write-
- * accesses in the VM mkwrite() callback
- */
-enum vmw_bo_dirty_method {
-	VMW_BO_DIRTY_PAGETABLE,
-	VMW_BO_DIRTY_MKWRITE,
-};
-
-/*
- * No dirtied pages at scan trigger a transition to the _MKWRITE method,
- * similarly a certain percentage of dirty pages trigger a transition to
- * the _PAGETABLE method. How many triggers should we wait for before
- * changing method?
- */
-#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
-
-/* Percentage to trigger a transition to the _PAGETABLE method */
-#define VMW_DIRTY_PERCENTAGE 10
-
-/**
- * struct vmw_bo_dirty - Dirty information for buffer objects
- * @start: First currently dirty bit
- * @end: Last currently dirty bit + 1
- * @method: The currently used dirty method
- * @change_count: Number of consecutive method change triggers
- * @ref_count: Reference count for this structure
- * @bitmap_size: The size of the bitmap in bits. Typically equal to the
- * nuber of pages in the bo.
- * @size: The accounting size for this struct.
- * @bitmap: A bitmap where each bit represents a page. A set bit means a
- * dirty page.
- */
-struct vmw_bo_dirty {
-	unsigned long start;
-	unsigned long end;
-	enum vmw_bo_dirty_method method;
-	unsigned int change_count;
-	unsigned int ref_count;
-	unsigned long bitmap_size;
-	size_t size;
-	unsigned long bitmap[0];
-};
-
-/**
- * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for dirty bits
- * @vbo: The buffer object to scan
- *
- * Scans the pagetable for dirty bits. Clear those bits and modify the
- * dirty structure with the results. This function may change the
- * dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	num_marked = apply_as_clean(mapping,
-				    offset, dirty->bitmap_size,
-				    offset, &dirty->bitmap[0],
-				    &dirty->start, &dirty->end);
-	if (num_marked == 0)
-		dirty->change_count++;
-	else
-		dirty->change_count = 0;
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		dirty->change_count = 0;
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-		apply_as_wrprotect(mapping,
-				   offset, dirty->bitmap_size);
-		apply_as_clean(mapping,
-			       offset, dirty->bitmap_size,
-			       offset, &dirty->bitmap[0],
-			       &dirty->start, &dirty->end);
-	}
-}
-
-/**
- * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking method
- * @vbo: The buffer object to scan
- *
- * Write-protect pages written to so that consecutive write accesses will
- * trigger a call to mkwrite.
- *
- * This function may change the dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	if (dirty->end <= dirty->start)
-		return;
-
-	num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
-					dirty->start + offset,
-					dirty->end - dirty->start);
-
-	if (100UL * num_marked / dirty->bitmap_size >
-	    VMW_DIRTY_PERCENTAGE) {
-		dirty->change_count++;
-	} else {
-		dirty->change_count = 0;
-	}
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		pgoff_t start = 0;
-		pgoff_t end = dirty->bitmap_size;
-
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-		apply_as_clean(mapping, offset, end, offset, &dirty->bitmap[0],
-			       &start, &end);
-		bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
-		if (dirty->start < dirty->end)
-			bitmap_set(&dirty->bitmap[0], dirty->start,
-				   dirty->end - dirty->start);
-		dirty->change_count = 0;
-	}
-}
-
-/**
- * vmw_bo_dirty_scan - Scan for dirty pages and add them to the dirty
- * tracking structure
- * @vbo: The buffer object to scan
- *
- * This function may change the dirty tracking method.
- */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
-		vmw_bo_dirty_scan_pagetable(vbo);
-	else
-		vmw_bo_dirty_scan_mkwrite(vbo);
-}
-
-/**
- * vmw_bo_dirty_pre_unmap - write-protect and pick up dirty pages before
- * an unmap_mapping_range operation.
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * If we're using the _PAGETABLE scan method, we may leak dirty pages
- * when calling unmap_mapping_range(). This function makes sure we pick
- * up all dirty pages.
- */
-static void vmw_bo_dirty_pre_unmap(struct vmw_buffer_object *vbo,
-				   pgoff_t start, pgoff_t end)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	if (dirty->method != VMW_BO_DIRTY_PAGETABLE || start >= end)
-		return;
-
-	apply_as_wrprotect(mapping, start + offset, end - start);
-	apply_as_clean(mapping, start + offset, end - start, offset,
-		       &dirty->bitmap[0], &dirty->start, &dirty->end);
-}
-
-/**
- * vmw_bo_dirty_unmap - Clear all ptes pointing to a range within a bo
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * This is similar to ttm_bo_unmap_virtual_locked() except it takes a subrange.
- */
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end)
-{
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	vmw_bo_dirty_pre_unmap(vbo, start, end);
-	unmap_shared_mapping_range(mapping, (offset + start) << PAGE_SHIFT,
-				   (loff_t) (end - start) << PAGE_SHIFT);
-}
-
-/**
- * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
- * @vbo: The buffer object
- *
- * This function registers a dirty-tracking user to a buffer object.
- * A user can be for example a resource or a vma in a special user-space
- * mapping.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t num_pages = vbo->base.num_pages;
-	size_t size, acc_size;
-	int ret;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-
-	if (dirty) {
-		dirty->ref_count++;
-		return 0;
-	}
-
-	size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) * sizeof(long);
-	acc_size = ttm_round_pot(size);
-	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for buffer object "
-			       "dirty tracker.\n");
-		return ret;
-	}
-	dirty = kvzalloc(size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	dirty->size = acc_size;
-	dirty->bitmap_size = num_pages;
-	dirty->start = dirty->bitmap_size;
-	dirty->end = 0;
-	dirty->ref_count = 1;
-	if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-	} else {
-		struct address_space *mapping = vbo->base.bdev->dev_mapping;
-		pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-
-		/* Write-protect and then pick up already dirty bits */
-		apply_as_wrprotect(mapping, offset, num_pages);
-		apply_as_clean(mapping, offset, num_pages, offset,
-			       &dirty->bitmap[0], &dirty->start, &dirty->end);
-	}
-
-	vbo->dirty = dirty;
-
-	return 0;
-
-out_no_dirty:
-	ttm_mem_global_free(&ttm_mem_glob, acc_size);
-	return ret;
-}
-
-/**
- * vmw_bo_dirty_release - Release a dirty-tracking user from a buffer object
- * @vbo: The buffer object
- *
- * This function releases a dirty-tracking user from a buffer object.
- * If the reference count reaches zero, then the dirty-tracking object is
- * freed and the pointer to it cleared.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty && --dirty->ref_count == 0) {
-		size_t acc_size = dirty->size;
-
-		kvfree(dirty);
-		ttm_mem_global_free(&ttm_mem_glob, acc_size);
-		vbo->dirty = NULL;
-	}
-}
-
-/**
- * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will pick up all dirty ranges affecting the resource from
- * it's backup mob, and call vmw_resource_dirty_update() once for each
- * range. The transferred ranges will be cleared from the backing mob's
- * dirty tracking.
- */
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t start, cur, end;
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-
-	WARN_ON_ONCE(res_start & ~PAGE_MASK);
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	cur = max(res_start, dirty->start);
-	res_end = max(res_end, dirty->end);
-	while (cur < res_end) {
-		unsigned long num;
-
-		start = find_next_bit(&dirty->bitmap[0], res_end, cur);
-		if (start >= res_end)
-			break;
-
-		end = find_next_zero_bit(&dirty->bitmap[0], res_end, start + 1);
-		cur = end + 1;
-		num = end - start;
-		bitmap_clear(&dirty->bitmap[0], start, num);
-		vmw_resource_dirty_update(res, start, end);
-	}
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-/**
- * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will clear all dirty ranges affecting the resource from
- * it's backup mob's dirty tracking.
- */
-void vmw_bo_dirty_clear_res(struct vmw_resource *res)
-{
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	res_start = max(res_start, dirty->start);
-	res_end = min(res_end, dirty->end);
-	bitmap_clear(&dirty->bitmap[0], res_start, res_end - res_start);
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	vm_fault_t ret;
-	unsigned long page_offset;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, typeof(*vbo), base);
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-	if (unlikely(page_offset >= bo->num_pages)) {
-		ret = VM_FAULT_SIGBUS;
-		goto out_unlock;
-	}
-
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE &&
-	    !test_bit(page_offset, &vbo->dirty->bitmap[0])) {
-		struct vmw_bo_dirty *dirty = vbo->dirty;
-
-		__set_bit(page_offset, &dirty->bitmap[0]);
-		dirty->start = min(dirty->start, page_offset);
-		dirty->end = max(dirty->end, page_offset + 1);
-	}
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
-
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, struct vmw_buffer_object, base);
-	pgoff_t num_prefault;
-	pgprot_t prot;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	num_prefault = (vma->vm_flags & VM_RAND_READ) ? 1 :
-		TTM_BO_VM_NUM_PREFAULT;
-
-	if (vbo->dirty) {
-		pgoff_t allowed_prefault;
-		unsigned long page_offset;
-
-		page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-		if (page_offset >= bo->num_pages ||
-		    vmw_resources_clean(vbo, page_offset,
-					page_offset + PAGE_SIZE,
-					&allowed_prefault)) {
-			ret = VM_FAULT_SIGBUS;
-			goto out_unlock;
-		}
-
-		num_prefault = min(num_prefault, allowed_prefault);
-	}
-
-	/*
-	 * If we don't track dirty using the MKWRITE method, make sure
-	 * sure the page protection is write-enabled so we don't get
-	 * a lot of unnecessary write faults.
-	 */
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
-		prot = vma->vm_page_prot;
-	else
-		prot = vm_get_page_prot(vma->vm_flags);
-
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index d70ee0df5c13..1d38a8b2f2ec 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -34,51 +34,6 @@
 
 #define VMW_RES_EVICT_ERR_COUNT 10
 
-/**
- * vmw_resource_mob_attach - Mark a resource as attached to its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_attach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-	struct rb_node **new = &backup->res_tree.rb_node, *parent = NULL;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
-		res->func->prio;
-
-	while (*new) {
-		struct vmw_resource *this =
-			container_of(*new, struct vmw_resource, mob_node);
-
-		parent = *new;
-		new = (res->backup_offset < this->backup_offset) ?
-			&((*new)->rb_left) : &((*new)->rb_right);
-	}
-
-	rb_link_node(&res->mob_node, parent, new);
-	rb_insert_color(&res->mob_node, &backup->res_tree);
-
-	vmw_bo_prio_add(backup, res->used_prio);
-}
-
-/**
- * vmw_resource_mob_detach - Mark a resource as detached from its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_detach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	if (vmw_resource_mob_attached(res)) {
-		rb_erase(&res->mob_node, &backup->res_tree);
-		RB_CLEAR_NODE(&res->mob_node);
-		vmw_bo_prio_del(backup, res->used_prio);
-	}
-}
-
-
 struct vmw_resource *vmw_resource_reference(struct vmw_resource *res)
 {
 	kref_get(&res->kref);
@@ -125,7 +80,7 @@ static void vmw_resource_release(struct kref *kref)
 		struct ttm_buffer_object *bo = &res->backup->base;
 
 		ttm_bo_reserve(bo, false, false, NULL);
-		if (vmw_resource_mob_attached(res) &&
+		if (!list_empty(&res->mob_head) &&
 		    res->func->unbind != NULL) {
 			struct ttm_validate_buffer val_buf;
 
@@ -134,11 +89,7 @@ static void vmw_resource_release(struct kref *kref)
 			res->func->unbind(res, false, &val_buf);
 		}
 		res->backup_dirty = false;
-		vmw_resource_mob_detach(res);
-		if (res->dirty)
-			res->func->dirty_free(res);
-		if (res->coherent)
-			vmw_bo_dirty_release(res->backup);
+		list_del_init(&res->mob_head);
 		ttm_bo_unreserve(bo);
 		vmw_bo_unreference(&res->backup);
 	}
@@ -220,17 +171,14 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
 	res->res_free = res_free;
 	res->dev_priv = dev_priv;
 	res->func = func;
-	RB_CLEAR_NODE(&res->mob_node);
 	INIT_LIST_HEAD(&res->lru_head);
+	INIT_LIST_HEAD(&res->mob_head);
 	INIT_LIST_HEAD(&res->binding_head);
 	res->id = -1;
 	res->backup = NULL;
 	res->backup_offset = 0;
 	res->backup_dirty = false;
 	res->res_dirty = false;
-	res->coherent = false;
-	res->used_prio = 3;
-	res->dirty = NULL;
 	if (delay_id)
 		return 0;
 	else
@@ -395,8 +343,7 @@ out_no_bo:
  * should be retried once resources have been freed up.
  */
 static int vmw_resource_do_validate(struct vmw_resource *res,
-				    struct ttm_validate_buffer *val_buf,
-				    bool dirtying)
+				    struct ttm_validate_buffer *val_buf)
 {
 	int ret = 0;
 	const struct vmw_res_func *func = res->func;
@@ -408,47 +355,14 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
 	}
 
 	if (func->bind &&
-	    ((func->needs_backup && !vmw_resource_mob_attached(res) &&
+	    ((func->needs_backup && list_empty(&res->mob_head) &&
 	      val_buf->bo != NULL) ||
 	     (!func->needs_backup && val_buf->bo != NULL))) {
 		ret = func->bind(res, val_buf);
 		if (unlikely(ret != 0))
 			goto out_bind_failed;
 		if (func->needs_backup)
-			vmw_resource_mob_attach(res);
-	}
-
-	/*
-	 * Handle the case where the backup mob is marked coherent but
-	 * the resource isn't.
-	 */
-	if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
-	    !res->coherent) {
-		if (res->backup->dirty && !res->dirty) {
-			ret = func->dirty_alloc(res);
-			if (ret)
-				return ret;
-		} else if (!res->backup->dirty && res->dirty) {
-			func->dirty_free(res);
-		}
-	}
-
-	/*
-	 * Transfer the dirty regions to the resource and update
-	 * the resource.
-	 */
-	if (res->dirty) {
-		if (dirtying && !res->res_dirty) {
-			pgoff_t start = res->backup_offset >> PAGE_SHIFT;
-			pgoff_t end = __KERNEL_DIV_ROUND_UP
-				(res->backup_offset + res->backup_size,
-				 PAGE_SIZE);
-
-			vmw_bo_dirty_unmap(res->backup, start, end);
-		}
-
-		vmw_bo_dirty_transfer_to_res(res);
-		return func->dirty_sync(res);
+			list_add_tail(&res->mob_head, &res->backup->res_list);
 	}
 
 	return 0;
@@ -488,29 +402,19 @@ void vmw_resource_unreserve(struct vmw_resource *res,
 
 	if (switch_backup && new_backup != res->backup) {
 		if (res->backup) {
-			vmw_resource_mob_detach(res);
-			if (res->coherent)
-				vmw_bo_dirty_release(res->backup);
+			lockdep_assert_held(&res->backup->base.resv->lock.base);
+			list_del_init(&res->mob_head);
 			vmw_bo_unreference(&res->backup);
 		}
 
 		if (new_backup) {
 			res->backup = vmw_bo_reference(new_backup);
-
-			/*
-			 * The validation code should already have added a
-			 * dirty tracker here.
-			 */
-			WARN_ON(res->coherent && !new_backup->dirty);
-
-			vmw_resource_mob_attach(res);
+			lockdep_assert_held(&new_backup->base.resv->lock.base);
+			list_add_tail(&res->mob_head, &new_backup->res_list);
 		} else {
 			res->backup = NULL;
 		}
-	} else if (switch_backup && res->coherent) {
-		vmw_bo_dirty_release(res->backup);
 	}
-
 	if (switch_backup)
 		res->backup_offset = new_backup_offset;
 
@@ -565,7 +469,7 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
 	if (unlikely(ret != 0))
 		goto out_no_reserve;
 
-	if (res->func->needs_backup && !vmw_resource_mob_attached(res))
+	if (res->func->needs_backup && list_empty(&res->mob_head))
 		return 0;
 
 	backup_dirty = res->backup_dirty;
@@ -670,11 +574,11 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
 		return ret;
 
 	if (unlikely(func->unbind != NULL &&
-		     (!func->needs_backup || vmw_resource_mob_attached(res)))) {
+		     (!func->needs_backup || !list_empty(&res->mob_head)))) {
 		ret = func->unbind(res, res->res_dirty, &val_buf);
 		if (unlikely(ret != 0))
 			goto out_no_unbind;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 	ret = func->destroy(res);
 	res->backup_dirty = true;
@@ -691,7 +595,6 @@ out_no_unbind:
  *                         to the device.
  * @res: The resource to make visible to the device.
  * @intr: Perform waits interruptible if possible.
- * @dirtying: Pending GPU operation will dirty the resource
  *
  * On succesful return, any backup DMA buffer pointed to by @res->backup will
  * be reserved and validated.
@@ -701,8 +604,7 @@ out_no_unbind:
  * Return: Zero on success, -ERESTARTSYS if interrupted, negative error code
  * on failure.
  */
-int vmw_resource_validate(struct vmw_resource *res, bool intr,
-			  bool dirtying)
+int vmw_resource_validate(struct vmw_resource *res, bool intr)
 {
 	int ret;
 	struct vmw_resource *evict_res;
@@ -719,7 +621,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (res->backup)
 		val_buf.bo = &res->backup->base;
 	do {
-		ret = vmw_resource_do_validate(res, &val_buf, dirtying);
+		ret = vmw_resource_do_validate(res, &val_buf);
 		if (likely(ret != -EBUSY))
 			break;
 
@@ -758,7 +660,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (unlikely(ret != 0))
 		goto out_no_validate;
 	else if (!res->func->needs_backup && res->backup) {
-		WARN_ON_ONCE(vmw_resource_mob_attached(res));
+		list_del_init(&res->mob_head);
 		vmw_bo_unreference(&res->backup);
 	}
 
@@ -782,23 +684,22 @@ out_no_validate:
  */
 void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
 {
+
+	struct vmw_resource *res, *next;
 	struct ttm_validate_buffer val_buf = {
 		.bo = &vbo->base,
 		.num_shared = 0
 	};
 
 	lockdep_assert_held(&vbo->base.resv->lock.base);
-	while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
-		struct rb_node *node = vbo->res_tree.rb_node;
-		struct vmw_resource *res =
-			container_of(node, struct vmw_resource, mob_node);
-
-		if (!WARN_ON_ONCE(!res->func->unbind))
-			(void) res->func->unbind(res, res->res_dirty, &val_buf);
+	list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
+		if (!res->func->unbind)
+			continue;
 
+		(void) res->func->unbind(res, res->res_dirty, &val_buf);
 		res->backup_dirty = true;
 		res->res_dirty = false;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 
 	(void) ttm_bo_wait(&vbo->base, false, false);
@@ -1019,7 +920,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
 			/* Do we really need to pin the MOB as well? */
 			vmw_bo_pin_reserved(vbo, true);
 		}
-		ret = vmw_resource_validate(res, interruptible, true);
+		ret = vmw_resource_validate(res, interruptible);
 		if (vbo)
 			ttm_bo_unreserve(&vbo->base);
 		if (ret)
@@ -1079,101 +980,3 @@ enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
 {
 	return res->func->res_type;
 }
-
-/**
- * vmw_resource_update_dirty - Update a resource's dirty tracker with a
- * sequential range of touched backing store memory.
- * @res: The resource.
- * @start: The first page touched.
- * @end: The last page touched + 1.
- */
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end)
-{
-	if (res->dirty)
-		res->func->dirty_range_add(res, start << PAGE_SHIFT,
-					   end << PAGE_SHIFT);
-}
-
-/**
- * vmw_resources_clean - Clean resources intersecting a mob range
- * @vbo: The mob buffer object
- * @start: The mob page offset starting the range
- * @end: The mob page offset ending the range
- * @num_prefault: Returns how many pages including the first have been
- * cleaned and are ok to prefault
- */
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault)
-{
-	struct rb_node *cur = vbo->res_tree.rb_node;
-	struct vmw_resource *found = NULL;
-	unsigned long res_start = start << PAGE_SHIFT;
-	unsigned long res_end = end << PAGE_SHIFT;
-	unsigned long last_cleaned = 0;
-
-	/*
-	 * Find the resource with lowest backup_offset that intersects the
-	 * range.
-	 */
-	while (cur) {
-		struct vmw_resource *cur_res =
-			container_of(cur, struct vmw_resource, mob_node);
-
-		if (cur_res->backup_offset >= res_end) {
-			cur = cur->rb_left;
-		} else if (cur_res->backup_offset + cur_res->backup_size <=
-			   res_start) {
-			cur = cur->rb_right;
-		} else {
-			found = cur_res;
-			cur = cur->rb_left;
-			/* Continue to look for resources with lower offsets */
-		}
-	}
-
-	/*
-	 * In order of increasing backup_offset, clean dirty resorces
-	 * intersecting the range.
-	 */
-	while (found) {
-		if (found->res_dirty) {
-			int ret;
-
-			if (!found->func->clean)
-				return -EINVAL;
-
-			ret = found->func->clean(found);
-			if (ret)
-				return ret;
-
-			found->res_dirty = false;
-		}
-		last_cleaned = found->backup_offset + found->backup_size;
-		cur = rb_next(&found->mob_node);
-		if (!cur)
-			break;
-
-		found = container_of(cur, struct vmw_resource, mob_node);
-		if (found->backup_offset >= res_end)
-			break;
-	}
-
-	/*
-	 * Set number of pages allowed prefaulting and fence the buffer object
-	 */
-	*num_prefault = 1;
-	if (last_cleaned > res_start) {
-		struct ttm_buffer_object *bo = &vbo->base;
-
-		*num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned - res_start,
-						      PAGE_SIZE);
-		vmw_bo_fence_single(bo, NULL);
-		if (bo->moving)
-			dma_fence_put(bo->moving);
-		bo->moving = dma_fence_get
-			(reservation_object_get_excl(bo->resv));
-	}
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index 3b7438b2d289..7e19eba0b0b8 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -71,13 +71,6 @@ struct vmw_user_resource_conv {
  * @commit_notify:     If the resource is a command buffer managed resource,
  *                     callback to notify that a define or remove command
  *                     has been committed to the device.
- * @dirty_alloc:       Allocate a dirty tracker. NULL if dirty-tracking is not
- *                     supported.
- * @dirty_free:        Free the dirty tracker.
- * @dirty_sync:        Upload the dirty mob contents to the resource.
- * @dirty_add_range:   Add a sequential dirty range to the resource
- *                     dirty tracker.
- * @clean:             Clean the resource.
  */
 struct vmw_res_func {
 	enum vmw_res_type res_type;
@@ -85,8 +78,6 @@ struct vmw_res_func {
 	const char *type_name;
 	struct ttm_placement *backup_placement;
 	bool may_evict;
-	u32 prio;
-	u32 dirty_prio;
 
 	int (*create) (struct vmw_resource *res);
 	int (*destroy) (struct vmw_resource *res);
@@ -97,12 +88,6 @@ struct vmw_res_func {
 		       struct ttm_validate_buffer *val_buf);
 	void (*commit_notify)(struct vmw_resource *res,
 			      enum vmw_cmdbuf_res_state state);
-	int (*dirty_alloc)(struct vmw_resource *res);
-	void (*dirty_free)(struct vmw_resource *res);
-	int (*dirty_sync)(struct vmw_resource *res);
-	void (*dirty_range_add)(struct vmw_resource *res, size_t start,
-				 size_t end);
-	int (*clean)(struct vmw_resource *res);
 };
 
 /**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index e139fdfd1635..d310d21f0d54 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -95,8 +95,6 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_shader_create,
@@ -108,9 +106,7 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 static const struct vmw_res_func vmw_dx_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
-	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
+	.may_evict = false,
 	.type_name = "dx shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_shader_create,
@@ -427,7 +423,7 @@ static int vmw_dx_shader_create(struct vmw_resource *res)
 
 	WARN_ON_ONCE(!shader->committed);
 
-	if (vmw_resource_mob_attached(res)) {
+	if (!list_empty(&res->mob_head)) {
 		mutex_lock(&dev_priv->binding_mutex);
 		ret = vmw_dx_shader_unscrub(res);
 		mutex_unlock(&dev_priv->binding_mutex);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index 862ca44680ca..219471903bc1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -68,20 +68,6 @@ struct vmw_surface_offset {
 	uint32_t bo_offset;
 };
 
-/**
- * vmw_surface_dirty - Surface dirty-tracker
- * @cache: Cached layout information of the surface.
- * @size: Accounting size for the struct vmw_surface_dirty.
- * @num_subres: Number of subresources.
- * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per subresource.
- */
-struct vmw_surface_dirty {
-	struct svga3dsurface_cache cache;
-	size_t size;
-	u32 num_subres;
-	SVGA3dBox boxes[0];
-};
-
 static void vmw_user_surface_free(struct vmw_resource *res);
 static struct vmw_resource *
 vmw_user_surface_base_to_res(struct ttm_base_object *base);
@@ -110,13 +96,6 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
 				  struct drm_vmw_gb_surface_ref_ext_rep *rep,
 				  struct drm_file *file_priv);
 
-static void vmw_surface_dirty_free(struct vmw_resource *res);
-static int vmw_surface_dirty_alloc(struct vmw_resource *res);
-static int vmw_surface_dirty_sync(struct vmw_resource *res);
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end);
-static int vmw_surface_clean(struct vmw_resource *res);
-
 static const struct vmw_user_resource_conv user_surface_conv = {
 	.object_type = VMW_RES_SURFACE,
 	.base_obj_to_res = vmw_user_surface_base_to_res,
@@ -133,8 +112,6 @@ static const struct vmw_res_func vmw_legacy_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = false,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 1,
 	.type_name = "legacy surfaces",
 	.backup_placement = &vmw_srf_placement,
 	.create = &vmw_legacy_srf_create,
@@ -147,19 +124,12 @@ static const struct vmw_res_func vmw_gb_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 2,
 	.type_name = "guest backed surfaces",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_surface_create,
 	.destroy = vmw_gb_surface_destroy,
 	.bind = vmw_gb_surface_bind,
-	.unbind = vmw_gb_surface_unbind,
-	.dirty_alloc = vmw_surface_dirty_alloc,
-	.dirty_free = vmw_surface_dirty_free,
-	.dirty_sync = vmw_surface_dirty_sync,
-	.dirty_range_add = vmw_surface_dirty_range_add,
-	.clean = vmw_surface_clean,
+	.unbind = vmw_gb_surface_unbind
 };
 
 /**
@@ -667,7 +637,6 @@ static void vmw_user_surface_free(struct vmw_resource *res)
 	struct vmw_private *dev_priv = srf->res.dev_priv;
 	uint32_t size = user_srf->size;
 
-	WARN_ON_ONCE(res->dirty);
 	if (user_srf->master)
 		drm_master_put(&user_srf->master);
 	kfree(srf->offsets);
@@ -946,6 +915,12 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 		if (unlikely(drm_is_render_client(file_priv)))
 			require_exist = true;
 
+		if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) {
+			DRM_ERROR("Locked master refused legacy "
+				  "surface reference.\n");
+			return -EACCES;
+		}
+
 		handle = u_handle;
 	}
 
@@ -1195,16 +1170,10 @@ static int vmw_gb_surface_bind(struct vmw_resource *res,
 		cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
 		cmd2->header.size = sizeof(cmd2->body);
 		cmd2->body.sid = res->id;
+		res->backup_dirty = false;
 	}
 	vmw_fifo_commit(dev_priv, submit_size);
 
-	if (res->backup->dirty && res->backup_dirty) {
-		/* We've just made a full upload. Cear dirty regions. */
-		vmw_bo_dirty_clear_res(res);
-	}
-
-	res->backup_dirty = false;
-
 	return 0;
 }
 
@@ -1669,8 +1638,7 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 			}
 		}
 	} else if (req->base.drm_surface_flags &
-		   (drm_vmw_surface_flag_create_buffer |
-		    drm_vmw_surface_flag_coherent))
+		   drm_vmw_surface_flag_create_buffer)
 		ret = vmw_user_bo_alloc(dev_priv, tfile,
 					res->backup_size,
 					req->base.drm_surface_flags &
@@ -1684,26 +1652,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 		goto out_unlock;
 	}
 
-	if (req->base.drm_surface_flags & drm_vmw_surface_flag_coherent) {
-		struct vmw_buffer_object *backup = res->backup;
-
-		ttm_bo_reserve(&backup->base, false, false, NULL);
-		if (!res->func->dirty_alloc)
-			ret = -EINVAL;
-		if (!ret)
-			ret = vmw_bo_dirty_add(backup);
-		if (!ret) {
-			res->coherent = true;
-			ret = res->func->dirty_alloc(res);
-		}
-		ttm_bo_unreserve(&backup->base);
-		if (ret) {
-			vmw_resource_unreference(&res);
-			goto out_unlock;
-		}
-
-	}
-
 	tmp = vmw_resource_reference(res);
 	ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf->prime,
 				    req->base.drm_surface_flags &
@@ -1812,338 +1760,3 @@ out_bad_resource:
 
 	return ret;
 }
-
-/**
- * vmw_subres_dirty_add - Add a dirty region to a subresource
- * @dirty: The surfaces's dirty tracker.
- * @loc_start: The location corresponding to the start of the region.
- * @loc_end: The location corresponding to the end of the region.
- *
- * As we are assuming that @loc_start and @loc_end represent a sequential
- * range of backing store memory, if the region spans multiple lines then
- * regardless of the x coordinate, the full lines are dirtied.
- * Correspondingly if the region spans multiple z slices, then full rather
- * than partial z slices are dirtied.
- */
-static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
-				 const struct svga3dsurface_loc *loc_start,
-				 const struct svga3dsurface_loc *loc_end)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
-	u32 mip = loc_start->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	u32 box_c2 = box->z + box->d;
-
-	if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
-		return;
-
-	if (box->d == 0 || box->z > loc_start->z)
-		box->z = loc_start->z;
-	if (box_c2 < loc_end->z)
-		box->d = loc_end->z - box->z;
-
-	if (loc_start->z + 1 == loc_end->z) {
-		box_c2 = box->y + box->h;
-		if (box->h == 0 || box->y > loc_start->y)
-			box->y = loc_start->y;
-		if (box_c2 < loc_end->y)
-			box->h = loc_end->y - box->y;
-
-		if (loc_start->y + 1 == loc_end->y) {
-			box_c2 = box->x + box->w;
-			if (box->w == 0 || box->x > loc_start->x)
-				box->x = loc_start->x;
-			if (box_c2 < loc_end->x)
-				box->w = loc_end->x - box->x;
-		} else {
-			box->x = 0;
-			box->w = size->width;
-		}
-	} else {
-		box->y = 0;
-		box->h = size->height;
-		box->x = 0;
-		box->w = size->width;
-	}
-}
-
-/**
- * vmw_subres_dirty_full - Mark a full subresource as dirty
- * @dirty: The surface's dirty tracker.
- * @subres: The subresource
- */
-static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty, u32 subres)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	u32 mip = subres % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	SVGA3dBox *box = &dirty->boxes[subres];
-
-	box->x = 0;
-	box->y = 0;
-	box->z = 0;
-	box->w = size->width;
-	box->h = size->height;
-	box->d = size->depth;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for texture
- * surfaces.
- */
-static void vmw_surface_tex_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t backup_end = res->backup_offset + res->backup_size;
-	struct svga3dsurface_loc loc1, loc2;
-	const struct svga3dsurface_cache *cache;
-
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	cache = &dirty->cache;
-	svga3dsurface_get_loc(cache, &loc1, start);
-	svga3dsurface_get_loc(cache, &loc2, end - 1);
-	svga3dsurface_inc_loc(cache, &loc2);
-
-	if (loc1.sub_resource + 1 == loc2.sub_resource) {
-		/* Dirty range covers a single sub-resource */
-		vmw_subres_dirty_add(dirty, &loc1, &loc2);
-	} else {
-		/* Dirty range covers multiple sub-resources */
-		struct svga3dsurface_loc loc_min, loc_max;
-		u32 sub_res = loc1.sub_resource;
-
-		svga3dsurface_max_loc(cache, loc1.sub_resource, &loc_max);
-		vmw_subres_dirty_add(dirty, &loc1, &loc_max);
-		svga3dsurface_min_loc(cache, loc2.sub_resource - 1, &loc_min);
-		vmw_subres_dirty_add(dirty, &loc_min, &loc2);
-		for (sub_res = loc1.sub_resource + 1;
-		     sub_res < loc2.sub_resource - 1; ++sub_res)
-			vmw_subres_dirty_full(dirty, sub_res);
-	}
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for buffer
- * surfaces.
- */
-static void vmw_surface_buf_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	size_t backup_end = res->backup_offset + cache->mip_chain_bytes;
-	SVGA3dBox *box = &dirty->boxes[0];
-	u32 box_c2;
-
-	box->h = box->d = 1;
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	box_c2 = box->x + box->w;
-	if (box->w == 0 || box->x > start)
-		box->x = start;
-	if (box_c2 < end)
-		box->w = end - box->x;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for surfaces
- */
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-
-	if (WARN_ON(end <= res->backup_offset ||
-		    start >= res->backup_offset + res->backup_size))
-		return;
-
-	if (srf->format == SVGA3D_BUFFER)
-		vmw_surface_buf_dirty_range_add(res, start, end);
-	else
-		vmw_surface_tex_dirty_range_add(res, start, end);
-}
-
-/*
- * vmw_surface_dirty_sync - The surface's dirty_sync callback.
- */
-static int vmw_surface_dirty_sync(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	bool has_dx = 0;
-	u32 i, num_dirty;
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t alloc_size;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdDXUpdateSubResource body;
-	} *cmd1;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdUpdateGBImage body;
-	} *cmd2;
-	void *cmd;
-
-	num_dirty = 0;
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (box->d)
-			num_dirty++;
-	}
-
-	if (!num_dirty)
-		goto out;
-
-	alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) : sizeof(*cmd2));
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd1 = cmd;
-	cmd2 = cmd;
-
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (!box->d)
-			continue;
-
-		/*
-		 * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
-		 * UPDATE_GB_IMAGE is not.
-		 */
-		if (has_dx) {
-			cmd1->header.id = SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
-			cmd1->header.size = sizeof(cmd1->body);
-			cmd1->body.sid = res->id;
-			cmd1->body.subResource = i;
-			cmd1->body.box = *box;
-			cmd1++;
-		} else {
-			cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
-			cmd2->header.size = sizeof(cmd2->body);
-			cmd2->body.image.sid = res->id;
-			cmd2->body.image.face = i / cache->num_mip_levels;
-			cmd2->body.image.mipmap = i -
-				(cache->num_mip_levels * cmd2->body.image.face);
-			cmd2->body.box = *box;
-			cmd2++;
-		}
-
-	}
-	vmw_fifo_commit(dev_priv, alloc_size);
- out:
-	memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
-	       dirty->num_subres);
-
-	return 0;
-}
-
-/*
- * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
- */
-static int vmw_surface_dirty_alloc(struct vmw_resource *res)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-	struct vmw_surface_dirty *dirty;
-	u32 num_layers = 1;
-	u32 num_mip;
-	u32 num_subres;
-	u32 num_samples;
-	size_t dirty_size, acc_size;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-	int ret;
-
-	if (srf->array_size)
-		num_layers = srf->array_size;
-	else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
-		num_layers *= SVGA3D_MAX_SURFACE_FACES;
-
-	num_mip = srf->mip_levels[0];
-	if (!num_mip)
-		num_mip = 1;
-
-	num_subres = num_layers * num_mip;
-	dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty->boxes[0]);
-	acc_size = ttm_round_pot(dirty_size);
-	ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
-				   acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for surface "
-			       "dirty tracker.\n");
-		return ret;
-	}
-
-	dirty = kvzalloc(dirty_size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	num_samples = max_t(u32, 1, srf->multisample_count);
-	ret = svga3dsurface_setup_cache(&srf->base_size, srf->format, num_mip,
-					num_layers, num_samples, &dirty->cache);
-	if (ret)
-		goto out_no_cache;
-
-	dirty->num_subres = num_subres;
-	dirty->size = acc_size;
-	res->dirty = (struct vmw_resource_dirty *) dirty;
-
-	return 0;
-
-out_no_cache:
-	kvfree(dirty);
-out_no_dirty:
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	return ret;
-}
-
-/*
- * vmw_surface_dirty_free - The surface's dirty_free callback
- */
-static void vmw_surface_dirty_free(struct vmw_resource *res)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t acc_size = dirty->size;
-
-	kvfree(dirty);
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	res->dirty = NULL;
-}
-
-/*
- * vmw_surface_clean - The surface's clean callback
- */
-static int vmw_surface_clean(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	size_t alloc_size;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdReadbackGBSurface body;
-	} *cmd;
-
-	alloc_size = sizeof(*cmd);
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.sid = res->id;
-	vmw_fifo_commit(dev_priv, alloc_size);
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
index 9aaf807ed73c..f611b2290a1b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
@@ -33,8 +33,6 @@
  * struct vmw_validation_bo_node - Buffer object validation metadata.
  * @base: Metadata used for TTM reservation- and validation.
  * @hash: A hash entry used for the duplicate detection hash table.
- * @coherent_count: If switching backup buffers, number of new coherent
- * resources that will have this buffer as a backup buffer.
  * @as_mob: Validate as mob.
  * @cpu_blit: Validate for cpu blit access.
  *
@@ -44,7 +42,6 @@
 struct vmw_validation_bo_node {
 	struct ttm_validate_buffer base;
 	struct drm_hash_item hash;
-	unsigned int coherent_count;
 	u32 as_mob : 1;
 	u32 cpu_blit : 1;
 };
@@ -462,19 +459,6 @@ int vmw_validation_res_reserve(struct vmw_validation_context *ctx,
 			if (ret)
 				goto out_unreserve;
 		}
-
-		if (val->switching_backup && val->new_backup &&
-		    res->coherent) {
-			struct vmw_validation_bo_node *bo_node =
-				vmw_validation_find_bo_dup(ctx,
-							   val->new_backup);
-
-			if (WARN_ON(!bo_node)) {
-				ret = -EINVAL;
-				goto out_unreserve;
-			}
-			bo_node->coherent_count++;
-		}
 	}
 
 	return 0;
@@ -578,9 +562,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 	int ret;
 
 	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		struct vmw_buffer_object *vbo =
-			container_of(entry->base.bo, typeof(*vbo), base);
-
 		if (entry->cpu_blit) {
 			struct ttm_operation_ctx ctx = {
 				.interruptible = intr,
@@ -595,27 +576,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 		}
 		if (ret)
 			return ret;
-
-		/*
-		 * Rather than having the resource code allocating the bo
-		 * dirty tracker in resource_unreserve() where we can't fail,
-		 * Do it here when validating the buffer object.
-		 */
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-
-			while (coherent_count) {
-				ret = vmw_bo_dirty_add(vbo);
-				if (ret)
-					return ret;
-
-				coherent_count--;
-			}
-			entry->coherent_count -= coherent_count;
-		}
-
-		if (vbo->dirty)
-			vmw_bo_dirty_scan(vbo);
 	}
 	return 0;
 }
@@ -641,8 +601,7 @@ int vmw_validation_res_validate(struct vmw_validation_context *ctx, bool intr)
 		struct vmw_resource *res = val->res;
 		struct vmw_buffer_object *backup = res->backup;
 
-		ret = vmw_resource_validate(res, intr, val->dirty_set &&
-					    val->dirty);
+		ret = vmw_resource_validate(res, intr);
 		if (ret) {
 			if (ret != -ERESTARTSYS)
 				DRM_ERROR("Failed to validate resource.\n");
@@ -869,34 +828,3 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 	ctx->mem_size_left += size;
 	return 0;
 }
-
-/**
- * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
- * validation context
- * @ctx: The validation context
- *
- * This function unreserves the buffer objects previously reserved using
- * vmw_validation_bo_reserve. It's typically used as part of an error path
- */
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
-{
-	struct vmw_validation_bo_node *entry;
-
-	/*
-	 * Switching coherent resource backup buffers failed.
-	 * Release corresponding buffer object dirty trackers.
-	 */
-	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-			struct vmw_buffer_object *vbo =
-				container_of(entry->base.bo, typeof(*vbo),
-					     base);
-
-			while (coherent_count--)
-				vmw_bo_dirty_release(vbo);
-		}
-	}
-
-	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index fd83e017c2a5..1d2322ad6fd5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -172,6 +172,20 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
 				      NULL, true);
 }
 
+/**
+ * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
+ * validation context
+ * @ctx: The validation context
+ *
+ * This function unreserves the buffer objects previously reserved using
+ * vmw_validation_bo_reserve. It's typically used as part of an error path
+ */
+static inline void
+vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
+{
+	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
+}
+
 /**
  * vmw_validation_bo_fence - Unreserve and fence buffer objects registered
  * with a validation context
@@ -254,6 +268,4 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 			       unsigned int size);
 void vmw_validation_res_set_dirty(struct vmw_validation_context *ctx,
 				  void *val_private, u32 dirty);
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
-
 #endif
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 435d02f719a8..49d9cdfc58f2 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -768,14 +768,4 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
 			struct ttm_operation_ctx *ctx);
 void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
 int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
-
-/* Default number of pre-faulted pages in the TTM fault handler */
-#define TTM_BO_VM_NUM_PREFAULT 16
-
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf);
-
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault);
 #endif
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index a2d810a2504d..c9b8ba492f24 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -442,9 +442,6 @@ extern struct ttm_bo_global {
  * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
  * @man: An array of mem_type_managers.
  * @vma_manager: Address space manager
- * @vm_ops: Pointer to the struct vm_operations_struct used for this
- * device's VM operations. The driver may override this before the first
- * mmap() call.
  * lru_lock: Spinlock that protects the buffer+device lru lists and
  * ddestroy lists.
  * @dev_mapping: A pointer to the struct address_space representing the
@@ -463,7 +460,6 @@ struct ttm_bo_device {
 	struct ttm_bo_global *glob;
 	struct ttm_bo_driver *driver;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
-	const struct vm_operations_struct *vm_ops;
 
 	/*
 	 * Protected by internal locks.
@@ -492,8 +488,6 @@ struct ttm_bo_device {
 	bool no_retry;
 };
 
-extern const struct vm_operations_struct ttm_bo_vm_ops;
-
 /**
  * struct ttm_lru_bulk_move_pos
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..dd0b5f4e1e45 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2686,24 +2686,7 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
-struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
-			 struct pfn_range_apply *closure);
-struct pfn_range_apply {
-	struct mm_struct *mm;
-	pter_fn_t ptefn;
-	unsigned int alloc;
-};
-extern int apply_to_pfn_range(struct pfn_range_apply *closure,
-			      unsigned long address, unsigned long size);
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr);
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end);
+
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 02cab33f2f25..399f58317cff 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -891,13 +891,11 @@ struct drm_vmw_shader_arg {
  *                                      surface.
  * @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
  *                                      given.
- * @drm_vmw_surface_flag_coherent:      Back surface with coherent memory.
  */
 enum drm_vmw_surface_flags {
 	drm_vmw_surface_flag_shareable = (1 << 0),
 	drm_vmw_surface_flag_scanout = (1 << 1),
-	drm_vmw_surface_flag_create_buffer = (1 << 2),
-	drm_vmw_surface_flag_coherent = (1 << 3),
+	drm_vmw_surface_flag_create_buffer = (1 << 2)
 };
 
 /**
diff --git a/mm/Kconfig b/mm/Kconfig
index 5006d0e6a5c7..f0c76ba47695 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,7 +765,4 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
-config AS_DIRTY_HELPERS
-        bool
-
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index f5d412bbc2f7..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,4 +104,3 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
-obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
deleted file mode 100644
index f600e31534fb..000000000000
--- a/mm/as_dirty_helpers.c
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/mm_types.h>
-#include <linux/hugetlb.h>
-#include <linux/bitops.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-/**
- * struct apply_as - Closure structure for apply_as_range
- * @base: struct pfn_range_apply we derive from
- * @start: Address of first modified pte
- * @end: Address of last modified pte + 1
- * @total: Total number of modified ptes
- * @vma: Pointer to the struct vm_area_struct we're currently operating on
- */
-struct apply_as {
-	struct pfn_range_apply base;
-	unsigned long start;
-	unsigned long end;
-	unsigned long total;
-	struct vm_area_struct *vma;
-};
-
-/**
- * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as
- *
- * The function write-protects a pte and records the range in
- * virtual address space of touched ptes for efficient range TLB flushes.
- *
- * Return: Always zero.
- */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
-			      unsigned long addr,
-			      struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	pte_t ptent = *pte;
-
-	if (pte_write(ptent)) {
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_wrprotect(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-	}
-
-	return 0;
-}
-
-/**
- * struct apply_as_clean - Closure structure for apply_as_clean
- * @base: struct apply_as we derive from
- * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
- * @bitmap: Bitmap with one bit for each page offset in the address_space range
- * covered.
- * @start: Address_space page offset of first modified pte relative
- * to @bitmap_pgoff
- * @end: Address_space page offset of last modified pte relative
- * to @bitmap_pgoff
- */
-struct apply_as_clean {
-	struct apply_as base;
-	pgoff_t bitmap_pgoff;
-	unsigned long *bitmap;
-	pgoff_t start;
-	pgoff_t end;
-};
-
-/**
- * apply_pt_clean - Leaf pte callback to clean a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as_clean
- *
- * The function cleans a pte and records the range in
- * virtual address space of touched ptes for efficient TLB flushes.
- * It also records dirty ptes in a bitmap representing page offsets
- * in the address_space, as well as the first and last of the bits
- * touched.
- *
- * Return: Always zero.
- */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
-			  unsigned long addr,
-			  struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
-	pte_t ptent = *pte;
-
-	if (pte_dirty(ptent)) {
-		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
-			aas->vma->vm_pgoff - clean->bitmap_pgoff;
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_mkclean(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-
-		__set_bit(pgoff, clean->bitmap);
-		clean->start = min(clean->start, pgoff);
-		clean->end = max(clean->end, pgoff + 1);
-	}
-
-	return 0;
-}
-
-/**
- * apply_as_range - Apply a pte callback to all PTEs pointing into a range
- * of an address_space.
- * @mapping: Pointer to the struct address_space
- * @aas: Closure structure
- * @first_index: First page offset in the address_space
- * @nr: Number of incremental page offsets to cover
- *
- * Return: Number of ptes touched. Note that this number might be larger
- * than @nr if there are overlapping vmas
- */
-static unsigned long apply_as_range(struct address_space *mapping,
-				    struct apply_as *aas,
-				    pgoff_t first_index, pgoff_t nr)
-{
-	struct vm_area_struct *vma;
-	pgoff_t vba, vea, cba, cea;
-	unsigned long start_addr, end_addr;
-	struct mmu_notifier_range range;
-
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
-				  first_index + nr - 1) {
-		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
-
-		/*
-		 * We can only do advisory flag tests below, since we can't
-		 * require the vm's mmap_sem to be held to protect the flags.
-		 * Therefore, callers that strictly depend on specific mmap
-		 * flags to remain constant throughout the operation must
-		 * either ensure those flags are immutable for all relevant
-		 * vmas or can't use this function. Fixing this properly would
-		 * require the vma::vm_flags to be protected by a separate
-		 * lock taken after the i_mmap_lock
-		 */
-
-		/* Skip non-applicable VMAs */
-		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
-		    (VM_SHARED | VM_WRITE))
-			continue;
-
-		/* Warn on and skip VMAs whose flags indicate illegal usage */
-		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
-			continue;
-
-		/* Clip to the vma */
-		vba = vma->vm_pgoff;
-		vea = vba + vma_pages(vma);
-		cba = first_index;
-		cba = max(cba, vba);
-		cea = first_index + nr;
-		cea = min(cea, vea);
-
-		/* Translate to virtual address */
-		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
-		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
-		if (start_addr >= end_addr)
-			continue;
-
-		aas->base.mm = vma->vm_mm;
-		aas->vma = vma;
-		aas->start = end_addr;
-		aas->end = start_addr;
-
-		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
-					vma, vma->vm_mm, start_addr, end_addr);
-		mmu_notifier_invalidate_range_start(&range);
-
-		/* Needed when we only change protection? */
-		flush_cache_range(vma, start_addr, end_addr);
-
-		/*
-		 * We're not using tlb_gather_mmu() since typically
-		 * only a small subrange of PTEs are affected.
-		 */
-		inc_tlb_flush_pending(vma->vm_mm);
-
-		/* Should not error since aas->base.alloc == 0 */
-		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
-					   end_addr - start_addr));
-		if (aas->end > aas->start)
-			flush_tlb_range(vma, aas->start, aas->end);
-
-		mmu_notifier_invalidate_range_end(&range);
-		dec_tlb_flush_pending(vma->vm_mm);
-	}
-	i_mmap_unlock_read(mapping);
-
-	return aas->total;
-}
-
-/**
- * apply_as_wrprotect - Write-protect all ptes in an address_space range
- * @mapping: The address_space we want to write protect
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of ptes actually write-protected. Note that
- * already write-protected ptes are not counted.
- */
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr)
-{
-	struct apply_as aas = {
-		.base = {
-			.alloc = 0,
-			.ptefn = apply_pt_wrprotect,
-		},
-		.total = 0,
-	};
-
-	return apply_as_range(mapping, &aas, first_index, nr);
-}
-EXPORT_SYMBOL_GPL(apply_as_wrprotect);
-
-/**
- * apply_as_clean - Clean all ptes in an address_space range
- * @mapping: The address_space we want to clean
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- * @bitmap_pgoff: The page offset of the first bit in @bitmap
- * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
- * cover the whole range @first_index..@first_index + @nr.
- * @start: Pointer to number of the first set bit in @bitmap.
- * is modified as new bits are set by the function.
- * @end: Pointer to the number of the last set bit in @bitmap.
- * none set. The value is modified as new bits are set by the function.
- *
- * Note: When this function returns there is no guarantee that a CPU has
- * not already dirtied new ptes. However it will not clean any ptes not
- * reported in the bitmap.
- *
- * If a caller needs to make sure all dirty ptes are picked up and none
- * additional are added, it first needs to write-protect the address-space
- * range and make sure new writers are blocked in page_mkwrite() or
- * pfn_mkwrite(). And then after a TLB flush following the write-protection
- * pick up all dirty bits.
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of dirty ptes actually cleaned.
- */
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end)
-{
-	bool none_set = (*start >= *end);
-	struct apply_as_clean clean = {
-		.base = {
-			.base = {
-				.alloc = 0,
-				.ptefn = apply_pt_clean,
-			},
-			.total = 0,
-		},
-		.bitmap_pgoff = bitmap_pgoff,
-		.bitmap = bitmap,
-		.start = none_set ? nr : *start,
-		.end = none_set ? 0 : *end,
-	};
-	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
-					   nr);
-
-	*start = clean.start;
-	*end = clean.end;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(apply_as_clean);
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..ddf20bd0c317 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,17 +2032,18 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (closure->mm == &init_mm) ?
+	pte = (mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2053,109 +2054,86 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = fn(pte++, token, addr, data);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (closure->mm != &init_mm)
+	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err = 0;
+	int err;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(closure->mm, pud, addr);
+	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
-
 	do {
 		next = pmd_addr_end(addr, end);
-		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
-			continue;
-		err = apply_to_pte_range(closure, pmd, addr, next);
+		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	pud = pud_alloc(closure->mm, p4d, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
-
 	do {
 		next = pud_addr_end(addr, end);
-		if (!closure->alloc && pud_none_or_clear_bad(pud))
-			continue;
-		err = apply_to_pmd_range(closure, pud, addr, next);
+		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	p4d = p4d_alloc(closure->mm, pgd, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
-
 	do {
 		next = p4d_addr_end(addr, end);
-		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
-			continue;
-		err = apply_to_pud_range(closure, p4d, addr, next);
+		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/**
- * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
- * function on each leaf page table entry
- * @closure: Details about how to scan and what function to apply
- * @addr: Start virtual address
- * @size: Size of the region
- *
- * If @closure->alloc is set to 1, the function will fill in the page table
- * as necessary. Otherwise it will skip non-present parts.
- * Note: The caller must ensure that the range does not contain huge pages.
- * The caller must also assure that the proper mmu_notifier functions are
- * called before and after the call to apply_to_pfn_range.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- *
- * Return: Zero on success. If the provided function returns a non-zero status,
- * the page table walk will terminate and that status will be returned.
- * If @closure->alloc is set to 1, then this function may also return memory
- * allocation errors arising from allocating page table memory.
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
  */
-int apply_to_pfn_range(struct pfn_range_apply *closure,
-		       unsigned long addr, unsigned long size)
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2165,65 +2143,16 @@ int apply_to_pfn_range(struct pfn_range_apply *closure,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(closure->mm, addr);
+	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
-			continue;
-		err = apply_to_p4d_range(closure, pgd, addr, next);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
-
-/**
- * struct page_range_apply - Closure structure for apply_to_page_range()
- * @pter: The base closure structure we derive from
- * @fn: The leaf pte function to call
- * @data: The leaf pte function closure
- */
-struct page_range_apply {
-	struct pfn_range_apply pter;
-	pte_fn_t fn;
-	void *data;
-};
-
-/*
- * Callback wrapper to enable use of apply_to_pfn_range for
- * the apply_to_page_range interface
- */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
-				       unsigned long addr,
-				       struct pfn_range_apply *pter)
-{
-	struct page_range_apply *pra =
-		container_of(pter, typeof(*pra), pter);
-
-	return pra->fn(pte, token, addr, pra->data);
-}
-
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
-{
-	struct page_range_apply pra = {
-		.pter = {.mm = mm,
-			 .alloc = 1,
-			 .ptefn = apply_to_page_range_wrapper },
-		.fn = fn,
-		.data = data
-	};
-
-	return apply_to_pfn_range(&pra.pter, addr, size);
-}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
@@ -2309,7 +2238,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
 	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
-	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
 		lock_page(page);
@@ -2586,7 +2515,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
 		ret = vma->vm_ops->pfn_mkwrite(vmf);
-		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
+		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
 	}
@@ -2607,8 +2536,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp || (tmp &
-				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-				       VM_FAULT_RETRY)))) {
+				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
@@ -3673,8 +3601,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 		unlock_page(vmf->page);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp ||
-				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-					VM_FAULT_RETRY)))) {
+				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
-- 
cgit v1.2.3


From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:52 -0700
Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok

Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate
that it can be used for both loads and stores.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d944369ca87..ff65d22cf336 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/net/core/filter.c b/net/core/filter.c
index 47f6386fb17a..c5983ddb1a9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
-- 
cgit v1.2.3


From c4dcc8a162784c1f827c7f6d8409598f19708fe6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 16 Jul 2019 09:36:08 +0530
Subject: cpufreq: Make cpufreq_generic_init() return void

It always returns 0 (success) and its return type should really be void.

Over that, many drivers have added error handling code based on its
return value, which is not required at all.

Change its return type to void and update all the callers.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/bmips-cpufreq.c     | 17 ++++++-----------
 drivers/cpufreq/cpufreq.c           |  4 +---
 drivers/cpufreq/davinci-cpufreq.c   |  3 ++-
 drivers/cpufreq/imx6q-cpufreq.c     |  6 ++----
 drivers/cpufreq/kirkwood-cpufreq.c  |  3 ++-
 drivers/cpufreq/loongson1-cpufreq.c |  8 +++-----
 drivers/cpufreq/loongson2_cpufreq.c |  3 ++-
 drivers/cpufreq/maple-cpufreq.c     |  3 ++-
 drivers/cpufreq/omap-cpufreq.c      | 15 +++++----------
 drivers/cpufreq/pasemi-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac32-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac64-cpufreq.c    |  3 ++-
 drivers/cpufreq/s3c2416-cpufreq.c   |  9 ++-------
 drivers/cpufreq/s3c64xx-cpufreq.c   | 15 +++------------
 drivers/cpufreq/s5pv210-cpufreq.c   |  3 ++-
 drivers/cpufreq/sa1100-cpufreq.c    |  3 ++-
 drivers/cpufreq/sa1110-cpufreq.c    |  3 ++-
 drivers/cpufreq/spear-cpufreq.c     |  3 ++-
 drivers/cpufreq/tegra20-cpufreq.c   |  8 +-------
 include/linux/cpufreq.h             |  2 +-
 20 files changed, 46 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c
index 56a4ebbf00e0..f7c23fa468f0 100644
--- a/drivers/cpufreq/bmips-cpufreq.c
+++ b/drivers/cpufreq/bmips-cpufreq.c
@@ -131,23 +131,18 @@ static int bmips_cpufreq_exit(struct cpufreq_policy *policy)
 static int bmips_cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
-	int ret;
 
 	freq_table = bmips_cpufreq_get_freq_table(policy);
 	if (IS_ERR(freq_table)) {
-		ret = PTR_ERR(freq_table);
-		pr_err("%s: couldn't determine frequency table (%d).\n",
-			BMIPS_CPUFREQ_NAME, ret);
-		return ret;
+		pr_err("%s: couldn't determine frequency table (%ld).\n",
+			BMIPS_CPUFREQ_NAME, PTR_ERR(freq_table));
+		return PTR_ERR(freq_table);
 	}
 
-	ret = cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
-	if (ret)
-		bmips_cpufreq_exit(policy);
-	else
-		pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
+	cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
+	pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver bmips_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 99aa7d20b458..efab334d6ab2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(arch_set_freq_scale);
  * - set policies transition latency
  * - policy->cpus with all possible CPUs
  */
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency)
 {
@@ -174,8 +174,6 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 	 * share the clock and voltage and clock.
 	 */
 	cpumask_setall(policy->cpus);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(cpufreq_generic_init);
 
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 940fe85db97a..664fa4ab9d1c 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -93,7 +93,8 @@ static int davinci_cpu_init(struct cpufreq_policy *policy)
 	 * Setting the latency to 2000 us to accommodate addition of drivers
 	 * to pre/post change notification list.
 	 */
-	return cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	return 0;
 }
 
 static struct cpufreq_driver davinci_driver = {
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 3e17560b1efe..91ea95c97bb2 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -193,14 +193,12 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
-	int ret;
-
 	policy->clk = clks[ARM].clk;
-	ret = cpufreq_generic_init(policy, freq_table, transition_latency);
+	cpufreq_generic_init(policy, freq_table, transition_latency);
 	policy->suspend_freq = max_freq;
 	dev_pm_opp_of_register_em(policy->cpus);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver imx6q_cpufreq_driver = {
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 7ab564c1f7ae..cb74bdc5baaa 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -85,7 +85,8 @@ static int kirkwood_cpufreq_target(struct cpufreq_policy *policy,
 /* Module init and exit code */
 static int kirkwood_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	return 0;
 }
 
 static struct cpufreq_driver kirkwood_cpufreq_driver = {
diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c
index 21c9ce8526c0..0ea88778882a 100644
--- a/drivers/cpufreq/loongson1-cpufreq.c
+++ b/drivers/cpufreq/loongson1-cpufreq.c
@@ -81,7 +81,7 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	struct cpufreq_frequency_table *freq_tbl;
 	unsigned int pll_freq, freq;
-	int steps, i, ret;
+	int steps, i;
 
 	pll_freq = clk_get_rate(cpufreq->pll_clk) / 1000;
 
@@ -103,11 +103,9 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	freq_tbl[i].frequency = CPUFREQ_TABLE_END;
 
 	policy->clk = cpufreq->clk;
-	ret = cpufreq_generic_init(policy, freq_tbl, 0);
-	if (ret)
-		kfree(freq_tbl);
+	cpufreq_generic_init(policy, freq_tbl, 0);
 
-	return ret;
+	return 0;
 }
 
 static int ls1x_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c
index da344696beed..890813e0bb76 100644
--- a/drivers/cpufreq/loongson2_cpufreq.c
+++ b/drivers/cpufreq/loongson2_cpufreq.c
@@ -95,7 +95,8 @@ static int loongson2_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	policy->clk = cpuclk;
-	return cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	return 0;
 }
 
 static int loongson2_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index a94355723ef8..a03cd3ad170f 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -143,7 +143,8 @@ static unsigned int maple_cpufreq_get_speed(unsigned int cpu)
 
 static int maple_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	return 0;
 }
 
 static struct cpufreq_driver maple_cpufreq_driver = {
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 68052b74d28f..edda20119cfd 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -125,23 +125,18 @@ static int omap_cpu_init(struct cpufreq_policy *policy)
 			dev_err(mpu_dev,
 				"%s: cpu%d: failed creating freq table[%d]\n",
 				__func__, policy->cpu, result);
-			goto fail;
+			clk_put(policy->clk);
+			return result;
 		}
 	}
 
 	atomic_inc_return(&freq_table_users);
 
 	/* FIXME: what's the actual transition time? */
-	result = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (!result) {
-		dev_pm_opp_of_register_em(policy->cpus);
-		return 0;
-	}
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
+	dev_pm_opp_of_register_em(policy->cpus);
 
-	freq_table_free();
-fail:
-	clk_put(policy->clk);
-	return result;
+	return 0;
 }
 
 static int omap_cpu_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 6b1e4abe3248..93f39a1d4c3d 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -196,7 +196,8 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->cur = pas_freqs[cur_astate].frequency;
 	ppc_proc_freq = policy->cur * 1000ul;
 
-	return cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	return 0;
 
 out_unmap_sdcpwr:
 	iounmap(sdcpwr_mapbase);
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 9b4ce2eb8222..bc7fc930294e 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -376,7 +376,8 @@ static int pmac_cpufreq_target(	struct cpufreq_policy *policy,
 
 static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static u32 read_gpio(struct device_node *np)
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 1d32a863332d..045881494cc9 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -324,7 +324,8 @@ static unsigned int g5_cpufreq_get_speed(unsigned int cpu)
 
 static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver g5_cpufreq_driver = {
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index 5b2db3c6568f..124a4c68c5ec 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -450,21 +450,16 @@ static int s3c2416_cpufreq_driver_init(struct cpufreq_policy *policy)
 	/* Datasheet says PLL stabalisation time must be at least 300us,
 	 * so but add some fudge. (reference in LOCKCON0 register description)
 	 */
-	ret = cpufreq_generic_init(policy, s3c_freq->freq_table,
+	cpufreq_generic_init(policy, s3c_freq->freq_table,
 			(500 * 1000) + s3c_freq->regulator_latency);
-	if (ret)
-		goto err_freq_table;
-
 	register_reboot_notifier(&s3c2416_cpufreq_reboot_notifier);
 
 	return 0;
 
-err_freq_table:
 #ifdef CONFIG_ARM_S3C2416_CPUFREQ_VCORESCALE
-	regulator_put(s3c_freq->vddarm);
 err_vddarm:
-#endif
 	clk_put(s3c_freq->armclk);
+#endif
 err_armclk:
 	clk_put(s3c_freq->hclk);
 err_hclk:
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 0cb9040eca49..40aafa8299a0 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -147,7 +147,6 @@ out:
 
 static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 {
-	int ret;
 	struct cpufreq_frequency_table *freq;
 
 	if (policy->cpu != 0)
@@ -168,8 +167,7 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 #ifdef CONFIG_REGULATOR
 	vddarm = regulator_get(NULL, "vddarm");
 	if (IS_ERR(vddarm)) {
-		ret = PTR_ERR(vddarm);
-		pr_err("Failed to obtain VDDARM: %d\n", ret);
+		pr_err("Failed to obtain VDDARM: %ld\n", PTR_ERR(vddarm));
 		pr_err("Only frequency scaling available\n");
 		vddarm = NULL;
 	} else {
@@ -199,16 +197,9 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 	 * the PLLs, which we don't currently) is ~300us worst case,
 	 * but add some fudge.
 	 */
-	ret = cpufreq_generic_init(policy, s3c64xx_freq_table,
+	cpufreq_generic_init(policy, s3c64xx_freq_table,
 			(500 * 1000) + regulator_latency);
-	if (ret != 0) {
-		pr_err("Failed to configure frequency table: %d\n",
-		       ret);
-		regulator_put(vddarm);
-		clk_put(policy->clk);
-	}
-
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver s3c64xx_cpufreq_driver = {
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index c7b7d1e65b08..0663cc935fa6 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -544,7 +544,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 	s5pv210_dram_conf[1].freq = clk_get_rate(dmc1_clk);
 
 	policy->suspend_freq = SLEEP_FREQ;
-	return cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	return 0;
 
 out_dmc1:
 	clk_put(dmc0_clk);
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index ab5cab93e638..5c075ef6adc0 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -181,7 +181,8 @@ static int sa1100_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 static struct cpufreq_driver sa1100_driver __refdata = {
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index 66e5fb088ecc..1057d7f65118 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -306,7 +306,8 @@ static int sa1110_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 /* sa1110_driver needs __refdata because it must remain after init registers
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 4074e2615522..73bd8dc47074 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -153,8 +153,9 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 static int spear_cpufreq_init(struct cpufreq_policy *policy)
 {
 	policy->clk = spear_cpufreq.clk;
-	return cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
+	cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
 			spear_cpufreq.transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver spear_cpufreq_driver = {
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index 3c32cc7b0671..f84ecd22f488 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -118,17 +118,11 @@ static int tegra_target(struct cpufreq_policy *policy, unsigned int index)
 static int tegra_cpu_init(struct cpufreq_policy *policy)
 {
 	struct tegra20_cpufreq *cpufreq = cpufreq_get_driver_data();
-	int ret;
 
 	clk_prepare_enable(cpufreq->cpu_clk);
 
 	/* FIXME: what's the actual transition time? */
-	ret = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (ret) {
-		clk_disable_unprepare(cpufreq->cpu_clk);
-		return ret;
-	}
-
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
 	policy->clk = cpufreq->cpu_clk;
 	policy->suspend_freq = freq_table[0].frequency;
 	return 0;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afc683021ac5..441ff15b7768 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -995,7 +995,7 @@ extern struct freq_attr *cpufreq_generic_attr[];
 int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy);
 
 unsigned int cpufreq_generic_get(unsigned int cpu);
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3


From 05ba4c895363db795f3d54f2da0de56d6520e52d Mon Sep 17 00:00:00 2001
From: Yonatan Goldschmidt <yon.goldschmidt@gmail.com>
Date: Mon, 8 Jul 2019 15:57:09 -0700
Subject: netfilter: Update obsolete comments referring to ip_conntrack

In 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") the new
generic nf_conntrack was introduced, and it came to supersede the old
ip_conntrack.

This change updates (some) of the obsolete comments referring to old
file/function names of the ip_conntrack mechanism, as well as removes a
few self-referencing comments that we shouldn't maintain anymore.

I did not update any comments referring to historical actions (e.g,
comments like "this file was derived from ..." were left untouched, even
if the referenced file is no longer here).

Signed-off-by: Yonatan Goldschmidt <yon.goldschmidt@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_h323_asn1.h | 3 +--
 net/ipv4/netfilter/ipt_CLUSTERIP.c               | 4 ++--
 net/netfilter/Kconfig                            | 6 ++----
 net/netfilter/nf_conntrack_core.c                | 4 +---
 net/netfilter/nf_conntrack_h323_asn1.c           | 5 ++---
 net/netfilter/nf_conntrack_proto_gre.c           | 2 --
 net/netfilter/nf_conntrack_proto_icmp.c          | 2 +-
 net/netfilter/nf_nat_core.c                      | 2 +-
 8 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_h323_asn1.h b/include/linux/netfilter/nf_conntrack_h323_asn1.h
index 91d6275292a5..19df78341fb3 100644
--- a/include/linux/netfilter/nf_conntrack_h323_asn1.h
+++ b/include/linux/netfilter/nf_conntrack_h323_asn1.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /****************************************************************************
- * ip_conntrack_h323_asn1.h - BER and PER decoding library for H.323
- * 			      conntrack/NAT module.
+ * BER and PER decoding library for H.323 conntrack/NAT module.
  *
  * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4d6bf7ac0792..6bdb1ab8af61 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -416,8 +416,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	     ctinfo == IP_CT_RELATED_REPLY))
 		return XT_CONTINUE;
 
-	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
-	 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+	/* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO,
+	 * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here
 	 * on, which all have an ID field [relevant for hashing]. */
 
 	hash = clusterip_hashfn(skb, cipinfo->config);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 32a45c03786e..0d65f4d39494 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -223,8 +223,6 @@ config NF_CONNTRACK_FTP
 	  of Network Address Translation on them.
 
 	  This is FTP support on Layer 3 independent connection tracking.
-	  Layer 3 independent connection tracking is experimental scheme
-	  which generalize ip_conntrack to support other layer 3 protocols.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
@@ -338,7 +336,7 @@ config NF_CONNTRACK_SIP
 	help
 	  SIP is an application-layer control protocol that can establish,
 	  modify, and terminate multimedia sessions (conferences) such as
-	  Internet telephony calls. With the ip_conntrack_sip and
+	  Internet telephony calls. With the nf_conntrack_sip and
 	  the nf_nat_sip modules you can support the protocol on a connection
 	  tracking/NATing firewall.
 
@@ -1313,7 +1311,7 @@ config NETFILTER_XT_MATCH_HELPER
 	depends on NETFILTER_ADVANCED
 	help
 	  Helper matching allows you to match packets in dynamic connections
-	  tracked by a conntrack-helper, ie. ip_conntrack_ftp
+	  tracked by a conntrack-helper, ie. nf_conntrack_ftp
 
 	  To compile it as a module, choose M here.  If unsure, say Y.
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index bdfeacee0817..a542761e90d1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1817,9 +1817,7 @@ EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/mutex.h>
 
-/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
- * in ip_conntrack_core, since we don't want the protocols to autoload
- * or depend on ctnetlink */
+/* Generic function for tcp/udp/sctp/dccp and alike. */
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *tuple)
 {
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 8f6ba8162f0b..573cb4481481 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -1,11 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
- * 			      	     conntrack/NAT module.
+ * BER and PER decoding library for H.323 conntrack/NAT module.
  *
  * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
- * See ip_conntrack_helper_h323_asn1.h for details.
+ * See nf_conntrack_helper_h323_asn1.h for details.
  */
 
 #ifdef __KERNEL__
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index c2eb365f1723..5b05487a60d2 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * ip_conntrack_proto_gre.c - Version 3.0
- *
  * Connection tracking protocol helper module for GRE.
  *
  * GRE is a generic encapsulation protocol, which is generally not very
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index dd53e2b20f6b..097deba7441a 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -215,7 +215,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	}
 
-	/* See ip_conntrack_proto_tcp.c */
+	/* See nf_conntrack_proto_tcp.c */
 	if (state->net->ct.sysctl_checksum &&
 	    state->hook == NF_INET_PRE_ROUTING &&
 	    nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 9ab410455992..3f6023ed4966 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -519,7 +519,7 @@ another_round:
  * and NF_INET_LOCAL_OUT, we change the destination to map into the
  * range. It might not be possible to get a unique tuple, but we try.
  * At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
+ * __nf_conntrack_confirm and drop the packet. */
 static void
 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_tuple *orig_tuple,
-- 
cgit v1.2.3


From 07b0fdecb2477396bcb69609019aade2b22124a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 16 Jul 2019 07:58:31 -0700
Subject: blkcg: allow blkcg_policy->pd_stat() to print non-debug info too

Currently, ->pd_stat() is called only when moduleparam
blkcg_debug_stats is set which prevents it from printing non-debug
policy-specific statistics.  Let's move debug testing down so that
->pd_stat() can print non-debug stat too.  This patch doesn't cause
any visible behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 9 +++------
 block/blk-iolatency.c      | 3 +++
 include/linux/blk-cgroup.h | 1 +
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 24ed26957367..55a7dc227dfb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
-static bool blkcg_debug_stats = false;
+bool blkcg_debug_stats = false;
 static struct workqueue_struct *blkcg_punt_bio_wq;
 
 static bool blkcg_policy_enabled(struct request_queue *q,
@@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 					 dbytes, dios);
 		}
 
-		if (!blkcg_debug_stats)
-			goto next;
-
-		if (atomic_read(&blkg->use_delay)) {
+		if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
 			has_stats = true;
 			off += scnprintf(buf+off, size-off,
 					 " use_delay=%d delay_nsec=%llu",
@@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 				has_stats = true;
 			off += written;
 		}
-next:
+
 		if (has_stats) {
 			if (off < size - 1) {
 				off += scnprintf(buf+off, size-off, "\n");
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index d973c38ee4fd..0fff7b56df0e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
 	unsigned long long avg_lat;
 	unsigned long long cur_win;
 
+	if (!blkcg_debug_stats)
+		return 0;
+
 	if (iolat->ssd)
 		return iolatency_ssd_stat(iolat, buf, size);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 689a58231288..12811091fd50 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -181,6 +181,7 @@ struct blkcg_policy {
 
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
+extern bool blkcg_debug_stats;
 
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
-- 
cgit v1.2.3


From 46710f3a34b592ac5c51a95f696b2d2a2a0d9419 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:57:59 -0700
Subject: tracing: Pass type into tracing_generic_entry_update()

All callers of tracing_generic_entry_update() have to initialize
entry->type, so let's just simply move it inside.
Link: http://lkml.kernel.org/r/20190525165802.25944-2-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    | 1 +
 kernel/trace/trace.c            | 8 ++++----
 kernel/trace/trace_event_perf.c | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..5c6f2a6c8cd2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -142,6 +142,7 @@ enum print_line_t {
 enum print_line_t trace_handle_return(struct trace_seq *s);
 
 void tracing_generic_entry_update(struct trace_entry *entry,
+				  unsigned short type,
 				  unsigned long flags,
 				  int pc);
 struct trace_event_file;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 77b9c4ca5faa..6b62e1718548 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event,
 {
 	struct trace_entry *ent = ring_buffer_event_data(event);
 
-	tracing_generic_entry_update(ent, flags, pc);
-	ent->type = type;
+	tracing_generic_entry_update(ent, type, flags, pc);
 }
 
 static __always_inline struct ring_buffer_event *
@@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
 void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
-			     int pc)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
+			     unsigned long flags, int pc)
 {
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->type			= type;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a6104474..0892e38ed6fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type)
 	unsigned long flags;
 
 	local_save_flags(flags);
-	tracing_generic_entry_update(entry, flags, pc);
-	entry->type = type;
+	tracing_generic_entry_update(entry, type, flags, pc);
 }
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
-- 
cgit v1.2.3


From 0aeb1def44169cbe7119f26cf10b974a2046142e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:58:01 -0700
Subject: tracing: Make trace_get_fields() global

trace_get_fields() is the only way to read tracepoint fields at
run time, as their fields are defined at compile-time with macros.
Make this function visible to all users and it will be used by
trace event injection code to calculate the size of a tracepoint
entry.
Link: http://lkml.kernel.org/r/20190525165802.25944-4-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 8 ++++++++
 kernel/trace/trace_events.c  | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5c6f2a6c8cd2..5150436783e8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -318,6 +318,14 @@ trace_event_name(struct trace_event_call *call)
 		return call->name;
 }
 
+static inline struct list_head *
+trace_get_fields(struct trace_event_call *event_call)
+{
+	if (!event_call->class->get_fields)
+		return &event_call->class->fields;
+	return event_call->class->get_fields(event_call);
+}
+
 struct trace_array;
 struct trace_subsystem_dir;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index edc72f3b080c..c7506bc81b75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system)
 #define while_for_each_event_file()		\
 	}
 
-static struct list_head *
-trace_get_fields(struct trace_event_call *event_call)
-{
-	if (!event_call->class->get_fields)
-		return &event_call->class->fields;
-	return event_call->class->get_fields(event_call);
-}
-
 static struct ftrace_event_field *
 __find_event_field(struct list_head *head, char *name)
 {
-- 
cgit v1.2.3


From 9087c37584fb7d8315877bb55f85e4268cc0b4f4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 10 Jul 2019 19:01:19 +0000
Subject: dma-direct: Force unencrypted DMA under SME for certain DMA masks

If a device doesn't support DMA to a physical address that includes the
encryption bit (currently bit 47, so 48-bit DMA), then the DMA must
occur to unencrypted memory. SWIOTLB is used to satisfy that requirement
if an IOMMU is not active (enabled or configured in passthrough mode).

However, commit fafadcd16595 ("swiotlb: don't dip into swiotlb pool for
coherent allocations") modified the coherent allocation support in
SWIOTLB to use the DMA direct coherent allocation support. When an IOMMU
is not active, this resulted in dma_alloc_coherent() failing for devices
that didn't support DMA addresses that included the encryption bit.

Addressing this requires changes to the force_dma_unencrypted() function
in kernel/dma/direct.c. Since the function is now non-trivial and
SME/SEV specific, update the DMA direct support to add an arch override
for the force_dma_unencrypted() function. The arch override is selected
when CONFIG_AMD_MEM_ENCRYPT is set. The arch override function resides in
the arch/x86/mm/mem_encrypt.c file and forces unencrypted DMA when either
SEV is active or SME is active and the device does not support DMA to
physical addresses that include the encryption bit.

Fixes: fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations")
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
[hch: moved the force_dma_unencrypted declaration to dma-mapping.h,
      fold the s390 fix from Halil Pasic]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/s390/Kconfig          |  1 +
 arch/s390/mm/init.c        |  7 ++++++-
 arch/x86/Kconfig           |  1 +
 arch/x86/mm/mem_encrypt.c  | 30 ++++++++++++++++++++++++++++++
 include/linux/dma-direct.h |  9 +++++++++
 kernel/dma/Kconfig         |  3 +++
 kernel/dma/direct.c        | 16 ++++------------
 7 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5d8570ed6cab..a4ad2733eedf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -189,6 +189,7 @@ config S390
 	select VIRT_CPU_ACCOUNTING
 	select ARCH_HAS_SCALED_CPUTIME
 	select HAVE_NMI
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select SWIOTLB
 	select GENERIC_ALLOCATOR
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f0bee6af3960..78c319c5ce48 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -30,7 +30,7 @@
 #include <linux/export.h>
 #include <linux/cma.h>
 #include <linux/gfp.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
@@ -161,6 +161,11 @@ bool sev_active(void)
 	return is_prot_virt_guest();
 }
 
+bool force_dma_unencrypted(struct device *dev)
+{
+	return sev_active();
+}
+
 /* protected virtualization */
 static void pv_init(void)
 {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..d1afe92bf994 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1528,6 +1528,7 @@ config AMD_MEM_ENCRYPT
 	depends on X86_64 && CPU_SUP_AMD
 	select DYNAMIC_PHYSICAL_MASK
 	select ARCH_USE_MEMREMAP_PROT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	---help---
 	  Say yes to enable support for the encryption of system memory.
 	  This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index e0df96fdfe46..c805f0a5c16e 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,10 @@
 #include <linux/dma-direct.h>
 #include <linux/swiotlb.h>
 #include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -348,6 +352,32 @@ bool sev_active(void)
 }
 EXPORT_SYMBOL(sev_active);
 
+/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+bool force_dma_unencrypted(struct device *dev)
+{
+	/*
+	 * For SEV, all DMA must be to unencrypted addresses.
+	 */
+	if (sev_active())
+		return true;
+
+	/*
+	 * For SME, all DMA must be to unencrypted addresses if the
+	 * device does not support DMA to addresses that include the
+	 * encryption mask.
+	 */
+	if (sme_active()) {
+		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
+		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
+						dev->bus_dma_mask);
+
+		if (dma_dev_mask <= dma_enc_mask)
+			return true;
+	}
+
+	return false;
+}
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_free_decrypted_mem(void)
 {
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b7338702592a..adf993a3bd58 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -32,6 +32,15 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
+#ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
+bool force_dma_unencrypted(struct device *dev);
+#else
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+
 /*
  * If memory encryption is supported, phys_to_dma will set the memory encryption
  * bit in the DMA address, and dma_to_phys will clear it.  The raw __phys_to_dma
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 70f8f8d9200e..9decbba255fc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
 config ARCH_HAS_DMA_MMAP_PGPROT
 	bool
 
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
+	bool
+
 config DMA_NONCOHERENT_CACHE_SYNC
 	bool
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b90e1aede743..d7cec866d16b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -23,14 +23,6 @@
 #define ARCH_ZONE_DMA_BITS 24
 #endif
 
-/*
- * For AMD SEV all DMA must be to unencrypted addresses.
- */
-static inline bool force_dma_unencrypted(void)
-{
-	return sev_active();
-}
-
 static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
@@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		return __phys_to_dma(dev, phys);
 	return phys_to_dma(dev, phys);
 }
@@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
 		dma_mask = dev->bus_dma_mask;
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		*phys_mask = __dma_to_phys(dev, dma_mask);
 	else
 		*phys_mask = dma_to_phys(dev, dma_mask);
@@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted()) {
+	if (force_dma_unencrypted(dev)) {
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
 		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
 	} else {
@@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-- 
cgit v1.2.3


From 89165b8b0ee97bd775ac4376b932fd030f7462bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:30 -0700
Subject: mm: provide a print_vma_addr stub for !CONFIG_MMU

Link: http://lkml.kernel.org/r/20190703122359.18200-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0389c34ac529..74797ed20c2c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,7 +2767,13 @@ extern int randomize_va_space;
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
+#ifdef CONFIG_MMU
 void print_vma_addr(char *prefix, unsigned long rip);
+#else
+static inline void print_vma_addr(char *prefix, unsigned long rip)
+{
+}
+#endif
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-- 
cgit v1.2.3


From 9b98fa22948551e20a15b0b9d22589e3724c361a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:33 -0700
Subject: mm: stub out all of swapops.h for !CONFIG_MMU

The whole header file deals with swap entries and PTEs, none of which
can exist for nommu builds.  The current nommu ports have lots of stubs
to allow the inline functions in swapops.h to compile, but as none of
this functionality is actually used there is no point in even providing
it.  This way we don't have to provide the stubs for the upcoming RISC-V
nommu port, and can eventually remove it from the existing ports.

Link: http://lkml.kernel.org/r/20190703122359.18200-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 15bdb6fe71e5..877fd239b6ff 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -6,6 +6,8 @@
 #include <linux/bug.h>
 #include <linux/mm_types.h>
 
+#ifdef CONFIG_MMU
+
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
  * get good packing density in that tree, so the index should be dense in
@@ -50,13 +52,11 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
-#ifdef CONFIG_MMU
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
 	return !pte_none(pte) && !pte_present(pte);
 }
-#endif
 
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
@@ -360,4 +360,5 @@ static inline int non_swap_entry(swp_entry_t entry)
 }
 #endif
 
+#endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
-- 
cgit v1.2.3


From ce251e0e3c0597ea8cab5787df579bd1f9c1aca1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:26:42 -0700
Subject: include/linux/kernel.h: add typeof_member() macro

Add typeof_member() macro so that types can be extracted without
introducing dummy variables.

Link: http://lkml.kernel.org/r/20190529190720.GA5703@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 0c9bc231107f..4fa360a13c1e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -88,6 +88,8 @@
  */
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 
+#define typeof_member(T, m)	typeof(((T*)0)->m)
+
 #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 
 #define DIV_ROUND_DOWN_ULL(ll, d) \
-- 
cgit v1.2.3


From 95b980d62d52c4c1768ee719e8db3efe27ef52b2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 16 Jul 2019 16:26:57 -0700
Subject: linux/bits.h: make BIT(), GENMASK(), and friends available in
 assembly

BIT(),  GENMASK(), etc. are useful to define register bits of hardware.
However, low-level code is often written in assembly, where they are
not available due to the hard-coded 1UL, 0UL.

In fact, in-kernel headers such as arch/arm64/include/asm/sysreg.h
use _BITUL() instead of BIT() so that the register bit macros are
available in assembly.

Using macros in include/uapi/linux/const.h have two reasons:

[1] For use in uapi headers
  We should use underscore-prefixed variants for user-space.

[2] For use in assembly code
  Since _BITUL() uses UL(1) instead of 1UL, it can be used as an
  alternative of BIT().

For [2], it is pretty easy to change BIT() etc. for use in assembly.

This allows to replace _BUTUL() in kernel-space headers with BIT().

Link: http://lkml.kernel.org/r/20190609153941.17249-1-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bits.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 2b7b532c1d51..669d69441a62 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -1,13 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_BITS_H
 #define __LINUX_BITS_H
+
+#include <linux/const.h>
 #include <asm/bitsperlong.h>
 
-#define BIT(nr)			(1UL << (nr))
-#define BIT_ULL(nr)		(1ULL << (nr))
-#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
+#define BIT(nr)			(UL(1) << (nr))
+#define BIT_ULL(nr)		(ULL(1) << (nr))
+#define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
-#define BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
+#define BIT_ULL_MASK(nr)	(ULL(1) << ((nr) % BITS_PER_LONG_LONG))
 #define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
 #define BITS_PER_BYTE		8
 
@@ -17,10 +19,11 @@
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
 #define GENMASK(h, l) \
-	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+	(((~UL(0)) - (UL(1) << (l)) + 1) & \
+	 (~UL(0) >> (BITS_PER_LONG - 1 - (h))))
 
 #define GENMASK_ULL(h, l) \
-	(((~0ULL) - (1ULL << (l)) + 1) & \
-	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+	(((~ULL(0)) - (ULL(1) << (l)) + 1) & \
+	 (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 #endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3


From 4c6080cd6f8baad9f7faa3deac9a90e59726b119 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:27:12 -0700
Subject: lib/list: tweak LIST_POISON2 for better code generation on x86_64

list_del() poisoning can generate 2 64-bit immediate loads but it also can
generate one 64-bit immediate load and an addition:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 05 00 01 00 00   <=====>	add    rax,0x100
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

However on x86_64 not all constants are equal: those within [-128, 127]
range can be added with shorter "add r64, imm32" instruction:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 83 c0 22	<======>	add    rax,0x22
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

Patch saves 2 bytes per some LIST_POISON2 usage.

(Slightly disappointing) space savings on F29 x86_64 config:

	add/remove: 0/0 grow/shrink: 0/2164 up/down: 0/-5184 (-5184)
	Function                                     old     new   delta
	zstd_get_workspace                           548     546      -2
		...
	mlx4_delete_all_resources_for_slave         4826    4804     -22
	Total: Before=83304131, After=83298947, chg -0.01%

New constants are:

	0xdead000000000100
	0xdead000000000122

Note: LIST_POISON1 can't be changed to ...11 because something in page
allocator requires low bit unset.

Link: http://lkml.kernel.org/r/20190513191502.GA8492@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index d6d980a681c7..df34330b4e34 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -21,7 +21,7 @@
  * non-initialized list entries.
  */
 #define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
-- 
cgit v1.2.3


From 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:27:33 -0700
Subject: mm/ioremap: probe platform for p4d huge map support

Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level
paging") started while levelling up P4D huge mapping support at par with
PUD and PMD.  A new arch call back arch_ioremap_p4d_supported() is added
which just maintains status quo (P4D huge map not supported) on x86,
arm64 and powerpc.

When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the
arch about the support, hence runtime effects are minimal.

Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c                      | 5 +++++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +++++
 arch/x86/mm/ioremap.c                    | 5 +++++
 include/linux/io.h                       | 1 +
 lib/ioremap.c                            | 2 ++
 5 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1b49c08dfa2b..e661469cabdd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -942,6 +942,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	return dt_virt;
 }
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 	/*
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 65c2ba1e1783..b4ca9e95e678 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1237,3 +1237,8 @@ int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
 		return 0;
 	}
 }
+
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e500f1df1140..63e99f15d7cf 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -459,6 +459,11 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/io.h b/include/linux/io.h
index 9876e5801a9d..accac822336a 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -33,6 +33,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 void __init ioremap_huge_init(void);
+int arch_ioremap_p4d_supported(void);
 int arch_ioremap_pud_supported(void);
 int arch_ioremap_pmd_supported(void);
 #else
diff --git a/lib/ioremap.c b/lib/ioremap.c
index a95161d9c883..0a2ffadc6d71 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap);
 void __init ioremap_huge_init(void)
 {
 	if (!ioremap_huge_disabled) {
+		if (arch_ioremap_p4d_supported())
+			ioremap_p4d_capable = 1;
 		if (arch_ioremap_pud_supported())
 			ioremap_pud_capable = 1;
 		if (arch_ioremap_pmd_supported())
-- 
cgit v1.2.3


From 9f973cb38088e0cf42e0bae97ff140813e623f13 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 16 Jul 2019 16:27:45 -0700
Subject: lib/rbtree: avoid generating code twice for the cached versions

As was already noted in rbtree.h, the logic to cache rb_first (or
rb_last) can easily be implemented externally to the core rbtree api.

Change the implementation to do just that.  Previously the update of
rb_leftmost was wired deeper into the implmentation, but there were some
disadvantages to that - mostly, lib/rbtree.c had separate instantiations
for rb_insert_color() vs rb_insert_color_cached(), as well as rb_erase()
vs rb_erase_cached(), which were doing exactly the same thing save for
the rb_leftmost update at the start of either function.

   text	   data	    bss	    dec	    hex	filename
   5405	    120	      0	   5525	   1595	lib/rbtree.o-vanilla
   3827	     96	      0	   3923	    f53	lib/rbtree.o-patch

[dave@stgolabs.net: changelog addition]
  Link: http://lkml.kernel.org/r/20190628171416.by5gdizl3rcxk5h5@linux-r8p5
[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190628045008.39926-1-walken@google.com
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h           | 70 ++++++++++++++++++++++++++--------------
 include/linux/rbtree_augmented.h | 27 ++++++----------
 lib/rbtree.c                     | 40 ++---------------------
 3 files changed, 59 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e6337fce08f2..1fd61a9af45c 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -32,25 +32,9 @@ struct rb_root {
 	struct rb_node *rb_node;
 };
 
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
 #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
@@ -72,12 +56,6 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
-extern void rb_insert_color_cached(struct rb_node *,
-				   struct rb_root_cached *, bool);
-extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
@@ -87,8 +65,6 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 				struct rb_root *root);
-extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
@@ -136,4 +112,50 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent
 			typeof(*pos), field); 1; }); \
 	     pos = n)
 
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 0f902ccb48b0..179faab29f52 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -30,10 +30,9 @@ struct rb_augment_callbacks {
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 };
 
-extern void __rb_insert_augmented(struct rb_node *node,
-				  struct rb_root *root,
-				  bool newleft, struct rb_node **leftmost,
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
 /*
  * Fixup the rbtree and update the augmented information when rebalancing.
  *
@@ -48,7 +47,7 @@ static inline void
 rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 		    const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
+	__rb_insert_augmented(node, root, augment->rotate);
 }
 
 static inline void
@@ -56,8 +55,9 @@ rb_insert_augmented_cached(struct rb_node *node,
 			   struct rb_root_cached *root, bool newleft,
 			   const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, &root->rb_root,
-			      newleft, &root->rb_leftmost, augment->rotate);
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
 #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
@@ -150,7 +150,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 
 static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     struct rb_node **leftmost,
 		     const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right;
@@ -158,9 +157,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
-	if (leftmost && node == *leftmost)
-		*leftmost = rb_next(node);
-
 	if (!tmp) {
 		/*
 		 * Case 1: node to erase has no more than 1 child (easy!)
@@ -260,8 +256,7 @@ static __always_inline void
 rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		   const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, root,
-							 NULL, augment);
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
 	if (rebalance)
 		__rb_erase_color(rebalance, root, augment->rotate);
 }
@@ -270,11 +265,9 @@ static __always_inline void
 rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
 			  const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
-							 &root->rb_leftmost,
-							 augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
 }
 
 #endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 1ef6e25d031c..abc86c6a3177 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
-	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
-	if (newleft)
-		*leftmost = node;
-
 	while (true) {
 		/*
 		 * Loop invariant: node is red.
@@ -437,38 +433,19 @@ static const struct rb_augment_callbacks dummy_callbacks = {
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, false, NULL, dummy_rotate);
+	__rb_insert(node, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root,
-					 NULL, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_erase);
 
-void rb_insert_color_cached(struct rb_node *node,
-			    struct rb_root_cached *root, bool leftmost)
-{
-	__rb_insert(node, &root->rb_root, leftmost,
-		    &root->rb_leftmost, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_insert_color_cached);
-
-void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, &root->rb_root,
-					 &root->rb_leftmost, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_erase_cached);
-
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -477,10 +454,9 @@ EXPORT_SYMBOL(rb_erase_cached);
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, newleft, leftmost, augment_rotate);
+	__rb_insert(node, root, augment_rotate);
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
@@ -591,16 +567,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 }
 EXPORT_SYMBOL(rb_replace_node);
 
-void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root_cached *root)
-{
-	rb_replace_node(victim, new, &root->rb_root);
-
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-}
-EXPORT_SYMBOL(rb_replace_node_cached);
-
 void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 			 struct rb_root *root)
 {
-- 
cgit v1.2.3


From b98cca444d287a63dd96df04af7fb9793567599e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:28:00 -0700
Subject: mm, kprobes: generalize and rename notify_page_fault() as
 kprobe_page_fault()

Architectures which support kprobes have very similar boilerplate around
calling kprobe_fault_handler().  Use a helper function in kprobes.h to
unify them, based on the x86 code.

This changes the behaviour for other architectures when preemption is
enabled.  Previously, they would have disabled preemption while calling
the kprobe handler.  However, preemption would be disabled if this fault
was due to a kprobe, so we know the fault was not due to a kprobe
handler and can simply return failure.

This behaviour was introduced in commit a980c0ef9f6d ("x86/kprobes:
Refactor kprobes_fault() like kprobe_exceptions_notify()")

[anshuman.khandual@arm.com: export kprobe_fault_handler()]
  Link: http://lkml.kernel.org/r/1561133358-8876-1-git-send-email-anshuman.khandual@arm.com
Link: http://lkml.kernel.org/r/1560420444-25737-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/fault.c             | 24 +-----------------------
 arch/arm64/mm/fault.c           | 24 +-----------------------
 arch/ia64/mm/fault.c            | 24 +-----------------------
 arch/mips/include/asm/kprobes.h |  1 +
 arch/mips/kernel/kprobes.c      |  2 +-
 arch/powerpc/mm/fault.c         | 23 ++---------------------
 arch/s390/mm/fault.c            | 16 +---------------
 arch/sh/mm/fault.c              | 18 ++----------------
 arch/sparc/mm/fault_64.c        | 16 +---------------
 arch/x86/mm/fault.c             | 21 ++-------------------
 include/linux/kprobes.h         | 19 +++++++++++++++++++
 11 files changed, 32 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0e417233dad7..890eeaac3cbb 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -27,28 +27,6 @@
 
 #ifdef CONFIG_MMU
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, fsr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	return 0;
-}
-#endif
-
 /*
  * This is useful to dump out the page tables associated with
  * 'addr' in mm 'mm'.
@@ -265,7 +243,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	vm_fault_t fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, fsr))
+	if (kprobe_page_fault(regs, fsr))
 		return 0;
 
 	tsk = current;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c8c61b1eb479..9568c116ac7f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -59,28 +59,6 @@ static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
 	return debug_fault_info + DBG_ESR_EVT(esr);
 }
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, esr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	return 0;
-}
-#endif
-
 static void data_abort_decode(unsigned int esr)
 {
 	pr_alert("Data abort info:\n");
@@ -434,7 +412,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	unsigned long vm_flags = VM_READ | VM_WRITE;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, esr))
+	if (kprobe_page_fault(regs, esr))
 		return 0;
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c3a283d3172..c2f299fe9e04 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -21,28 +21,6 @@
 
 extern int die(char *, struct pt_regs *, long);
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	return 0;
-}
-#endif
-
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -116,7 +94,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * This is to handle the kprobes on user space access instructions
 	 */
-	if (notify_page_fault(regs, TRAP_BRKPT))
+	if (kprobe_page_fault(regs, TRAP_BRKPT))
 		return;
 
 	if (user_mode(regs))
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index 3cf8e4d5fa28..68b1e5d458cf 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -41,6 +41,7 @@ do {									\
 #define kretprobe_blacklist_size 0
 
 void arch_remove_kprobe(struct kprobe *p);
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c
index 81ba1d3c367c..6cfae2411c04 100644
--- a/arch/mips/kernel/kprobes.c
+++ b/arch/mips/kernel/kprobes.c
@@ -398,7 +398,7 @@ out:
 	return 1;
 }
 
-static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d989592b6fc8..8432c281de92 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -42,26 +42,6 @@
 #include <asm/debug.h>
 #include <asm/kup.h>
 
-static inline bool notify_page_fault(struct pt_regs *regs)
-{
-	bool ret = false;
-
-#ifdef CONFIG_KPROBES
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 11))
-			ret = true;
-		preempt_enable();
-	}
-#endif /* CONFIG_KPROBES */
-
-	if (unlikely(debugger_fault_handler(regs)))
-		ret = true;
-
-	return ret;
-}
-
 /*
  * Check whether the instruction inst is a store using
  * an update addressing form which will update r1.
@@ -461,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_write = page_fault_is_write(error_code);
 	vm_fault_t fault, major = 0;
 	bool must_retry = false;
+	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
-	if (notify_page_fault(regs))
+	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
 		return 0;
 
 	if (unlikely(page_fault_is_bad(error_code))) {
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 0ba174f779da..63507662828f 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -67,20 +67,6 @@ static int __init fault_init(void)
 }
 early_initcall(fault_init);
 
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 /*
  * Find out which address space caused the exception.
  */
@@ -412,7 +398,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 	 */
 	clear_pt_regs_flag(regs, PIF_PER_TRAP);
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 14))
 		return 0;
 
 	mm = tsk->mm;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 3093bc372138..5f51456f4fc7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -24,20 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address)
 {
@@ -412,14 +398,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if (unlikely(fault_in_kernel_space(address))) {
 		if (vmalloc_fault(address) >= 0)
 			return;
-		if (notify_page_fault(regs, vec))
+		if (kprobe_page_fault(regs, vec))
 			return;
 
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
 
-	if (unlikely(notify_page_fault(regs, vec)))
+	if (unlikely(kprobe_page_fault(regs, vec)))
 		return;
 
 	/* Only enable interrupts if they were on before the fault */
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 83fda4d9c3b2..2371fb6b97e4 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -38,20 +38,6 @@
 
 int show_unhandled_signals = 1;
 
-static inline __kprobes int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 0))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 static void __kprobes unhandled_fault(unsigned long address,
 				      struct task_struct *tsk,
 				      struct pt_regs *regs)
@@ -285,7 +271,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 
 	fault_code = get_thread_fault_code();
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 0))
 		goto exit_exception;
 
 	si_code = SEGV_MAPERR;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 794f364cb882..d1634c59ed56 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -46,23 +46,6 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
 	return 0;
 }
 
-static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
-{
-	if (!kprobes_built_in())
-		return 0;
-	if (user_mode(regs))
-		return 0;
-	/*
-	 * To be potentially processing a kprobe fault and to be allowed to call
-	 * kprobe_running(), we have to be non-preemptible.
-	 */
-	if (preemptible())
-		return 0;
-	if (!kprobe_running())
-		return 0;
-	return kprobe_fault_handler(regs, X86_TRAP_PF);
-}
-
 /*
  * Prefetch quirks:
  *
@@ -1282,7 +1265,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		return;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (kprobes_fault(regs))
+	if (kprobe_page_fault(regs, X86_TRAP_PF))
 		return;
 
 	/*
@@ -1313,7 +1296,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 	mm = tsk->mm;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (unlikely(kprobes_fault(regs)))
+	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
 		return;
 
 	/*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 443d9800ca3f..04bdaf01112c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -458,4 +458,23 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif
 
+/* Returns true if kprobes handled the fault */
+static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
+					      unsigned int trap)
+{
+	if (!kprobes_built_in())
+		return false;
+	if (user_mode(regs))
+		return false;
+	/*
+	 * To be potentially processing a kprobe fault and to be allowed
+	 * to call kprobe_running(), we have to be non-preemptible.
+	 */
+	if (preemptible())
+		return false;
+	if (!kprobe_running())
+		return false;
+	return kprobe_fault_handler(regs, trap);
+}
+
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From f90fb3c7e2c13ae829db2274b88b845a75038b8a Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Tue, 16 Jul 2019 16:28:10 -0700
Subject: uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel
 side headers

Only users of upc_req in kernel side fs/coda/psdev.c and
fs/coda/upcall.c already include linux/coda_psdev.h.

Suggested by Jan Harkes <jaharkes@cs.cmu.edu> in
  https://lore.kernel.org/lkml/20150531111913.GA23377@cs.cmu.edu/

Fixes these include/uapi/linux/coda_psdev.h compilation errors in userspace:

  linux/coda_psdev.h:12:19: error: field `uc_chain' has incomplete type
  struct list_head    uc_chain;
                   ^
  linux/coda_psdev.h:13:2: error: unknown type name `caddr_t'
  caddr_t             uc_data;
  ^
  linux/coda_psdev.h:14:2: error: unknown type name `u_short'
  u_short             uc_flags;
  ^
  linux/coda_psdev.h:15:2: error: unknown type name `u_short'
  u_short             uc_inSize;  /* Size is at most 5000 bytes */
  ^
  linux/coda_psdev.h:16:2: error: unknown type name `u_short'
  u_short             uc_outSize;
  ^
  linux/coda_psdev.h:17:2: error: unknown type name `u_short'
  u_short             uc_opcode;  /* copied from data to save lookup */
  ^
  linux/coda_psdev.h:19:2: error: unknown type name `wait_queue_head_t'
  wait_queue_head_t   uc_sleep;   /* process' wait queue */
  ^

Link: http://lkml.kernel.org/r/9f99f5ce6a0563d5266e6cf7aa9585aac2cae971.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 11 +++++++++++
 include/uapi/linux/coda_psdev.h | 13 -------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 15170954aa2b..57d2b2faf6a3 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -19,6 +19,17 @@ struct venus_comm {
 	struct mutex	    vc_mutex;
 };
 
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
 
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index aa6623efd2dd..d50d51a57fe4 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,19 +7,6 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head    uc_chain;
-	caddr_t	            uc_data;
-	u_short	            uc_flags;
-	u_short             uc_inSize;  /* Size is at most 5000 bytes */
-	u_short	            uc_outSize;
-	u_short	            uc_opcode;  /* copied from data to save lookup */
-	int		    uc_unique;
-	wait_queue_head_t   uc_sleep;   /* process' wait queue */
-};
-
 #define CODA_REQ_ASYNC  0x1
 #define CODA_REQ_READ   0x2
 #define CODA_REQ_WRITE  0x4
-- 
cgit v1.2.3


From 6e51f8aa76b67d0a6eb168fd41a81e8478ae07a9 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:16 -0700
Subject: coda: potential buffer overflow in coda_psdev_write()

Add checks to make sure the downcall message we got from the Coda cache
manager is large enough to contain the data it is supposed to have.
i.e.  when we get a CODA_ZAPDIR we can access &out->coda_zapdir.CodaFid.

Link: http://lkml.kernel.org/r/894fb6b250add09e4e3935f14649f21284a5cb18.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c            |  8 ++++++--
 fs/coda/upcall.c           | 34 +++++++++++++++++++++++++++++++++-
 include/linux/coda_psdev.h |  3 ++-
 3 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 241f7e04ad04..b4da2812499e 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -100,8 +100,12 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	ssize_t retval = 0, count = 0;
 	int error;
 
+	/* make sure there is enough to copy out the (opcode, unique) values */
+	if (nbytes < (2 * sizeof(u_int32_t)))
+		return -EINVAL;
+
         /* Peek at the opcode, uniquefier */
-	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t)))
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
@@ -127,7 +131,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		}
 
 		/* what downcall errors does Venus handle ? */
-		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+		error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes);
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1175a1722411..cf1e662681a5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -804,12 +804,44 @@ exit:
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes)
 {
 	struct inode *inode = NULL;
 	struct CodaFid *fid = NULL, *newfid;
 	struct super_block *sb;
 
+	/*
+	 * Make sure we have received enough data from the cache
+	 * manager to populate the necessary fields in the buffer
+	 */
+	switch (opcode) {
+	case CODA_PURGEUSER:
+		if (nbytes < sizeof(struct coda_purgeuser_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPDIR:
+		if (nbytes < sizeof(struct coda_zapdir_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPFILE:
+		if (nbytes < sizeof(struct coda_zapfile_out))
+			return -EINVAL;
+		break;
+
+	case CODA_PURGEFID:
+		if (nbytes < sizeof(struct coda_purgefid_out))
+			return -EINVAL;
+		break;
+
+	case CODA_REPLACE:
+		if (nbytes < sizeof(struct coda_replace_out))
+			return -EINVAL;
+		break;
+	}
+
 	/* Handle invalidation requests. */
 	mutex_lock(&vcp->vc_mutex);
 	sb = vcp->vc_sb;
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 57d2b2faf6a3..d1672fd5e638 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -71,7 +71,8 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid,
 int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
 int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
-- 
cgit v1.2.3


From b2a57e334086602be56b74958d9f29b955cd157f Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Tue, 16 Jul 2019 16:28:20 -0700
Subject: coda: fix build using bare-metal toolchain

The kernel is self-contained project and can be built with bare-metal
toolchain.  But bare-metal toolchain doesn't define __linux__.  Because
of this u_quad_t type is not defined when using bare-metal toolchain and
codafs build fails.  This patch fixes it by defining u_quad_t type
unconditionally.

Link: http://lkml.kernel.org/r/3cbb40b0a57b6f9923a9d67b53473c0b691a3eaa.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda.h b/include/linux/coda.h
index d30209b9cef8..0ca0c83fdb1c 100644
--- a/include/linux/coda.h
+++ b/include/linux/coda.h
@@ -58,8 +58,7 @@ Mellon the rights to redistribute these changes without encumbrance.
 #ifndef _CODA_HEADER_
 #define _CODA_HEADER_
 
-#if defined(__linux__)
 typedef unsigned long long u_quad_t;
-#endif
+
 #include <uapi/linux/coda.h>
 #endif 
-- 
cgit v1.2.3


From 2fe7491d219428a32f09948e88bfaf8e71b9a66b Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:26 -0700
Subject: uapi linux/coda_psdev.h: move CODA_REQ_ from uapi to kernel side
 headers

These constants only used internally and not exposed to userspace.

Link: http://lkml.kernel.org/r/baeafc30dad70d8b422ee679420099c2d8aa7da0.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 5 +++++
 include/uapi/linux/coda_psdev.h | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d1672fd5e638..9487f792770c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -31,6 +31,11 @@ struct upc_req {
 	wait_queue_head_t	uc_sleep;   /* process' wait queue */
 };
 
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
 	return (struct venus_comm *)((sb)->s_fs_info);
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index d50d51a57fe4..3dacb7fad66a 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,9 +7,4 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
 #endif /* _UAPI__CODA_PSDEV_H */
-- 
cgit v1.2.3


From 8fc8b9df831387e0d02c1d0f5bb53d327e0d477a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jul 2019 16:28:47 -0700
Subject: coda: move internal defs out of include/linux/ [ver #2]

Move include/linux/coda_psdev.h to fs/coda/ as there's nothing else that
uses it.

Link: http://lkml.kernel.org/r/3ceeee0415a929b89fb02700b6b4b3a07938acb8.1558117389.git.jaharkes@cs.cmu.edu
Link: https://patchwork.kernel.org/patch/10590257/
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/cache.c            |  2 +-
 fs/coda/cnode.c            |  2 +-
 fs/coda/coda_linux.c       |  2 +-
 fs/coda/coda_psdev.h       | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/coda/dir.c              |  2 +-
 fs/coda/file.c             |  3 +-
 fs/coda/inode.c            |  2 +-
 fs/coda/pioctl.c           |  3 +-
 fs/coda/psdev.c            |  3 +-
 fs/coda/symlink.c          |  3 +-
 fs/coda/upcall.c           |  2 +-
 include/linux/coda_psdev.h | 89 ----------------------------------------------
 12 files changed, 99 insertions(+), 103 deletions(-)
 create mode 100644 fs/coda/coda_psdev.h
 delete mode 100644 include/linux/coda_psdev.h

(limited to 'include/linux')

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 201fc08a8b4f..3b8c4513118f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -21,7 +21,7 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 845b5a66952a..2e5badf67f98 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,8 +8,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
 #include <linux/pagemap.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e4b5f02f0dd4..2e1a5a192074 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* initialize the debugging variables */
diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
new file mode 100644
index 000000000000..012e16f741a6
--- /dev/null
+++ b/fs/coda/coda_psdev.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CODA_PSDEV_H
+#define __CODA_PSDEV_H
+
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/coda_psdev.h>
+
+struct kstatfs;
+
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
+
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
+/* communication pending/processing queues */
+struct venus_comm {
+	u_long		    vc_seq;
+	wait_queue_head_t   vc_waitq; /* Venus wait queue */
+	struct list_head    vc_pending;
+	struct list_head    vc_processing;
+	int                 vc_inuse;
+	struct super_block *vc_sb;
+	struct mutex	    vc_mutex;
+};
+
+static inline struct venus_comm *coda_vcp(struct super_block *sb)
+{
+	return (struct venus_comm *)((sb)->s_fs_info);
+}
+
+/* upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
+int venus_getattr(struct super_block *sb, struct CodaFid *fid,
+		  struct coda_vattr *attr);
+int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
+int venus_lookup(struct super_block *sb, struct CodaFid *fid,
+		 const char *name, int length, int *type,
+		 struct CodaFid *resfid);
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		kuid_t uid);
+int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
+	       struct file **f);
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length,
+		struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_create(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length);
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length);
+int venus_readlink(struct super_block *sb, struct CodaFid *fid,
+		   char *buffer, int *length);
+int venus_rename(struct super_block *sb, struct CodaFid *new_fid,
+		 struct CodaFid *old_fid, size_t old_length,
+		 size_t new_length, const char *old_name,
+		 const char *new_name);
+int venus_link(struct super_block *sb, struct CodaFid *fid,
+		  struct CodaFid *dirfid, const char *name, int len );
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		  const char *name, int len, const char *symname, int symlen);
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
+int venus_fsync(struct super_block *sb, struct CodaFid *fid);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
+
+/*
+ * Statistics
+ */
+
+extern struct venus_comm coda_comms[];
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 7e103eb8ffcd..716a0b932ec0 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,7 +23,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 43d371551d2b..a6b32c883a50 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -22,8 +22,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_int.h"
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 23f6ebd08e80..96d832ed23b5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -27,7 +27,7 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index e0c17b7dccce..644d48c12ce8 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -20,8 +20,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* pioctl ops */
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e80bda1de6c5..0a61e949a430 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -38,8 +38,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 #include "coda_int.h"
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 202297d156df..8907d0508198 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -17,8 +17,7 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1e2f50722107..eb8cc30f2589 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,7 +33,7 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
deleted file mode 100644
index 9487f792770c..000000000000
--- a/include/linux/coda_psdev.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __CODA_PSDEV_H
-#define __CODA_PSDEV_H
-
-#include <linux/backing-dev.h>
-#include <linux/mutex.h>
-#include <uapi/linux/coda_psdev.h>
-
-struct kstatfs;
-
-/* communication pending/processing queues */
-struct venus_comm {
-	u_long		    vc_seq;
-	wait_queue_head_t   vc_waitq; /* Venus wait queue */
-	struct list_head    vc_pending;
-	struct list_head    vc_processing;
-	int                 vc_inuse;
-	struct super_block *vc_sb;
-	struct mutex	    vc_mutex;
-};
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head	uc_chain;
-	caddr_t			uc_data;
-	u_short			uc_flags;
-	u_short			uc_inSize;  /* Size is at most 5000 bytes */
-	u_short			uc_outSize;
-	u_short			uc_opcode;  /* copied from data to save lookup */
-	int			uc_unique;
-	wait_queue_head_t	uc_sleep;   /* process' wait queue */
-};
-
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
-static inline struct venus_comm *coda_vcp(struct super_block *sb)
-{
-	return (struct venus_comm *)((sb)->s_fs_info);
-}
-
-/* upcalls */
-int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
-int venus_getattr(struct super_block *sb, struct CodaFid *fid,
-		  struct coda_vattr *attr);
-int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
-int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
-		 const char *name, int length, int *type, 
-		 struct CodaFid *resfid);
-int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-		kuid_t uid);
-int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
-	       struct file **f);
-int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length, 
-		struct CodaFid *newfid, struct coda_vattr *attrs);
-int venus_create(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length, int excl, int mode,
-		 struct CodaFid *newfid, struct coda_vattr *attrs) ;
-int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length);
-int venus_remove(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length);
-int venus_readlink(struct super_block *sb, struct CodaFid *fid, 
-		   char *buffer, int *length);
-int venus_rename(struct super_block *, struct CodaFid *new_fid, 
-		 struct CodaFid *old_fid, size_t old_length, 
-		 size_t new_length, const char *old_name, 
-		 const char *new_name);
-int venus_link(struct super_block *sb, struct CodaFid *fid, 
-		  struct CodaFid *dirfid, const char *name, int len );
-int venus_symlink(struct super_block *sb, struct CodaFid *fid,
-		  const char *name, int len, const char *symname, int symlen);
-int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
-int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
-		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
-		  size_t nbytes);
-int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
-
-/*
- * Statistics
- */
-
-extern struct venus_comm coda_comms[];
-#endif
-- 
cgit v1.2.3


From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001
From: Elvira Khabirova <lineprinter@altlinux.org>
Date: Tue, 16 Jul 2019 16:29:42 -0700
Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request

PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain
details of the syscall the tracee is blocked in.

There are two reasons for a special syscall-related ptrace request.

Firstly, with the current ptrace API there are cases when ptracer cannot
retrieve necessary information about syscalls.  Some examples include:

 * The notorious int-0x80-from-64-bit-task issue. See [1] for details.
   In short, if a 64-bit task performs a syscall through int 0x80, its
   tracer has no reliable means to find out that the syscall was, in
   fact, a compat syscall, and misidentifies it.

 * Syscall-enter-stop and syscall-exit-stop look the same for the
   tracer. Common practice is to keep track of the sequence of
   ptrace-stops in order not to mix the two syscall-stops up. But it is
   not as simple as it looks; for example, strace had a (just recently
   fixed) long-standing bug where attaching strace to a tracee that is
   performing the execve system call led to the tracer identifying the
   following syscall-exit-stop as syscall-enter-stop, which messed up
   all the state tracking.

 * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow
   accessing an undumpable mm"), both PTRACE_PEEKDATA and
   process_vm_readv become unavailable when the process dumpable flag is
   cleared. On such architectures as ia64 this results in all syscall
   arguments being unavailable for the tracer.

Secondly, ptracers also have to support a lot of arch-specific code for
obtaining information about the tracee.  For some architectures, this
requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall
argument and return value.

ptrace(2) man page:

long ptrace(enum __ptrace_request request, pid_t pid,
            void *addr, void *data);
...
PTRACE_GET_SYSCALL_INFO
       Retrieve information about the syscall that caused the stop.
       The information is placed into the buffer pointed by "data"
       argument, which should be a pointer to a buffer of type
       "struct ptrace_syscall_info".
       The "addr" argument contains the size of the buffer pointed to
       by "data" argument (i.e., sizeof(struct ptrace_syscall_info)).
       The return value contains the number of bytes available
       to be written by the kernel.
       If the size of data to be written by the kernel exceeds the size
       specified by "addr" argument, the output is truncated.

[ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO]
  Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org
Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org
Signed-off-by: Elvira Khabirova <lineprinter@altlinux.org>
Co-developed-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Greentime Hu <greentime@andestech.com>
Cc: Helge Deller <deller@gmx.de>	[parisc]
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: kbuild test robot <lkp@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vincent Chen <deanbo422@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h                     |   9 ++-
 include/uapi/linux/ptrace.h                   |  35 +++++++++
 kernel/ptrace.c                               | 101 +++++++++++++++++++++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c |  13 +++-
 4 files changed, 150 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8446573cc682..36fb3bbed6b2 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -54,13 +54,15 @@ struct linux_binprm;
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
-static inline int ptrace_report_syscall(struct pt_regs *regs)
+static inline int ptrace_report_syscall(struct pt_regs *regs,
+					unsigned long message)
 {
 	int ptrace = current->ptrace;
 
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
+	current->ptrace_message = message;
 	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 
+	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
 
@@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 static inline __must_check int tracehook_report_syscall_entry(
 	struct pt_regs *regs)
 {
-	return ptrace_report_syscall(regs);
+	return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
 }
 
 /**
@@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	if (step)
 		user_single_step_report(regs);
 	else
-		ptrace_report_syscall(regs);
+		ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
 }
 
 /**
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index d5a1b8a492b9..a71b6e3b03eb 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -73,6 +73,41 @@ struct seccomp_metadata {
 	__u64 flags;		/* Output: filter's flags */
 };
 
+#define PTRACE_GET_SYSCALL_INFO		0x420e
+#define PTRACE_SYSCALL_INFO_NONE	0
+#define PTRACE_SYSCALL_INFO_ENTRY	1
+#define PTRACE_SYSCALL_INFO_EXIT	2
+#define PTRACE_SYSCALL_INFO_SECCOMP	3
+
+struct ptrace_syscall_info {
+	__u8 op;	/* PTRACE_SYSCALL_INFO_* */
+	__u32 arch __attribute__((__aligned__(sizeof(__u32))));
+	__u64 instruction_pointer;
+	__u64 stack_pointer;
+	union {
+		struct {
+			__u64 nr;
+			__u64 args[6];
+		} entry;
+		struct {
+			__s64 rval;
+			__u8 is_error;
+		} exit;
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u32 ret_data;
+		} seccomp;
+	};
+};
+
+/*
+ * These values are stored in task->ptrace_message
+ * by tracehook_report_syscall_* to describe the current syscall-stop.
+ */
+#define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
+#define PTRACE_EVENTMSG_SYSCALL_EXIT	2
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83a531cea2f3..cb9ddcc08119 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,8 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 
+#include <asm/syscall.h>	/* for syscall_get_* */
+
 /*
  * Access another process' address space via ptrace.
  * Source/target buffer must be kernel space,
@@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
  * to ensure no machine forgets it.
  */
 EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+			      struct ptrace_syscall_info *info)
+{
+	unsigned long args[ARRAY_SIZE(info->entry.args)];
+	int i;
+
+	info->op = PTRACE_SYSCALL_INFO_ENTRY;
+	info->entry.nr = syscall_get_nr(child, regs);
+	syscall_get_arguments(child, regs, args);
+	for (i = 0; i < ARRAY_SIZE(args); i++)
+		info->entry.args[i] = args[i];
+
+	/* args is the last field in struct ptrace_syscall_info.entry */
+	return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+				struct ptrace_syscall_info *info)
+{
+	/*
+	 * As struct ptrace_syscall_info.entry is currently a subset
+	 * of struct ptrace_syscall_info.seccomp, it makes sense to
+	 * initialize that subset using ptrace_get_syscall_info_entry().
+	 * This can be reconsidered in the future if these structures
+	 * diverge significantly enough.
+	 */
+	ptrace_get_syscall_info_entry(child, regs, info);
+	info->op = PTRACE_SYSCALL_INFO_SECCOMP;
+	info->seccomp.ret_data = child->ptrace_message;
+
+	/* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+	return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+			     struct ptrace_syscall_info *info)
+{
+	info->op = PTRACE_SYSCALL_INFO_EXIT;
+	info->exit.rval = syscall_get_error(child, regs);
+	info->exit.is_error = !!info->exit.rval;
+	if (!info->exit.is_error)
+		info->exit.rval = syscall_get_return_value(child, regs);
+
+	/* is_error is the last field in struct ptrace_syscall_info.exit */
+	return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+			void __user *datavp)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	struct ptrace_syscall_info info = {
+		.op = PTRACE_SYSCALL_INFO_NONE,
+		.arch = syscall_get_arch(child),
+		.instruction_pointer = instruction_pointer(regs),
+		.stack_pointer = user_stack_pointer(regs),
+	};
+	unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+	unsigned long write_size;
+
+	/*
+	 * This does not need lock_task_sighand() to access
+	 * child->last_siginfo because ptrace_freeze_traced()
+	 * called earlier by ptrace_check_attach() ensures that
+	 * the tracee cannot go away and clear its last_siginfo.
+	 */
+	switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+	case SIGTRAP | 0x80:
+		switch (child->ptrace_message) {
+		case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+			actual_size = ptrace_get_syscall_info_entry(child, regs,
+								    &info);
+			break;
+		case PTRACE_EVENTMSG_SYSCALL_EXIT:
+			actual_size = ptrace_get_syscall_info_exit(child, regs,
+								   &info);
+			break;
+		}
+		break;
+	case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+		actual_size = ptrace_get_syscall_info_seccomp(child, regs,
+							      &info);
+		break;
+	}
+
+	write_size = min(actual_size, user_size);
+	return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
 int ptrace_request(struct task_struct *child, long request,
 		   unsigned long addr, unsigned long data)
@@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = __put_user(kiov.iov_len, &uiov->iov_len);
 		break;
 	}
+
+	case PTRACE_GET_SYSCALL_INFO:
+		ret = ptrace_get_syscall_info(child, addr, datavp);
+		break;
 #endif
 
 	case PTRACE_SECCOMP_GET_FILTER:
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index dc66fe852768..6ef7f16c4cf5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 	unsigned long msg;
 	static bool entry;
 
-	/* Make sure we got an empty message. */
+	/*
+	 * The traditional way to tell PTRACE_SYSCALL entry/exit
+	 * is by counting.
+	 */
+	entry = !entry;
+
+	/* Make sure we got an appropriate message. */
 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
 	EXPECT_EQ(0, ret);
-	EXPECT_EQ(0, msg);
+	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
+			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
 
-	/* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */
-	entry = !entry;
 	if (!entry)
 		return;
 
-- 
cgit v1.2.3


From e2d9018e81ba9357d3bb8bddc0ee58d460d092fe Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:29:50 -0700
Subject: signal: reorder struct sighand_struct

struct sighand_struct::siglock field is the most used field by far, put
it first so that is can be accessed without IMM8 or IMM32 encoding on
x86_64.

Space savings (on trimmed down VM test config):

add/remove: 0/0 grow/shrink: 8/68 up/down: 49/-1147 (-1098)
Function                                     old     new   delta
complete_signal                              512     533     +21
do_signalfd4                                 335     346     +11
__cleanup_sighand                             39      43      +4
unhandled_signal                              49      52      +3
prepare_signal                               692     695      +3
ignore_signals                                37      40      +3
__tty_check_change.part                      248     251      +3
ksys_unshare                                 780     781      +1
sighand_ctor                                  33      29      -4
ptrace_trap_notify                            60      56      -4
sigqueue_free                                 98      91      -7
run_posix_cpu_timers                        1389    1382      -7
proc_pid_status                             2448    2441      -7
proc_pid_limits                              344     337      -7
posix_cpu_timer_rearm                        222     215      -7
posix_cpu_timer_get                          249     242      -7
kill_pid_info_as_cred                        243     236      -7
freeze_task                                  197     190      -7
flush_old_exec                              1873    1866      -7
do_task_stat                                3363    3356      -7
do_send_sig_info                              98      91      -7
do_group_exit                                147     140      -7
init_sighand                                2088    2080      -8
do_notify_parent_cldstop                     399     391      -8
signalfd_cleanup                              50      41      -9
do_notify_parent                             557     545     -12
__send_signal                               1029    1017     -12
ptrace_stop                                  590     577     -13
get_signal                                  1576    1563     -13
__lock_task_sighand                          112      99     -13
zap_pid_ns_processes                         391     377     -14
update_rlimit_cpu                             78      64     -14
tty_signal_session_leader                    413     399     -14
tty_open_proc_set_tty                        149     135     -14
tty_jobctrl_ioctl                            936     922     -14
set_cpu_itimer                               339     325     -14
ptrace_resume                                226     212     -14
ptrace_notify                                110      96     -14
proc_clear_tty                                81      67     -14
posix_cpu_timer_del                          229     215     -14
kernel_sigaction                             156     142     -14
getrusage                                    977     963     -14
get_current_tty                               98      84     -14
force_sigsegv                                 89      75     -14
force_sig_info                               205     191     -14
flush_signals                                 83      69     -14
flush_itimer_signals                          85      71     -14
do_timer_create                             1120    1106     -14
do_sigpending                                 88      74     -14
do_signal_stop                               537     523     -14
cgroup_init_fs_context                       644     630     -14
call_usermodehelper_exec_async               402     388     -14
calculate_sigpending                          58      44     -14
__x64_sys_timer_delete                       248     234     -14
__set_current_blocked                         80      66     -14
__ptrace_unlink                              310     296     -14
__ptrace_detach.part                         187     173     -14
send_sigqueue                                362     347     -15
get_cpu_itimer                               214     199     -15
signalfd_poll                                175     159     -16
dequeue_signal                               340     323     -17
do_getitimer                                 192     174     -18
release_task.part                           1060    1040     -20
ptrace_peek_siginfo                          408     387     -21
posix_cpu_timer_set                          827     806     -21
exit_signals                                 437     416     -21
do_sigaction                                 541     520     -21
do_setitimer                                 485     464     -21
disassociate_ctty.part                       545     517     -28
__x64_sys_rt_sigtimedwait                    721     679     -42
__x64_sys_ptrace                            1319    1277     -42
ptrace_request                              1828    1782     -46
signalfd_read                                507     459     -48
wait_consider_task                          2027    1971     -56
do_coredump                                 3672    3616     -56
copy_process.part                           6936    6871     -65

Link: http://lkml.kernel.org/r/20190503192800.GA18004@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 532458698bde..01add55a609b 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -15,10 +15,10 @@
  */
 
 struct sighand_struct {
-	refcount_t		count;
-	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	refcount_t		count;
 	wait_queue_head_t	signalfd_wqh;
+	struct k_sigaction	action[_NSIG];
 };
 
 /*
-- 
cgit v1.2.3


From b772434be0891ed1081a08ae7cfd4666728f8e82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 16 Jul 2019 16:29:53 -0700
Subject: signal: simplify set_user_sigmask/restore_user_sigmask

task->saved_sigmask and ->restore_sigmask are only used in the ret-from-
syscall paths.  This means that set_user_sigmask() can save ->blocked in
->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked
was modified.

This way the callers do not need 2 sigset_t's passed to set/restore and
restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns
into the trivial helper which just calls restore_saved_sigmask().

Link: http://lkml.kernel.org/r/20190606113206.GA9464@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Eric Wong <e@80x24.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                     | 20 +++++--------
 fs/eventpoll.c               | 12 +++-----
 fs/io_uring.c                | 11 ++-----
 fs/select.c                  | 34 ++++++++--------------
 include/linux/compat.h       |  3 +-
 include/linux/sched/signal.h | 12 ++++++--
 include/linux/signal.h       |  4 ---
 kernel/signal.c              | 69 ++++++++++++--------------------------------
 8 files changed, 57 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 2d405733a8c6..8327db0c8e08 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2094,7 +2094,6 @@ SYSCALL_DEFINE6(io_pgetevents,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2105,14 +2104,14 @@ SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2130,7 +2129,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2142,14 +2140,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		return -EFAULT;
 
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2198,7 +2196,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2209,14 +2206,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2234,7 +2231,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2245,14 +2241,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c768ae43..0f9c073d78d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 		size_t, sigsetsize)
 {
 	int error;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	error = set_user_sigmask(sigmask, sigsetsize);
 	if (error)
 		return error;
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+	restore_saved_sigmask_unless(error == -EINTR);
 
 	return error;
 }
@@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	err = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (err)
 		return err;
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+	restore_saved_sigmask_unless(err == -EINTR);
 
 	return err;
 }
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d682049c07b2..e2a66e12fbc6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2400,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
-	sigset_t ksigmask, sigsaved;
 	int ret;
 
 	if (io_cqring_events(ring) >= min_events)
@@ -2410,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 #ifdef CONFIG_COMPAT
 		if (in_compat_syscall())
 			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
-						      &ksigmask, &sigsaved, sigsz);
+						      sigsz);
 		else
 #endif
-			ret = set_user_sigmask(sig, &ksigmask,
-					       &sigsaved, sigsz);
+			ret = set_user_sigmask(sig, sigsz);
 
 		if (ret)
 			return ret;
 	}
 
 	ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
-	if (sig)
-		restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e8b63c..1fc1b247fede 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize,
 		       enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	void __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize, enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct old_timespec32 __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6cfcf8..16dafd9f4b86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,8 +138,7 @@ typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
 
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize);
 
 struct compat_sigaction {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 01add55a609b..efd8ce7675ed 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -420,7 +420,6 @@ void task_join_group_stop(struct task_struct *task);
 static inline void set_restore_sigmask(void)
 {
 	set_thread_flag(TIF_RESTORE_SIGMASK);
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -451,7 +450,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 static inline void set_restore_sigmask(void)
 {
 	current->restore_sigmask = true;
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
@@ -484,6 +482,16 @@ static inline void restore_saved_sigmask(void)
 		__set_current_blocked(&current->saved_sigmask);
 }
 
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
+
+static inline void restore_saved_sigmask_unless(bool interrupted)
+{
+	if (interrupted)
+		WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+	else
+		restore_saved_sigmask();
+}
+
 static inline sigset_t *sigmask_to_save(void)
 {
 	sigset_t *res = &current->blocked;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 78c2bb376954..b5d99482d3fe 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			       struct task_struct *p, enum pid_type type);
 extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-	sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
-				 sigset_t *sigsaved, bool interrupted);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index dabe100d2091..91b789dd6e72 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask);
  *
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
  */
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-		     sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
-	if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+	if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_user_sigmask);
 
 #ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (get_compat_sigset(set, usigmask))
+	if (get_compat_sigset(&kmask, umask))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_compat_user_sigmask);
 #endif
 
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- *           usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
-				bool interrupted)
-{
-
-	if (!usigmask)
-		return;
-	/*
-	 * When signals are pending, do not restore them here.
-	 * Restoring sigmask here can lead to delivering signals that the above
-	 * syscalls are intended to block because of the sigmask passed in.
-	 */
-	if (interrupted) {
-		current->saved_sigmask = *sigsaved;
-		set_restore_sigmask();
-		return;
-	}
-
-	/*
-	 * This is needed because the fast syscall return path does not restore
-	 * saved_sigmask when signals are not pending.
-	 */
-	set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
 /**
  *  sys_rt_sigprocmask - change the list of currently blocked signals
  *  @how: whether to add, remove, or set signals
-- 
cgit v1.2.3


From f57e515a1b56325a28a0972c632a623a9c84590c Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 16 Jul 2019 16:30:06 -0700
Subject: kernel/pid.c: convert struct pid count to refcount_t

struct pid's count is an atomic_t field used as a refcount.  Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:

 -	if ((atomic_read(&pid->count) == 1) ||
 -	     atomic_dec_and_test(&pid->count)) {
 +	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);

Here the change is from: Fully ordered --> RELEASE + ACQUIRE (as per
refcount-vs-atomic.rst) This ACQUIRE should take care of making sure the
free happens after the refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is.  The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Link: http://lkml.kernel.org/r/20190701183826.191936-1-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: KJ Tsanaktsidis <ktsanaktsidis@zendesk.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 5 +++--
 kernel/pid.c        | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 1484db6ca8d1..2a83e434db9d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,6 +4,7 @@
 
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/refcount.h>
 
 enum pid_type
 {
@@ -57,7 +58,7 @@ struct upid {
 
 struct pid
 {
-	atomic_t count;
+	refcount_t count;
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
@@ -74,7 +75,7 @@ extern const struct file_operations pidfd_fops;
 static inline struct pid *get_pid(struct pid *pid)
 {
 	if (pid)
-		atomic_inc(&pid->count);
+		refcount_inc(&pid->count);
 	return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 16263b526560..0a9f2e437217 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,14 +37,14 @@
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
 struct pid init_struct_pid = {
-	.count 		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.tasks		= {
 		{ .first = NULL },
 		{ .first = NULL },
@@ -108,8 +108,7 @@ void put_pid(struct pid *pid)
 		return;
 
 	ns = pid->numbers[pid->level].ns;
-	if ((atomic_read(&pid->count) == 1) ||
-	     atomic_dec_and_test(&pid->count)) {
+	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
@@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	get_pid_ns(ns);
-	atomic_set(&pid->count, 1);
+	refcount_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
-- 
cgit v1.2.3


From 97a0efea657e986322b09b99016b3f7d2ce37021 Mon Sep 17 00:00:00 2001
From: Tom Levy <tomlevy93@gmail.com>
Date: Tue, 16 Jul 2019 16:30:24 -0700
Subject: include/linux/lz4.h: fix spelling and copy-paste errors in
 documentation

Fix a few spelling and grammar errors, and two places where fast/safe in
the documentation did not match the function.

Link: http://lkml.kernel.org/r/20190321014452.13297-1-tomlevy93@gmail.com
Signed-off-by: Tom Levy <tomlevy93@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 394e3d9213b8..b16e15b9587a 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -278,7 +278,7 @@ int LZ4_decompress_fast(const char *source, char *dest, int originalSize);
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * Decompresses data fom 'source' into 'dest'.
+ * Decompresses data from 'source' into 'dest'.
  * If the source stream is detected malformed, the function will
  * stop decoding and return a negative result.
  * This function is protected against buffer overflow exploits,
@@ -522,7 +522,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *dictionary, int dictSize);
 
 /**
- * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * LZ4_decompress_safe_continue() - Decompress blocks in streaming mode
  * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
  * @source: source address of the compressed data
  * @dest: output buffer address of the uncompressed data
@@ -530,7 +530,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -569,7 +569,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  *	which must be already allocated with 'originalSize' bytes
  * @originalSize: is the original and therefore uncompressed size
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -610,10 +610,10 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
  * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
@@ -633,10 +633,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
- * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * LZ4_decompress_fast_continue()
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
-- 
cgit v1.2.3


From eca499ab3749a4537dee77ffead47a1a2c0dee19 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 16 Jul 2019 16:30:31 -0700
Subject: mm/hotplug: make remove_memory() interface usable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Presently the remove_memory() interface is inherently broken.  It tries
to remove memory but panics if some memory is not offline.  The problem
is that it is impossible to ensure that all memory blocks are offline as
this function also takes lock_device_hotplug that is required to change
memory state via sysfs.

So, between calling this function and offlining all memory blocks there
is always a window when lock_device_hotplug is released, and therefore,
there is always a chance for a panic during this window.

Make this interface to return an error if memory removal fails.  This
way it is safe to call this function without panicking machine, and also
makes it symmetric to add_memory() which already returns an error.

Link: http://lkml.kernel.org/r/20190517215438.6487-3-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  8 ++++--
 mm/memory_hotplug.c            | 64 ++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ae892eef8b82..988fde33cd7f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -324,7 +324,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern void remove_memory(int nid, u64 start, u64 size);
+extern int remove_memory(int nid, u64 start, u64 size);
 extern void __remove_memory(int nid, u64 start, u64 size);
 
 #else
@@ -341,7 +341,11 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	return -EINVAL;
 }
 
-static inline void remove_memory(int nid, u64 start, u64 size) {}
+static inline int remove_memory(int nid, u64 start, u64 size)
+{
+	return -EBUSY;
+}
+
 static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..4ebe696138e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
-	}
 
-	return ret;
+		return -EBUSY;
+	}
+	return 0;
 }
 
 static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start,
 	}
 }
 
-/**
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
-	int ret;
+	int rc = 0;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	/*
 	 * All memory blocks must be offlined before removing memory.  Check
-	 * whether all memory blocks in question are offline and trigger a BUG()
+	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-				check_memblock_offlined_cb);
-	if (ret)
-		BUG();
+	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
+			       check_memblock_offlined_cb);
+	if (rc)
+		goto done;
 
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
@@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	try_offline_node(nid);
 
+done:
 	mem_hotplug_done();
+	return rc;
 }
 
-void remove_memory(int nid, u64 start, u64 size)
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+
+	/*
+	 * trigger BUG() is some memory is not offlined prior to calling this
+	 * function
+	 */
+	if (try_remove_memory(nid, start, size))
+		BUG();
+}
+
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
 {
+	int rc;
+
 	lock_device_hotplug();
-	__remove_memory(nid, start, size);
+	rc  = try_remove_memory(nid, start, size);
 	unlock_device_hotplug();
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-- 
cgit v1.2.3


From 7588adf8dff12c4b358557a13796a25fef796548 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:44 -0700
Subject: mm: clean up is_device_*_page() definitions

Refactor is_device_{public,private}_page() with is_pci_p2pdma_page() to
make them all consistent in depending on their respective config options
even when CONFIG_DEV_PAGEMAP_OPS is enabled for other reasons.  This
allows a little more compile-time optimisation as well as the conceptual
and cosmetic cleanup.

Link: http://lkml.kernel.org/r/187c2ab27dea70635d375a61b2f2076d26c032b0.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Suggested-by: Jerome Glisse <jglisse@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 74797ed20c2c..baa8b8761d8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -956,41 +956,28 @@ static inline bool put_devmap_managed_page(struct page *page)
 	return false;
 }
 
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-#ifdef CONFIG_PCI_P2PDMA
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
-}
-#else /* CONFIG_PCI_P2PDMA */
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return false;
-}
-#endif /* CONFIG_PCI_P2PDMA */
-
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
 }
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline bool is_device_private_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 /* 127: arbitrary random number, small enough to assemble well */
 #define page_ref_zero_or_close_to_overflow(page) \
-- 
cgit v1.2.3


From 175967318c3018d01931ac950c82adab5deb47ca Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:47 -0700
Subject: mm: introduce ARCH_HAS_PTE_DEVMAP

ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined
with the long-out-of-date comment can lead to the impression than an
architecture may just enable it (since __add_pages() now "comprehends
device memory" for itself) and expect things to work.

In practice, however, ZONE_DEVICE users have little chance of
functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean
that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper
dependency so the real situation is clearer.

Link: http://lkml.kernel.org/r/87554aa78478a02a63f2c4cf60a847279ae3eb3b.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig                         | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/x86/Kconfig                             | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/include/asm/pgtable_types.h         | 1 -
 include/linux/mm.h                           | 4 ++--
 include/linux/pfn_t.h                        | 4 ++--
 mm/Kconfig                                   | 5 ++---
 mm/gup.c                                     | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f516796dd819..d8dcd8820369 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
@@ -136,7 +137,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 62e6ea0a7650..8308f32e9782 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -90,7 +90,6 @@
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 #define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
-#define __HAVE_ARCH_PTE_DEVMAP
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..4a55bd01e918 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
@@ -80,7 +81,6 @@ config X86
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if X86_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5e0509b41986..0bc530c4eb13 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
@@ -732,7 +732,7 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t a)
 {
 	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..b5e49e6bac63 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -103,7 +103,6 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index baa8b8761d8c..f43f4de4de68 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -547,7 +547,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 struct mmu_gather;
 struct inode;
 
-#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
@@ -1750,7 +1750,7 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PTE_DEVMAP
+#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
 	return 0;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 01e8037023f7..2d9148221e9a 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -97,7 +97,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
 	const u64 flags = PFN_DEV|PFN_MAP;
@@ -115,7 +115,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd);
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 pud_t pud_mkdevmap(pud_t pud);
 #endif
-#endif /* __HAVE_ARCH_PTE_DEVMAP */
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
 	bool
 
 config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_ZONE_DEVICE
+	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
-- 
cgit v1.2.3


From 79eb597cba06c435b72f220e9d426ae413fc2579 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 16 Jul 2019 16:30:54 -0700
Subject: mm: add account_locked_vm utility function

locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.

Include the helper's caller in the debug print to distinguish between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

[daniel.m.jordan@oracle.com: v3]
  Link: http://lkml.kernel.org/r/20190529205019.20927-1-daniel.m.jordan@oracle.com
[sfr@canb.auug.org.au: fix mm/util.c]
Link: http://lkml.kernel.org/r/20190524175045.26897-1-daniel.m.jordan@oracle.com
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Cc: Alan Tull <atull@kernel.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Wu Hao <hao.wu@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c     | 44 ++-------------------
 arch/powerpc/mm/book3s64/iommu_api.c | 41 ++------------------
 drivers/fpga/dfl-afu-dma-region.c    | 53 ++-----------------------
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-----------------------
 drivers/vfio/vfio_iommu_type1.c      | 17 +-------
 include/linux/mm.h                   |  4 ++
 mm/util.c                            | 75 ++++++++++++++++++++++++++++++++++++
 7 files changed, 98 insertions(+), 190 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 5bf05cc774e2..e99a14798ab0 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -19,6 +19,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/iommu.h>
 #include <linux/file.h>
+#include <linux/mm.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -45,43 +46,6 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-	long ret = 0;
-
-	if (!current || !current->mm)
-		return ret; /* process exited */
-
-	down_write(&current->mm->mmap_sem);
-
-	if (inc) {
-		unsigned long locked, lock_limit;
-
-		locked = current->mm->locked_vm + stt_pages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += stt_pages;
-	} else {
-		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-			stt_pages = current->mm->locked_vm;
-
-		current->mm->locked_vm -= stt_pages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-			inc ? '+' : '-',
-			stt_pages << PAGE_SHIFT,
-			current->mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -291,7 +255,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 
 	kvm_put_kvm(stt->kvm);
 
-	kvmppc_account_memlimit(
+	account_locked_vm(current->mm,
 		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
 	call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -316,7 +280,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		return -EINVAL;
 
 	npages = kvmppc_tce_pages(size);
-	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
 	if (ret)
 		return ret;
 
@@ -362,7 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
 	kfree(stt);
  fail_acct:
-	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+	account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 90ee3a89722c..b056cae3388b 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -14,6 +14,7 @@
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
+#include <linux/mm.h>
 #include <asm/mmu_context.h>
 #include <asm/pte-walk.h>
 #include <linux/mm_inline.h>
@@ -46,40 +47,6 @@ struct mm_iommu_table_group_mem_t {
 	u64 dev_hpa;		/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-		unsigned long npages, bool incr)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-
-	if (incr) {
-		locked = mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > mm->locked_vm))
-			npages = mm->locked_vm;
-		mm->locked_vm -= npages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
 bool mm_iommu_preregistered(struct mm_struct *mm)
 {
 	return !list_empty(&mm->context.iommu_group_mem_list);
@@ -96,7 +63,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	unsigned long entry, chunk;
 
 	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		ret = account_locked_vm(mm, entries, true);
 		if (ret)
 			return ret;
 
@@ -211,7 +178,7 @@ free_exit:
 	kfree(mem);
 
 unlock_exit:
-	mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+	account_locked_vm(mm, locked_entries, false);
 
 	return ret;
 }
@@ -311,7 +278,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
 
-	mm_iommu_adjust_locked_vm(mm, unlock_entries, false);
+	account_locked_vm(mm, unlock_entries, false);
 
 	return ret;
 }
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index dcd80b088c7b..62f924489db5 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -12,6 +12,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 #include "dfl-afu.h"
 
@@ -31,52 +32,6 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
 	afu->dma_regions = RB_ROOT;
 }
 
-/**
- * afu_dma_adjust_locked_vm - adjust locked memory
- * @dev: port device
- * @npages: number of pages
- * @incr: increase or decrease locked memory
- *
- * Increase or decrease the locked memory size with npages input.
- *
- * Return 0 on success.
- * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
- */
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
-{
-	unsigned long locked, lock_limit;
-	int ret = 0;
-
-	/* the task is exiting. */
-	if (!current->mm)
-		return 0;
-
-	down_write(&current->mm->mmap_sem);
-
-	if (incr) {
-		locked = current->mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-			npages = current->mm->locked_vm;
-		current->mm->locked_vm -= npages;
-	}
-
-	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
-		incr ? '+' : '-', npages << PAGE_SHIFT,
-		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
-		ret ? "- exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 /**
  * afu_dma_pin_pages - pin pages of given dma memory region
  * @pdata: feature device platform data
@@ -92,7 +47,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 	struct device *dev = &pdata->dev->dev;
 	int ret, pinned;
 
-	ret = afu_dma_adjust_locked_vm(dev, npages, true);
+	ret = account_locked_vm(current->mm, npages, true);
 	if (ret)
 		return ret;
 
@@ -121,7 +76,7 @@ put_pages:
 free_pages:
 	kfree(region->pages);
 unlock_vm:
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 	return ret;
 }
 
@@ -141,7 +96,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
 
 	put_all_pages(region->pages, npages);
 	kfree(region->pages);
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 
 	dev_dbg(dev, "%ld pages unpinned\n", npages);
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7048c9198c21..8ce9ad21129f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/mm.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
@@ -31,51 +32,6 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (WARN_ON_ONCE(!mm))
-		return -EPERM;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-	locked = mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-		ret = -ENOMEM;
-	else
-		mm->locked_vm += npages;
-
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
-{
-	if (!mm || !npages)
-		return;
-
-	down_write(&mm->mmap_sem);
-	if (WARN_ON_ONCE(npages > mm->locked_vm))
-		npages = mm->locked_vm;
-	mm->locked_vm -= npages;
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-}
-
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -333,7 +289,7 @@ static int tce_iommu_enable(struct tce_container *container)
 		return ret;
 
 	locked = table_group->tce32_size >> PAGE_SHIFT;
-	ret = try_increment_locked_vm(container->mm, locked);
+	ret = account_locked_vm(container->mm, locked, true);
 	if (ret)
 		return ret;
 
@@ -352,7 +308,7 @@ static void tce_iommu_disable(struct tce_container *container)
 	container->enabled = false;
 
 	BUG_ON(!container->mm);
-	decrement_locked_vm(container->mm, container->locked_pages);
+	account_locked_vm(container->mm, container->locked_pages, false);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -656,7 +612,7 @@ static long tce_iommu_create_table(struct tce_container *container,
 	if (!table_size)
 		return -EINVAL;
 
-	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 	if (ret)
 		return ret;
 
@@ -675,7 +631,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	iommu_tce_table_put(tbl);
-	decrement_locked_vm(container->mm, pages);
+	account_locked_vm(container->mm, pages, false);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index add34adfadc7..054391f30fa8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -272,21 +272,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 
 	ret = down_write_killable(&mm->mmap_sem);
 	if (!ret) {
-		if (npage > 0) {
-			if (!dma->lock_cap) {
-				unsigned long limit;
-
-				limit = task_rlimit(dma->task,
-						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-				if (mm->locked_vm + npage > limit)
-					ret = -ENOMEM;
-			}
-		}
-
-		if (!ret)
-			mm->locked_vm += npage;
-
+		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
+					  dma->lock_cap);
 		up_write(&mm->mmap_sem);
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f43f4de4de68..bd6512559bed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1543,6 +1543,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim);
+
 /* Container for pinned pfns / pages */
 struct frame_vector {
 	unsigned int nr_allocated;	/* Number of frames we have space for */
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 }
 #endif
 
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ * @task:        task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0       on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim)
+{
+	unsigned long locked_vm, limit;
+	int ret = 0;
+
+	lockdep_assert_held_write(&mm->mmap_sem);
+
+	locked_vm = mm->locked_vm;
+	if (inc) {
+		if (!bypass_rlim) {
+			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+			if (locked_vm + pages > limit)
+				ret = -ENOMEM;
+		}
+		if (!ret)
+			mm->locked_vm = locked_vm + pages;
+	} else {
+		WARN_ON_ONCE(pages > locked_vm);
+		mm->locked_vm = locked_vm - pages;
+	}
+
+	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+		 ret ? " - exceeded" : "");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against, may be NULL
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0       on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+	int ret;
+
+	if (pages == 0 || !mm)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	ret = __account_locked_vm(mm, pages, inc, current,
+				  capable(CAP_IPC_LOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long pgoff)
-- 
cgit v1.2.3


From 56cbb429d911991170fe867b4bba14f0efed5829 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Jul 2019 16:57:51 -0400
Subject: switch the remnants of releasing the mountpoint away from fs_pin

We used to need rather convoluted ordering trickery to guarantee
that dput() of ex-mountpoints happens before the final mntput()
of the same.  Since we don't need that anymore, there's no point
playing with fs_pin for that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 10 ++--------
 fs/mount.h             |  7 +++++--
 fs/namespace.c         | 37 +++++++++++++++++++------------------
 include/linux/fs_pin.h |  1 -
 4 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index a6497cf8ae53..47ef3c71ce90 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -19,20 +19,14 @@ void pin_remove(struct fs_pin *pin)
 	spin_unlock_irq(&pin->wait.lock);
 }
 
-void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 {
 	spin_lock(&pin_lock);
-	if (p)
-		hlist_add_head(&pin->s_list, p);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
 	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
 	spin_unlock(&pin_lock);
 }
 
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
-{
-	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
-}
-
 void pin_kill(struct fs_pin *p)
 {
 	wait_queue_entry_t wait;
diff --git a/fs/mount.h b/fs/mount.h
index 84aa8cdf4971..711a4093e475 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
-	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	union {
+		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+		struct hlist_node mnt_umount;
+	};
 	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
@@ -68,7 +71,7 @@ struct mount {
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
-	struct fs_pin mnt_umount;
+	struct hlist_head mnt_stuck_children;
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 46316ba15615..54a815e48ead 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -171,13 +171,6 @@ unsigned int mnt_get_count(struct mount *mnt)
 #endif
 }
 
-static void drop_mountpoint(struct fs_pin *p)
-{
-	struct mount *m = container_of(p, struct mount, mnt_umount);
-	pin_remove(p);
-	mntput(&m->mnt);
-}
-
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -215,7 +208,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
-		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 	}
 	return mnt;
 
@@ -1087,19 +1080,22 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void cleanup_mnt(struct mount *mnt)
 {
+	struct hlist_node *p;
+	struct mount *m;
 	/*
-	 * This probably indicates that somebody messed
-	 * up a mnt_want/drop_write() pair.  If this
-	 * happens, the filesystem was probably unable
-	 * to make r/w->r/o transitions.
-	 */
-	/*
+	 * The warning here probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this happens, the
+	 * filesystem was probably unable to make r/w->r/o transitions.
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
+	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1168,6 +1164,7 @@ static void mntput_no_expire(struct mount *mnt)
 		struct mount *p, *tmp;
 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
 			__put_mountpoint(unhash_mnt(p), &list);
+			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
 		}
 	}
 	unlock_mount_hash();
@@ -1360,6 +1357,8 @@ EXPORT_SYMBOL(may_umount);
 static void namespace_unlock(void)
 {
 	struct hlist_head head;
+	struct hlist_node *p;
+	struct mount *m;
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1374,7 +1373,10 @@ static void namespace_unlock(void)
 
 	synchronize_rcu_expedited();
 
-	group_pin_kill(&head);
+	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 }
 
 static inline void namespace_lock(void)
@@ -1461,8 +1463,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 		disconnect = disconnect_mount(p, how);
 
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
 		if (mnt_has_parent(p)) {
 			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
@@ -1470,6 +1470,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
 			} else {
 				umount_mnt(p);
+				hlist_add_head(&p->mnt_umount, &unmounted);
 			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
@@ -1622,8 +1623,8 @@ void __detach_mounts(struct dentry *dentry)
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
-			hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
 			umount_mnt(mnt);
+			hlist_add_head(&mnt->mnt_umount, &unmounted);
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 7cab74d66f85..bdd09fd2520c 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -20,6 +20,5 @@ static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
 }
 
 void pin_remove(struct fs_pin *);
-void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
 void pin_kill(struct fs_pin *);
-- 
cgit v1.2.3


From b866455423e040813f113d8b87e8297778ee2014 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 21:59:11 +0200
Subject: dma-mapping: add a dma_addressing_limited helper

This helper returns if the device has issues addressing all present
memory in the system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8d13e28a8e07..e11b115dd0e4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -679,6 +679,20 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev:	device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false.  Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return min_not_zero(*dev->dma_mask, dev->bus_dma_mask) <
+		dma_get_required_mask(dev);
+}
+
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		const struct iommu_ops *iommu, bool coherent);
-- 
cgit v1.2.3


From db074436f421967f4f30cfbb6fbc2a728f3e62b3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:50:59 -0700
Subject: iomap: move the direct IO code into a separate file

Move the direct IO code into a separate file so that we can group
related functions in a single file instead of having a single enormous
source file.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 554 -------------------------------------------------
 fs/iomap/Makefile     |   1 +
 fs/iomap/direct-io.c  | 562 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |   7 +
 4 files changed, 570 insertions(+), 554 deletions(-)
 create mode 100644 fs/iomap/direct-io.c

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index ad994c408cb8..c983fedc7081 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -90,12 +90,6 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	return written ? written : ret;
 }
 
-static sector_t
-iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
-}
-
 static struct iomap_page *
 iomap_page_create(struct inode *inode, struct page *page)
 {
@@ -1148,551 +1142,3 @@ out_unlock:
 	return block_page_mkwrite_return(ret);
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-
-/*
- * Private flags for iomap_dio, must not overlap with the public ones in
- * iomap.h:
- */
-#define IOMAP_DIO_WRITE_FUA	(1 << 28)
-#define IOMAP_DIO_NEED_SYNC	(1 << 29)
-#define IOMAP_DIO_WRITE		(1 << 30)
-#define IOMAP_DIO_DIRTY		(1 << 31)
-
-struct iomap_dio {
-	struct kiocb		*iocb;
-	iomap_dio_end_io_t	*end_io;
-	loff_t			i_size;
-	loff_t			size;
-	atomic_t		ref;
-	unsigned		flags;
-	int			error;
-	bool			wait_for_completion;
-
-	union {
-		/* used during submission and for synchronous completion: */
-		struct {
-			struct iov_iter		*iter;
-			struct task_struct	*waiter;
-			struct request_queue	*last_queue;
-			blk_qc_t		cookie;
-		} submit;
-
-		/* used for aio completion: */
-		struct {
-			struct work_struct	work;
-		} aio;
-	};
-};
-
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
-	struct request_queue *q = READ_ONCE(kiocb->private);
-
-	if (!q)
-		return 0;
-	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
-static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
-		struct bio *bio)
-{
-	atomic_inc(&dio->ref);
-
-	if (dio->iocb->ki_flags & IOCB_HIPRI)
-		bio_set_polled(bio, dio->iocb);
-
-	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-	dio->submit.cookie = submit_bio(bio);
-}
-
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
-{
-	struct kiocb *iocb = dio->iocb;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
-
-	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
-	} else {
-		ret = dio->error;
-	}
-
-	if (likely(!ret)) {
-		ret = dio->size;
-		/* check for short read */
-		if (offset + ret > dio->i_size &&
-		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
-	}
-
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 *
-	 * And this page cache invalidation has to be after dio->end_io(), as
-	 * some filesystems convert unwritten extents to real allocations in
-	 * end_io() when necessary, otherwise a racing buffer read would cache
-	 * zeros from unwritten extents.
-	 */
-	if (!dio->error &&
-	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-		int err;
-		err = invalidate_inode_pages2_range(inode->i_mapping,
-				offset >> PAGE_SHIFT,
-				(offset + dio->size - 1) >> PAGE_SHIFT);
-		if (err)
-			dio_warn_stale_pagecache(iocb->ki_filp);
-	}
-
-	/*
-	 * If this is a DSYNC write, make sure we push it to stable storage now
-	 * that we've written data.
-	 */
-	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
-		ret = generic_write_sync(iocb, ret);
-
-	inode_dio_end(file_inode(iocb->ki_filp));
-	kfree(dio);
-
-	return ret;
-}
-
-static void iomap_dio_complete_work(struct work_struct *work)
-{
-	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
-	struct kiocb *iocb = dio->iocb;
-
-	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
-}
-
-/*
- * Set an error in the dio if none is set yet.  We have to use cmpxchg
- * as the submission context and the completion context(s) can race to
- * update the error.
- */
-static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
-{
-	cmpxchg(&dio->error, 0, ret);
-}
-
-static void iomap_dio_bio_end_io(struct bio *bio)
-{
-	struct iomap_dio *dio = bio->bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
-
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		if (dio->wait_for_completion) {
-			struct task_struct *waiter = dio->submit.waiter;
-			WRITE_ONCE(dio->submit.waiter, NULL);
-			blk_wake_io_task(waiter);
-		} else if (dio->flags & IOMAP_DIO_WRITE) {
-			struct inode *inode = file_inode(dio->iocb->ki_filp);
-
-			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
-			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
-		} else {
-			iomap_dio_complete_work(&dio->aio.work);
-		}
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static void
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
-		unsigned len)
-{
-	struct page *page = ZERO_PAGE(0);
-	int flags = REQ_SYNC | REQ_IDLE;
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-	bio->bi_private = dio;
-	bio->bi_end_io = iomap_dio_bio_end_io;
-
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-	iomap_dio_submit_bio(dio, iomap, bio);
-}
-
-static loff_t
-iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
-	unsigned int align = iov_iter_alignment(dio->submit.iter);
-	struct iov_iter iter;
-	struct bio *bio;
-	bool need_zeroout = false;
-	bool use_fua = false;
-	int nr_pages, ret = 0;
-	size_t copied = 0;
-
-	if ((pos | length | align) & ((1 << blkbits) - 1))
-		return -EINVAL;
-
-	if (iomap->type == IOMAP_UNWRITTEN) {
-		dio->flags |= IOMAP_DIO_UNWRITTEN;
-		need_zeroout = true;
-	}
-
-	if (iomap->flags & IOMAP_F_SHARED)
-		dio->flags |= IOMAP_DIO_COW;
-
-	if (iomap->flags & IOMAP_F_NEW) {
-		need_zeroout = true;
-	} else if (iomap->type == IOMAP_MAPPED) {
-		/*
-		 * Use a FUA write if we need datasync semantics, this is a pure
-		 * data IO that doesn't require any metadata updates (including
-		 * after IO completion such as unwritten extent conversion) and
-		 * the underlying device supports FUA. This allows us to avoid
-		 * cache flushes on IO completion.
-		 */
-		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
-		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
-		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
-			use_fua = true;
-	}
-
-	/*
-	 * Operate on a partial iter trimmed to the extent we were called for.
-	 * We'll update the iter in the dio once we're done with this extent.
-	 */
-	iter = *dio->submit.iter;
-	iov_iter_truncate(&iter, length);
-
-	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0)
-		return nr_pages;
-
-	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos - pad, pad);
-	}
-
-	do {
-		size_t n;
-		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
-			return 0;
-		}
-
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = dio->iocb->ki_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
-
-		ret = bio_iov_iter_get_pages(bio, &iter);
-		if (unlikely(ret)) {
-			/*
-			 * We have to stop part way through an IO. We must fall
-			 * through to the sub-block tail zeroing here, otherwise
-			 * this short IO may expose stale data in the tail of
-			 * the block we haven't written data to.
-			 */
-			bio_put(bio);
-			goto zero_tail;
-		}
-
-		n = bio->bi_iter.bi_size;
-		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
-			task_io_account_write(n);
-		} else {
-			bio->bi_opf = REQ_OP_READ;
-			if (dio->flags & IOMAP_DIO_DIRTY)
-				bio_set_pages_dirty(bio);
-		}
-
-		iov_iter_advance(dio->submit.iter, n);
-
-		dio->size += n;
-		pos += n;
-		copied += n;
-
-		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-		iomap_dio_submit_bio(dio, iomap, bio);
-	} while (nr_pages);
-
-	/*
-	 * We need to zeroout the tail of a sub-block write if the extent type
-	 * requires zeroing or the write extends beyond EOF. If we don't zero
-	 * the block tail in the latter case, we can expose stale data via mmap
-	 * reads of the EOF block.
-	 */
-zero_tail:
-	if (need_zeroout ||
-	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
-	}
-	return copied ? copied : ret;
-}
-
-static loff_t
-iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
-{
-	length = iov_iter_zero(length, dio->submit.iter);
-	dio->size += length;
-	return length;
-}
-
-static loff_t
-iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	struct iov_iter *iter = dio->submit.iter;
-	size_t copied;
-
-	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	if (dio->flags & IOMAP_DIO_WRITE) {
-		loff_t size = inode->i_size;
-
-		if (pos > size)
-			memset(iomap->inline_data + size, 0, pos - size);
-		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
-		if (copied) {
-			if (pos + copied > size)
-				i_size_write(inode, pos + copied);
-			mark_inode_dirty(inode);
-		}
-	} else {
-		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
-	}
-	dio->size += copied;
-	return copied;
-}
-
-static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct iomap_dio *dio = data;
-
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
-			return -EIO;
-		return iomap_dio_hole_actor(length, dio);
-	case IOMAP_UNWRITTEN:
-		if (!(dio->flags & IOMAP_DIO_WRITE))
-			return iomap_dio_hole_actor(length, dio);
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_MAPPED:
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_INLINE:
-		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
-	default:
-		WARN_ON_ONCE(1);
-		return -EIO;
-	}
-}
-
-/*
- * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.  This allows us to optimise pure data writes
- * to use REQ_FUA rather than requiring generic_write_sync() to issue a
- * REQ_FLUSH post write. This is slightly tricky because a single request here
- * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
- * may be pure data writes. In that case, we still need to do a full data sync
- * completion.
- */
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
-{
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	size_t count = iov_iter_count(iter);
-	loff_t pos = iocb->ki_pos, start = pos;
-	loff_t end = iocb->ki_pos + count - 1, ret = 0;
-	unsigned int flags = IOMAP_DIRECT;
-	bool wait_for_completion = is_sync_kiocb(iocb);
-	struct blk_plug plug;
-	struct iomap_dio *dio;
-
-	lockdep_assert_held(&inode->i_rwsem);
-
-	if (!count)
-		return 0;
-
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
-
-	dio->iocb = iocb;
-	atomic_set(&dio->ref, 1);
-	dio->size = 0;
-	dio->i_size = i_size_read(inode);
-	dio->end_io = end_io;
-	dio->error = 0;
-	dio->flags = 0;
-
-	dio->submit.iter = iter;
-	dio->submit.waiter = current;
-	dio->submit.cookie = BLK_QC_T_NONE;
-	dio->submit.last_queue = NULL;
-
-	if (iov_iter_rw(iter) == READ) {
-		if (pos >= dio->i_size)
-			goto out_free_dio;
-
-		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
-			dio->flags |= IOMAP_DIO_DIRTY;
-	} else {
-		flags |= IOMAP_WRITE;
-		dio->flags |= IOMAP_DIO_WRITE;
-
-		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_NEED_SYNC;
-
-		/*
-		 * For datasync only writes, we optimistically try using FUA for
-		 * this IO.  Any non-FUA write that occurs will clear this flag,
-		 * hence we know before completion whether a cache flush is
-		 * necessary.
-		 */
-		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_WRITE_FUA;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (filemap_range_has_page(mapping, start, end)) {
-			ret = -EAGAIN;
-			goto out_free_dio;
-		}
-		flags |= IOMAP_NOWAIT;
-	}
-
-	ret = filemap_write_and_wait_range(mapping, start, end);
-	if (ret)
-		goto out_free_dio;
-
-	/*
-	 * Try to invalidate cache pages for the range we're direct
-	 * writing.  If this invalidation fails, tough, the write will
-	 * still work, but racing two incompatible write paths is a
-	 * pretty crazy thing to do, so we don't support it 100%.
-	 */
-	ret = invalidate_inode_pages2_range(mapping,
-			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	if (ret)
-		dio_warn_stale_pagecache(iocb->ki_filp);
-	ret = 0;
-
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
-		ret = sb_init_dio_done_wq(inode->i_sb);
-		if (ret < 0)
-			goto out_free_dio;
-	}
-
-	inode_dio_begin(inode);
-
-	blk_start_plug(&plug);
-	do {
-		ret = iomap_apply(inode, pos, count, flags, ops, dio,
-				iomap_dio_actor);
-		if (ret <= 0) {
-			/* magic error code to fall back to buffered I/O */
-			if (ret == -ENOTBLK) {
-				wait_for_completion = true;
-				ret = 0;
-			}
-			break;
-		}
-		pos += ret;
-
-		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
-			break;
-	} while ((count = iov_iter_count(iter)) > 0);
-	blk_finish_plug(&plug);
-
-	if (ret < 0)
-		iomap_dio_set_error(dio, ret);
-
-	/*
-	 * If all the writes we issued were FUA, we don't need to flush the
-	 * cache on IO completion. Clear the sync flag for this case.
-	 */
-	if (dio->flags & IOMAP_DIO_WRITE_FUA)
-		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
-
-	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
-	WRITE_ONCE(iocb->private, dio->submit.last_queue);
-
-	/*
-	 * We are about to drop our additional submission reference, which
-	 * might be the last reference to the dio.  There are three three
-	 * different ways we can progress here:
-	 *
-	 *  (a) If this is the last reference we will always complete and free
-	 *	the dio ourselves.
-	 *  (b) If this is not the last reference, and we serve an asynchronous
-	 *	iocb, we must never touch the dio after the decrement, the
-	 *	I/O completion handler will complete and free it.
-	 *  (c) If this is not the last reference, but we serve a synchronous
-	 *	iocb, the I/O completion handler will wake us up on the drop
-	 *	of the final reference, and we will complete and free it here
-	 *	after we got woken by the I/O completion handler.
-	 */
-	dio->wait_for_completion = wait_for_completion;
-	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!wait_for_completion)
-			return -EIOCBQUEUED;
-
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!READ_ONCE(dio->submit.waiter))
-				break;
-
-			if (!(iocb->ki_flags & IOCB_HIPRI) ||
-			    !dio->submit.last_queue ||
-			    !blk_poll(dio->submit.last_queue,
-					 dio->submit.cookie, true))
-				io_schedule();
-		}
-		__set_current_state(TASK_RUNNING);
-	}
-
-	return iomap_dio_complete(dio);
-
-out_free_dio:
-	kfree(dio);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 5dfe8b5cf330..a67a97758858 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -6,6 +6,7 @@
 obj-$(CONFIG_FS_IOMAP)		+= iomap.o
 
 iomap-y				+= \
+					direct-io.o \
 					fiemap.o \
 					seek.o
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
new file mode 100644
index 000000000000..10517cea9682
--- /dev/null
+++ b/fs/iomap/direct-io.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "../internal.h"
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE_FUA	(1 << 28)
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	iomap_dio_end_io_t	*end_io;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+	bool			wait_for_completion;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+	struct request_queue *q = READ_ONCE(kiocb->private);
+
+	if (!q)
+		return 0;
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+		struct bio *bio)
+{
+	atomic_inc(&dio->ref);
+
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(bio, dio->iocb);
+
+	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+	dio->submit.cookie = submit_bio(bio);
+}
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
+	ssize_t ret;
+
+	if (dio->end_io) {
+		ret = dio->end_io(iocb,
+				dio->error ? dio->error : dio->size,
+				dio->flags);
+	} else {
+		ret = dio->error;
+	}
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (offset + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - offset;
+		iocb->ki_pos += ret;
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after dio->end_io(), as
+	 * some filesystems convert unwritten extents to real allocations in
+	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		int err;
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
+	}
+
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+
+	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (dio->wait_for_completion) {
+			struct task_struct *waiter = dio->submit.waiter;
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			blk_wake_io_task(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	int flags = REQ_SYNC | REQ_IDLE;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio_set_dev(bio, iomap->bdev);
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	__bio_add_page(bio, page, len, 0);
+	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+	iomap_dio_submit_bio(dio, iomap, bio);
+}
+
+static loff_t
+iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	bool use_fua = false;
+	int nr_pages, ret = 0;
+	size_t copied = 0;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	if (iomap->type == IOMAP_UNWRITTEN) {
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+	}
+
+	if (iomap->flags & IOMAP_F_SHARED)
+		dio->flags |= IOMAP_DIO_COW;
+
+	if (iomap->flags & IOMAP_F_NEW) {
+		need_zeroout = true;
+	} else if (iomap->type == IOMAP_MAPPED) {
+		/*
+		 * Use a FUA write if we need datasync semantics, this is a pure
+		 * data IO that doesn't require any metadata updates (including
+		 * after IO completion such as unwritten extent conversion) and
+		 * the underlying device supports FUA. This allows us to avoid
+		 * cache flushes on IO completion.
+		 */
+		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
+			use_fua = true;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
+			return 0;
+		}
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio_set_dev(bio, iomap->bdev);
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = dio->iocb->ki_hint;
+		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			/*
+			 * We have to stop part way through an IO. We must fall
+			 * through to the sub-block tail zeroing here, otherwise
+			 * this short IO may expose stale data in the tail of
+			 * the block we haven't written data to.
+			 */
+			bio_put(bio);
+			goto zero_tail;
+		}
+
+		n = bio->bi_iter.bi_size;
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+			task_io_account_write(n);
+		} else {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+		iomap_dio_submit_bio(dio, iomap, bio);
+	} while (nr_pages);
+
+	/*
+	 * We need to zeroout the tail of a sub-block write if the extent type
+	 * requires zeroing or the write extends beyond EOF. If we don't zero
+	 * the block tail in the latter case, we can expose stale data via mmap
+	 * reads of the EOF block.
+	 */
+zero_tail:
+	if (need_zeroout ||
+	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+	return copied ? copied : ret;
+}
+
+static loff_t
+iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+{
+	length = iov_iter_zero(length, dio->submit.iter);
+	dio->size += length;
+	return length;
+}
+
+static loff_t
+iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	struct iov_iter *iter = dio->submit.iter;
+	size_t copied;
+
+	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	if (dio->flags & IOMAP_DIO_WRITE) {
+		loff_t size = inode->i_size;
+
+		if (pos > size)
+			memset(iomap->inline_data + size, 0, pos - size);
+		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+		if (copied) {
+			if (pos + copied > size)
+				i_size_write(inode, pos + copied);
+			mark_inode_dirty(inode);
+		}
+	} else {
+		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+	}
+	dio->size += copied;
+	return copied;
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		return iomap_dio_hole_actor(length, dio);
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE))
+			return iomap_dio_hole_actor(length, dio);
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_MAPPED:
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_INLINE:
+		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+}
+
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, start = pos;
+	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	bool wait_for_completion = is_sync_kiocb(iocb);
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->end_io = end_io;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	dio->submit.waiter = current;
+	dio->submit.cookie = BLK_QC_T_NONE;
+	dio->submit.last_queue = NULL;
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		flags |= IOMAP_WRITE;
+		dio->flags |= IOMAP_DIO_WRITE;
+
+		/* for data sync or sync, we need sync completion processing */
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+		/*
+		 * For datasync only writes, we optimistically try using FUA for
+		 * this IO.  Any non-FUA write that occurs will clear this flag,
+		 * hence we know before completion whether a cache flush is
+		 * necessary.
+		 */
+		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_WRITE_FUA;
+	}
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (filemap_range_has_page(mapping, start, end)) {
+			ret = -EAGAIN;
+			goto out_free_dio;
+		}
+		flags |= IOMAP_NOWAIT;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, start, end);
+	if (ret)
+		goto out_free_dio;
+
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	if (ret)
+		dio_warn_stale_pagecache(iocb->ki_filp);
+	ret = 0;
+
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
+	    !inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			goto out_free_dio;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK) {
+				wait_for_completion = true;
+				ret = 0;
+			}
+			break;
+		}
+		pos += ret;
+
+		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+			break;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	/*
+	 * If all the writes we issued were FUA, we don't need to flush the
+	 * cache on IO completion. Clear the sync flag for this case.
+	 */
+	if (dio->flags & IOMAP_DIO_WRITE_FUA)
+		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
+	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+	WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+	/*
+	 * We are about to drop our additional submission reference, which
+	 * might be the last reference to the dio.  There are three three
+	 * different ways we can progress here:
+	 *
+	 *  (a) If this is the last reference we will always complete and free
+	 *	the dio ourselves.
+	 *  (b) If this is not the last reference, and we serve an asynchronous
+	 *	iocb, we must never touch the dio after the decrement, the
+	 *	I/O completion handler will complete and free it.
+	 *  (c) If this is not the last reference, but we serve a synchronous
+	 *	iocb, the I/O completion handler will wake us up on the drop
+	 *	of the final reference, and we will complete and free it here
+	 *	after we got woken by the I/O completion handler.
+	 */
+	dio->wait_for_completion = wait_for_completion;
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!wait_for_completion)
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_poll(dio->submit.last_queue,
+					 dio->submit.cookie, true))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1df9ea187a9a..baa1e2d31f05 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/mm_types.h>
+#include <linux/blkdev.h>
 
 struct address_space;
 struct fiemap_extent_info;
@@ -69,6 +70,12 @@ struct iomap {
 	const struct iomap_page_ops *page_ops;
 };
 
+static inline sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 /*
  * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
  * and page_done will be called for each page written to.  This only applies to
-- 
cgit v1.2.3


From 5d907307adc14cd5148b07629c2b4535acd06062 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:51:01 -0700
Subject: iomap: move internal declarations into fs/iomap/

Move internal function declarations out of fs/internal.h into
include/linux/iomap.h so that our transition is complete.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/dax.c              |  1 -
 fs/internal.h         | 10 ----------
 fs/iomap/apply.c      |  2 --
 fs/iomap/fiemap.c     |  2 --
 fs/iomap/seek.c       |  2 --
 fs/iomap/swapfile.c   |  2 --
 include/linux/iomap.h | 10 ++++++++++
 7 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index fe5e33810cd4..cb53f9bd6fd7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -26,7 +26,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include <asm/pgalloc.h>
-#include "internal.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/fs_dax.h>
diff --git a/fs/internal.h b/fs/internal.h
index 2f3c3de51fad..2b0bebd67904 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,15 +182,5 @@ extern const struct dentry_operations ns_dentry_operations;
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 
-/*
- * iomap support:
- */
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-		void *data, struct iomap *iomap);
-
-loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, const struct iomap_ops *ops, void *data,
-		iomap_actor_t actor);
-
 /* direct-io.c: */
 int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 9f956cf23867..54c02aecf3cd 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -8,8 +8,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 /*
  * Execute a iomap write on a segment of the mapping that spans a
  * contiguous range of pages that have identical block mapping state.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 1fc88ec1584d..f26fdd36e383 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,8 +7,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 struct fiemap_ctx {
 	struct fiemap_extent_info *fi;
 	struct iomap prev;
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 715442eb71aa..c04bad4b2b43 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,8 +10,6 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 
-#include "../internal.h"
-
 /*
  * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
  * Returns true if found and updates @lastoff to the offset in file.
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b79c33631263..152a230f668d 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -9,8 +9,6 @@
 #include <linux/iomap.h>
 #include <linux/swap.h>
 
-#include "../internal.h"
-
 /* Swapfile activation */
 
 struct iomap_swapfile_info {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index baa1e2d31f05..bc499ceae392 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -122,6 +122,16 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Main iomap iterator function.
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, const struct iomap_ops *ops, void *data,
+		iomap_actor_t actor);
+
 /*
  * Structure allocate for each page when block size < PAGE_SIZE to track
  * sub-page uptodate status and I/O completions.
-- 
cgit v1.2.3


From 9af93db9e140a4e6e79cdb098919bc928a72cd59 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Wed, 17 Jul 2019 13:10:58 +0800
Subject: platform/x86: asus: Rename "fan mode" to "fan boost mode"

The Asus WMI spec indicates that the function being controlled here
is called "Fan Boost Mode". The user-facing documentation also calls it
this.

The spec uses the term "fan mode" is used to refer to other things,
including functionality expected to appear on future products.
We missed this before as we are not dealing with the most readable of
specs, and didn't forsee any confusion around shortening the name.

Rename "fan mode" to "fan boost mode" to improve consistency with the
spec and to avoid a future naming conflict.

There is no interface breakage here since this has yet to be included
in an official kernel release. I also updated the kernel version listed
under ABI accordingly.

Signed-off-by: Daniel Drake <drake@endlessm.com>
Acked-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |   6 +-
 drivers/platform/x86/asus-wmi.c                   | 118 ++++++++++++----------
 include/linux/platform_data/x86/asus-wmi.h        |   2 +-
 3 files changed, 66 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 87ae5cc983bf..9e99f2909612 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -37,9 +37,9 @@ Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
 
-What:		/sys/devices/platform/<platform>/fan_mode
-Date:		Apr 2019
-KernelVersion:	5.2
+What:		/sys/devices/platform/<platform>/fan_boost_mode
+Date:		Sep 2019
+KernelVersion:	5.3
 Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
 Description:
 		Fan boost mode:
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 18f3a8bad52f..ca28d27dae63 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -68,12 +68,12 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
-#define ASUS_FAN_MODE_NORMAL		0
-#define ASUS_FAN_MODE_OVERBOOST		1
-#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
-#define ASUS_FAN_MODE_SILENT		2
-#define ASUS_FAN_MODE_SILENT_MASK	0x02
-#define ASUS_FAN_MODES_MASK		0x03
+#define ASUS_FAN_BOOST_MODE_NORMAL		0
+#define ASUS_FAN_BOOST_MODE_OVERBOOST		1
+#define ASUS_FAN_BOOST_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_BOOST_MODE_SILENT		2
+#define ASUS_FAN_BOOST_MODE_SILENT_MASK		0x02
+#define ASUS_FAN_BOOST_MODES_MASK		0x03
 
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
@@ -182,9 +182,9 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
-	bool fan_mode_available;
-	u8 fan_mode_mask;
-	u8 fan_mode;
+	bool fan_boost_mode_available;
+	u8 fan_boost_mode_mask;
+	u8 fan_boost_mode;
 
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
@@ -1487,14 +1487,15 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 
 /* Fan mode *******************************************************************/
 
-static int fan_mode_check_present(struct asus_wmi *asus)
+static int fan_boost_mode_check_present(struct asus_wmi *asus)
 {
 	u32 result;
 	int err;
 
-	asus->fan_mode_available = false;
+	asus->fan_boost_mode_available = false;
 
-	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_BOOST_MODE,
+				    &result);
 	if (err) {
 		if (err == -ENODEV)
 			return 0;
@@ -1503,72 +1504,77 @@ static int fan_mode_check_present(struct asus_wmi *asus)
 	}
 
 	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
-			(result & ASUS_FAN_MODES_MASK)) {
-		asus->fan_mode_available = true;
-		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+			(result & ASUS_FAN_BOOST_MODES_MASK)) {
+		asus->fan_boost_mode_available = true;
+		asus->fan_boost_mode_mask = result & ASUS_FAN_BOOST_MODES_MASK;
 	}
 
 	return 0;
 }
 
-static int fan_mode_write(struct asus_wmi *asus)
+static int fan_boost_mode_write(struct asus_wmi *asus)
 {
 	int err;
 	u8 value;
 	u32 retval;
 
-	value = asus->fan_mode;
+	value = asus->fan_boost_mode;
 
-	pr_info("Set fan mode: %u\n", value);
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+	pr_info("Set fan boost mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_BOOST_MODE, value,
+				    &retval);
 
 	if (err) {
-		pr_warn("Failed to set fan mode: %d\n", err);
+		pr_warn("Failed to set fan boost mode: %d\n", err);
 		return err;
 	}
 
 	if (retval != 1) {
-		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		pr_warn("Failed to set fan boost mode (retval): 0x%x\n",
+			retval);
 		return -EIO;
 	}
 
 	return 0;
 }
 
-static int fan_mode_switch_next(struct asus_wmi *asus)
+static int fan_boost_mode_switch_next(struct asus_wmi *asus)
 {
-	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
-		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
-	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	u8 mask = asus->fan_boost_mode_mask;
+
+	if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_NORMAL) {
+		if (mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_OVERBOOST;
+		else if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
+	} else if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
 		else
-			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	} else {
-		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+		asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	}
 
-	return fan_mode_write(asus);
+	return fan_boost_mode_write(asus);
 }
 
-static ssize_t fan_mode_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+static ssize_t fan_boost_mode_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
 {
 	struct asus_wmi *asus = dev_get_drvdata(dev);
 
-	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_boost_mode);
 }
 
-static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+static ssize_t fan_boost_mode_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	int result;
 	u8 new_mode;
-
 	struct asus_wmi *asus = dev_get_drvdata(dev);
+	u8 mask = asus->fan_boost_mode_mask;
 
 	result = kstrtou8(buf, 10, &new_mode);
 	if (result < 0) {
@@ -1576,24 +1582,24 @@ static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
 		return result;
 	}
 
-	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+	if (new_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK))
 			return -EINVAL;
-	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+	} else if (new_mode == ASUS_FAN_BOOST_MODE_SILENT) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_SILENT_MASK))
 			return -EINVAL;
-	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+	} else if (new_mode != ASUS_FAN_BOOST_MODE_NORMAL) {
 		return -EINVAL;
 	}
 
-	asus->fan_mode = new_mode;
-	fan_mode_write(asus);
+	asus->fan_boost_mode = new_mode;
+	fan_boost_mode_write(asus);
 
 	return result;
 }
 
-// Fan mode: 0 - normal, 1 - overboost, 2 - silent
-static DEVICE_ATTR_RW(fan_mode);
+// Fan boost mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_boost_mode);
 
 /* Backlight ******************************************************************/
 
@@ -1873,8 +1879,8 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
-	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
-		fan_mode_switch_next(asus);
+	if (asus->fan_boost_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_boost_mode_switch_next(asus);
 		return;
 	}
 
@@ -2034,7 +2040,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
-	&dev_attr_fan_mode.attr,
+	&dev_attr_fan_boost_mode.attr,
 	NULL
 };
 
@@ -2056,8 +2062,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
-	else if (attr == &dev_attr_fan_mode.attr)
-		ok = asus->fan_mode_available;
+	else if (attr == &dev_attr_fan_boost_mode.attr)
+		ok = asus->fan_boost_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2315,9 +2321,9 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
-	err = fan_mode_check_present(asus);
+	err = fan_boost_mode_check_present(asus);
 	if (err)
-		goto fail_fan_mode;
+		goto fail_fan_boost_mode;
 
 	err = asus_wmi_sysfs_init(asus->platform_device);
 	if (err)
@@ -2402,7 +2408,7 @@ fail_hwmon:
 fail_input:
 	asus_wmi_sysfs_exit(asus->platform_device);
 fail_sysfs:
-fail_fan_mode:
+fail_fan_boost_mode:
 fail_platform:
 	kfree(asus);
 	return err;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 8551156b8dca..4802cd2c7309 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,7 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
-#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
+#define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3


From 733232f8c852bcc2ad6fc1db7f4c43eb01c7c217 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jul 2019 12:57:06 -0400
Subject: dm: use printk ratelimiting functions

DM provided its own ratelimiting printk wrapper but given printk
advances this is no longer needed.

Also, switching DMDEBUG_LIMIT to using pr_debug_ratelimited() fixes the
reported issue where DMDEBUG_LIMIT() still caused a flood of "callbacks
suppressed" messages.

Reported-by: Milan Broz <gmazyland@gmail.com>
Depends-on: 29fc2bc7539386 ("printk: pr_debug_ratelimited: check state first to reduce "callbacks suppressed" messages")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/device-mapper.h | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..603ce5bb4fac 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -530,29 +530,20 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  *---------------------------------------------------------------*/
 #define DM_NAME "device-mapper"
 
-#define DM_RATELIMIT(pr_func, fmt, ...)					\
-do {									\
-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&rs))						\
-		pr_func(DM_FMT(fmt), ##__VA_ARGS__);			\
-} while (0)
-
 #define DM_FMT(fmt) DM_NAME ": " DM_MSG_PREFIX ": " fmt "\n"
 
 #define DMCRIT(fmt, ...) pr_crit(DM_FMT(fmt), ##__VA_ARGS__)
 
 #define DMERR(fmt, ...) pr_err(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMERR_LIMIT(fmt, ...) DM_RATELIMIT(pr_err, fmt, ##__VA_ARGS__)
+#define DMERR_LIMIT(fmt, ...) pr_err_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMWARN(fmt, ...) pr_warn(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMWARN_LIMIT(fmt, ...) DM_RATELIMIT(pr_warn, fmt, ##__VA_ARGS__)
+#define DMWARN_LIMIT(fmt, ...) pr_warn_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMINFO_LIMIT(fmt, ...) DM_RATELIMIT(pr_info, fmt, ##__VA_ARGS__)
+#define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 
 #ifdef CONFIG_DM_DEBUG
 #define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__)
-#define DMDEBUG_LIMIT(fmt, ...) DM_RATELIMIT(pr_debug, fmt, ##__VA_ARGS__)
+#define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #else
 #define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
 #define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-- 
cgit v1.2.3


From 7402a4fedc2bc448100c2d086406c708451b16dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 16 Jul 2019 13:51:29 -0400
Subject: SUNRPC: Fix up backchannel slot table accounting

Add a per-transport maximum limit in the socket case, and add
helpers to allow the NFSv4 code to discover that limit.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c                 |  3 +++
 include/linux/sunrpc/bc_xprt.h    |  1 +
 include/linux/sunrpc/clnt.h       |  1 +
 include/linux/sunrpc/xprt.h       |  6 ++++--
 net/sunrpc/backchannel_rqst.c     | 40 +++++++++++++++++++++------------------
 net/sunrpc/clnt.c                 | 13 +++++++++++++
 net/sunrpc/svc.c                  |  2 +-
 net/sunrpc/xprtrdma/backchannel.c |  7 +++++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  1 +
 11 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 52de7245a2ee..39896afc6edf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8380,6 +8380,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 {
 	unsigned int max_rqst_sz, max_resp_sz;
 	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+	unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -8402,6 +8403,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+	if (args->bc_attrs.max_reqs > max_bc_slots)
+		args->bc_attrs.max_reqs = max_bc_slots;
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index d4229a78524a..87d27e13d885 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -43,6 +43,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
 int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
 void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs);
 void xprt_free_bc_rqst(struct rpc_rqst *req);
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt);
 
 /*
  * Determine if a shared backchannel is in use
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4e070e00c143..abc63bd1be2b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -194,6 +194,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
 size_t		rpc_max_bc_payload(struct rpc_clnt *);
+unsigned int	rpc_num_bc_slots(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ed76e5fb36c1..13e108bcc9eb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -158,6 +158,7 @@ struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
+	unsigned int	(*bc_num_slots)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
 				      unsigned int max_reqs);
@@ -251,8 +252,9 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
-	int			bc_alloc_count;	/* Total number of preallocs */
-	atomic_t		bc_free_slots;
+	unsigned int		bc_alloc_max;
+	unsigned int		bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_slot_count;	/* Number of allocated slots */
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c47d82622fd1..339e8c077c2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -31,25 +31,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+#define BC_MAX_SLOTS	64U
+
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
+{
+	return BC_MAX_SLOTS;
+}
+
 /*
  * Helper routines that track the number of preallocation elements
  * on the transport.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-	return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
-}
-
-static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_add(n, &xprt->bc_free_slots);
-	xprt->bc_alloc_count += n;
-}
-
-static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_sub(n, &xprt->bc_free_slots);
-	return xprt->bc_alloc_count -= n;
+	return xprt->bc_alloc_count < xprt->bc_alloc_max;
 }
 
 /*
@@ -145,6 +140,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 
 	dprintk("RPC:       setup backchannel transport\n");
 
+	if (min_reqs > BC_MAX_SLOTS)
+		min_reqs = BC_MAX_SLOTS;
+
 	/*
 	 * We use a temporary list to keep track of the preallocated
 	 * buffers.  Once we're done building the list we splice it
@@ -172,7 +170,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 	 */
 	spin_lock(&xprt->bc_pa_lock);
 	list_splice(&tmp_list, &xprt->bc_pa_list);
-	xprt_inc_alloc_count(xprt, min_reqs);
+	xprt->bc_alloc_count += min_reqs;
+	xprt->bc_alloc_max += min_reqs;
+	atomic_add(min_reqs, &xprt->bc_slot_count);
 	spin_unlock(&xprt->bc_pa_lock);
 
 	dprintk("RPC:       setup backchannel transport done\n");
@@ -220,11 +220,13 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
 		goto out;
 
 	spin_lock_bh(&xprt->bc_pa_lock);
-	xprt_dec_alloc_count(xprt, max_reqs);
+	xprt->bc_alloc_max -= max_reqs;
 	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
 		dprintk("RPC:        req=%p\n", req);
 		list_del(&req->rq_bc_pa_list);
 		xprt_free_allocation(req);
+		xprt->bc_alloc_count--;
+		atomic_dec(&xprt->bc_slot_count);
 		if (--max_reqs == 0)
 			break;
 	}
@@ -241,13 +243,14 @@ static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid,
 	struct rpc_rqst *req = NULL;
 
 	dprintk("RPC:       allocate a backchannel request\n");
-	if (atomic_read(&xprt->bc_free_slots) <= 0)
-		goto not_found;
 	if (list_empty(&xprt->bc_pa_list)) {
 		if (!new)
 			goto not_found;
+		if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS)
+			goto not_found;
 		list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
@@ -291,6 +294,7 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
 	if (xprt_need_to_requeue(xprt)) {
 		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 		req = NULL;
 	}
 	spin_unlock_bh(&xprt->bc_pa_lock);
@@ -357,7 +361,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
-	xprt_dec_alloc_count(xprt, 1);
+	xprt->bc_alloc_count--;
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 383555d2b522..79c849391cb9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1526,6 +1526,19 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
 
+unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	unsigned int ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_num_slots(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_num_bc_slots);
+
 /**
  * rpc_force_rebind - force transport to check that remote port is unchanged
  * @clnt: client to rebind
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e15cb704453e..220b79988000 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1595,7 +1595,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp, argv, resv);
 
-	atomic_inc(&req->rq_xprt->bc_free_slots);
+	atomic_dec(&req->rq_xprt->bc_slot_count);
 	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ce986591f213..59e624b1d7a0 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -52,6 +52,13 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 	return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
 
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	return r_xprt->rx_buf.rb_bc_srv_max_requests;
+}
+
 static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4993aa49ecbe..52abddac19e5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -812,6 +812,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
+	.bc_num_slots		= xprt_rdma_bc_max_slots,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8378f45d2da7..92ce09fcea74 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -605,6 +605,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3c2cc96afcaa..6b1fca51028a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2788,6 +2788,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_maxpayload		= xs_tcp_bc_maxpayload,
+	.bc_num_slots		= xprt_bc_max_slots,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
-- 
cgit v1.2.3


From a6d81d30d3cd87f85bfd922358eb18b8146c4925 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 16 Jul 2019 16:19:25 -0400
Subject: wait: add wq_has_single_sleeper helper

rq-qos sits in the io path so we want to take locks as sparingly as
possible.  To accomplish this we try not to take the waitqueue head lock
unless we are sure we need to go to sleep, and we have an optimization
to make sure that we don't starve out existing waiters.  Since we check
if there are existing waiters locklessly we need to be able to update
our view of the waitqueue list after we've added ourselves to the
waitqueue.  Accomplish this by adding this helper to see if there is
more than just ourselves on the list.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/wait.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index b6f77cf60dd7..30c515520fb2 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -126,6 +126,19 @@ static inline int waitqueue_active(struct wait_queue_head *wq_head)
 	return !list_empty(&wq_head->head);
 }
 
+/**
+ * wq_has_single_sleeper - check if there is only one sleeper
+ * @wq_head: wait queue head
+ *
+ * Returns true of wq_head has only one sleeper on the list.
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
+{
+	return list_is_singular(&wq_head->head);
+}
+
 /**
  * wq_has_sleeper - check if there are any waiting processes
  * @wq_head: wait queue head
-- 
cgit v1.2.3


From 3193c0836f203a91bef96d88c64cccf0be090d9c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 17 Jul 2019 20:36:45 -0500
Subject: bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()

On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression
elimination" optimization results in ___bpf_prog_run()'s jumptable code
changing from this:

	select_insn:
		jmp *jumptable(, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *jumptable(, %rax, 8)
	ALU_ADD_X:
		...
		jmp *jumptable(, %rax, 8)

to this:

	select_insn:
		mov jumptable, %r12
		jmp *(%r12, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *(%r12, %rax, 8)
	ALU_ADD_X:
		...
		jmp *(%r12, %rax, 8)

The jumptable address is placed in a register once, at the beginning of
the function.  The function execution can then go through multiple
indirect jumps which rely on that same register value.  This has a few
issues:

1) Objtool isn't smart enough to be able to track such a register value
   across multiple recursive indirect jumps through the jump table.

2) With CONFIG_RETPOLINE enabled, this optimization actually results in
   a small slowdown.  I measured a ~4.7% slowdown in the test_bpf
   "tcpdump port 22" selftest.

   This slowdown is actually predicted by the GCC manual:

     Note: When compiling a program using computed gotos, a GCC
     extension, you may get better run-time performance if you
     disable the global common subexpression elimination pass by
     adding -fno-gcse to the command line.

So just disable the optimization for this function.

Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com
---
 include/linux/compiler-gcc.h   | 2 ++
 include/linux/compiler_types.h | 4 ++++
 kernel/bpf/core.c              | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index e8579412ad21..d7ee4c6bad48 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -170,3 +170,5 @@
 #else
 #define __diag_GCC_8(s)
 #endif
+
+#define __no_fgcse __attribute__((optimize("-fno-gcse")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 095d55c3834d..599c27b56c29 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -189,6 +189,10 @@ struct ftrace_likely_data {
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
 
+#ifndef __no_fgcse
+# define __no_fgcse
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7e98f36a14e2..8191a7db2777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
  *
  * Decode and execute eBPF instructions.
  */
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
-- 
cgit v1.2.3


From d5b9216fd5114be4ed98ca9c1ecc5f164cd8cf5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 18 Jul 2019 09:32:17 -0400
Subject: pnfs/flexfiles: Add tracepoints for detecting pnfs fallback to MDS

Add tracepoints to allow debugging of the event chain leading to
a pnfs fallback to doing I/O through the MDS.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 26 ++++++++++++
 fs/nfs/nfs4trace.c                     |  8 ++++
 fs/nfs/nfs4trace.h                     | 76 +++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.c                          |  2 +
 include/linux/nfs4.h                   |  1 +
 5 files changed, 112 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bcff3bf5ae09..b04e20d28162 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -934,6 +934,10 @@ out_nolseg:
 	if (pgio->pg_error < 0)
 		return;
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_READ,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_read_mds(pgio);
@@ -1000,6 +1004,10 @@ retry:
 	return;
 
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
@@ -1026,6 +1034,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
+	trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
 out:
@@ -1075,6 +1087,10 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_write_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_RW, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 	}
 }
@@ -1094,6 +1110,10 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_read_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_READ, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 	}
 }
@@ -1827,6 +1847,9 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1892,6 +1915,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index e9fb3e50a999..1a8f376b3f73 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -16,4 +16,12 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
 #endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d85f20945a2b..b2f395fa7350 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1771,6 +1771,7 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
 
 #define show_pnfs_update_layout_reason(reason)				\
 	__print_symbolic(reason,					\
@@ -1786,7 +1787,8 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
 		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
 		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
-		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+		{ PNFS_UPDATE_LAYOUT_EXIT, "exit" })
 
 TRACE_EVENT(pnfs_update_layout,
 		TP_PROTO(struct inode *inode,
@@ -1845,6 +1847,78 @@ TRACE_EVENT(pnfs_update_layout,
 		)
 );
 
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+		TP_PROTO(struct inode *inode,
+			loff_t pos,
+			u64 count,
+			enum pnfs_iomode iomode,
+			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg
+		),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u32, fhandle)
+			__field(loff_t, pos)
+			__field(u64, count)
+			__field(enum pnfs_iomode, iomode)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
+		),
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->pos = pos;
+			__entry->count = count;
+			__entry->iomode = iomode;
+			if (lo != NULL) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(lo->plh_stateid.seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(&lo->plh_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
+			__entry->lseg = (long)lseg;
+		),
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s pos=%llu count=%llu "
+			"layoutstateid=%d:0x%08x lseg=0x%lx",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->pos,
+			(unsigned long long)__entry->count,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg
+		)
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+	DEFINE_EVENT(pnfs_layout_event, name, \
+		TP_PROTO(struct inode *inode, \
+			loff_t pos, \
+			u64 count, \
+			enum pnfs_iomode iomode, \
+			struct pnfs_layout_hdr *lo, \
+			struct pnfs_layout_segment *lseg \
+		), \
+		TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 758917463700..75bd5b552ba4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2037,6 +2037,8 @@ lookup_again:
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_EXIT);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 22494d170619..fd59904a282c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -660,6 +660,7 @@ enum pnfs_update_layout_reason {
 	PNFS_UPDATE_LAYOUT_BLOCKED,
 	PNFS_UPDATE_LAYOUT_INVALID_OPEN,
 	PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
+	PNFS_UPDATE_LAYOUT_EXIT,
 };
 
 #define NFS4_OP_MAP_NUM_LONGS					\
-- 
cgit v1.2.3


From 00289cd87676e14913d2d8492d1ce05c4baafdae Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 17 Jul 2019 18:07:53 -0700
Subject: drivers/base: Introduce kill_device()

The libnvdimm subsystem arranges for devices to be destroyed as a result
of a sysfs operation. Since device_unregister() cannot be called from
an actively running sysfs attribute of the same device libnvdimm
arranges for device_unregister() to be performed in an out-of-line async
context.

The driver core maintains a 'dead' state for coordinating its own racing
async registration / de-registration requests. Rather than add local
'dead' state tracking infrastructure to libnvdimm device objects, export
the existing state tracking via a new kill_device() helper.

The kill_device() helper simply marks the device as dead, i.e. that it
is on its way to device_del(), or returns that the device was already
dead. This can be used in advance of calling device_unregister() for
subsystems like libnvdimm that might need to handle multiple user
threads racing to delete a device.

This refactoring does not change any behavior, but it is a pre-requisite
for follow-on fixes and therefore marked for -stable.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver...")
Cc: <stable@vger.kernel.org>
Tested-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/156341207332.292348.14959761496009347574.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/base/core.c    | 27 +++++++++++++++++++--------
 include/linux/device.h |  1 +
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd7511e04e62..eaf3aa0cb803 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2211,6 +2211,24 @@ void put_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(put_device);
 
+bool kill_device(struct device *dev)
+{
+	/*
+	 * Require the device lock and set the "dead" flag to guarantee that
+	 * the update behavior is consistent with the other bitfields near
+	 * it and that we cannot have an asynchronous probe routine trying
+	 * to run while we are tearing out the bus/class/sysfs from
+	 * underneath the device.
+	 */
+	lockdep_assert_held(&dev->mutex);
+
+	if (dev->p->dead)
+		return false;
+	dev->p->dead = true;
+	return true;
+}
+EXPORT_SYMBOL_GPL(kill_device);
+
 /**
  * device_del - delete device from system.
  * @dev: device.
@@ -2230,15 +2248,8 @@ void device_del(struct device *dev)
 	struct kobject *glue_dir = NULL;
 	struct class_interface *class_intf;
 
-	/*
-	 * Hold the device lock and set the "dead" flag to guarantee that
-	 * the update behavior is consistent with the other bitfields near
-	 * it and that we cannot have an asynchronous probe routine trying
-	 * to run while we are tearing out the bus/class/sysfs from
-	 * underneath the device.
-	 */
 	device_lock(dev);
-	dev->p->dead = true;
+	kill_device(dev);
 	device_unlock(dev);
 
 	/* Notify clients of device removal.  This call must come
diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..0da5c67f6be1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1373,6 +1373,7 @@ extern int (*platform_notify_remove)(struct device *dev);
  */
 extern struct device *get_device(struct device *dev);
 extern void put_device(struct device *dev);
+extern bool kill_device(struct device *dev);
 
 #ifdef CONFIG_DEVTMPFS
 extern int devtmpfs_create_node(struct device *dev);
-- 
cgit v1.2.3


From 87a30e1f05d73a34e6d1895065541369131aaf1c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 17 Jul 2019 18:08:26 -0700
Subject: driver-core, libnvdimm: Let device subsystems add local lockdep
 coverage

For good reason, the standard device_lock() is marked
lockdep_set_novalidate_class() because there is simply no sane way to
describe the myriad ways the device_lock() ordered with other locks.
However, that leaves subsystems that know their own local device_lock()
ordering rules to find lock ordering mistakes manually. Instead,
introduce an optional / additional lockdep-enabled lock that a subsystem
can acquire in all the same paths that the device_lock() is acquired.

A conversion of the NFIT driver and NVDIMM subsystem to a
lockdep-validate device_lock() scheme is included. The
debug_nvdimm_lock() implementation implements the correct lock-class and
stacking order for the libnvdimm device topology hierarchy.

Yes, this is a hack, but hopefully it is a useful hack for other
subsystems device_lock() debug sessions. Quoting Greg:

    "Yeah, it feels a bit hacky but it's really up to a subsystem to mess up
     using it as much as anything else, so user beware :)

     I don't object to it if it makes things easier for you to debug."

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/156341210661.292348.7014034644265455704.stgit@dwillia2-desk3.amr.corp.intel.com
---
 drivers/acpi/nfit/core.c        | 28 ++++++++---------
 drivers/acpi/nfit/nfit.h        | 24 +++++++++++++++
 drivers/base/core.c             |  3 ++
 drivers/nvdimm/btt_devs.c       | 16 +++++-----
 drivers/nvdimm/bus.c            | 28 ++++++++++-------
 drivers/nvdimm/core.c           | 10 +++---
 drivers/nvdimm/dimm_devs.c      |  4 +--
 drivers/nvdimm/namespace_devs.c | 36 +++++++++++-----------
 drivers/nvdimm/nd-core.h        | 68 +++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/pfn_devs.c       | 24 +++++++--------
 drivers/nvdimm/pmem.c           |  4 +--
 drivers/nvdimm/region.c         |  2 +-
 drivers/nvdimm/region_devs.c    | 16 +++++-----
 include/linux/device.h          |  5 +++
 14 files changed, 187 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 23022cf20d26..f22139458ce1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1282,7 +1282,7 @@ static ssize_t hw_error_scrub_store(struct device *dev,
 	if (rc)
 		return rc;
 
-	device_lock(dev);
+	nfit_device_lock(dev);
 	nd_desc = dev_get_drvdata(dev);
 	if (nd_desc) {
 		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
@@ -1299,7 +1299,7 @@ static ssize_t hw_error_scrub_store(struct device *dev,
 			break;
 		}
 	}
-	device_unlock(dev);
+	nfit_device_unlock(dev);
 	if (rc)
 		return rc;
 	return size;
@@ -1319,7 +1319,7 @@ static ssize_t scrub_show(struct device *dev,
 	ssize_t rc = -ENXIO;
 	bool busy;
 
-	device_lock(dev);
+	nfit_device_lock(dev);
 	nd_desc = dev_get_drvdata(dev);
 	if (!nd_desc) {
 		device_unlock(dev);
@@ -1339,7 +1339,7 @@ static ssize_t scrub_show(struct device *dev,
 	}
 
 	mutex_unlock(&acpi_desc->init_mutex);
-	device_unlock(dev);
+	nfit_device_unlock(dev);
 	return rc;
 }
 
@@ -1356,14 +1356,14 @@ static ssize_t scrub_store(struct device *dev,
 	if (val != 1)
 		return -EINVAL;
 
-	device_lock(dev);
+	nfit_device_lock(dev);
 	nd_desc = dev_get_drvdata(dev);
 	if (nd_desc) {
 		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 
 		rc = acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG);
 	}
-	device_unlock(dev);
+	nfit_device_unlock(dev);
 	if (rc)
 		return rc;
 	return size;
@@ -1749,9 +1749,9 @@ static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data)
 	struct acpi_device *adev = data;
 	struct device *dev = &adev->dev;
 
-	device_lock(dev->parent);
+	nfit_device_lock(dev->parent);
 	__acpi_nvdimm_notify(dev, event);
-	device_unlock(dev->parent);
+	nfit_device_unlock(dev->parent);
 }
 
 static bool acpi_nvdimm_has_method(struct acpi_device *adev, char *method)
@@ -3457,8 +3457,8 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 	struct device *dev = acpi_desc->dev;
 
 	/* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */
-	device_lock(dev);
-	device_unlock(dev);
+	nfit_device_lock(dev);
+	nfit_device_unlock(dev);
 
 	/* Bounce the init_mutex to complete initial registration */
 	mutex_lock(&acpi_desc->init_mutex);
@@ -3602,8 +3602,8 @@ void acpi_nfit_shutdown(void *data)
 	 * acpi_nfit_ars_rescan() submissions have had a chance to
 	 * either submit or see ->cancel set.
 	 */
-	device_lock(bus_dev);
-	device_unlock(bus_dev);
+	nfit_device_lock(bus_dev);
+	nfit_device_unlock(bus_dev);
 
 	flush_workqueue(nfit_wq);
 }
@@ -3746,9 +3746,9 @@ EXPORT_SYMBOL_GPL(__acpi_nfit_notify);
 
 static void acpi_nfit_notify(struct acpi_device *adev, u32 event)
 {
-	device_lock(&adev->dev);
+	nfit_device_lock(&adev->dev);
 	__acpi_nfit_notify(&adev->dev, adev->handle, event);
-	device_unlock(&adev->dev);
+	nfit_device_unlock(&adev->dev);
 }
 
 static const struct acpi_device_id acpi_nfit_ids[] = {
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 6ee2b02af73e..24241941181c 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -312,6 +312,30 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
 	return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
 }
 
+#ifdef CONFIG_PROVE_LOCKING
+static inline void nfit_device_lock(struct device *dev)
+{
+	device_lock(dev);
+	mutex_lock(&dev->lockdep_mutex);
+}
+
+static inline void nfit_device_unlock(struct device *dev)
+{
+	mutex_unlock(&dev->lockdep_mutex);
+	device_unlock(dev);
+}
+#else
+static inline void nfit_device_lock(struct device *dev)
+{
+	device_lock(dev);
+}
+
+static inline void nfit_device_unlock(struct device *dev)
+{
+	device_unlock(dev);
+}
+#endif
+
 const guid_t *to_nfit_uuid(enum nfit_uuids id);
 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
 void acpi_nfit_shutdown(void *data);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index eaf3aa0cb803..4825949d6547 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1663,6 +1663,9 @@ void device_initialize(struct device *dev)
 	kobject_init(&dev->kobj, &device_ktype);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	mutex_init(&dev->mutex);
+#ifdef CONFIG_PROVE_LOCKING
+	mutex_init(&dev->lockdep_mutex);
+#endif
 	lockdep_set_novalidate_class(&dev->mutex);
 	spin_lock_init(&dev->devres_lock);
 	INIT_LIST_HEAD(&dev->devres_head);
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 62d00fffa4af..3508a79110c7 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -62,14 +62,14 @@ static ssize_t sector_size_store(struct device *dev,
 	struct nd_btt *nd_btt = to_nd_btt(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
 			btt_lbasize_supported);
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -91,11 +91,11 @@ static ssize_t uuid_store(struct device *dev,
 	struct nd_btt *nd_btt = to_nd_btt(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -120,13 +120,13 @@ static ssize_t namespace_store(struct device *dev,
 	struct nd_btt *nd_btt = to_nd_btt(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len);
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -138,14 +138,14 @@ static ssize_t size_show(struct device *dev,
 	struct nd_btt *nd_btt = to_nd_btt(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (dev->driver)
 		rc = sprintf(buf, "%llu\n", nd_btt->size);
 	else {
 		/* no size to convey if the btt instance is disabled */
 		rc = -ENXIO;
 	}
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index df41f3571dc9..798c5c4aea9c 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -26,7 +26,7 @@
 
 int nvdimm_major;
 static int nvdimm_bus_major;
-static struct class *nd_class;
+struct class *nd_class;
 static DEFINE_IDA(nd_ida);
 
 static int to_nd_device_type(struct device *dev)
@@ -91,7 +91,10 @@ static int nvdimm_bus_probe(struct device *dev)
 			dev->driver->name, dev_name(dev));
 
 	nvdimm_bus_probe_start(nvdimm_bus);
+	debug_nvdimm_lock(dev);
 	rc = nd_drv->probe(dev);
+	debug_nvdimm_unlock(dev);
+
 	if (rc == 0)
 		nd_region_probe_success(nvdimm_bus, dev);
 	else
@@ -113,8 +116,11 @@ static int nvdimm_bus_remove(struct device *dev)
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 	int rc = 0;
 
-	if (nd_drv->remove)
+	if (nd_drv->remove) {
+		debug_nvdimm_lock(dev);
 		rc = nd_drv->remove(dev);
+		debug_nvdimm_unlock(dev);
+	}
 	nd_region_disable(nvdimm_bus, dev);
 
 	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
@@ -140,7 +146,7 @@ static void nvdimm_bus_shutdown(struct device *dev)
 
 void nd_device_notify(struct device *dev, enum nvdimm_event event)
 {
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (dev->driver) {
 		struct nd_device_driver *nd_drv;
 
@@ -148,7 +154,7 @@ void nd_device_notify(struct device *dev, enum nvdimm_event event)
 		if (nd_drv->notify)
 			nd_drv->notify(dev, event);
 	}
-	device_unlock(dev);
+	nd_device_unlock(dev);
 }
 EXPORT_SYMBOL(nd_device_notify);
 
@@ -296,7 +302,7 @@ static void nvdimm_bus_release(struct device *dev)
 	kfree(nvdimm_bus);
 }
 
-static bool is_nvdimm_bus(struct device *dev)
+bool is_nvdimm_bus(struct device *dev)
 {
 	return dev->release == nvdimm_bus_release;
 }
@@ -575,9 +581,9 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
 		 * or otherwise let the async path handle it if the
 		 * unregistration was already queued.
 		 */
-		device_lock(dev);
+		nd_device_lock(dev);
 		killed = kill_device(dev);
-		device_unlock(dev);
+		nd_device_unlock(dev);
 
 		if (!killed)
 			return;
@@ -888,10 +894,10 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 		if (nvdimm_bus->probe_active == 0)
 			break;
 		nvdimm_bus_unlock(dev);
-		device_unlock(dev);
+		nd_device_unlock(dev);
 		wait_event(nvdimm_bus->wait,
 				nvdimm_bus->probe_active == 0);
-		device_lock(dev);
+		nd_device_lock(dev);
 		nvdimm_bus_lock(dev);
 	} while (true);
 }
@@ -1107,7 +1113,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		goto out;
 	}
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
 	if (rc)
@@ -1129,7 +1135,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 out_unlock:
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 out:
 	kfree(in_env);
 	kfree(out_env);
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 5e1f060547bf..9204f1e9fd14 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -246,7 +246,7 @@ static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf,
  *
  * Enforce that uuids can only be changed while the device is disabled
  * (driver detached)
- * LOCKING: expects device_lock() is held on entry
+ * LOCKING: expects nd_device_lock() is held on entry
  */
 int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 		size_t len)
@@ -347,15 +347,15 @@ static DEVICE_ATTR_RO(provider);
 
 static int flush_namespaces(struct device *dev, void *data)
 {
-	device_lock(dev);
-	device_unlock(dev);
+	nd_device_lock(dev);
+	nd_device_unlock(dev);
 	return 0;
 }
 
 static int flush_regions_dimms(struct device *dev, void *data)
 {
-	device_lock(dev);
-	device_unlock(dev);
+	nd_device_lock(dev);
+	nd_device_unlock(dev);
 	device_for_each_child(dev, NULL, flush_namespaces);
 	return 0;
 }
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index dfecd6e17043..29a065e769ea 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -484,12 +484,12 @@ static ssize_t security_store(struct device *dev,
 	 * done while probing is idle and the DIMM is not in active use
 	 * in any region.
 	 */
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	rc = __security_store(dev, buf, len);
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index a434a5964cb9..92cd809d7e43 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -410,7 +410,7 @@ static ssize_t alt_name_store(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	rc = __alt_name_store(dev, buf, len);
@@ -418,7 +418,7 @@ static ssize_t alt_name_store(struct device *dev,
 		rc = nd_namespace_label_update(nd_region, dev);
 	dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc);
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc < 0 ? rc : len;
 }
@@ -1077,7 +1077,7 @@ static ssize_t size_store(struct device *dev,
 	if (rc)
 		return rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	rc = __size_store(dev, val);
@@ -1103,7 +1103,7 @@ static ssize_t size_store(struct device *dev,
 	dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc);
 
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc < 0 ? rc : len;
 }
@@ -1286,7 +1286,7 @@ static ssize_t uuid_store(struct device *dev,
 	} else
 		return -ENXIO;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	if (to_ndns(dev)->claim)
@@ -1302,7 +1302,7 @@ static ssize_t uuid_store(struct device *dev,
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc < 0 ? rc : len;
 }
@@ -1376,7 +1376,7 @@ static ssize_t sector_size_store(struct device *dev,
 	} else
 		return -ENXIO;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	if (to_ndns(dev)->claim)
 		rc = -EBUSY;
@@ -1387,7 +1387,7 @@ static ssize_t sector_size_store(struct device *dev,
 	dev_dbg(dev, "result: %zd %s: %s%s", rc, rc < 0 ? "tried" : "wrote",
 			buf, buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -1502,9 +1502,9 @@ static ssize_t holder_show(struct device *dev,
 	struct nd_namespace_common *ndns = to_ndns(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -1541,7 +1541,7 @@ static ssize_t holder_class_store(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	rc = __holder_class_store(dev, buf);
@@ -1549,7 +1549,7 @@ static ssize_t holder_class_store(struct device *dev,
 		rc = nd_namespace_label_update(nd_region, dev);
 	dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc);
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc < 0 ? rc : len;
 }
@@ -1560,7 +1560,7 @@ static ssize_t holder_class_show(struct device *dev,
 	struct nd_namespace_common *ndns = to_ndns(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (ndns->claim_class == NVDIMM_CCLASS_NONE)
 		rc = sprintf(buf, "\n");
 	else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) ||
@@ -1572,7 +1572,7 @@ static ssize_t holder_class_show(struct device *dev,
 		rc = sprintf(buf, "dax\n");
 	else
 		rc = sprintf(buf, "<unknown>\n");
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -1586,7 +1586,7 @@ static ssize_t mode_show(struct device *dev,
 	char *mode;
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	claim = ndns->claim;
 	if (claim && is_nd_btt(claim))
 		mode = "safe";
@@ -1599,7 +1599,7 @@ static ssize_t mode_show(struct device *dev,
 	else
 		mode = "raw";
 	rc = sprintf(buf, "%s\n", mode);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -1703,8 +1703,8 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 		 * Flush any in-progess probes / removals in the driver
 		 * for the raw personality of this namespace.
 		 */
-		device_lock(&ndns->dev);
-		device_unlock(&ndns->dev);
+		nd_device_lock(&ndns->dev);
+		nd_device_unlock(&ndns->dev);
 		if (ndns->dev.driver) {
 			dev_dbg(&ndns->dev, "is active, can't bind %s\n",
 					dev_name(dev));
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 6cd470547106..0ac52b6eb00e 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -9,6 +9,7 @@
 #include <linux/sizes.h>
 #include <linux/mutex.h>
 #include <linux/nd.h>
+#include "nd.h"
 
 extern struct list_head nvdimm_bus_list;
 extern struct mutex nvdimm_bus_list_mutex;
@@ -182,4 +183,71 @@ ssize_t nd_namespace_store(struct device *dev,
 		struct nd_namespace_common **_ndns, const char *buf,
 		size_t len);
 struct nd_pfn *to_nd_pfn_safe(struct device *dev);
+bool is_nvdimm_bus(struct device *dev);
+
+#ifdef CONFIG_PROVE_LOCKING
+extern struct class *nd_class;
+
+enum {
+	LOCK_BUS,
+	LOCK_NDCTL,
+	LOCK_REGION,
+	LOCK_DIMM = LOCK_REGION,
+	LOCK_NAMESPACE,
+	LOCK_CLAIM,
+};
+
+static inline void debug_nvdimm_lock(struct device *dev)
+{
+	if (is_nd_region(dev))
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_REGION);
+	else if (is_nvdimm(dev))
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_DIMM);
+	else if (is_nd_btt(dev) || is_nd_pfn(dev) || is_nd_dax(dev))
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_CLAIM);
+	else if (dev->parent && (is_nd_region(dev->parent)))
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_NAMESPACE);
+	else if (is_nvdimm_bus(dev))
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_BUS);
+	else if (dev->class && dev->class == nd_class)
+		mutex_lock_nested(&dev->lockdep_mutex, LOCK_NDCTL);
+	else
+		dev_WARN(dev, "unknown lock level\n");
+}
+
+static inline void debug_nvdimm_unlock(struct device *dev)
+{
+	mutex_unlock(&dev->lockdep_mutex);
+}
+
+static inline void nd_device_lock(struct device *dev)
+{
+	device_lock(dev);
+	debug_nvdimm_lock(dev);
+}
+
+static inline void nd_device_unlock(struct device *dev)
+{
+	debug_nvdimm_unlock(dev);
+	device_unlock(dev);
+}
+#else
+static inline void nd_device_lock(struct device *dev)
+{
+	device_lock(dev);
+}
+
+static inline void nd_device_unlock(struct device *dev)
+{
+	device_unlock(dev);
+}
+
+static inline void debug_nvdimm_lock(struct device *dev)
+{
+}
+
+static inline void debug_nvdimm_unlock(struct device *dev)
+{
+}
+#endif
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 0f81fc56bbfd..9b09fe18e666 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -67,7 +67,7 @@ static ssize_t mode_store(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc = 0;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	if (dev->driver)
 		rc = -EBUSY;
@@ -89,7 +89,7 @@ static ssize_t mode_store(struct device *dev,
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -132,14 +132,14 @@ static ssize_t align_store(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_size_select_store(dev, buf, &nd_pfn->align,
 			nd_pfn_supported_alignments());
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -161,11 +161,11 @@ static ssize_t uuid_store(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len);
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc ? rc : len;
 }
@@ -190,13 +190,13 @@ static ssize_t namespace_store(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len);
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -208,7 +208,7 @@ static ssize_t resource_show(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (dev->driver) {
 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
 		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
@@ -222,7 +222,7 @@ static ssize_t resource_show(struct device *dev,
 		/* no address to convey if the pfn instance is disabled */
 		rc = -ENXIO;
 	}
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
@@ -234,7 +234,7 @@ static ssize_t size_show(struct device *dev,
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (dev->driver) {
 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
 		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
@@ -250,7 +250,7 @@ static ssize_t size_show(struct device *dev,
 		/* no size to convey if the pfn instance is disabled */
 		rc = -ENXIO;
 	}
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 28cb44c61d4a..53797e7be18a 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -520,8 +520,8 @@ static int nd_pmem_remove(struct device *dev)
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
 	else {
 		/*
-		 * Note, this assumes device_lock() context to not race
-		 * nd_pmem_notify()
+		 * Note, this assumes nd_device_lock() context to not
+		 * race nd_pmem_notify()
 		 */
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 488c47ac4c4a..37bf8719a2a4 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -102,7 +102,7 @@ static int nd_region_remove(struct device *dev)
 	nvdimm_bus_unlock(dev);
 
 	/*
-	 * Note, this assumes device_lock() context to not race
+	 * Note, this assumes nd_device_lock() context to not race
 	 * nd_region_notify()
 	 */
 	sysfs_put(nd_region->bb_state);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index a15276cdec7d..91b5a7ade0d5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -329,7 +329,7 @@ static ssize_t set_cookie_show(struct device *dev,
 	 * the v1.1 namespace label cookie definition. To read all this
 	 * data we need to wait for probing to settle.
 	 */
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	if (nd_region->ndr_mappings) {
@@ -346,7 +346,7 @@ static ssize_t set_cookie_show(struct device *dev,
 		}
 	}
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	if (rc)
 		return rc;
@@ -422,12 +422,12 @@ static ssize_t available_size_show(struct device *dev,
 	 * memory nvdimm_bus_lock() is dropped, but that's userspace's
 	 * problem to not race itself.
 	 */
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	available = nd_region_available_dpa(nd_region);
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return sprintf(buf, "%llu\n", available);
 }
@@ -439,12 +439,12 @@ static ssize_t max_available_extent_show(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev);
 	unsigned long long available = 0;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
 	available = nd_region_allocatable_dpa(nd_region);
 	nvdimm_bus_unlock(dev);
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return sprintf(buf, "%llu\n", available);
 }
@@ -563,12 +563,12 @@ static ssize_t region_badblocks_show(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev);
 	ssize_t rc;
 
-	device_lock(dev);
+	nd_device_lock(dev);
 	if (dev->driver)
 		rc = badblocks_show(&nd_region->bb, buf, 0);
 	else
 		rc = -ENXIO;
-	device_unlock(dev);
+	nd_device_unlock(dev);
 
 	return rc;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 0da5c67f6be1..9237b857b598 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -909,6 +909,8 @@ struct dev_links_info {
  * 		This identifies the device type and carries type-specific
  * 		information.
  * @mutex:	Mutex to synchronize calls to its driver.
+ * @lockdep_mutex: An optional debug lock that a subsystem can use as a
+ * 		peer lock to gain localized lockdep coverage of the device_lock.
  * @bus:	Type of bus device is on.
  * @driver:	Which driver has allocated this
  * @platform_data: Platform data specific to the device.
@@ -991,6 +993,9 @@ struct device {
 					   core doesn't touch it */
 	void		*driver_data;	/* Driver data, set and get with
 					   dev_set_drvdata/dev_get_drvdata */
+#ifdef CONFIG_PROVE_LOCKING
+	struct mutex		lockdep_mutex;
+#endif
 	struct mutex		mutex;	/* mutex to synchronize calls to
 					 * its driver.
 					 */
-- 
cgit v1.2.3


From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:51 -0700
Subject: mm/memory_hotplug: allow arch_remove_memory() without
 CONFIG_MEMORY_HOTREMOVE

We want to improve error handling while adding memory by allowing to use
arch_remove_memory() and __remove_pages() even if
CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like:

	arch_add_memory()
	rc = do_something();
	if (rc) {
		arch_remove_memory();
	}

We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require
quite some dependencies for memory offlining.

Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c            | 2 --
 arch/ia64/mm/init.c            | 2 --
 arch/powerpc/mm/mem.c          | 2 --
 arch/s390/mm/init.c            | 2 --
 arch/sh/mm/init.c              | 2 --
 arch/x86/mm/init_32.c          | 2 --
 arch/x86/mm/init_64.c          | 2 --
 drivers/base/memory.c          | 2 --
 include/linux/memory.h         | 2 --
 include/linux/memory_hotplug.h | 2 --
 mm/memory_hotplug.c            | 2 --
 mm/sparse.c                    | 6 ------
 12 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a21fa7e1167d..750a69dde39b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d28e29103bdb..aae75fd7b810 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 26a8da3723bb..9259337d7374 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
 			     struct vmem_altmap *altmap)
 {
@@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 		pr_warn("Hash collision while resizing HPT\n");
 }
 #endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init mem_topology_setup(void)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 5b1ec2f532e0..4e5bbe328594 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return rc;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	vmem_remove_mapping(start, size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 13c6a6bb5fd9..dfdbaa50946e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	zone = page_zone(pfn_to_page(start_pfn));
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f265a4316179..4068abb9427f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
 
 int kernel_set_to_readonly __read_mostly;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 08bbf648827b..5a289a2ab108 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
 	remove_pagetable(start, end, false, altmap);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
@@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e0aa7f9abb36..92459d6f12be 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -723,7 +723,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void
 unregister_memory(struct memory_block *memory)
 {
@@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section)
 out_unlock:
 	mutex_unlock(&mem_sysfs_mutex);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* return true if the memory block is offlined, otherwise, return false */
 bool is_memblock_offlined(struct memory_block *mem)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e1dc1bb2b787..474c7c60c8f2 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int hotplug_memory_register(int nid, struct mem_section *section);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void unregister_memory_section(struct mem_section *);
-#endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 988fde33cd7f..87bf9c4a889e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void)
 	return movable_node_enabled;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void arch_remove_memory(int nid, u64 start, u64 size,
 			       struct vmem_altmap *altmap);
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
  * Do we want sysfs memblock files created. This will allow userspace to online
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a8c25fd85ee3..bc11888d5d7e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -318,7 +318,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
@@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 
 	set_zone_contiguous(zone);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 int set_online_page_callback(online_page_callback_t callback)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166949b5..d1d5e05f5b8d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap,
 
 	vmemmap_free(start, end, altmap);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long start = (unsigned long)memmap;
@@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap)
 
 	vmemmap_free(start, end, NULL);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
 static struct page *__kmalloc_section_memmap(void)
 {
@@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap,
 			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
@@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap)
 			put_page_bootmem(page);
 	}
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 /**
@@ -746,7 +742,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 #ifdef CONFIG_MEMORY_FAILURE
 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 {
@@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 			PAGES_PER_SECTION - map_offset);
 	free_section_usemap(memmap, usemap, altmap);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From db051a0dac13db24d58470d75cee0ce7c6b031a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:56 -0700
Subject: mm/memory_hotplug: create memory block devices after
 arch_add_memory()

Only memory to be added to the buddy and to be onlined/offlined by user
space using /sys/devices/system/memory/...  needs (and should have!)
memory block devices.

Factor out creation of memory block devices.  Create all devices after
arch_add_memory() succeeded.  We can later drop the want_memblock
parameter, because it is now effectively stale.

Only after memory block devices have been added, memory can be onlined
by user space.  This implies, that memory is not visible to user space
at all before arch_add_memory() succeeded.

While at it
 - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory()
 - introduce find_memory_block_by_id() to search via block id
 - Use find_memory_block_by_id() in init_memory_block() to catch
   duplicates

Link: http://lkml.kernel.org/r/20190527111152.16324-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qian Cai <cai@lca.pw>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 82 +++++++++++++++++++++++++++++++++-----------------
 include/linux/memory.h |  2 +-
 mm/memory_hotplug.c    | 15 ++++-----
 3 files changed, 63 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 92459d6f12be..18a30c3ac0ef 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr)
 	return section_nr / sections_per_block;
 }
 
+static inline int pfn_to_block_id(unsigned long pfn)
+{
+	return base_memory_block_id(pfn_to_section_nr(pfn));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
+static struct memory_block *find_memory_block_by_id(int block_id,
+						    struct memory_block *hint)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
 	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
@@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
 	return to_memory_block(dev);
 }
 
+struct memory_block *find_memory_block_hinted(struct mem_section *section,
+					      struct memory_block *hint)
+{
+	int block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id, hint);
+}
+
 /*
  * For now, we have a linear search to go find the appropriate
  * memory_block corresponding to a particular phys_index. If
@@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	unsigned long start_pfn;
 	int ret = 0;
 
+	mem = find_memory_block_by_id(block_id, NULL);
+	if (mem) {
+		put_device(&mem->dev);
+		return -EEXIST;
+	}
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
@@ -695,44 +712,53 @@ static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
+static void unregister_memory(struct memory_block *memory)
+{
+	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
+		return;
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
 /*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
+ * Create memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * will be initialized as offline.
  */
-int hotplug_memory_register(int nid, struct mem_section *section)
+int create_memory_block_devices(unsigned long start, unsigned long size)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
-	int ret = 0;
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
 
-	mutex_lock(&mem_sysfs_mutex);
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
+		return -EINVAL;
 
-	mem = find_memory_block(section);
-	if (mem) {
-		mem->section_count++;
-		put_device(&mem->dev);
-	} else {
+	mutex_lock(&mem_sysfs_mutex);
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 		ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
 		if (ret)
-			goto out;
-		mem->section_count++;
+			break;
+		mem->section_count = sections_per_block;
+	}
+	if (ret) {
+		end_block_id = block_id;
+		for (block_id = start_block_id; block_id != end_block_id;
+		     block_id++) {
+			mem = find_memory_block_by_id(block_id, NULL);
+			mem->section_count = 0;
+			unregister_memory(mem);
+		}
 	}
-
-out:
 	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got via find_memory_block() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
-}
-
 void unregister_memory_section(struct mem_section *section)
 {
 	struct memory_block *mem;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 474c7c60c8f2..db3e8567f900 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-int hotplug_memory_register(int nid, struct mem_section *section);
+int create_memory_block_devices(unsigned long start, unsigned long size);
 extern void unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bc11888d5d7e..78291526eb4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -259,13 +259,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		return -EEXIST;
 
 	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
-	if (ret < 0)
-		return ret;
-
-	if (!want_memblock)
-		return 0;
-
-	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+	return ret < 0 ? ret : 0;
 }
 
 /*
@@ -1105,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
 	if (ret < 0)
 		goto error;
 
+	/* create memory block devices after memory was added */
+	ret = create_memory_block_devices(start, size);
+	if (ret) {
+		arch_remove_memory(nid, start, size, NULL);
+		goto error;
+	}
+
 	if (new_node) {
 		/* If sysfs file of new node can't be created, cpu on the node
 		 * can't be hot-added. There is no rollback way now.
-- 
cgit v1.2.3


From 05f800a0bd08e14606ac63e0a5c63ed6880acaab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:01 -0700
Subject: mm/memory_hotplug: drop MHP_MEMBLOCK_API

No longer needed, the callers of arch_add_memory() can handle this
manually.

Link: http://lkml.kernel.org/r/20190527111152.16324-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 8 --------
 mm/memory_hotplug.c            | 9 +++------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 87bf9c4a889e..36c514b80cf1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 size,
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
 
-/*
- * Do we want sysfs memblock files created. This will allow userspace to online
- * and offline memory explicitly. Lack of this bit means that the caller has to
- * call move_pfn_range_to_zone to finish the initialization.
- */
-
-#define MHP_MEMBLOCK_API               (1<<0)
-
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 		       struct mhp_restrictions *restrictions);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 78291526eb4d..fb9dc3fa1138 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-		struct vmem_altmap *altmap, bool want_memblock)
+				   struct vmem_altmap *altmap)
 {
 	int ret;
 
@@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap,
-				restrictions->flags & MHP_MEMBLOCK_API);
+		err = __add_section(nid, section_nr_to_pfn(i), altmap);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -1065,9 +1064,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res)
 {
-	struct mhp_restrictions restrictions = {
-		.flags = MHP_MEMBLOCK_API,
-	};
+	struct mhp_restrictions restrictions = {};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
-- 
cgit v1.2.3


From 4c4b7f9ba9486c565aead99a198ceeef73ae81f6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:06 -0700
Subject: mm/memory_hotplug: remove memory block devices before
 arch_remove_memory()

Let's factor out removing of memory block devices, which is only
necessary for memory added via add_memory() and friends that created
memory block devices.  Remove the devices before calling
arch_remove_memory().

This finishes factoring out memory block device handling from
arch_add_memory() and arch_remove_memory().

Link: http://lkml.kernel.org/r/20190527111152.16324-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 37 ++++++++++++++++++-------------------
 drivers/base/node.c    | 11 ++++++-----
 include/linux/memory.h |  2 +-
 include/linux/node.h   |  6 ++----
 mm/memory_hotplug.c    |  5 +++--
 5 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 18a30c3ac0ef..826dd76f662e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -759,32 +759,31 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 	return ret;
 }
 
-void unregister_memory_section(struct mem_section *section)
+/*
+ * Remove memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * have to be offline.
+ */
+void remove_memory_block_devices(unsigned long start, unsigned long size)
 {
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	int block_id;
 
-	if (WARN_ON_ONCE(!present_section(section)))
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
 		return;
 
 	mutex_lock(&mem_sysfs_mutex);
-
-	/*
-	 * Some users of the memory hotplug do not want/need memblock to
-	 * track all sections. Skip over those.
-	 */
-	mem = find_memory_block(section);
-	if (!mem)
-		goto out_unlock;
-
-	unregister_mem_sect_under_nodes(mem, __section_nr(section));
-
-	mem->section_count--;
-	if (mem->section_count == 0)
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (WARN_ON_ONCE(!mem))
+			continue;
+		mem->section_count = 0;
+		unregister_memory_block_under_nodes(mem);
 		unregister_memory(mem);
-	else
-		put_device(&mem->dev);
-
-out_unlock:
+	}
 	mutex_unlock(&mem_sysfs_mutex);
 }
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index aa878fbcf705..0b0f38c2c7cd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -802,9 +802,10 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 	return 0;
 }
 
-/* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-				    unsigned long phys_index)
+/*
+ * Unregister memory block device under all nodes that it spans.
+ */
+int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
@@ -817,8 +818,8 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
 		return -ENOMEM;
 	nodes_clear(*unlinked_nodes);
 
-	sect_start_pfn = section_nr_to_pfn(phys_index);
-	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
 		int nid;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index db3e8567f900..f26a5417ec5d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,7 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size);
-extern void unregister_memory_section(struct mem_section *);
+void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/node.h b/include/linux/node.h
index 1a557c589ecb..02a29e71b175 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,8 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-					   unsigned long phys_index);
+extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -176,8 +175,7 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-						  unsigned long phys_index)
+static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	return 0;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fb9dc3fa1138..37c861e7a717 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -520,8 +520,6 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	unregister_memory_section(ms);
-
 	scn_nr = __section_nr(ms);
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
@@ -1834,6 +1832,9 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	memblock_free(start, size);
 	memblock_remove(start, size);
 
+	/* remove memory block devices before removing memory */
+	remove_memory_block_devices(start, size);
+
 	arch_remove_memory(nid, start, size, NULL);
 	__release_memory_resource(start, size);
 
-- 
cgit v1.2.3


From a31b264c2b415b29660da0bc2ba291a98629ce51 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:12 -0700
Subject: mm/memory_hotplug: make unregister_memory_block_under_nodes() never
 fail

We really don't want anything during memory hotunplug to fail.  We
always pass a valid memory block device, that check can go.  Avoid
allocating memory and eventually failing.  As we are always called under
lock, we can use a static piece of memory.  This avoids having to put
the structure onto the stack, having to guess about the stack size of
callers.

Patch inspired by a patch from Oscar Salvador.

In the future, there might be no need to iterate over nodes at all.
mem->nid should tell us exactly what to remove.  Memory block devices
with mixed nodes (added during boot) should properly fenced off and
never removed.

Link: http://lkml.kernel.org/r/20190527111152.16324-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 18 +++++-------------
 include/linux/node.h |  5 ++---
 2 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0b0f38c2c7cd..beec80649b33 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -804,20 +804,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 
 /*
  * Unregister memory block device under all nodes that it spans.
+ * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
  */
-int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
+	static nodemask_t unlinked_nodes;
 
-	if (!mem_blk) {
-		NODEMASK_FREE(unlinked_nodes);
-		return -EFAULT;
-	}
-	if (!unlinked_nodes)
-		return -ENOMEM;
-	nodes_clear(*unlinked_nodes);
-
+	nodes_clear(unlinked_nodes);
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
@@ -828,15 +822,13 @@ int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 			continue;
 		if (!node_online(nid))
 			continue;
-		if (node_test_and_set(nid, *unlinked_nodes))
+		if (node_test_and_set(nid, unlinked_nodes))
 			continue;
 		sysfs_remove_link(&node_devices[nid]->dev.kobj,
 			 kobject_name(&mem_blk->dev.kobj));
 		sysfs_remove_link(&mem_blk->dev.kobj,
 			 kobject_name(&node_devices[nid]->dev.kobj));
 	}
-	NODEMASK_FREE(unlinked_nodes);
-	return 0;
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/include/linux/node.h b/include/linux/node.h
index 02a29e71b175..548c226966a2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
+extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	return 0;
 }
 
 static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
-- 
cgit v1.2.3


From b9bf8d342d9b443c0d19aa57883d8ddb38d965de Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:17 -0700
Subject: mm/memory_hotplug: remove "zone" parameter from
 sparse_remove_one_section

The parameter is unused, so let's drop it.  Memory removal paths should
never care about zones.  This is the job of memory offlining and will
require more refactorings.

Link: http://lkml.kernel.org/r/20190527111152.16324-12-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 mm/sparse.c                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 36c514b80cf1..79e0add6a597 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -350,7 +350,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_one_section(int nid, unsigned long start_pfn,
 				  struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern void sparse_remove_one_section(struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 37c861e7a717..d1d0ceaaca88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
-	sparse_remove_one_section(zone, ms, map_offset, altmap);
+	sparse_remove_one_section(ms, map_offset, altmap);
 }
 
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index d1d5e05f5b8d..1552c855d62a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
-		unsigned long map_offset, struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
+			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	unsigned long *usemap = NULL;
-- 
cgit v1.2.3


From 43675e6fbbeadca90c6c5031557ff95e217e6d2f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 18 Jul 2019 15:57:24 -0700
Subject: mm: thp: make transhuge_vma_suitable available for anonymous THP

transhuge_vma_suitable() was only available for shmem THP, but anonymous
THP has the same check except pgoff check.  And, it will be used for THP
eligible check in the later patch, so make it available for all kind of
THPs.  This also helps reduce code duplication slightly.

Since anonymous THP doesn't have to check pgoff, so make pgoff check
shmem vma only.

And regroup some functions in include/linux/mm.h to solve compile issue
since transhuge_vma_suitable() needs call vma_is_anonymous() which was
defined after huge_mm.h is included.

[akpm@linux-foundation.org: fix typo]
[yang.shi@linux.alibaba.com: v4]
  Link: http://lkml.kernel.org/r/1563400758-124759-2-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1560401041-32207-2-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 23 +++++++++++++++++++++++
 include/linux/mm.h      | 34 +++++++++++++++++-----------------
 mm/huge_memory.c        |  2 +-
 mm/memory.c             | 13 -------------
 4 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..45ede62aa85b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -121,6 +121,23 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 bool transparent_hugepage_enabled(struct vm_area_struct *vma);
 
+#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
+
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	/* Don't have to check pgoff for anonymous vma */
+	if (!vma_is_anonymous(vma)) {
+		if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
+			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+			return false;
+	}
+
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return false;
+	return true;
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
@@ -271,6 +288,12 @@ static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd6512559bed..48ab7b982d82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -541,6 +541,23 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 	vma->vm_ops = NULL;
 }
 
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
+#ifdef CONFIG_SHMEM
+/*
+ * The vma_is_shmem is not inline because it is used only by slow
+ * paths in userfault.
+ */
+bool vma_is_shmem(struct vm_area_struct *vma);
+#else
+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+#endif
+
+int vma_is_stack_for_current(struct vm_area_struct *vma);
+
 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
 
@@ -1620,23 +1637,6 @@ int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
-static inline bool vma_is_anonymous(struct vm_area_struct *vma)
-{
-	return !vma->vm_ops;
-}
-
-#ifdef CONFIG_SHMEM
-/*
- * The vma_is_shmem is not inline because it is used only by slow
- * paths in userfault.
- */
-bool vma_is_shmem(struct vm_area_struct *vma);
-#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-#endif
-
-int vma_is_stack_for_current(struct vm_area_struct *vma);
-
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885642c82aaa..782dd1446a3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -689,7 +689,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+	if (!transhuge_vma_suitable(vma, haddr))
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
diff --git a/mm/memory.c b/mm/memory.c
index 89325f9c6173..e2bb51b6242e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3162,19 +3162,6 @@ map_pte:
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
-{
-	if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
-			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
-		return false;
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
-		return false;
-	return true;
-}
-
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-- 
cgit v1.2.3


From 2491f0a2c0b117b9097e9c9eee0c21f2e5f716d7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:37 -0700
Subject: mm: section numbers use the type "unsigned long"

Patch series "mm: Further memory block device cleanups", v1.

Some further cleanups around memory block devices.  Especially, clean up
and simplify walk_memory_range().  Including some other minor cleanups.

This patch (of 6):

We are using a mixture of "int" and "unsigned long".  Let's make this
consistent by using "unsigned long" everywhere.  We'll do the same with
memory block ids next.

While at it, turn the "unsigned long i" in removable_show() into an int
- sections_per_block is an int.

[akpm@linux-foundation.org: s/unsigned long i/unsigned long nr/]
[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-2-david@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 27 +++++++++++++--------------
 include/linux/mmzone.h |  4 ++--
 mm/sparse.c            | 12 ++++++------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 826dd76f662e..5947b5a5686d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(int section_nr)
+static inline int base_memory_block_id(unsigned long section_nr)
 {
 	return section_nr / sections_per_block;
 }
@@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev,
 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	unsigned long i, pfn;
-	int ret = 1;
 	struct memory_block *mem = to_memory_block(dev);
+	unsigned long pfn;
+	int ret = 1, i;
 
 	if (mem->state != MEM_ONLINE)
 		goto out;
@@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	return ret;
 }
 
-static int add_memory_block(int base_section_nr)
+static int add_memory_block(unsigned long base_section_nr)
 {
+	int ret, section_count = 0;
 	struct memory_block *mem;
-	int i, ret, section_count = 0;
+	unsigned long nr;
 
-	for (i = base_section_nr;
-	     i < base_section_nr + sections_per_block;
-	     i++)
-		if (present_section_nr(i))
+	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
+	     nr++)
+		if (present_section_nr(nr))
 			section_count++;
 
 	if (section_count == 0)
@@ -822,10 +822,9 @@ static const struct attribute_group *memory_root_attr_groups[] = {
  */
 int __init memory_dev_init(void)
 {
-	unsigned int i;
 	int ret;
 	int err;
-	unsigned long block_sz;
+	unsigned long block_sz, nr;
 
 	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 	if (ret)
@@ -839,9 +838,9 @@ int __init memory_dev_init(void)
 	 * during boot and have been initialized
 	 */
 	mutex_lock(&mem_sysfs_mutex);
-	for (i = 0; i <= __highest_present_section_nr;
-		i += sections_per_block) {
-		err = add_memory_block(i);
+	for (nr = 0; nr <= __highest_present_section_nr;
+	     nr += sections_per_block) {
+		err = add_memory_block(nr);
 		if (!ret)
 			ret = err;
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 70394cabaf4e..298d1c3e4c2e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1219,7 +1219,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
+extern unsigned long __section_nr(struct mem_section *ms);
 extern unsigned long usemap_size(void);
 
 /*
@@ -1291,7 +1291,7 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-extern int __highest_present_section_nr;
+extern unsigned long __highest_present_section_nr;
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index fe44b2d3bd7e..b29534cea8c0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
 	unsigned long root_nr;
 	struct mem_section *root = NULL;
@@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 #else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
-	return (int)(ms - mem_section[0]);
+	return (unsigned long)(ms - mem_section[0]);
 }
 #endif
 
@@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  * Keeping track of this gives us an easy way to break out of
  * those loops early.
  */
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
 static void section_mark_present(struct mem_section *ms)
 {
-	int section_nr = __section_nr(ms);
+	unsigned long section_nr = __section_nr(ms);
 
 	if (section_nr > __highest_present_section_nr)
 		__highest_present_section_nr = section_nr;
@@ -189,7 +189,7 @@ static void section_mark_present(struct mem_section *ms)
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
 {
 	do {
 		section_nr++;
-- 
cgit v1.2.3


From 8d595c4c0f768f19db043d378b22e98405f9fd47 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:43 -0700
Subject: mm: make register_mem_sect_under_node() static

It is only used internally.

Link: http://lkml.kernel.org/r/20190614100114.311-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 3 ++-
 include/linux/node.h | 7 -------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index beec80649b33..27391f1e8f60 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -753,7 +753,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
 }
 
 /* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
+static int register_mem_sect_under_node(struct memory_block *mem_blk,
+					 void *arg)
 {
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
diff --git a/include/linux/node.h b/include/linux/node.h
index 548c226966a2..4866f32a02d8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -137,8 +137,6 @@ static inline int register_one_node(int nid)
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
-extern int register_mem_sect_under_node(struct memory_block *mem_blk,
-						void *arg);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
@@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	return 0;
 }
-static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
-							void *arg)
-{
-	return 0;
-}
 static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 }
-- 
cgit v1.2.3


From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:46 -0700
Subject: mm/memory_hotplug: rename walk_memory_range() and pass start+size
 instead of pfns

walk_memory_range() was once used to iterate over sections.  Now, it
iterates over memory blocks.  Rename the function, fixup the
documentation.

Also, pass start+size instead of PFNs, which is what most callers
already have at hand.  (we'll rework link_mem_sections() most probably
soon)

Follow-up patches will rework, simplify, and move walk_memory_blocks()
to drivers/base/memory.c.

Note: walk_memory_blocks() only works correctly right now if the
start_pfn is aligned to a section start.  This is the case right now,
but we'll generalize the function in a follow up patch so the semantics
match the documentation.

[akpm@linux-foundation.org: remove unused variable]
Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/powernv/memtrace.c | 23 +++++++++++------------
 drivers/acpi/acpi_memhotplug.c            | 19 ++++---------------
 drivers/base/node.c                       |  5 +++--
 include/linux/memory_hotplug.h            |  2 +-
 mm/memory_hotplug.c                       | 24 +++++++++++++-----------
 5 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..eb2e75dac369 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
-	u64 end_pfn = start_pfn + nr_pages - 1;
+	const unsigned long start = PFN_PHYS(start_pfn);
+	const unsigned long size = PFN_PHYS(nr_pages);
 
-	if (walk_memory_range(start_pfn, end_pfn, NULL,
-	    check_memblock_online))
+	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
 		return false;
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+			   change_memblock_state);
 
 	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
-				  change_memblock_state);
+		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+				   change_memblock_state);
 		return false;
 	}
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+			   change_memblock_state);
 
 
 	return true;
@@ -242,9 +242,8 @@ static int memtrace_online(void)
 		 */
 		if (!memhp_auto_online) {
 			lock_device_hotplug();
-			walk_memory_range(PFN_DOWN(ent->start),
-					  PFN_UP(ent->start + ent->size - 1),
-					  NULL, online_mem_block);
+			walk_memory_blocks(ent->start, ent->size, NULL,
+					   online_mem_block);
 			unlock_device_hotplug();
 		}
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index db013dc21c02..e294f44a7850 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
-static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
-{
-	return PFN_DOWN(info->start_addr);
-}
-
-static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
-{
-	return PFN_UP(info->start_addr + info->length-1);
-}
-
 static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 {
 	return acpi_bind_one(&mem->dev, arg);
@@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
 				   struct acpi_device *adev)
 {
-	return walk_memory_range(acpi_meminfo_start_pfn(info),
-				 acpi_meminfo_end_pfn(info), adev,
-				 acpi_bind_memblk);
+	return walk_memory_blocks(info->start_addr, info->length, adev,
+				  acpi_bind_memblk);
 }
 
 static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
@@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
 
 static void acpi_unbind_memory_blocks(struct acpi_memory_info *info)
 {
-	walk_memory_range(acpi_meminfo_start_pfn(info),
-			  acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
+	walk_memory_blocks(info->start_addr, info->length, NULL,
+			   acpi_unbind_memblk);
 }
 
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 27391f1e8f60..75b7e6f6535b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
 {
-	return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
-					register_mem_sect_under_node);
+	return walk_memory_blocks(PFN_PHYS(start_pfn),
+				  PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
+				  register_mem_sect_under_node);
 }
 
 #ifdef CONFIG_HUGETLBFS
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 79e0add6a597..d9fffc34949f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,7 +340,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d1d0ceaaca88..b3ef84e408fa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1124,8 +1124,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
 
 	/* online pages if requested */
 	if (memhp_auto_online)
-		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
-				  NULL, online_memory_block);
+		walk_memory_blocks(start, size, NULL, online_memory_block);
 
 	return ret;
 error:
@@ -1663,20 +1662,24 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
  * @arg: argument passed to func
- * @func: callback for each memory section walked
+ * @func: callback for each memory block walked
  *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
  *
  * Returns the return value of func.
  */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *))
 {
+	const unsigned long start_pfn = PFN_DOWN(start);
+	const unsigned long end_pfn = PFN_UP(start + size - 1);
 	struct memory_block *mem = NULL;
 	struct mem_section *section;
 	unsigned long pfn, section_nr;
@@ -1822,8 +1825,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-			       check_memblock_offlined_cb);
+	rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
 	if (rc)
 		goto done;
 
-- 
cgit v1.2.3


From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:50 -0700
Subject: mm/memory_hotplug: move and simplify walk_memory_blocks()

Let's move walk_memory_blocks() to the place where memory block logic
resides and simplify it.  While at it, add a type for the callback
function.

Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 42 ++++++++++++++++++++++++++++++++
 include/linux/memory.h         |  3 +++
 include/linux/memory_hotplug.h |  2 --
 mm/memory_hotplug.c            | 55 ------------------------------------------
 4 files changed, 45 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c54e80fd25a8..0204384b4d1d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn)
 	return base_memory_block_id(pfn_to_section_nr(pfn));
 }
 
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+	return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -851,3 +856,40 @@ out:
 		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
 	return ret;
 }
+
+/**
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
+ *
+ * In case func() returns an error, walking is aborted and the error is
+ * returned.
+ */
+int walk_memory_blocks(unsigned long start, unsigned long size,
+		       void *arg, walk_memory_blocks_func_t func)
+{
+	const unsigned long start_block_id = phys_to_block_id(start);
+	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
+
+	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (!mem)
+			continue;
+
+		ret = func(mem, arg);
+		put_device(&mem->dev);
+		if (ret)
+			break;
+	}
+	return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f26a5417ec5d..b3b388775a30 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
 							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
+			      void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d9fffc34949f..475aff8efbf8 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,8 +340,6 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b3ef84e408fa..fafee5f13ef2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1659,62 +1659,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/**
- * walk_memory_blocks - walk through all present memory blocks overlapped
- *			by the range [start, start + size)
- *
- * @start: start address of the memory range
- * @size: size of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory block walked
- *
- * This function walks through all present memory blocks overlapped by the
- * range [start, start + size), calling func on each memory block.
- *
- * Returns the return value of func.
- */
-int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *))
-{
-	const unsigned long start_pfn = PFN_DOWN(start);
-	const unsigned long end_pfn = PFN_UP(start + size - 1);
-	struct memory_block *mem = NULL;
-	struct mem_section *section;
-	unsigned long pfn, section_nr;
-	int ret;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		section_nr = pfn_to_section_nr(pfn);
-		if (!present_section_nr(section_nr))
-			continue;
-
-		section = __nr_to_section(section_nr);
-		/* same memblock? */
-		if (mem)
-			if ((section_nr >= mem->start_section_nr) &&
-			    (section_nr <= mem->end_section_nr))
-				continue;
-
-		mem = find_memory_block_hinted(section, mem);
-		if (!mem)
-			continue;
-
-		ret = func(mem, arg);
-		if (ret) {
-			kobject_put(&mem->dev.kobj);
-			return ret;
-		}
-	}
-
-	if (mem)
-		kobject_put(&mem->dev.kobj);
-
-	return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
 	int ret = !is_memblock_offlined(mem);
-- 
cgit v1.2.3


From dd625285910d3cff535fa76355e49949513918a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:53 -0700
Subject: drivers/base/memory.c: get rid of find_memory_block_hinted()

No longer needed, let's remove it.  Also, drop the "hint" parameter
completely from "find_memory_block_by_id", as nobody needs it anymore.

[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-7-david@redhat.com
[david@redhat.com: handle zero-length walks]
  Link: http://lkml.kernel.org/r/1c2edc22-afd7-2211-c4c7-40e54e5007e8@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 40 ++++++++++++++--------------------------
 include/linux/memory.h |  2 --
 2 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0204384b4d1d..20c39d1bcef8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 	return 0;
 }
 
-/*
- * A reference for the returned object is held and the reference for the
- * hinted object is released.
- */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id,
-						    struct memory_block *hint)
+/* A reference for the returned memory block device is acquired. */
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
-	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
-	if (hint)
-		put_device(&hint->dev);
-	if (!dev)
-		return NULL;
-	return to_memory_block(dev);
-}
-
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
-{
-	unsigned long block_id = base_memory_block_id(__section_nr(section));
-
-	return find_memory_block_by_id(block_id, hint);
+	dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
+	return dev ? to_memory_block(dev) : NULL;
 }
 
 /*
@@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
  */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
-	return find_memory_block_hinted(section, NULL);
+	unsigned long block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id);
 }
 
 static struct attribute *memory_memblk_attrs[] = {
@@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory,
 	unsigned long start_pfn;
 	int ret = 0;
 
-	mem = find_memory_block_by_id(block_id, NULL);
+	mem = find_memory_block_by_id(block_id);
 	if (mem) {
 		put_device(&mem->dev);
 		return -EEXIST;
@@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 		end_block_id = block_id;
 		for (block_id = start_block_id; block_id != end_block_id;
 		     block_id++) {
-			mem = find_memory_block_by_id(block_id, NULL);
+			mem = find_memory_block_by_id(block_id);
 			mem->section_count = 0;
 			unregister_memory(mem);
 		}
@@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
 
 	mutex_lock(&mem_sysfs_mutex);
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (WARN_ON_ONCE(!mem))
 			continue;
 		mem->section_count = 0;
@@ -881,8 +866,11 @@ int walk_memory_blocks(unsigned long start, unsigned long size,
 	unsigned long block_id;
 	int ret = 0;
 
+	if (!size)
+		return 0;
+
 	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (!mem)
 			continue;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index b3b388775a30..02e633f3ede0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block_hinted(struct mem_section *,
-							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
cgit v1.2.3


From f1eca35a0dc7cb3cdb00c88c8c5e5138a65face0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:57:57 -0700
Subject: mm/sparsemem: introduce struct mem_section_usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: Sub-section memory hotplug support", v10.

The memory hotplug section is an arbitrary / convenient unit for memory
hotplug.  'Section-size' units have bled into the user interface
('memblock' sysfs) and can not be changed without breaking existing
userspace.  The section-size constraint, while mostly benign for typical
memory hotplug, has and continues to wreak havoc with 'device-memory'
use cases, persistent memory (pmem) in particular.  Recall that pmem
uses devm_memremap_pages(), and subsequently arch_add_memory(), to
allocate a 'struct page' memmap for pmem.  However, it does not use the
'bottom half' of memory hotplug, i.e.  never marks pmem pages online and
never exposes the userspace memblock interface for pmem.  This leaves an
opening to redress the section-size constraint.

To date, the libnvdimm subsystem has attempted to inject padding to
satisfy the internal constraints of arch_add_memory().  Beyond
complicating the code, leading to bugs [2], wasting memory, and limiting
configuration flexibility, the padding hack is broken when the platform
changes this physical memory alignment of pmem from one boot to the
next.  Device failure (intermittent or permanent) and physical
reconfiguration are events that can cause the platform firmware to
change the physical placement of pmem on a subsequent boot, and device
failure is an everyday event in a data-center.

It turns out that sections are only a hard requirement of the
user-facing interface for memory hotplug and with a bit more
infrastructure sub-section arch_add_memory() support can be added for
kernel internal usages like devm_memremap_pages().  Here is an analysis
of the current design assumptions in the current code and how they are
addressed in the new implementation:

Current design assumptions:

 - Sections that describe boot memory (early sections) are never
   unplugged / removed.

 - pfn_valid(), in the CONFIG_SPARSEMEM_VMEMMAP=y, case devolves to a
   valid_section() check

 - __add_pages() and helper routines assume all operations occur in
   PAGES_PER_SECTION units.

 - The memblock sysfs interface only comprehends full sections

New design assumptions:

 - Sections are instrumented with a sub-section bitmask to track (on
   x86) individual 2MB sub-divisions of a 128MB section.

 - Partially populated early sections can be extended with additional
   sub-sections, and those sub-sections can be removed with
   arch_remove_memory(). With this in place we no longer lose usable
   memory capacity to padding.

 - pfn_valid() is updated to look deeper than valid_section() to also
   check the active-sub-section mask. This indication is in the same
   cacheline as the valid_section() so the performance impact is
   expected to be negligible. So far the lkp robot has not reported any
   regressions.

 - Outside of the core vmemmap population routines which are replaced,
   other helper routines like shrink_{zone,pgdat}_span() are updated to
   handle the smaller granularity. Core memory hotplug routines that
   deal with online memory are not touched.

 - The existing memblock sysfs user api guarantees / assumptions are not
   touched since this capability is limited to !online
   !memblock-sysfs-accessible sections.

Meanwhile the issue reports continue to roll in from users that do not
understand when and how the 128MB constraint will bite them.  The current
implementation relied on being able to support at least one misaligned
namespace, but that immediately falls over on any moderately complex
namespace creation attempt.  Beyond the initial problem of 'System RAM'
colliding with pmem, and the unsolvable problem of physical alignment
changes, Linux is now being exposed to platforms that collide pmem ranges
with other pmem ranges by default [3].  In short, devm_memremap_pages()
has pushed the venerable section-size constraint past the breaking point,
and the simplicity of section-aligned arch_add_memory() is no longer
tenable.

These patches are exposed to the kbuild robot on a subsection-v10 branch
[4], and a preview of the unit test for this functionality is available
on the 'subsection-pending' branch of ndctl [5].

[2]: https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com
[3]: https://github.com/pmem/ndctl/issues/76
[4]: https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=subsection-v10
[5]: https://github.com/pmem/ndctl/commit/7c59b4867e1c

This patch (of 13):

Towards enabling memory hotplug to track partial population of a section,
introduce 'struct mem_section_usage'.

A pointer to a 'struct mem_section_usage' instance replaces the existing
pointer to a 'pageblock_flags' bitmap.  Effectively it adds one more
'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to house
a new 'subsection_map' bitmap.  The new bitmap enables the memory
hot{plug,remove} implementation to act on incremental sub-divisions of a
section.

SUBSECTION_SHIFT is defined as global constant instead of per-architecture
value like SECTION_SIZE_BITS in order to allow cross-arch compatibility of
subsection users.  Specifically a common subsection size allows for the
possibility that persistent memory namespace configurations be made
compatible across architectures.

The primary motivation for this functionality is to support platforms that
mix "System RAM" and "Persistent Memory" within a single section, or
multiple PMEM ranges with different mapping lifetimes within a single
section.  The section restriction for hotplug has caused an ongoing saga
of hacks and bugs for devm_memremap_pages() users.

Beyond the fixups to teach existing paths how to retrieve the 'usemap'
from a section, and updates to usemap allocation path, there are no
expected behavior changes.

Link: http://lkml.kernel.org/r/156092349845.979959.73333291612799019.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Qian Cai <cai@lca.pw>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 28 +++++++++++++++--
 mm/memory_hotplug.c    | 18 ++++++-----
 mm/page_alloc.c        |  2 +-
 mm/sparse.c            | 81 +++++++++++++++++++++++++-------------------------
 4 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 298d1c3e4c2e..2520336bdfd1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,24 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 
+#define SUBSECTION_SHIFT 21
+
+#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
+#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
+#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
+
+#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
+#error Subsection size exceeds section size
+#else
+#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
+#endif
+
+struct mem_section_usage {
+	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+	/* See declaration of similar field in struct zone */
+	unsigned long pageblock_flags[0];
+};
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1177,8 +1195,7 @@ struct mem_section {
 	 */
 	unsigned long section_mem_map;
 
-	/* See declaration of similar field in struct zone */
-	unsigned long *pageblock_flags;
+	struct mem_section_usage *usage;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
 	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
@@ -1209,6 +1226,11 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+static inline unsigned long *section_to_usemap(struct mem_section *ms)
+{
+	return ms->usage->pageblock_flags;
+}
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 #ifdef CONFIG_SPARSEMEM_EXTREME
@@ -1220,7 +1242,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern unsigned long __section_nr(struct mem_section *ms);
-extern unsigned long usemap_size(void);
+extern size_t mem_section_usage_size(void);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fafee5f13ef2..cf9d979a6498 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, SECTION_INFO);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 #else /* CONFIG_SPARSEMEM_VMEMMAP */
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e515bfcf7f28..be78bafbfe3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
+	return section_to_usemap(__pfn_to_section(pfn));
 #else
 	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
diff --git a/mm/sparse.c b/mm/sparse.c
index b29534cea8c0..41bef8e1f65c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		unsigned long *pageblock_bitmap)
+		struct mem_section_usage *usage)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
 							SECTION_HAS_MEM_MAP;
- 	ms->pageblock_flags = pageblock_bitmap;
+	ms->usage = usage;
 }
 
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
 {
 	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
 {
-	return kmalloc(usemap_size(), GFP_KERNEL);
+	return sizeof(struct mem_section_usage) + usemap_size();
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
+	struct mem_section_usage *usage;
 	unsigned long goal, limit;
-	unsigned long *p;
 	int nid;
 	/*
 	 * A page may contain usemaps for other sections preventing the
@@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	limit = goal + (1UL << PA_SECTION_SHIFT);
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-	p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
-	if (!p && limit) {
+	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+	if (!usage && limit) {
 		limit = 0;
 		goto again;
 	}
-	return p;
+	return usage;
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 	unsigned long usemap_snr, pgdat_snr;
 	static unsigned long old_usemap_snr;
@@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		old_pgdat_snr = NR_MEM_SECTIONS;
 	}
 
-	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
 	if (usemap_snr == pgdat_snr)
 		return;
@@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		usemap_snr, pgdat_snr, nid);
 }
 #else
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
 	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				   unsigned long pnum_end,
 				   unsigned long map_count)
 {
-	unsigned long pnum, usemap_longs, *usemap;
+	struct mem_section_usage *usage;
+	unsigned long pnum;
 	struct page *map;
 
-	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
-	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
-							  usemap_size() *
-							  map_count);
-	if (!usemap) {
+	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+			mem_section_usage_size() * map_count);
+	if (!usage) {
 		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
 		goto failed;
 	}
@@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			pnum_begin = pnum;
 			goto failed;
 		}
-		check_usemap_section_nr(nid, usemap);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
-		usemap += usemap_longs;
+		check_usemap_section_nr(nid, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
 	return;
@@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 				     struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
+	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
-	unsigned long *usemap;
 	int ret;
 
 	/*
@@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
 	if (!memmap)
 		return -ENOMEM;
-	usemap = __kmalloc_section_usemap();
-	if (!usemap) {
+	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+	if (!usage) {
 		__kfree_section_memmap(memmap, altmap);
 		return -ENOMEM;
 	}
@@ -733,11 +732,11 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usemap);
+	sparse_init_one_section(ms, section_nr, memmap, usage);
 
 out:
 	if (ret < 0) {
-		kfree(usemap);
+		kfree(usage);
 		__kfree_section_memmap(memmap, altmap);
 	}
 	return ret;
@@ -773,20 +772,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
-		struct vmem_altmap *altmap)
+static void free_section_usage(struct page *memmap,
+		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usemap_page;
+	struct page *usage_page;
 
-	if (!usemap)
+	if (!usage)
 		return;
 
-	usemap_page = virt_to_page(usemap);
+	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-		kfree(usemap);
+	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
 		return;
@@ -805,18 +804,18 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL;
+	struct mem_section_usage *usage = NULL;
 
 	if (ms->section_mem_map) {
-		usemap = ms->pageblock_flags;
+		usage = ms->usage;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
 						__section_nr(ms));
 		ms->section_mem_map = 0;
-		ms->pageblock_flags = NULL;
+		ms->usage = NULL;
 	}
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usemap(memmap, usemap, altmap);
+	free_section_usage(memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From 326e1b8f83a4318b09033ef754f40c785aed5e68 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:00 -0700
Subject: mm/sparsemem: introduce a SECTION_IS_EARLY flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for sub-section hotplug, track whether a given section
was created during early memory initialization, or later via memory
hotplug.  This distinction is needed to maintain the coarse expectation
that pfn_valid() returns true for any pfn within a given section even if
that section has pages that are reserved from the page allocator.

For example one of the of goals of subsection hotplug is to support
cases where the system physical memory layout collides System RAM and
PMEM within a section.  Several pfn_valid() users expect to just check
if a section is valid, but they are not careful to check if the given
pfn is within a "System RAM" boundary and instead expect pgdat
information to further validate the pfn.

Rather than unwind those paths to make their pfn_valid() queries more
precise a follow on patch uses the SECTION_IS_EARLY flag to maintain the
traditional expectation that pfn_valid() returns true for all early
sections.

Link: https://lore.kernel.org/lkml/1560366952-10660-1-git-send-email-cai@lca.pw/
Link: http://lkml.kernel.org/r/156092350358.979959.5817209875548072819.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  8 +++++++-
 mm/sparse.c            | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2520336bdfd1..4be40634238b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1260,7 +1260,8 @@ extern size_t mem_section_usage_size(void);
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_IS_ONLINE	(1UL<<2)
-#define SECTION_MAP_LAST_BIT	(1UL<<3)
+#define SECTION_IS_EARLY	(1UL<<3)
+#define SECTION_MAP_LAST_BIT	(1UL<<4)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
 #define SECTION_NID_SHIFT	3
 
@@ -1286,6 +1287,11 @@ static inline int valid_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 
+static inline int early_section(struct mem_section *section)
+{
+	return (section && (section->section_mem_map & SECTION_IS_EARLY));
+}
+
 static inline int valid_section_nr(unsigned long nr)
 {
 	return valid_section(__nr_to_section(nr));
diff --git a/mm/sparse.c b/mm/sparse.c
index 41bef8e1f65c..6d23a526279a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,11 +288,11 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		struct mem_section_usage *usage)
+		struct mem_section_usage *usage, unsigned long flags)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
-	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
-							SECTION_HAS_MEM_MAP;
+	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+		| SECTION_HAS_MEM_MAP | flags;
 	ms->usage = usage;
 }
 
@@ -497,7 +497,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			goto failed;
 		}
 		check_usemap_section_nr(nid, usage);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+				SECTION_IS_EARLY);
 		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
@@ -732,7 +733,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage);
+	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
 out:
 	if (ret < 0) {
@@ -772,19 +773,16 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct page *memmap,
+static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usage_page;
-
 	if (!usage)
 		return;
 
-	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
@@ -816,6 +814,6 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From f46edbd1b1516da1fb34c917775168d5df576f78 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:04 -0700
Subject: mm/sparsemem: add helpers track active portions of a section at boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for hot{plug,remove} of sub-ranges of a section by tracking a
sub-section active bitmask, each bit representing a PMD_SIZE span of the
architecture's memory hotplug section size.

The implications of a partially populated section is that pfn_valid()
needs to go beyond a valid_section() check and either determine that the
section is an "early section", or read the sub-section active ranges
from the bitmask.  The expectation is that the bitmask (subsection_map)
fits in the same cacheline as the valid_section() / early_section()
data, so the incremental performance overhead to pfn_valid() should be
negligible.

The rationale for using early_section() to short-ciruit the
subsection_map check is that there are legacy code paths that use
pfn_valid() at section granularity before validating the pfn against
pgdat data.  So, the early_section() check allows those traditional
assumptions to persist while also permitting subsection_map to tell the
truth for purposes of populating the unused portions of early sections
with PMEM and other ZONE_DEVICE mappings.

Link: http://lkml.kernel.org/r/156092350874.979959.18185938451405518285.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Jane Chu <jane.chu@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 33 ++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        | 10 ++++++++--
 mm/sparse.c            | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4be40634238b..7747ec9de588 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1178,6 +1178,8 @@ struct mem_section_usage {
 	unsigned long pageblock_flags[0];
 };
 
+void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1321,12 +1323,40 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 
 extern unsigned long __highest_present_section_nr;
 
+static inline int subsection_map_index(unsigned long pfn)
+{
+	return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	int idx = subsection_map_index(pfn);
+
+	return test_bit(idx, ms->usage->subsection_map);
+}
+#else
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	return 1;
+}
+#endif
+
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
 {
+	struct mem_section *ms;
+
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
+	ms = __nr_to_section(pfn_to_section_nr(pfn));
+	if (!valid_section(ms))
+		return 0;
+	/*
+	 * Traditionally early sections always returned pfn_valid() for
+	 * the entire section-sized span.
+	 */
+	return early_section(ms) || pfn_section_valid(ms, pfn);
 }
 #endif
 
@@ -1358,6 +1388,7 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #define pfn_present pfn_valid
+#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index be78bafbfe3a..c4cdd3954804 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7351,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
-	/* Print out the early node map */
+	/*
+	 * Print out the early node map, and initialize the
+	 * subsection-map relative to active online memory ranges to
+	 * enable future "sub-section" extensions of the memory map.
+	 */
 	pr_info("Early memory node ranges\n");
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
 			(u64)start_pfn << PAGE_SHIFT,
 			((u64)end_pfn << PAGE_SHIFT) - 1);
+		subsection_map_init(start_pfn, end_pfn - start_pfn);
+	}
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
diff --git a/mm/sparse.c b/mm/sparse.c
index 6d23a526279a..26b48ee1a262 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -210,6 +210,41 @@ static inline unsigned long first_present_section_nr(void)
 	return next_present_section_nr(-1);
 }
 
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+		unsigned long nr_pages)
+{
+	int idx = subsection_map_index(pfn);
+	int end = subsection_map_index(pfn + nr_pages - 1);
+
+	bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	int i, start_sec = pfn_to_section_nr(pfn);
+
+	if (!nr_pages)
+		return;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		struct mem_section *ms;
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		ms = __nr_to_section(i);
+		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
+				pfns, subsection_map_index(pfn),
+				subsection_map_index(pfn + pfns - 1));
+
+		pfn += pfns;
+		nr_pages -= pfns;
+	}
+}
+
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From e9c0a3f05477e18d2dae816cb61b62be1b7e90d3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:11 -0700
Subject: mm/sparsemem: convert kmalloc_section_memmap() to
 populate_section_memmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow sub-section sized ranges to be added to the memmap.

populate_section_memmap() takes an explict pfn range rather than
assuming a full section, and those parameters are plumbed all the way
through to vmmemap_populate().  There should be no sub-section usage in
current deployments.  New warnings are added to clarify which memmap
allocation paths are sub-section capable.

Link: http://lkml.kernel.org/r/156092352058.979959.6551283472062305149.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +++-
 include/linux/mm.h    |  4 ++--
 mm/sparse-vmemmap.c   | 21 ++++++++++++++-------
 mm/sparse.c           | 50 +++++++++++++++++++++++++++-----------------------
 4 files changed, 46 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a289a2ab108..a6b5c653727b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1518,7 +1518,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 {
 	int err;
 
-	if (boot_cpu_has(X86_FEATURE_PSE))
+	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+		err = vmemmap_populate_basepages(start, end, node);
+	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
 	else if (altmap) {
 		pr_err_once("%s: no cpu support for altmap allocations\n",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 48ab7b982d82..0334ca97c584 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,8 +2767,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 void *sparse_buffer_alloc(unsigned long size);
-struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap);
+struct page * __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 7fec05796796..200aef686722 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 	return 0;
 }
 
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long start;
 	unsigned long end;
-	struct page *map;
 
-	map = pfn_to_page(pnum * PAGES_PER_SECTION);
-	start = (unsigned long)map;
-	end = (unsigned long)(map + PAGES_PER_SECTION);
+	/*
+	 * The minimum granularity of memmap extensions is
+	 * PAGES_PER_SUBSECTION as allocations are tracked in the
+	 * 'subsection_map' bitmap of the section.
+	 */
+	end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+	pfn &= PAGE_SUBSECTION_MASK;
+	nr_pages = end - pfn;
+
+	start = (unsigned long) pfn_to_page(pfn);
+	end = start + nr_pages * sizeof(struct page);
 
 	if (vmemmap_populate(start, end, nid, altmap))
 		return NULL;
 
-	return map;
+	return pfn_to_page(pfn);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 26b48ee1a262..6b01022e23a9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -439,8 +439,8 @@ static unsigned long __init section_map_size(void)
 	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 }
 
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long size = section_map_size();
 	struct page *map = sparse_buffer_alloc(size);
@@ -521,10 +521,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 	}
 	sparse_buffer_init(map_count * section_map_size(), nid);
 	for_each_present_section_nr(pnum_begin, pnum) {
+		unsigned long pfn = section_nr_to_pfn(pnum);
+
 		if (pnum >= pnum_end)
 			break;
 
-		map = sparse_mem_map_populate(pnum, nid, NULL);
+		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+				nid, NULL);
 		if (!map) {
 			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
 			       __func__, nid);
@@ -625,17 +628,17 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
-	/* This will make the necessary allocations eventually. */
-	return sparse_mem_map_populate(pnum, nid, altmap);
+	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
 }
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	unsigned long start = (unsigned long)memmap;
-	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+	unsigned long start = (unsigned long) pfn_to_page(pfn);
+	unsigned long end = start + nr_pages * sizeof(struct page);
 
 	vmemmap_free(start, end, altmap);
 }
@@ -647,7 +650,8 @@ static void free_map_bootmem(struct page *memmap)
 	vmemmap_free(start, end, NULL);
 }
 #else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	struct page *page, *ret;
 	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -668,15 +672,11 @@ got_map_ptr:
 	return ret;
 }
 
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	return __kmalloc_section_memmap();
-}
+	struct page *memmap = pfn_to_page(pfn);
 
-static void __kfree_section_memmap(struct page *memmap,
-		struct vmem_altmap *altmap)
-{
 	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
@@ -745,12 +745,13 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
 	ret = 0;
-	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
+	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
+			altmap);
 	if (!memmap)
 		return -ENOMEM;
 	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 	if (!usage) {
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 		return -ENOMEM;
 	}
 
@@ -773,7 +774,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 out:
 	if (ret < 0) {
 		kfree(usage);
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 	}
 	return ret;
 }
@@ -809,7 +810,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 #endif
 
 static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, struct vmem_altmap *altmap)
+		struct mem_section_usage *usage, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	if (!usage)
 		return;
@@ -820,7 +822,7 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
-			__kfree_section_memmap(memmap, altmap);
+			depopulate_section_memmap(pfn, nr_pages, altmap);
 		return;
 	}
 
@@ -849,6 +851,8 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage,
+			section_nr_to_pfn(__section_nr(ms)),
+			PAGES_PER_SECTION, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From 46d945aeab4d7dd837bd0724662de2caf712f047 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:18 -0700
Subject: mm: kill is_dev_zone() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given there are no more usages of is_dev_zone() outside of 'ifdef
CONFIG_ZONE_DEVICE' protection, kill off the compilation helper.

Link: http://lkml.kernel.org/r/156092353211.979959.1489004866360828964.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 12 ------------
 mm/page_alloc.c        |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7747ec9de588..8331e76677c0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -855,18 +855,6 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return zone_idx(zone) == ZONE_DEVICE;
-}
-#else
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return false;
-}
-#endif
-
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c4cdd3954804..2c74367a8eba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5926,7 +5926,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
 
-	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+	if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 7ea6216049ff9cf250a6722cd766d99c8d1424e5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:22 -0700
Subject: mm/sparsemem: prepare for sub-section ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare the memory hot-{add,remove} paths for handling sub-section
ranges by plumbing the starting page frame and number of pages being
handled through arch_{add,remove}_memory() to
sparse_{add,remove}_one_section().

This is simply plumbing, small cleanups, and some identifier renames.
No intended functional changes.

Link: http://lkml.kernel.org/r/156092353780.979959.9713046515562743194.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   5 +-
 mm/memory_hotplug.c            | 114 +++++++++++++++++++++++++----------------
 mm/sparse.c                    |  16 +++---
 3 files changed, 81 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 475aff8efbf8..2d636a7491a4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -346,9 +346,10 @@ extern int add_memory_resource(int nid, struct resource *resource);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int sparse_add_one_section(int nid, unsigned long start_pfn,
-				  struct vmem_altmap *altmap);
+extern int sparse_add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern void sparse_remove_one_section(struct mem_section *ms,
+		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 11220044b01a..3fbb2cfab126 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,51 +252,84 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-				   struct vmem_altmap *altmap)
+static int __meminit __add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages,	struct vmem_altmap *altmap)
 {
 	int ret;
 
-	if (pfn_valid(phys_start_pfn))
+	if (pfn_valid(pfn))
 		return -EEXIST;
 
-	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
+	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
 	return ret < 0 ? ret : 0;
 }
 
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+		const char *reason)
+{
+	/*
+	 * Disallow all operations smaller than a sub-section and only
+	 * allow operations smaller than a section for
+	 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+	 * enforces a larger memory_block_size_bytes() granularity for
+	 * memory that will be marked online, so this check should only
+	 * fire for direct arch_{add,remove}_memory() users outside of
+	 * add_memory_resource().
+	 */
+	unsigned long min_align;
+
+	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+		min_align = PAGES_PER_SUBSECTION;
+	else
+		min_align = PAGES_PER_SECTION;
+	if (!IS_ALIGNED(pfn, min_align)
+			|| !IS_ALIGNED(nr_pages, min_align)) {
+		WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+				reason, pfn, pfn + nr_pages - 1);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-		unsigned long nr_pages, struct mhp_restrictions *restrictions)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+		struct mhp_restrictions *restrictions)
 {
 	unsigned long i;
-	int err = 0;
-	int start_sec, end_sec;
+	int start_sec, end_sec, err;
 	struct vmem_altmap *altmap = restrictions->altmap;
 
-	/* during initialize mem_map, align hot-added range to section */
-	start_sec = pfn_to_section_nr(phys_start_pfn);
-	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
 	if (altmap) {
 		/*
 		 * Validate altmap is within bounds of the total request
 		 */
-		if (altmap->base_pfn != phys_start_pfn
+		if (altmap->base_pfn != pfn
 				|| vmem_altmap_offset(altmap) > nr_pages) {
 			pr_warn_once("memory add fail, invalid altmap\n");
-			err = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 		altmap->alloc = 0;
 	}
 
+	err = check_pfn_span(pfn, nr_pages, "add");
+	if (err)
+		return err;
+
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap);
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		err = __add_section(nid, pfn, pfns, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -309,7 +342,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
-out:
 	return err;
 }
 
@@ -487,10 +519,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
 	pgdat->node_spanned_pages = 0;
 }
 
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
 	unsigned long flags;
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -499,27 +531,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static void __remove_section(struct zone *zone, struct mem_section *ms,
-			     unsigned long map_offset,
-			     struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
-	unsigned long start_pfn;
-	int scn_nr;
+	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
 
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
-	__remove_zone(zone, start_pfn);
-
-	sparse_remove_one_section(ms, map_offset, altmap);
+	__remove_zone(zone, pfn, nr_pages);
+	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
  * __remove_pages() - remove sections of pages from a zone
  * @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
  * @nr_pages: number of pages to remove (must be multiple of section size)
  * @altmap: alternative device page map or %NULL if default memmap is used
  *
@@ -528,30 +556,30 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+void __remove_pages(struct zone *zone, unsigned long pfn,
 		    unsigned long nr_pages, struct vmem_altmap *altmap)
 {
-	unsigned long i;
 	unsigned long map_offset = 0;
-	int sections_to_remove;
+	int i, start_sec, end_sec;
 
 	map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
-	/*
-	 * We can only remove entire sections
-	 */
-	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
-	BUG_ON(nr_pages % PAGES_PER_SECTION);
+	if (check_pfn_span(pfn, nr_pages, "remove"))
+		return;
 
-	sections_to_remove = nr_pages / PAGES_PER_SECTION;
-	for (i = 0; i < sections_to_remove; i++) {
-		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	for (i = start_sec; i <= end_sec; i++) {
+		unsigned long pfns;
 
 		cond_resched();
-		__remove_section(zone, __pfn_to_section(pfn), map_offset,
-				 altmap);
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		__remove_section(zone, pfn, pfns, map_offset, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 		map_offset = 0;
 	}
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b01022e23a9..41579b66fff1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -728,8 +728,8 @@ static void free_map_bootmem(struct page *memmap)
  * * -EEXIST	- Section has been present.
  * * -ENOMEM	- Out of memory.
  */
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
-				     struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct mem_section_usage *usage;
@@ -835,8 +835,9 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
-			       struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	struct mem_section_usage *usage = NULL;
@@ -849,10 +850,7 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 		ms->usage = NULL;
 	}
 
-	clear_hwpoisoned_pages(memmap + map_offset,
-			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage,
-			section_nr_to_pfn(__section_nr(ms)),
-			PAGES_PER_SECTION, altmap);
+	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
+	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From ba72b4c8cf60e452cf6f0258ed9ee697957b7dfd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:26 -0700
Subject: mm/sparsemem: support sub-section hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The libnvdimm sub-system has suffered a series of hacks and broken
workarounds for the memory-hotplug implementation's awkward
section-aligned (128MB) granularity.

For example the following backtrace is emitted when attempting
arch_add_memory() with physical address ranges that intersect 'System
RAM' (RAM) with 'Persistent Memory' (PMEM) within a given section:

    # cat /proc/iomem | grep -A1 -B1 Persistent\ Memory
    100000000-1ffffffff : System RAM
    200000000-303ffffff : Persistent Memory (legacy)
    304000000-43fffffff : System RAM
    440000000-23ffffffff : Persistent Memory
    2400000000-43bfffffff : Persistent Memory
      2400000000-43bfffffff : namespace2.0

    WARNING: CPU: 38 PID: 928 at arch/x86/mm/init_64.c:850 add_pages+0x5c/0x60
    [..]
    RIP: 0010:add_pages+0x5c/0x60
    [..]
    Call Trace:
     devm_memremap_pages+0x460/0x6e0
     pmem_attach_disk+0x29e/0x680 [nd_pmem]
     ? nd_dax_probe+0xfc/0x120 [libnvdimm]
     nvdimm_bus_probe+0x66/0x160 [libnvdimm]

It was discovered that the problem goes beyond RAM vs PMEM collisions as
some platform produce PMEM vs PMEM collisions within a given section.
The libnvdimm workaround for that case revealed that the libnvdimm
section-alignment-padding implementation has been broken for a long
while.

A fix for that long-standing breakage introduces as many problems as it
solves as it would require a backward-incompatible change to the
namespace metadata interpretation.  Instead of that dubious route [1],
address the root problem in the memory-hotplug implementation.

Note that EEXIST is no longer treated as success as that is how
sparse_add_section() reports subsection collisions, it was also obviated
by recent changes to perform the request_region() for 'System RAM'
before arch_add_memory() in the add_memory() sequence.

[1] https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com

[osalvador@suse.de: fix deactivate_section for early sections]
  Link: http://lkml.kernel.org/r/20190715081549.32577-2-osalvador@suse.de
Link: http://lkml.kernel.org/r/156092354368.979959.6232443923440952359.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   2 +-
 mm/memory_hotplug.c            |  27 +-----
 mm/page_alloc.c                |   2 +-
 mm/sparse.c                    | 206 +++++++++++++++++++++++++++--------------
 4 files changed, 141 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2d636a7491a4..f46ea71b4ffd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -348,7 +348,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct mem_section *ms,
+extern void sparse_remove_section(struct mem_section *ms,
 		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3fbb2cfab126..aafb71594ee3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,18 +252,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long pfn,
-		unsigned long nr_pages,	struct vmem_altmap *altmap)
-{
-	int ret;
-
-	if (pfn_valid(pfn))
-		return -EEXIST;
-
-	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
-	return ret < 0 ? ret : 0;
-}
-
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 		const char *reason)
 {
@@ -327,18 +315,11 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 
 		pfns = min(nr_pages, PAGES_PER_SECTION
 				- (pfn & ~PAGE_SECTION_MASK));
-		err = __add_section(nid, pfn, pfns, altmap);
+		err = sparse_add_section(nid, pfn, pfns, altmap);
+		if (err)
+			break;
 		pfn += pfns;
 		nr_pages -= pfns;
-
-		/*
-		 * EEXIST is finally dealt with by ioresource collision
-		 * check. see add_memory() => register_memory_resource()
-		 * Warning will be printed if there is collision.
-		 */
-		if (err && (err != -EEXIST))
-			break;
-		err = 0;
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
@@ -541,7 +522,7 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
 		return;
 
 	__remove_zone(zone, pfn, nr_pages);
-	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
+	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c74367a8eba..272c6de1bf4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5974,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		 * pfn out of zone.
 		 *
 		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
-		 * because this is done early in sparse_add_one_section
+		 * because this is done early in section_activate()
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
diff --git a/mm/sparse.c b/mm/sparse.c
index 41579b66fff1..a205a2ac66a4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 
+	/*
+	 * An existing section is possible in the sub-section hotplug
+	 * case. First hot-add instantiates, follow-on hot-add reuses
+	 * the existing section.
+	 *
+	 * The mem_hotplug_lock resolves the apparent race below.
+	 */
 	if (mem_section[root])
-		return -EEXIST;
+		return 0;
 
 	section = sparse_index_alloc(nid);
 	if (!section)
@@ -715,10 +722,120 @@ static void free_map_bootmem(struct page *memmap)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+		struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	bool section_is_early = early_section(ms);
+	struct page *memmap = NULL;
+	unsigned long *subsection_map = ms->usage
+		? &ms->usage->subsection_map[0] : NULL;
+
+	subsection_mask_set(map, pfn, nr_pages);
+	if (subsection_map)
+		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+				"section already deactivated (%#lx + %ld)\n",
+				pfn, nr_pages))
+		return;
+
+	/*
+	 * There are 3 cases to handle across two configurations
+	 * (SPARSEMEM_VMEMMAP={y,n}):
+	 *
+	 * 1/ deactivation of a partial hot-added section (only possible
+	 * in the SPARSEMEM_VMEMMAP=y case).
+	 *    a/ section was present at memory init
+	 *    b/ section was hot-added post memory init
+	 * 2/ deactivation of a complete hot-added section
+	 * 3/ deactivation of a complete section from memory init
+	 *
+	 * For 1/, when subsection_map does not empty we will not be
+	 * freeing the usage map, but still need to free the vmemmap
+	 * range.
+	 *
+	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+	 */
+	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!section_is_early) {
+			kfree(ms->usage);
+			ms->usage = NULL;
+		}
+		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+	}
+
+	if (section_is_early && memmap)
+		free_map_bootmem(memmap);
+	else
+		depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	struct mem_section_usage *usage = NULL;
+	unsigned long *subsection_map;
+	struct page *memmap;
+	int rc = 0;
+
+	subsection_mask_set(map, pfn, nr_pages);
+
+	if (!ms->usage) {
+		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+		if (!usage)
+			return ERR_PTR(-ENOMEM);
+		ms->usage = usage;
+	}
+	subsection_map = &ms->usage->subsection_map[0];
+
+	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+		rc = -EINVAL;
+	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+		rc = -EEXIST;
+	else
+		bitmap_or(subsection_map, map, subsection_map,
+				SUBSECTIONS_PER_SECTION);
+
+	if (rc) {
+		if (usage)
+			ms->usage = NULL;
+		kfree(usage);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The early init code does not consider partially populated
+	 * initial sections, it simply assumes that memory will never be
+	 * referenced.  If we hot-add memory into such a section then we
+	 * do not need to populate the memmap and can simply reuse what
+	 * is already there.
+	 */
+	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+		return pfn_to_page(pfn);
+
+	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+	if (!memmap) {
+		section_deactivate(pfn, nr_pages, altmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return memmap;
+}
+
 /**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
  * @nid: The node to add section on
  * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
  * @altmap: device page map
  *
  * This is only intended for hotplug.
@@ -732,51 +849,34 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
 	int ret;
 
-	/*
-	 * no locking for this, because it does its own
-	 * plus, it does a kmalloc
-	 */
 	ret = sparse_index_init(section_nr, nid);
-	if (ret < 0 && ret != -EEXIST)
+	if (ret < 0)
 		return ret;
-	ret = 0;
-	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
-			altmap);
-	if (!memmap)
-		return -ENOMEM;
-	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
-	if (!usage) {
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-		return -ENOMEM;
-	}
 
-	ms = __pfn_to_section(start_pfn);
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
-		ret = -EEXIST;
-		goto out;
-	}
+	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+	if (IS_ERR(memmap))
+		return PTR_ERR(memmap);
 
 	/*
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 
+	ms = __pfn_to_section(start_pfn);
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
-out:
-	if (ret < 0) {
-		kfree(usage);
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-	}
-	return ret;
+	/* Align memmap to section boundary in the subsection case */
+	if (section_nr_to_pfn(section_nr) != start_pfn)
+		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+	return 0;
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -809,48 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap)
-{
-	if (!usage)
-		return;
-
-	/*
-	 * Check to see if allocation came from hot-plug-add
-	 */
-	if (!early_section(ms)) {
-		kfree(usage);
-		if (memmap)
-			depopulate_section_memmap(pfn, nr_pages, altmap);
-		return;
-	}
-
-	/*
-	 * The usemap came from bootmem. This is packed with other usemaps
-	 * on the section which has pgdat at boot time. Just keep it as is now.
-	 */
-
-	if (memmap)
-		free_map_bootmem(memmap);
-}
-
-void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
 		unsigned long nr_pages, unsigned long map_offset,
 		struct vmem_altmap *altmap)
 {
-	struct page *memmap = NULL;
-	struct mem_section_usage *usage = NULL;
-
-	if (ms->section_mem_map) {
-		usage = ms->usage;
-		memmap = sparse_decode_mem_map(ms->section_mem_map,
-						__section_nr(ms));
-		ms->section_mem_map = 0;
-		ms->usage = NULL;
-	}
-
-	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
-	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
+	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+			nr_pages - map_offset);
+	section_deactivate(pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From a3619190d62ed9d66416891be2416f6bea2b3ca4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:40 -0700
Subject: libnvdimm/pfn: stop padding pmem namespaces to section alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the mm core supports section-unaligned hotplug of ZONE_DEVICE
memory, we no longer need to add padding at pfn/dax device creation
time.  The kernel will still honor padding established by older kernels.

Link: http://lkml.kernel.org/r/156092356588.979959.6793371748950931916.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/nvdimm/pfn.h      | 14 ---------
 drivers/nvdimm/pfn_devs.c | 77 ++++++++---------------------------------------
 include/linux/mmzone.h    |  3 ++
 3 files changed, 16 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dfb2bcda8f5a..7381673b7b70 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -33,18 +33,4 @@ struct nd_pfn_sb {
 	__le64 checksum;
 };
 
-#ifdef CONFIG_SPARSEMEM
-#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x)
-#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x)
-#else
-/*
- * In this case ZONE_DEVICE=n and we will disable 'pfn' device support,
- * but we still want pmem to compile.
- */
-#define PFN_SECTION_ALIGN_DOWN(x) (x)
-#define PFN_SECTION_ALIGN_UP(x) (x)
-#endif
-
-#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
-#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 06f465c0baf3..df2bdbd22450 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -587,14 +587,14 @@ static u32 info_block_reserve(void)
 }
 
 /*
- * We hotplug memory at section granularity, pad the reserved area from
- * the previous section base to the namespace base address.
+ * We hotplug memory at sub-section granularity, pad the reserved area
+ * from the previous section base to the namespace base address.
  */
 static unsigned long init_altmap_base(resource_size_t base)
 {
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	return PFN_SECTION_ALIGN_DOWN(base_pfn);
+	return SUBSECTION_ALIGN_DOWN(base_pfn);
 }
 
 static unsigned long init_altmap_reserve(resource_size_t base)
@@ -602,7 +602,7 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	unsigned long reserve = info_block_reserve() >> PAGE_SHIFT;
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+	reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn);
 	return reserve;
 }
 
@@ -632,8 +632,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
-					- offset) / PAGE_SIZE);
+		nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset));
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
@@ -649,54 +648,14 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	return 0;
 }
 
-static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
-{
-	return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
-			ALIGN_DOWN(phys, nd_pfn->align));
-}
-
-/*
- * Check if pmem collides with 'System RAM', or other regions when
- * section aligned.  Trim it accordingly.
- */
-static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc)
-{
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent);
-	const resource_size_t start = nsio->res.start;
-	const resource_size_t end = start + resource_size(&nsio->res);
-	resource_size_t adjust, size;
-
-	*start_pad = 0;
-	*end_trunc = 0;
-
-	adjust = start - PHYS_SECTION_ALIGN_DOWN(start);
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| nd_region_conflict(nd_region, start - adjust, size))
-		*start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
-
-	/* Now check that end of the range does not collide. */
-	adjust = PHYS_SECTION_ALIGN_UP(end) - end;
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| !IS_ALIGNED(end, nd_pfn->align)
-			|| nd_region_conflict(nd_region, start, size))
-		*end_trunc = end - phys_pmem_align_down(nd_pfn, end);
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	u32 start_pad, end_trunc, reserve = info_block_reserve();
 	resource_size_t start, size;
 	struct nd_region *nd_region;
+	unsigned long npfns, align;
 	struct nd_pfn_sb *pfn_sb;
-	unsigned long npfns;
 	phys_addr_t offset;
 	const char *sig;
 	u64 checksum;
@@ -727,43 +686,35 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENXIO;
 	}
 
-	memset(pfn_sb, 0, sizeof(*pfn_sb));
-
-	trim_pfn_device(nd_pfn, &start_pad, &end_trunc);
-	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
-				dev_name(&ndns->dev), start_pad + end_trunc);
-
 	/*
 	 * Note, we use 64 here for the standard size of struct page,
 	 * debugging options may cause it to be larger in which case the
 	 * implementation will limit the pfns advertised through
 	 * ->direct_access() to those that are included in the memmap.
 	 */
-	start = nsio->res.start + start_pad;
+	start = nsio->res.start;
 	size = resource_size(&nsio->res);
-	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - reserve)
-			/ PAGE_SIZE);
+	npfns = PHYS_PFN(size - SZ_8K);
+	align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
 	if (nd_pfn->mode == PFN_MODE_PMEM) {
 		/*
 		 * The altmap should be padded out to the block size used
 		 * when populating the vmemmap. This *should* be equal to
 		 * PMD_SIZE for most architectures.
 		 */
-		offset = ALIGN(start + reserve + 64 * npfns,
-				max(nd_pfn->align, PMD_SIZE)) - start;
+		offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
 	} else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(start + reserve, nd_pfn->align) - start;
+		offset = ALIGN(start + SZ_8K, align) - start;
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= size) {
+	if (offset >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = PHYS_PFN(size - offset);
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -772,8 +723,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
 	pfn_sb->version_minor = cpu_to_le16(3);
-	pfn_sb->start_pad = cpu_to_le32(start_pad);
-	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	pfn_sb->align = cpu_to_le32(nd_pfn->align);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8331e76677c0..d77d717c620c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
 #endif
 
+#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
+#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
+
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 	/* See declaration of similar field in struct zone */
-- 
cgit v1.2.3


From 371096949f0ad3950b06729989bd27de51b8c5f5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 18 Jul 2019 15:58:46 -0700
Subject: mm: migrate: remove unused mode argument

migrate_page_move_mapping() doesn't use the mode argument.  Remove it
and update callers accordingly.

Link: http://lkml.kernel.org/r/20190508210301.8472-1-keith.busch@intel.com
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                | 2 +-
 fs/f2fs/data.c          | 2 +-
 fs/iomap.c              | 2 +-
 fs/ubifs/file.c         | 2 +-
 include/linux/migrate.h | 3 +--
 mm/migrate.c            | 7 +++----
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 8327db0c8e08..8b3aa2739906 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -425,7 +425,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	BUG_ON(PageWriteback(old));
 	get_page(new);
 
-	rc = migrate_page_move_mapping(mapping, new, old, mode, 1);
+	rc = migrate_page_move_mapping(mapping, new, old, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
 		goto out_unlock;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4eb2f3920140..abbf14e9bd72 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2919,7 +2919,7 @@ int f2fs_migrate_page(struct address_space *mapping,
 	/* one extra reference was held for atomic_write page */
 	extra_count = atomic_written ? 1 : 0;
 	rc = migrate_page_move_mapping(mapping, newpage,
-				page, mode, extra_count);
+				page, extra_count);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		if (atomic_written)
 			mutex_unlock(&fi->inmem_lock);
diff --git a/fs/iomap.c b/fs/iomap.c
index 217c3e5a13d6..3e7f16a05653 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -566,7 +566,7 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
 {
 	int ret;
 
-	ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5f8de62fc51..400970d740bb 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1470,7 +1470,7 @@ static int ubifs_migrate_page(struct address_space *mapping,
 {
 	int rc;
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf2f9a5..7f04754c7f2b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -77,8 +77,7 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count);
+		struct page *newpage, struct page *page, int extra_count);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 3445747e229d..8992741f10aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -394,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count)
+		struct page *newpage, struct page *page, int extra_count)
 {
 	XA_STATE(xas, &mapping->i_pages, page_index(page));
 	struct zone *oldzone, *newzone;
@@ -681,7 +680,7 @@ int migrate_page(struct address_space *mapping,
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
@@ -780,7 +779,7 @@ recheck_buffers:
 		}
 	}
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		goto unlock_buffers;
 
-- 
cgit v1.2.3


From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Thu, 18 Jul 2019 15:58:50 -0700
Subject: proc/sysctl: add shared variables for range check

In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range.  This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.

On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.

The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:

    $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
    248

Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.

This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:

    # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
    add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
    Data                                         old     new   delta
    sysctl_vals                                    -      12     +12
    __kstrtab_sysctl_vals                          -      12     +12
    max                                           14      10      -4
    int_max                                       16       -     -16
    one                                           68       -     -68
    zero                                         128      28    -100
    Total: Before=20583249, After=20583085, chg -0.00%

[mcroce@redhat.com: tipc: remove two unused variables]
  Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
  Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/appldata/appldata_base.c            |  15 +-
 arch/s390/kernel/topology.c                   |   6 +-
 arch/x86/entry/vdso/vdso32-setup.c            |   7 +-
 arch/x86/kernel/itmt.c                        |   6 +-
 drivers/base/firmware_loader/fallback_table.c |  13 +-
 drivers/gpu/drm/i915/i915_perf.c              |   8 +-
 drivers/hv/vmbus_drv.c                        |   6 +-
 drivers/tty/tty_ldisc.c                       |   6 +-
 drivers/xen/balloon.c                         |   7 +-
 fs/eventpoll.c                                |   4 +-
 fs/notify/inotify/inotify_user.c              |   8 +-
 fs/proc/proc_sysctl.c                         |   4 +
 include/linux/sysctl.h                        |   7 +
 ipc/ipc_sysctl.c                              |  35 +++--
 kernel/pid_namespace.c                        |   3 +-
 kernel/sysctl.c                               | 197 +++++++++++++-------------
 kernel/ucount.c                               |   6 +-
 net/core/neighbour.c                          |  20 ++-
 net/core/sysctl_net_core.c                    |  34 +++--
 net/dccp/sysctl.c                             |  16 +--
 net/ipv4/sysctl_net_ipv4.c                    |  60 ++++----
 net/ipv6/addrconf.c                           |   6 +-
 net/ipv6/route.c                              |   7 +-
 net/ipv6/sysctl_net_ipv6.c                    |  10 +-
 net/mpls/af_mpls.c                            |  10 +-
 net/netfilter/ipvs/ip_vs_ctl.c                |   3 +-
 net/rxrpc/sysctl.c                            |   9 +-
 net/sctp/sysctl.c                             |  35 +++--
 net/sunrpc/xprtrdma/transport.c               |   3 +-
 net/tipc/sysctl.c                             |   6 +-
 security/keys/sysctl.c                        |  26 ++--
 security/loadpin/loadpin.c                    |   6 +-
 security/yama/yama_lsm.c                      |   3 +-
 33 files changed, 270 insertions(+), 322 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index e4b58240ec53..aa738cad1338 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -220,15 +220,13 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int timer_active = appldata_timer_active;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &timer_active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -255,13 +253,12 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int interval = appldata_interval;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &interval,
 		.maxlen		= sizeof(int),
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	};
 
 	rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -289,13 +286,11 @@ appldata_generic_handler(struct ctl_table *ctl, int write,
 	struct list_head *lh;
 	int rc, found;
 	int active;
-	int zero = 0;
-	int one = 1;
 	struct ctl_table ctl_entry = {
 		.data		= &active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	found = 0;
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 8964a3f60aad..2db6fb405a9a 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -587,15 +587,13 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write,
 {
 	int enabled = topology_is_enabled();
 	int new_mode;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &enabled,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 42d4c89f990e..240626e7f55a 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -65,9 +65,6 @@ subsys_initcall(sysenter_setup);
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
 
-static const int zero;
-static const int one = 1;
-
 static struct ctl_table abi_table2[] = {
 	{
 		.procname	= "vsyscall32",
@@ -75,8 +72,8 @@ static struct ctl_table abi_table2[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (int *)&zero,
-		.extra2		= (int *)&one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 838cf8a32c49..1cb3ca9bba49 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-static unsigned int zero;
-static unsigned int one = 1;
 static struct ctl_table itmt_kern_table[] = {
 	{
 		.procname	= "sched_itmt_enabled",
@@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_itmt_update_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c
index 776dd69cf5be..ba9d30b28edc 100644
--- a/drivers/base/firmware_loader/fallback_table.c
+++ b/drivers/base/firmware_loader/fallback_table.c
@@ -16,9 +16,6 @@
  * firmware fallback configuration table
  */
 
-static unsigned int zero;
-static unsigned int one = 1;
-
 struct firmware_fallback_config fw_fallback_config = {
 	.force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK),
 	.loading_timeout = 60,
@@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = {
 };
 EXPORT_SYMBOL_GPL(fw_fallback_config);
 
+#ifdef CONFIG_SYSCTL
 struct ctl_table firmware_config_table[] = {
 	{
 		.procname	= "force_sysfs_fallback",
@@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ignore_sysfs_fallback",
@@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
 EXPORT_SYMBOL_GPL(firmware_config_table);
+#endif
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 3d8162d28730..a700c5c3d167 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -274,8 +274,6 @@
 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 
 /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
-static int zero;
-static int one = 1;
 static u32 i915_perf_stream_paranoid = true;
 
 /* The maximum exponent the hardware accepts is 63 (essentially it selects one
@@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_perf_stream_paranoid),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
-	 .extra2 = &one,
+	 .extra1 = SYSCTL_ZERO,
+	 .extra2 = SYSCTL_ONE,
 	 },
 	{
 	 .procname = "oa_max_sample_rate",
@@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_oa_max_sample_rate),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
+	 .extra1 = SYSCTL_ZERO,
 	 .extra2 = &oa_sample_rate_hard_limit,
 	 },
 	{}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 894da5abdc55..ebd35fc35290 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = {
 };
 
 static struct ctl_table_header *hv_ctl_table_hdr;
-static int zero;
-static int one = 1;
 
 /*
  * sysctl option to allow the user to control whether kmsg data should be
@@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 	{}
 };
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fde8d4073e74..4c49f53afa3e 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty)
 	tty->ldisc = NULL;
 }
 
-static int zero;
-static int one = 1;
 static struct ctl_table tty_table[] = {
 	{
 		.procname	= "ldisc_autoload",
@@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = {
 		.maxlen		= sizeof(tty_ldisc_autoload),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index d37dd5bb7a8f..37a36c6b9f93 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated;
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table balloon_table[] = {
 	{
 		.procname	= "hotplug_unpopulated",
@@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0f9c073d78d5..d7f1f5011fac 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list);
 
 #include <linux/sysctl.h>
 
-static long zero;
+static long long_zero;
 static long long_max = LONG_MAX;
 
 struct ctl_table epoll_table[] = {
@@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = {
 		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &long_zero,
 		.extra2		= &long_max,
 	},
 	{ }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cce8de32779f..0b815178126e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #include <linux/sysctl.h>
 
-static int zero;
-
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
@@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_user_watches",
@@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_queued_events",
@@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.extra1		= SYSCTL_ZERO
 	},
 	{ }
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 36ad1b0d6259..d80989b6c344 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, INT_MAX };
+EXPORT_SYMBOL(sysctl_vals);
+
 /* Support for permanently empty directories */
 
 struct ctl_table sysctl_mount_point[] = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index aadd310769d0..6df477329b76 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -37,6 +37,13 @@ struct ctl_table_root;
 struct ctl_table_header;
 struct ctl_dir;
 
+/* Keep the same order as in fs/proc/proc_sysctl.c */
+#define SYSCTL_ZERO	((void *)&sysctl_vals[0])
+#define SYSCTL_ONE	((void *)&sysctl_vals[1])
+#define SYSCTL_INT_MAX	((void *)&sysctl_vals[2])
+
+extern const int sysctl_vals[];
+
 typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 2b14ce8ce73f..affd66537e87 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -113,9 +113,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 #define proc_ipc_sem_dointvec	   NULL
 #endif
 
-static int zero;
-static int one = 1;
-static int int_max = INT_MAX;
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
@@ -141,7 +138,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -150,8 +147,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -159,8 +156,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msgmni",
@@ -168,7 +165,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -177,8 +174,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_auto_msgmni,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	=  "msgmnb",
@@ -186,8 +183,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sem",
@@ -203,8 +200,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
@@ -212,8 +209,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
@@ -221,8 +218,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d726cef241c..a6a79f85c81a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 }
 
 extern int pid_max;
-static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
 		.mode = 0666, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
-		.extra1 = &zero,
+		.extra1 = SYSCTL_ZERO,
 		.extra2 = &pid_max,
 	},
 	{ }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 43186ccfa139..078950d9605b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,6 @@ static int sixty = 60;
 #endif
 
 static int __maybe_unused neg_one = -1;
-
-static int zero;
-static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long zero_ul;
@@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_schedstats,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
@@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "numa_balancing",
@@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_numa_balancing,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
@@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 #endif
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_energy_aware_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_PROVE_LOCKING
@@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
-		.extra2		= &one,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MODULES
@@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_UEVENT_HELPER
@@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ten_thousand,
 	},
 	{
@@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "kptr_restrict",
@@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #endif
@@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_thresh",
@@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_watchdog_thresh,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &sixty,
 	},
 	{
@@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_cpumask",
@@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_soft_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "softlockup_panic",
@@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "hung_task_check_count",
@@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "hung_task_timeout_secs",
@@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= perf_proc_update_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "perf_cpu_time_max_percent",
@@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
 		.mode		= 0644,
 		.proc_handler	= perf_cpu_time_max_percent_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &six_hundred_forty_kb,
 	},
 	{
@@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_thousand,
 	},
 #endif
@@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	{
@@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= timer_migration_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_BPF_SYSCALL
@@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "bpf_stats_enabled",
@@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
@@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= stack_erasing_sysctl,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_oom),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirty_background_ratio",
@@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_background_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirtytime_expire_seconds",
@@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirtytime_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= dirtytime_interval_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "swappiness",
@@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
@@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen			= sizeof(int),
 		.mode			= 0644,
 		.proc_handler	= sysctl_vm_numa_stat_handler,
-		.extra1			= &zero,
-		.extra2			= &one,
+		.extra1			= SYSCTL_ZERO,
+		.extra2			= SYSCTL_ONE,
 	},
 #endif
 	 {
@@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &four,
 	},
 #ifdef CONFIG_COMPACTION
@@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 
 #endif /* CONFIG_COMPACTION */
@@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
 		.proc_handler	= min_free_kbytes_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_boost_factor",
@@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_boost_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_boost_factor_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_scale_factor",
@@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_scale_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_scale_factor_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &one_thousand,
 	},
 	{
@@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
 		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_MMU
 	{
@@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #else
 	{
@@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_nr_trim_pages),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 	{
@@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(block_dump),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "vfs_cache_pressure",
@@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
@@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_legacy_va_layout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NUMA
@@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(node_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "min_unmapped_ratio",
@@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_slab_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #endif
@@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = {
 #endif
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_HIGHMEM
@@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
@@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "memory_failure_recovery",
@@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_hardlinks",
@@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_fifos",
@@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_coredump,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
@@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{ }
 };
@@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_kprobes_optimization_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write,
 		.data   = &val,
 		.maxlen = sizeof(val),
 		.mode   = table->mode,
-		.extra1 = &zero,
-		.extra2 = &one,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
 	};
 
 	if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/ucount.c b/kernel/ucount.c
index feb128c7b5d9..a53cc2b4179c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,16 +52,14 @@ static struct ctl_table_root set_root = {
 	.permissions = set_permissions,
 };
 
-static int zero = 0;
-static int int_max = INT_MAX;
 #define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
 		.mode		= 0644,			\
 		.proc_handler	= proc_dointvec_minmax,	\
-		.extra1		= &zero,		\
-		.extra2		= &int_max,		\
+		.extra1		= SYSCTL_ZERO,		\
+		.extra2		= SYSCTL_INT_MAX,	\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 742cea4ce72e..26da97359d5b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3374,8 +3374,6 @@ void neigh_app_ns(struct neighbour *n)
 EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int int_max = INT_MAX;
 static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
 
 static int proc_unres_qlen(struct ctl_table *ctl, int write,
@@ -3384,7 +3382,7 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
 	int size, ret;
 	struct ctl_table tmp = *ctl;
 
-	tmp.extra1 = &zero;
+	tmp.extra1 = SYSCTL_ZERO;
 	tmp.extra2 = &unres_qlen_max;
 	tmp.data = &size;
 
@@ -3449,8 +3447,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	struct ctl_table tmp = *ctl;
 	int ret;
 
-	tmp.extra1 = &zero;
-	tmp.extra2 = &int_max;
+	tmp.extra1 = SYSCTL_ZERO;
+	tmp.extra2 = SYSCTL_INT_MAX;
 
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	neigh_proc_update(ctl, write);
@@ -3595,24 +3593,24 @@ static struct neigh_sysctl_table {
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH2] = {
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH3] = {
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		{},
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f9204719aeee..8da5b3a54dac 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,8 +22,6 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 
-static int zero = 0;
-static int one = 1;
 static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -390,10 +388,10 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 # else
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 # endif
 	},
@@ -404,7 +402,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -413,8 +411,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 # endif
 	{
@@ -461,8 +459,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 #ifdef CONFIG_RPS
 	{
@@ -493,7 +491,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "busy_read",
@@ -501,7 +499,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NET_SCHED
@@ -533,7 +531,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &max_skb_frags,
 	},
 	{
@@ -542,7 +540,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "fb_tunnels_only_for_init_net",
@@ -550,8 +548,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "devconf_inherit_init_net",
@@ -559,7 +557,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -578,7 +576,7 @@ static struct ctl_table netns_core_table[] = {
 		.data		= &init_net.core.sysctl_somaxconn,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.proc_handler	= proc_dointvec_minmax
 	},
 	{ }
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index b59040f268a9..ee8d4f5afa72 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -16,9 +16,7 @@
 #endif
 
 /* Boundary values */
-static int		zero     = 0,
-			one      = 1,
-			u8_max   = 0xFF;
+static int		u8_max   = 0xFF;
 static unsigned long	seqw_min = DCCPF_SEQ_WMIN,
 			seqw_max = 0xFFFFFFFF;		/* maximum on 32 bit */
 
@@ -38,7 +36,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -47,7 +45,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -56,7 +54,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &u8_max,
 	},
 	{
@@ -65,7 +63,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -74,7 +72,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -83,7 +81,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "sync_ratelimit",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7d66306b5f39..0b980e841927 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,8 +28,6 @@
 #include <net/protocol.h>
 #include <net/netevent.h>
 
-static int zero;
-static int one = 1;
 static int two = 2;
 static int four = 4;
 static int thousand = 1000;
@@ -576,7 +574,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "icmp_msgs_burst",
@@ -584,7 +582,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "udp_mem",
@@ -674,8 +672,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -763,8 +761,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = ipv4_fwd_update_priority,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ip_nonlocal_bind",
@@ -794,8 +792,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -864,7 +862,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #endif
 	{
@@ -969,7 +967,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1011,7 +1009,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_tfo_blackhole_detect_timeout,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
@@ -1020,8 +1018,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "fib_multipath_hash_policy",
@@ -1029,8 +1027,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_fib_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1047,8 +1045,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1078,7 +1076,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &four,
 	},
 	{
@@ -1222,7 +1220,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &gso_max_segs,
 	},
 	{
@@ -1231,7 +1229,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_day_secs
 	},
 	{
@@ -1240,8 +1238,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_invalid_ratelimit",
@@ -1256,7 +1254,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1265,7 +1263,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1274,7 +1272,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_rmem",
@@ -1282,7 +1280,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_comp_sack_delay_ns",
@@ -1297,7 +1295,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &comp_sack_nr_max,
 	},
 	{
@@ -1306,7 +1304,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_rmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{
 		.procname	= "udp_wmem_min",
@@ -1314,7 +1312,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_wmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{ }
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e3203e83a..dc73888c7859 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6432,8 +6432,6 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
-static const int zero = 0;
-static const int one = 1;
 static const int two_five_five = 255;
 
 static const struct ctl_table addrconf_sysctl[] = {
@@ -6450,7 +6448,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
@@ -6809,7 +6807,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&zero,
+		.extra1		= (void *)SYSCTL_ZERO,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d2e6b31a8d6..8b0c33fb19a2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6031,9 +6031,6 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -6111,8 +6108,8 @@ static struct ctl_table ipv6_route_table_template[] = {
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_minmax,
-		.extra1		=	&zero,
-		.extra2		=	&one,
+		.extra1		=	SYSCTL_ZERO,
+		.extra2		=	SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index dc4c91e0bfb8..ec8fcfc60a27 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,8 +21,6 @@
 #include <net/calipso.h>
 #endif
 
-static int zero;
-static int one = 1;
 static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -115,7 +113,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &flowlabel_reflect_max,
 	},
 	{
@@ -152,8 +150,8 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_rt6_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "seg6_flowlabel",
@@ -179,7 +177,7 @@ static struct ctl_table ipv6_rotable[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #ifdef CONFIG_NETLABEL
 	{
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 198ec4fe4148..c312741df2ce 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -37,8 +37,6 @@
 
 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
 
-static int zero = 0;
-static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
@@ -2607,7 +2605,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
 		.data		= &platform_labels,
 		.maxlen		= sizeof(int),
 		.mode		= table->mode,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &label_limit,
 	};
 
@@ -2636,8 +2634,8 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "default_ttl",
@@ -2645,7 +2643,7 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &ttl_max,
 	},
 	{ }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 07e0967bf129..060565e7d227 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1726,7 +1726,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 
 #ifdef CONFIG_SYSCTL
 
-static int zero;
 static int three = 3;
 
 static int
@@ -1935,7 +1934,7 @@ static struct ctl_table vs_vars[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &three,
 	},
 	{
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 1e3fa67d91aa..2bbb38161851 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,7 +11,6 @@
 #include "ar-internal.h"
 
 static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned int one = 1;
 static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
@@ -97,7 +96,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&rxrpc_max_client_connections,
 	},
 	{
@@ -115,7 +114,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_max_acks,
 	},
 	{
@@ -124,7 +123,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_65535,
 	},
 	{
@@ -133,7 +132,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&four,
 	},
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 9a19147902f1..1250751bca1b 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -25,10 +25,7 @@
 #include <net/sctp/sctp.h>
 #include <linux/sysctl.h>
 
-static int zero = 0;
-static int one = 1;
 static int timer_max = 86400000; /* ms in one day */
-static int int_max = INT_MAX;
 static int sack_timer_min = 1;
 static int sack_timer_max = 500;
 static int addr_scope_max = SCTP_SCOPE_POLICY_MAX;
@@ -92,7 +89,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -101,7 +98,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_sctp_do_rto_min,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &init_net.sctp.rto_max
 	},
 	{
@@ -137,8 +134,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "cookie_preserve_enable",
@@ -160,7 +157,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -178,7 +175,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -187,8 +184,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "path_max_retrans",
@@ -196,8 +193,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "max_init_retransmits",
@@ -205,8 +202,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "pf_retrans",
@@ -214,8 +211,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sndbuf_policy",
@@ -286,7 +283,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &addr_scope_max,
 	},
 	{
@@ -295,7 +292,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &rwnd_scale_max,
 	},
 	{
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1f73a6a7e43c..ffb1684c4573 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -80,7 +80,6 @@ static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
 static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
 static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
-static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
 static unsigned int max_memreg = RPCRDMA_LAST - 1;
@@ -122,7 +121,7 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &max_padding,
 	},
 	{
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 9df82a573aa7..6159d327db76 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -38,8 +38,6 @@
 
 #include <linux/sysctl.h>
 
-static int zero;
-static int one = 1;
 static struct ctl_table_header *tipc_ctl_hdr;
 
 static struct ctl_table tipc_table[] = {
@@ -49,7 +47,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 	},
 	{
 		.procname	= "named_timeout",
@@ -57,7 +55,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_named_timeout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 	},
 	{
 		.procname       = "sk_filter",
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index dd1e21fab827..b46b651b3c4c 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -9,8 +9,6 @@
 #include <linux/sysctl.h>
 #include "internal.h"
 
-static const int zero, one = 1, max = INT_MAX;
-
 struct ctl_table key_sysctls[] = {
 	{
 		.procname = "maxkeys",
@@ -18,8 +16,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "maxbytes",
@@ -27,8 +25,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxkeys",
@@ -36,8 +34,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxbytes",
@@ -45,8 +43,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "gc_delay",
@@ -54,8 +52,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	{
@@ -64,8 +62,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #endif
 	{ }
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 81519c804888..ee5cb944f4ad 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -43,8 +43,6 @@ static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int one = 1;
 
 static struct ctl_path loadpin_sysctl_path[] = {
 	{ .procname = "kernel", },
@@ -59,8 +57,8 @@ static struct ctl_table loadpin_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 01c6239c4493..94dc346370b1 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -445,7 +445,6 @@ static int yama_dointvec_minmax(struct ctl_table *table, int write,
 	return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
 }
 
-static int zero;
 static int max_scope = YAMA_SCOPE_NO_ATTACH;
 
 static struct ctl_path yama_sysctl_path[] = {
@@ -461,7 +460,7 @@ static struct ctl_table yama_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = yama_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 		.extra2         = &max_scope,
 	},
 	{ }
-- 
cgit v1.2.3


From 0c5f81dad46c90792e6c3c4797131323c9e96dcd Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Sat, 6 Jul 2019 09:26:51 +0800
Subject: KVM: LAPIC: Inject timer interrupt via posted interrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dedicated instances are currently disturbed by unnecessary jitter due
to the emulated lapic timers firing on the same pCPUs where the
vCPUs reside.  There is no hardware virtual timer on Intel for guest
like ARM, so both programming timer in guest and the emulated timer fires
incur vmexits.  This patch tries to avoid vmexit when the emulated timer
fires, at least in dedicated instance scenario when nohz_full is enabled.

In that case, the emulated timers can be offload to the nearest busy
housekeeping cpus since APICv has been found for several years in server
processors. The guest timer interrupt can then be injected via posted interrupts,
which are delivered by the housekeeping cpu once the emulated timer fires.

The host should tuned so that vCPUs are placed on isolated physical
processors, and with several pCPUs surplus for busy housekeeping.
If disabled mwait/hlt/pause vmexits keep the vCPUs in non-root mode,
~3% redis performance benefit can be observed on Skylake server, and the
number of external interrupt vmexits drops substantially.  Without patch

            VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )

While with patch:

            VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c            | 99 ++++++++++++++++++++++++++---------------
 arch/x86/kvm/lapic.h            |  1 +
 arch/x86/kvm/vmx/vmx.c          |  3 +-
 arch/x86/kvm/x86.c              |  6 +++
 arch/x86/kvm/x86.h              |  2 +
 include/linux/sched/isolation.h |  6 +++
 kernel/sched/isolation.c        |  6 +++
 7 files changed, 87 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 32b80ecc0ac5..0aa158657f20 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -118,6 +118,17 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 	return apic->vcpu->vcpu_id;
 }
 
+bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
+{
+	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt);
+
+static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
+{
+	return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
+}
+
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -1421,29 +1432,6 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 	}
 }
 
-static void apic_timer_expired(struct kvm_lapic *apic)
-{
-	struct kvm_vcpu *vcpu = apic->vcpu;
-	struct swait_queue_head *q = &vcpu->wq;
-	struct kvm_timer *ktimer = &apic->lapic_timer;
-
-	if (atomic_read(&apic->lapic_timer.pending))
-		return;
-
-	atomic_inc(&apic->lapic_timer.pending);
-	kvm_set_pending_timer(vcpu);
-
-	/*
-	 * For x86, the atomic_inc() is serialized, thus
-	 * using swait_active() is safe.
-	 */
-	if (swait_active(q))
-		swake_up_one(q);
-
-	if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
-		ktimer->expired_tscdeadline = ktimer->tscdeadline;
-}
-
 /*
  * On APICv, this test will cause a busy wait
  * during a higher-priority task.
@@ -1517,7 +1505,7 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
 	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
 }
 
-void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 guest_tsc, tsc_deadline;
@@ -1525,9 +1513,6 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 	if (apic->lapic_timer.expired_tscdeadline == 0)
 		return;
 
-	if (!lapic_timer_int_injected(vcpu))
-		return;
-
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1539,8 +1524,57 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
 		adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
 }
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+	if (lapic_timer_int_injected(vcpu))
+		__kvm_wait_lapic_expire(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
 
+static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
+{
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+
+	kvm_apic_local_deliver(apic, APIC_LVTT);
+	if (apic_lvtt_tscdeadline(apic))
+		ktimer->tscdeadline = 0;
+	if (apic_lvtt_oneshot(apic)) {
+		ktimer->tscdeadline = 0;
+		ktimer->target_expiration = 0;
+	}
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic)
+{
+	struct kvm_vcpu *vcpu = apic->vcpu;
+	struct swait_queue_head *q = &vcpu->wq;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+
+	if (atomic_read(&apic->lapic_timer.pending))
+		return;
+
+	if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+		ktimer->expired_tscdeadline = ktimer->tscdeadline;
+
+	if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
+		if (apic->lapic_timer.timer_advance_ns)
+			__kvm_wait_lapic_expire(vcpu);
+		kvm_apic_inject_pending_timer_irqs(apic);
+		return;
+	}
+
+	atomic_inc(&apic->lapic_timer.pending);
+	kvm_set_pending_timer(vcpu);
+
+	/*
+	 * For x86, the atomic_inc() is serialized, thus
+	 * using swait_active() is safe.
+	 */
+	if (swait_active(q))
+		swake_up_one(q);
+}
+
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
 {
 	struct kvm_timer *ktimer = &apic->lapic_timer;
@@ -2325,13 +2359,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
-		kvm_apic_local_deliver(apic, APIC_LVTT);
-		if (apic_lvtt_tscdeadline(apic))
-			apic->lapic_timer.tscdeadline = 0;
-		if (apic_lvtt_oneshot(apic)) {
-			apic->lapic_timer.tscdeadline = 0;
-			apic->lapic_timer.target_expiration = 0;
-		}
+		kvm_apic_inject_pending_timer_irqs(apic);
 		atomic_set(&apic->lapic_timer.pending, 0);
 	}
 }
@@ -2453,7 +2481,8 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
 	struct hrtimer *timer;
 
-	if (!lapic_in_kernel(vcpu))
+	if (!lapic_in_kernel(vcpu) ||
+		kvm_can_post_timer_interrupt(vcpu))
 		return;
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 36747174e4a8..50053d2b8b7b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -236,6 +236,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu);
 
 static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 84f8d49a2fd2..280320f74db7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7064,7 +7064,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
 	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
 	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
 
-	if (kvm_mwait_in_guest(vcpu->kvm))
+	if (kvm_mwait_in_guest(vcpu->kvm) ||
+		kvm_can_post_timer_interrupt(vcpu))
 		return -EOPNOTSUPP;
 
 	vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6ab30c5e1ae0..58305cf81182 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,7 @@
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
 
 #include <trace/events/kvm.h>
@@ -153,6 +154,9 @@ EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 static bool __read_mostly force_emulation_prefix = false;
 module_param(force_emulation_prefix, bool, S_IRUGO);
 
+int __read_mostly pi_inject_timer = -1;
+module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -7058,6 +7062,8 @@ int kvm_arch_init(void *opaque)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
 	kvm_lapic_init();
+	if (pi_inject_timer == -1)
+		pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
 #ifdef CONFIG_X86_64
 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e08a12892e8b..6594020c0691 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -301,6 +301,8 @@ extern unsigned int min_timer_period_us;
 
 extern bool enable_vmware_backdoor;
 
+extern int pi_inject_timer;
+
 extern struct static_key kvm_no_apic_vcpu;
 
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index b0fb1446fe04..6c8512d3be88 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -19,6 +19,7 @@ enum hk_flags {
 DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
 extern int housekeeping_any_cpu(enum hk_flags flags);
 extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern bool housekeeping_enabled(enum hk_flags flags);
 extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
 extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
 extern void __init housekeeping_init(void);
@@ -35,6 +36,11 @@ static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 	return cpu_possible_mask;
 }
 
+static inline bool housekeeping_enabled(enum hk_flags flags)
+{
+	return false;
+}
+
 static inline void housekeeping_affine(struct task_struct *t,
 				       enum hk_flags flags) { }
 static inline void housekeeping_init(void) { }
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 123ea07a3f3b..ccb28085b114 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -14,6 +14,12 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden);
 static cpumask_var_t housekeeping_mask;
 static unsigned int housekeeping_flags;
 
+bool housekeeping_enabled(enum hk_flags flags)
+{
+	return !!(housekeeping_flags & flags);
+}
+EXPORT_SYMBOL_GPL(housekeeping_enabled);
+
 int housekeeping_any_cpu(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-- 
cgit v1.2.3


From d73eb57b80b98ae147e4e6a7d9877c2ba175f972 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Thu, 18 Jul 2019 19:39:06 +0800
Subject: KVM: Boost vCPUs that are delivering interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inspired by commit 9cac38dd5d (KVM/s390: Set preempted flag during
vcpu wakeup and interrupt delivery), we want to also boost not just
lock holders but also vCPUs that are delivering interrupts. Most
smp_call_function_many calls are synchronous, so the IPI target vCPUs
are also good yield candidates.  This patch introduces vcpu->ready to
boost vCPUs during wakeup and interrupt delivery time; unlike s390 we do
not reuse vcpu->preempted so that voluntarily preempted vCPUs are taken
into account by kvm_vcpu_on_spin, but vmx_vcpu_pi_put is not affected
(VT-d PI handles voluntary preemption separately, in pi_pre_block).

Testing on 80 HT 2 socket Xeon Skylake server, with 80 vCPUs VM 80GB RAM:
ebizzy -M

            vanilla     boosting    improved
1VM          21443       23520         9%
2VM           2800        8000       180%
3VM           1800        3100        72%

Testing on my Haswell desktop 8 HT, with 8 vCPUs VM 8GB RAM, two VMs,
one running ebizzy -M, the other running 'stress --cpu 2':

w/ boosting + w/o pv sched yield(vanilla)

            vanilla     boosting   improved
              1570         4000      155%

w/ boosting + w/ pv sched yield(vanilla)

            vanilla     boosting   improved
              1844         5157      179%

w/o boosting, perf top in VM:

 72.33%  [kernel]       [k] smp_call_function_many
  4.22%  [kernel]       [k] call_function_i
  3.71%  [kernel]       [k] async_page_fault

w/ boosting, perf top in VM:

 38.43%  [kernel]       [k] smp_call_function_many
  6.31%  [kernel]       [k] async_page_fault
  6.13%  libc-2.23.so   [.] __memcpy_avx_unaligned
  4.88%  [kernel]       [k] call_function_interrupt

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/interrupt.c |  2 +-
 include/linux/kvm_host.h  |  1 +
 virt/kvm/kvm_main.c       | 12 ++++++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 9dde4d7d8704..26f8bf4a22a7 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1240,7 +1240,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
 		 * yield-candidate.
 		 */
-		vcpu->preempted = true;
+		vcpu->ready = true;
 		swake_up_one(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c5da875f19e3..5c5b5867024c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -318,6 +318,7 @@ struct kvm_vcpu {
 	} spin_loop;
 #endif
 	bool preempted;
+	bool ready;
 	struct kvm_vcpu_arch arch;
 	struct dentry *debugfs_dentry;
 };
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b4ab59dd6846..887f3b0c2b60 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -314,6 +314,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
+	vcpu->ready = false;
 
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
@@ -2387,6 +2388,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 	wqp = kvm_arch_vcpu_wq(vcpu);
 	if (swq_has_sleeper(wqp)) {
 		swake_up_one(wqp);
+		WRITE_ONCE(vcpu->ready, true);
 		++vcpu->stat.halt_wakeup;
 		return true;
 	}
@@ -2500,7 +2502,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 				continue;
 			} else if (pass && i > last_boosted_vcpu)
 				break;
-			if (!READ_ONCE(vcpu->preempted))
+			if (!READ_ONCE(vcpu->ready))
 				continue;
 			if (vcpu == me)
 				continue;
@@ -4203,8 +4205,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-	if (vcpu->preempted)
-		vcpu->preempted = false;
+	vcpu->preempted = false;
+	WRITE_ONCE(vcpu->ready, false);
 
 	kvm_arch_sched_in(vcpu, cpu);
 
@@ -4216,8 +4218,10 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-	if (current->state == TASK_RUNNING)
+	if (current->state == TASK_RUNNING) {
 		vcpu->preempted = true;
+		WRITE_ONCE(vcpu->ready, true);
+	}
 	kvm_arch_vcpu_put(vcpu);
 }
 
-- 
cgit v1.2.3


From 903e9d1bffb557220af276eda97b9d6b103ec9e0 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 18 Jul 2019 07:26:46 +0300
Subject: connector: remove redundant input callback from cn_dev

A small cleanup: this callback is never used.
Originally fixed by Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>
for OpenVZ7 bug OVZ-6877

cc: stanislav.kinsburskiy@gmail.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/connector.c | 6 +-----
 include/linux/connector.h     | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 23553ed6b548..2d22d6bf52f2 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -248,16 +248,12 @@ static int __maybe_unused cn_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct cn_dev cdev = {
-	.input   = cn_rx_skb,
-};
-
 static int cn_init(void)
 {
 	struct cn_dev *dev = &cdev;
 	struct netlink_kernel_cfg cfg = {
 		.groups	= CN_NETLINK_USERS + 0xf,
-		.input	= dev->input,
+		.input	= cn_rx_skb,
 	};
 
 	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg);
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 6b6c7396a584..cb732643471b 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -50,7 +50,6 @@ struct cn_dev {
 
 	u32 seq, groups;
 	struct sock *nls;
-	void (*input) (struct sk_buff *skb);
 
 	struct cn_queue_dev *cbdev;
 };
-- 
cgit v1.2.3


From 893a1c97205a3ece0cbb3f571a3b972080f3b4c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Jul 2019 13:55:23 -0600
Subject: blk-mq: allow REQ_NOWAIT to return an error inline

By default, if a caller sets REQ_NOWAIT and we need to block, we'll
return -EAGAIN through the bio->bi_end_io() callback. For some use
cases, this makes it hard to use.

Allow a caller to ask for inline return of errors related to
blocking by also setting REQ_NOWAIT_INLINE.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c            | 8 ++++++--
 include/linux/blk_types.h | 5 ++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b038ec680e84..2bc2c0705660 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1960,9 +1960,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	rq = blk_mq_get_request(q, bio, &data);
 	if (unlikely(!rq)) {
 		rq_qos_cleanup(q, bio);
-		if (bio->bi_opf & REQ_NOWAIT)
+
+		cookie = BLK_QC_T_NONE;
+		if (bio->bi_opf & REQ_NOWAIT_INLINE)
+			cookie = BLK_QC_T_EAGAIN;
+		else if (bio->bi_opf & REQ_NOWAIT)
 			bio_wouldblock_error(bio);
-		return BLK_QC_T_NONE;
+		return cookie;
 	}
 
 	trace_block_getrq(q, bio, bio->bi_opf);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index feff3fe4467e..1b1fa1557e68 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,7 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
+	__REQ_NOWAIT_INLINE,	/* Return would-block error inline */
 	/*
 	 * When a shared kthread needs to issue a bio for a cgroup, doing
 	 * so synchronously can lead to priority inversions as the kthread
@@ -345,6 +346,7 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
+#define REQ_NOWAIT_INLINE	(1ULL << __REQ_NOWAIT_INLINE)
 #define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
@@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op)
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE		-1U
+#define BLK_QC_T_EAGAIN		-2U
 #define BLK_QC_T_SHIFT		16
 #define BLK_QC_T_INTERNAL	(1U << 31)
 
 static inline bool blk_qc_t_valid(blk_qc_t cookie)
 {
-	return cookie != BLK_QC_T_NONE;
+	return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
 }
 
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-- 
cgit v1.2.3


From 95fa145479fbc0a0c1fd3274ceb42ec03c042a4a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 19 Jul 2019 10:29:22 -0700
Subject: bpf: sockmap/tls, close can race with map free

When a map free is called and in parallel a socket is closed we
have two paths that can potentially reset the socket prot ops, the
bpf close() path and the map free path. This creates a problem
with which prot ops should be used from the socket closed side.

If the map_free side completes first then we want to call the
original lowest level ops. However, if the tls path runs first
we want to call the sockmap ops. Additionally there was no locking
around prot updates in TLS code paths so the prot ops could
be changed multiple times once from TLS path and again from sockmap
side potentially leaving ops pointed at either TLS or sockmap
when psock and/or tls context have already been destroyed.

To fix this race first only update ops inside callback lock
so that TLS, sockmap and lowest level all agree on prot state.
Second and a ULP callback update() so that lower layers can
inform the upper layer when they are being removed allowing the
upper layer to reset prot ops.

This gets us close to allowing sockmap and tls to be stacked
in arbitrary order but will save that patch for *next trees.

v4:
 - make sure we don't free things for device;
 - remove the checks which swap the callbacks back
   only if TLS is at the top.

Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com
Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h |  8 +++++++-
 include/net/tcp.h     |  3 +++
 net/core/skmsg.c      |  4 ++--
 net/ipv4/tcp_ulp.c    | 13 +++++++++++++
 net/tls/tls_main.c    | 33 ++++++++++++++++++++++++++++-----
 5 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 50ced8aba9db..e4b3fb4bb77c 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -354,7 +354,13 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 	sk->sk_write_space = psock->saved_write_space;
 
 	if (psock->sk_proto) {
-		sk->sk_prot = psock->sk_proto;
+		struct inet_connection_sock *icsk = inet_csk(sk);
+		bool has_ulp = !!icsk->icsk_ulp_data;
+
+		if (has_ulp)
+			tcp_update_ulp(sk, psock->sk_proto);
+		else
+			sk->sk_prot = psock->sk_proto;
 		psock->sk_proto = NULL;
 	}
 }
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f42d300f0cfa..c82a23470081 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2103,6 +2103,8 @@ struct tcp_ulp_ops {
 
 	/* initialize ulp */
 	int (*init)(struct sock *sk);
+	/* update ulp */
+	void (*update)(struct sock *sk, struct proto *p);
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
@@ -2114,6 +2116,7 @@ void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
+void tcp_update_ulp(struct sock *sk, struct proto *p);
 
 #define MODULE_ALIAS_TCP_ULP(name)				\
 	__MODULE_INFO(alias, alias_userspace, name);		\
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 93bffaad2135..6832eeb4b785 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -585,12 +585,12 @@ EXPORT_SYMBOL_GPL(sk_psock_destroy);
 
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
-	rcu_assign_sk_user_data(sk, NULL);
 	sk_psock_cork_free(psock);
 	sk_psock_zap_ingress(psock);
-	sk_psock_restore_proto(sk, psock);
 
 	write_lock_bh(&sk->sk_callback_lock);
+	sk_psock_restore_proto(sk, psock);
+	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.skb_parser)
 		sk_psock_stop_strp(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 3d8a1d835471..4849edb62d52 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -96,6 +96,19 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
 	rcu_read_unlock();
 }
 
+void tcp_update_ulp(struct sock *sk, struct proto *proto)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!icsk->icsk_ulp_ops) {
+		sk->sk_prot = proto;
+		return;
+	}
+
+	if (icsk->icsk_ulp_ops->update)
+		icsk->icsk_ulp_ops->update(sk, proto);
+}
+
 void tcp_cleanup_ulp(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 48f1c26459d0..f208f8455ef2 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -328,7 +328,10 @@ static void tls_sk_proto_unhash(struct sock *sk)
 
 	ctx = tls_get_ctx(sk);
 	tls_sk_proto_cleanup(sk, ctx, timeo);
+	write_lock_bh(&sk->sk_callback_lock);
 	icsk->icsk_ulp_data = NULL;
+	sk->sk_prot = ctx->sk_proto;
+	write_unlock_bh(&sk->sk_callback_lock);
 
 	if (ctx->sk_proto->unhash)
 		ctx->sk_proto->unhash(sk);
@@ -337,7 +340,7 @@ static void tls_sk_proto_unhash(struct sock *sk)
 
 static void tls_sk_proto_close(struct sock *sk, long timeout)
 {
-	void (*sk_proto_close)(struct sock *sk, long timeout);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tls_context *ctx = tls_get_ctx(sk);
 	long timeo = sock_sndtimeo(sk, 0);
 	bool free_ctx;
@@ -347,12 +350,15 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 
 	lock_sock(sk);
 	free_ctx = ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW;
-	sk_proto_close = ctx->sk_proto_close;
 
 	if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE)
 		tls_sk_proto_cleanup(sk, ctx, timeo);
 
+	write_lock_bh(&sk->sk_callback_lock);
+	if (free_ctx)
+		icsk->icsk_ulp_data = NULL;
 	sk->sk_prot = ctx->sk_proto;
+	write_unlock_bh(&sk->sk_callback_lock);
 	release_sock(sk);
 	if (ctx->tx_conf == TLS_SW)
 		tls_sw_free_ctx_tx(ctx);
@@ -360,7 +366,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		tls_sw_strparser_done(ctx);
 	if (ctx->rx_conf == TLS_SW)
 		tls_sw_free_ctx_rx(ctx);
-	sk_proto_close(sk, timeout);
+	ctx->sk_proto_close(sk, timeout);
 
 	if (free_ctx)
 		tls_ctx_free(ctx);
@@ -827,7 +833,7 @@ static int tls_init(struct sock *sk)
 	int rc = 0;
 
 	if (tls_hw_prot(sk))
-		goto out;
+		return 0;
 
 	/* The TLS ulp is currently supported only for TCP sockets
 	 * in ESTABLISHED state.
@@ -838,22 +844,38 @@ static int tls_init(struct sock *sk)
 	if (sk->sk_state != TCP_ESTABLISHED)
 		return -ENOTSUPP;
 
+	tls_build_proto(sk);
+
 	/* allocate tls context */
+	write_lock_bh(&sk->sk_callback_lock);
 	ctx = create_ctx(sk);
 	if (!ctx) {
 		rc = -ENOMEM;
 		goto out;
 	}
 
-	tls_build_proto(sk);
 	ctx->tx_conf = TLS_BASE;
 	ctx->rx_conf = TLS_BASE;
 	ctx->sk_proto = sk->sk_prot;
 	update_sk_prot(sk, ctx);
 out:
+	write_unlock_bh(&sk->sk_callback_lock);
 	return rc;
 }
 
+static void tls_update(struct sock *sk, struct proto *p)
+{
+	struct tls_context *ctx;
+
+	ctx = tls_get_ctx(sk);
+	if (likely(ctx)) {
+		ctx->sk_proto_close = p->close;
+		ctx->sk_proto = p;
+	} else {
+		sk->sk_prot = p;
+	}
+}
+
 void tls_register_device(struct tls_device *device)
 {
 	spin_lock_bh(&device_spinlock);
@@ -874,6 +896,7 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
+	.update			= tls_update,
 };
 
 static int __init tls_register(void)
-- 
cgit v1.2.3


From effa467870c7612012885df4e246bdb8ffd8e44c Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Tue, 16 Jul 2019 22:38:05 +0100
Subject: iommu/vt-d: Don't queue_iova() if there is no flush queue

Intel VT-d driver was reworked to use common deferred flushing
implementation. Previously there was one global per-cpu flush queue,
afterwards - one per domain.

Before deferring a flush, the queue should be allocated and initialized.

Currently only domains with IOMMU_DOMAIN_DMA type initialize their flush
queue. It's probably worth to init it for static or unmanaged domains
too, but it may be arguable - I'm leaving it to iommu folks.

Prevent queuing an iova flush if the domain doesn't have a queue.
The defensive check seems to be worth to keep even if queue would be
initialized for all kinds of domains. And is easy backportable.

On 4.19.43 stable kernel it has a user-visible effect: previously for
devices in si domain there were crashes, on sata devices:

 BUG: spinlock bad magic on CPU#6, swapper/0/1
  lock: 0xffff88844f582008, .magic: 00000000, .owner: <none>/-1, .owner_cpu: 0
 CPU: 6 PID: 1 Comm: swapper/0 Not tainted 4.19.43 #1
 Call Trace:
  <IRQ>
  dump_stack+0x61/0x7e
  spin_bug+0x9d/0xa3
  do_raw_spin_lock+0x22/0x8e
  _raw_spin_lock_irqsave+0x32/0x3a
  queue_iova+0x45/0x115
  intel_unmap+0x107/0x113
  intel_unmap_sg+0x6b/0x76
  __ata_qc_complete+0x7f/0x103
  ata_qc_complete+0x9b/0x26a
  ata_qc_complete_multiple+0xd0/0xe3
  ahci_handle_port_interrupt+0x3ee/0x48a
  ahci_handle_port_intr+0x73/0xa9
  ahci_single_level_irq_intr+0x40/0x60
  __handle_irq_event_percpu+0x7f/0x19a
  handle_irq_event_percpu+0x32/0x72
  handle_irq_event+0x38/0x56
  handle_edge_irq+0x102/0x121
  handle_irq+0x147/0x15c
  do_IRQ+0x66/0xf2
  common_interrupt+0xf/0xf
 RIP: 0010:__do_softirq+0x8c/0x2df

The same for usb devices that use ehci-pci:
 BUG: spinlock bad magic on CPU#0, swapper/0/1
  lock: 0xffff88844f402008, .magic: 00000000, .owner: <none>/-1, .owner_cpu: 0
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.43 #4
 Call Trace:
  <IRQ>
  dump_stack+0x61/0x7e
  spin_bug+0x9d/0xa3
  do_raw_spin_lock+0x22/0x8e
  _raw_spin_lock_irqsave+0x32/0x3a
  queue_iova+0x77/0x145
  intel_unmap+0x107/0x113
  intel_unmap_page+0xe/0x10
  usb_hcd_unmap_urb_setup_for_dma+0x53/0x9d
  usb_hcd_unmap_urb_for_dma+0x17/0x100
  unmap_urb_for_dma+0x22/0x24
  __usb_hcd_giveback_urb+0x51/0xc3
  usb_giveback_urb_bh+0x97/0xde
  tasklet_action_common.isra.4+0x5f/0xa1
  tasklet_action+0x2d/0x30
  __do_softirq+0x138/0x2df
  irq_exit+0x7d/0x8b
  smp_apic_timer_interrupt+0x10f/0x151
  apic_timer_interrupt+0xf/0x20
  </IRQ>
 RIP: 0010:_raw_spin_unlock_irqrestore+0x17/0x39

Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: <stable@vger.kernel.org> # 4.14+
Fixes: 13cf01744608 ("iommu/vt-d: Make use of iova deferred flushing")
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  3 ++-
 drivers/iommu/iova.c        | 18 ++++++++++++++----
 include/linux/iova.h        |  6 ++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9b1d62d03370..72c6d647bec9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3561,7 +3561,8 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 
 	freelist = domain_unmap(domain, start_pfn, last_pfn);
 
-	if (intel_iommu_strict || (pdev && pdev->untrusted)) {
+	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
+			!has_iova_flush_queue(&domain->iovad)) {
 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
 				      nrpages, !freelist, 0);
 		/* free iova */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d499b2621239..8413ae54904a 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -54,9 +54,14 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
+bool has_iova_flush_queue(struct iova_domain *iovad)
+{
+	return !!iovad->fq;
+}
+
 static void free_iova_flush_queue(struct iova_domain *iovad)
 {
-	if (!iovad->fq)
+	if (!has_iova_flush_queue(iovad))
 		return;
 
 	if (timer_pending(&iovad->fq_timer))
@@ -74,13 +79,14 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
 {
+	struct iova_fq __percpu *queue;
 	int cpu;
 
 	atomic64_set(&iovad->fq_flush_start_cnt,  0);
 	atomic64_set(&iovad->fq_flush_finish_cnt, 0);
 
-	iovad->fq = alloc_percpu(struct iova_fq);
-	if (!iovad->fq)
+	queue = alloc_percpu(struct iova_fq);
+	if (!queue)
 		return -ENOMEM;
 
 	iovad->flush_cb   = flush_cb;
@@ -89,13 +95,17 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 	for_each_possible_cpu(cpu) {
 		struct iova_fq *fq;
 
-		fq = per_cpu_ptr(iovad->fq, cpu);
+		fq = per_cpu_ptr(queue, cpu);
 		fq->head = 0;
 		fq->tail = 0;
 
 		spin_lock_init(&fq->lock);
 	}
 
+	smp_wmb();
+
+	iovad->fq = queue;
+
 	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
 	atomic_set(&iovad->fq_timer_on, 0);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 781b96ac706f..cd0f1de901a8 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -155,6 +155,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
+bool has_iova_flush_queue(struct iova_domain *iovad);
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
@@ -235,6 +236,11 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
+bool has_iova_flush_queue(struct iova_domain *iovad)
+{
+	return false;
+}
+
 static inline int init_iova_flush_queue(struct iova_domain *iovad,
 					iova_flush_cb flush_cb,
 					iova_entry_dtor entry_dtor)
-- 
cgit v1.2.3


From 6ee82ef04e38b0d8b09b04bc1b068673deed6582 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Mon, 1 Jul 2019 13:46:51 +0200
Subject: clk: Add missing documentation of devm_clk_bulk_get_optional()
 argument

Fix an incomplete devm_clk_bulk_get_optional() function documentation
by adding description of the num_clks argument as in other *clk_bulk*
functions.

Fixes: 9bd5ef0bd874 ("clk: Add devm_clk_bulk_get_optional() function")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 3c096c7a51dc..853a8f181394 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -359,6 +359,7 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 /**
  * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks
  * @dev: device for clock "consumer"
+ * @num_clks: the number of clk_bulk_data
  * @clks: pointer to the clk_bulk_data table of consumer
  *
  * Behaves the same as devm_clk_bulk_get() except where there is no clock
-- 
cgit v1.2.3


From 201c1db90cd643282185a00770f12f95da330eca Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 23 Jul 2019 09:51:00 +0200
Subject: iommu/iova: Fix compilation error with !CONFIG_IOMMU_IOVA

The stub function for !CONFIG_IOMMU_IOVA needs to be
'static inline'.

Fixes: effa467870c76 ('iommu/vt-d: Don't queue_iova() if there is no flush queue')
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index cd0f1de901a8..a0637abffee8 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -236,7 +236,7 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
-bool has_iova_flush_queue(struct iova_domain *iovad)
+static inline bool has_iova_flush_queue(struct iova_domain *iovad)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 327fe1d42b83f8a06b33ba30159582b49af5fc8e Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Date: Tue, 23 Jul 2019 00:27:41 -0300
Subject: block: blk-mq: Remove blk_mq_sched_started_request and
 started_request

blk_mq_sched_completed_request is a function that checks if the elevator
related to the request has started_request implemented, but currently, none of
the available IO schedulers implement started_request, so remove both.

Signed-off-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.h     | 9 ---------
 block/blk-mq.c           | 2 --
 include/linux/elevator.h | 1 -
 3 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index cf22ab00fefb..126021fc3a11 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 		e->type->ops.completed_request(rq, now);
 }
 
-static inline void blk_mq_sched_started_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	struct elevator_queue *e = q->elevator;
-
-	if (e && e->type->ops.started_request)
-		e->type->ops.started_request(rq);
-}
-
 static inline void blk_mq_sched_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2bc2c0705660..f78d3287dd82 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	blk_mq_sched_started_request(rq);
-
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 17cd0078377c..1dd014c9c87b 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -45,7 +45,6 @@ struct elevator_mq_ops {
 	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
 	bool (*has_work)(struct blk_mq_hw_ctx *);
 	void (*completed_request)(struct request *, u64);
-	void (*started_request)(struct request *);
 	void (*requeue_request)(struct request *);
 	struct request *(*former_request)(struct request_queue *, struct request *);
 	struct request *(*next_request)(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 06532750010e06dd4b6d69983773677df7fc5291 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 22 Jul 2019 18:51:49 +0200
Subject: dma-mapping: use dma_get_mask in dma_addressing_limited

We currently have cases where the dma_addressing_limited() gets
called with dma_mask unset. This causes a NULL pointer dereference.

Use dma_get_mask() accessor to prevent the crash.

Fixes: b866455423e0 ("dma-mapping: add a dma_addressing_limited helper")
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index e11b115dd0e4..f7d1eea32c78 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -689,8 +689,8 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
  */
 static inline bool dma_addressing_limited(struct device *dev)
 {
-	return min_not_zero(*dev->dma_mask, dev->bus_dma_mask) <
-		dma_get_required_mask(dev);
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) <
+			    dma_get_required_mask(dev);
 }
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
-- 
cgit v1.2.3


From d9b8aadaffa65809d146cf0f8632a22a946367d7 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 19 Jul 2019 11:18:15 +0200
Subject: bpf: fix narrower loads on s390

The very first check in test_pkt_md_access is failing on s390, which
happens because loading a part of a struct __sk_buff field produces
an incorrect result.

The preprocessed code of the check is:

{
	__u8 tmp = *((volatile __u8 *)&skb->len +
		((sizeof(skb->len) - sizeof(__u8)) / sizeof(__u8)));
	if (tmp != ((*(volatile __u32 *)&skb->len) & 0xFF)) return 2;
};

clang generates the following code for it:

      0:	71 21 00 03 00 00 00 00	r2 = *(u8 *)(r1 + 3)
      1:	61 31 00 00 00 00 00 00	r3 = *(u32 *)(r1 + 0)
      2:	57 30 00 00 00 00 00 ff	r3 &= 255
      3:	5d 23 00 1d 00 00 00 00	if r2 != r3 goto +29 <LBB0_10>

Finally, verifier transforms it to:

  0: (61) r2 = *(u32 *)(r1 +104)
  1: (bc) w2 = w2
  2: (74) w2 >>= 24
  3: (bc) w2 = w2
  4: (54) w2 &= 255
  5: (bc) w2 = w2

The problem is that when verifier emits the code to replace a partial
load of a struct __sk_buff field (*(u8 *)(r1 + 3)) with a full load of
struct sk_buff field (*(u32 *)(r1 + 104)), an optional shift and a
bitwise AND, it assumes that the machine is little endian and
incorrectly decides to use a shift.

Adjust shift count calculation to account for endianness.

Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 13 +++++++++++++
 kernel/bpf/verifier.c  |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff65d22cf336..92c6e31fb008 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -24,6 +24,7 @@
 
 #include <net/sch_generic.h>
 
+#include <asm/byteorder.h>
 #include <uapi/linux/filter.h>
 #include <uapi/linux/bpf.h>
 
@@ -747,6 +748,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
+static inline u8
+bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
+{
+	u8 load_off = off & (size_default - 1);
+
+#ifdef __LITTLE_ENDIAN
+	return load_off * 8;
+#else
+	return (size_default - (load_off + size)) * 8;
+#endif
+}
+
 #define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5900cbb966b1..c84d83f86141 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		if (is_narrower_load && size < target_size) {
-			u8 shift = (off & (size_default - 1)) * 8;
-
+			u8 shift = bpf_ctx_narrow_load_shift(off, size,
+							     size_default);
 			if (ctx_field_size <= 4) {
 				if (shift)
 					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
-- 
cgit v1.2.3


From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 11 Jul 2019 09:54:40 -0700
Subject: access: avoid the RCU grace period for the temporary subjective
 credentials

It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU
work because it installs a temporary credential that gets allocated and
freed for each system call.

The allocation and freeing overhead is mostly benign, but because
credentials can be accessed under the RCU read lock, the freeing
involves a RCU grace period.

Which is not a huge deal normally, but if you have a lot of access()
calls, this causes a fair amount of seconday damage: instead of having a
nice alloc/free patterns that hits in hot per-CPU slab caches, you have
all those delayed free's, and on big machines with hundreds of cores,
the RCU overhead can end up being enormous.

But it turns out that all of this is entirely unnecessary.  Exactly
because access() only installs the credential as the thread-local
subjective credential, the temporary cred pointer doesn't actually need
to be RCU free'd at all.  Once we're done using it, we can just free it
synchronously and avoid all the RCU overhead.

So add a 'non_rcu' flag to 'struct cred', which can be set by users that
know they only use it in non-RCU context (there are other potential
users for this).  We can make it a union with the rcu freeing list head
that we need for the RCU case, so this doesn't need any extra storage.

Note that this also makes 'get_current_cred()' clear the new non_rcu
flag, in case we have filesystems that take a long-term reference to the
cred and then expect the RCU delayed freeing afterwards.  It's not
entirely clear that this is required, but it makes for clear semantics:
the subjective cred remains non-RCU as long as you only access it
synchronously using the thread-local accessors, but you _can_ use it as
a generic cred if you want to.

It is possible that we should just remove the whole RCU markings for
->cred entirely.  Only ->real_cred is really supposed to be accessed
through RCU, and the long-term cred copies that nfs uses might want to
explicitly re-enable RCU freeing if required, rather than have
get_current_cred() do it implicitly.

But this is a "minimal semantic changes" change for the immediate
problem.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jan Glauber <jglauber@marvell.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Jayachandran Chandrasekharan Nair <jnair@marvell.com>
Cc: Greg KH <greg@kroah.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c            | 19 +++++++++++++++++++
 include/linux/cred.h |  8 +++++++-
 kernel/cred.c        | 21 +++++++++++++++++++--
 3 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..a59abe3c669a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __user *filename, int mode)
 				override_cred->cap_permitted;
 	}
 
+	/*
+	 * The new set of credentials can *only* be used in
+	 * task-synchronous circumstances, and does not need
+	 * RCU freeing, unless somebody then takes a separate
+	 * reference to it.
+	 *
+	 * NOTE! This is _only_ true because this credential
+	 * is used purely for override_creds() that installs
+	 * it as the subjective cred. Other threads will be
+	 * accessing ->real_cred, not the subjective cred.
+	 *
+	 * If somebody _does_ make a copy of this (using the
+	 * 'get_current_cred()' function), that will clear the
+	 * non_rcu field, because now that other user may be
+	 * expecting RCU freeing. But normal thread-synchronous
+	 * cred accesses will keep things non-RCY.
+	 */
+	override_cred->non_rcu = 1;
+
 	old_cred = override_creds(override_cred);
 retry:
 	res = user_path_at(dfd, filename, lookup_flags, &path);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 7eb43a038330..f7a30e0099be 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -145,7 +145,11 @@ struct cred {
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
-	struct rcu_head	rcu;		/* RCU deletion hook */
+	/* RCU deletion */
+	union {
+		int non_rcu;			/* Can we skip RCU deletion? */
+		struct rcu_head	rcu;		/* RCU deletion hook */
+	};
 } __randomize_layout;
 
 extern void __put_cred(struct cred *);
@@ -246,6 +250,7 @@ static inline const struct cred *get_cred(const struct cred *cred)
 	if (!cred)
 		return cred;
 	validate_creds(cred);
+	nonconst_cred->non_rcu = 0;
 	return get_new_cred(nonconst_cred);
 }
 
@@ -257,6 +262,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 	if (!atomic_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
 	validate_creds(cred);
+	nonconst_cred->non_rcu = 0;
 	return cred;
 }
 
diff --git a/kernel/cred.c b/kernel/cred.c
index c73a87a4df13..153ae369e024 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
 	BUG_ON(cred == current->cred);
 	BUG_ON(cred == current->real_cred);
 
-	call_rcu(&cred->rcu, put_cred_rcu);
+	if (cred->non_rcu)
+		put_cred_rcu(&cred->rcu);
+	else
+		call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
@@ -256,6 +259,7 @@ struct cred *prepare_creds(void)
 	old = task->cred;
 	memcpy(new, old, sizeof(struct cred));
 
+	new->non_rcu = 0;
 	atomic_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
@@ -535,7 +539,19 @@ const struct cred *override_creds(const struct cred *new)
 
 	validate_creds(old);
 	validate_creds(new);
-	get_cred(new);
+
+	/*
+	 * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
+	 *
+	 * That means that we do not clear the 'non_rcu' flag, since
+	 * we are only installing the cred into the thread-synchronous
+	 * '->cred' pointer, not the '->real_cred' pointer that is
+	 * visible to other threads under RCU.
+	 *
+	 * Also note that we did validate_creds() manually, not depending
+	 * on the validation in 'get_cred()'.
+	 */
+	get_new_cred((struct cred *)new);
 	alter_cred_subscribers(new, 1);
 	rcu_assign_pointer(current->cred, new);
 	alter_cred_subscribers(old, -1);
@@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	validate_creds(old);
 
 	*new = *old;
+	new->non_rcu = 0;
 	atomic_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_uid(new->user);
-- 
cgit v1.2.3


From 3b51c44bd6936e86a7180abd9aebc4387a479253 Mon Sep 17 00:00:00 2001
From: Atif Niyaz <atifniyaz@google.com>
Date: Wed, 24 Jul 2019 22:26:31 +0300
Subject: Input: allow drivers specify timestamp for input events

Currently, evdev stamps events with timestamps acquired in evdev_events()
However, this timestamping may not be accurate in terms of measuring
when the actual event happened.

Let's allow individual drivers specify timestamp in order to provide a more
accurate sense of time for the event. It is expected that drivers will set the
timestamp in their hard interrupt routine.

Signed-off-by: Atif Niyaz <atifniyaz@google.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/evdev.c | 35 ++++++++---------------------------
 drivers/input/input.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/input.h | 14 ++++++++++++++
 3 files changed, 62 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 867c2cfd0038..d7dd6fcf2db0 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -25,13 +25,6 @@
 #include <linux/cdev.h>
 #include "input-compat.h"
 
-enum evdev_clock_type {
-	EV_CLK_REAL = 0,
-	EV_CLK_MONO,
-	EV_CLK_BOOT,
-	EV_CLK_MAX
-};
-
 struct evdev {
 	int open;
 	struct input_handle handle;
@@ -53,7 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
-	unsigned int clk_type;
+	enum input_clock_type clk_type;
 	bool revoked;
 	unsigned long *evmasks[EV_CNT];
 	unsigned int bufsize;
@@ -149,17 +142,10 @@ static void __evdev_flush_queue(struct evdev_client *client, unsigned int type)
 
 static void __evdev_queue_syn_dropped(struct evdev_client *client)
 {
+	ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev);
+	struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]);
 	struct input_event ev;
-	ktime_t time;
-	struct timespec64 ts;
 
-	time = client->clk_type == EV_CLK_REAL ?
-			ktime_get_real() :
-			client->clk_type == EV_CLK_MONO ?
-				ktime_get() :
-				ktime_get_boottime();
-
-	ts = ktime_to_timespec64(time);
 	ev.input_event_sec = ts.tv_sec;
 	ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC;
 	ev.type = EV_SYN;
@@ -188,18 +174,18 @@ static void evdev_queue_syn_dropped(struct evdev_client *client)
 static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid)
 {
 	unsigned long flags;
-	unsigned int clk_type;
+	enum input_clock_type clk_type;
 
 	switch (clkid) {
 
 	case CLOCK_REALTIME:
-		clk_type = EV_CLK_REAL;
+		clk_type = INPUT_CLK_REAL;
 		break;
 	case CLOCK_MONOTONIC:
-		clk_type = EV_CLK_MONO;
+		clk_type = INPUT_CLK_MONO;
 		break;
 	case CLOCK_BOOTTIME:
-		clk_type = EV_CLK_BOOT;
+		clk_type = INPUT_CLK_BOOT;
 		break;
 	default:
 		return -EINVAL;
@@ -307,12 +293,7 @@ static void evdev_events(struct input_handle *handle,
 {
 	struct evdev *evdev = handle->private;
 	struct evdev_client *client;
-	ktime_t ev_time[EV_CLK_MAX];
-
-	ev_time[EV_CLK_MONO] = ktime_get();
-	ev_time[EV_CLK_REAL] = ktime_mono_to_real(ev_time[EV_CLK_MONO]);
-	ev_time[EV_CLK_BOOT] = ktime_mono_to_any(ev_time[EV_CLK_MONO],
-						 TK_OFFS_BOOT);
+	ktime_t *ev_time = input_get_timestamp(handle->dev);
 
 	rcu_read_lock();
 
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7f3c5fcb9ed6..7494a0dede79 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1894,6 +1894,46 @@ void input_free_device(struct input_dev *dev)
 }
 EXPORT_SYMBOL(input_free_device);
 
+/**
+ * input_set_timestamp - set timestamp for input events
+ * @dev: input device to set timestamp for
+ * @timestamp: the time at which the event has occurred
+ *   in CLOCK_MONOTONIC
+ *
+ * This function is intended to provide to the input system a more
+ * accurate time of when an event actually occurred. The driver should
+ * call this function as soon as a timestamp is acquired ensuring
+ * clock conversions in input_set_timestamp are done correctly.
+ *
+ * The system entering suspend state between timestamp acquisition and
+ * calling input_set_timestamp can result in inaccurate conversions.
+ */
+void input_set_timestamp(struct input_dev *dev, ktime_t timestamp)
+{
+	dev->timestamp[INPUT_CLK_MONO] = timestamp;
+	dev->timestamp[INPUT_CLK_REAL] = ktime_mono_to_real(timestamp);
+	dev->timestamp[INPUT_CLK_BOOT] = ktime_mono_to_any(timestamp,
+							   TK_OFFS_BOOT);
+}
+EXPORT_SYMBOL(input_set_timestamp);
+
+/**
+ * input_get_timestamp - get timestamp for input events
+ * @dev: input device to get timestamp from
+ *
+ * A valid timestamp is a timestamp of non-zero value.
+ */
+ktime_t *input_get_timestamp(struct input_dev *dev)
+{
+	const ktime_t invalid_timestamp = ktime_set(0, 0);
+
+	if (!ktime_compare(dev->timestamp[INPUT_CLK_MONO], invalid_timestamp))
+		input_set_timestamp(dev, ktime_get());
+
+	return dev->timestamp;
+}
+EXPORT_SYMBOL(input_get_timestamp);
+
 /**
  * input_set_capability - mark device as capable of a certain event
  * @dev: device that is capable of emitting or accepting event
diff --git a/include/linux/input.h b/include/linux/input.h
index 510e78558c10..e95a439d8bd5 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -33,6 +33,13 @@ struct input_value {
 	__s32 value;
 };
 
+enum input_clock_type {
+	INPUT_CLK_REAL = 0,
+	INPUT_CLK_MONO,
+	INPUT_CLK_BOOT,
+	INPUT_CLK_MAX
+};
+
 /**
  * struct input_dev - represents an input device
  * @name: name of the device
@@ -114,6 +121,8 @@ struct input_value {
  * @vals: array of values queued in the current frame
  * @devres_managed: indicates that devices is managed with devres framework
  *	and needs not be explicitly unregistered or freed.
+ * @timestamp: storage for a timestamp set by input_set_timestamp called
+ *  by a driver
  */
 struct input_dev {
 	const char *name;
@@ -184,6 +193,8 @@ struct input_dev {
 	struct input_value *vals;
 
 	bool devres_managed;
+
+	ktime_t timestamp[INPUT_CLK_MAX];
 };
 #define to_input_dev(d) container_of(d, struct input_dev, dev)
 
@@ -382,6 +393,9 @@ void input_close_device(struct input_handle *);
 
 int input_flush_device(struct input_handle *handle, struct file *file);
 
+void input_set_timestamp(struct input_dev *dev, ktime_t timestamp);
+ktime_t *input_get_timestamp(struct input_dev *dev);
+
 void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);
 void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value);
 
-- 
cgit v1.2.3


From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 16 Jul 2019 17:20:45 +0200
Subject: sched/fair: Don't free p->numa_faults with concurrent readers

When going through execve(), zero out the NUMA fault statistics instead of
freeing them.

During execve, the task is reachable through procfs and the scheduler. A
concurrent /proc/*/sched reader can read data from a freed ->numa_faults
allocation (confirmed by KASAN) and write it back to userspace.
I believe that it would also be possible for a use-after-free read to occur
through a race between a NUMA fault and execve(): task_numa_fault() can
lead to task_numa_compare(), which invokes task_weight() on the currently
running task of a different CPU.

Another way to fix this would be to make ->numa_faults RCU-managed or add
extra locking, but it seems easier to wipe the NUMA fault statistics on
execve.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()")
Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                            |  2 +-
 include/linux/sched/numa_balancing.h |  4 ++--
 kernel/fork.c                        |  2 +-
 kernel/sched/fair.c                  | 24 ++++++++++++++++++++----
 4 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index c71cbfe6826a..f7f6a140856a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1828,7 +1828,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	membarrier_execve(current);
 	rseq_execve(current);
 	acct_update_integrals(current);
-	task_numa_free(current);
+	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
 	if (filename)
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index e7dd04a84ba8..3988762efe15 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -19,7 +19,7 @@
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
-extern void task_numa_free(struct task_struct *p);
+extern void task_numa_free(struct task_struct *p, bool final);
 extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
 					int src_nid, int dst_cpu);
 #else
@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
 static inline void set_numabalancing_state(bool enabled)
 {
 }
-static inline void task_numa_free(struct task_struct *p)
+static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1b4148..2852d0e76ea3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	cgroup_free(tsk);
-	task_numa_free(tsk);
+	task_numa_free(tsk, true);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..6adb0e0f5feb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2353,13 +2353,23 @@ no_join:
 	return;
 }
 
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
 {
 	struct numa_group *grp = p->numa_group;
-	void *numa_faults = p->numa_faults;
+	unsigned long *numa_faults = p->numa_faults;
 	unsigned long flags;
 	int i;
 
+	if (!numa_faults)
+		return;
+
 	if (grp) {
 		spin_lock_irqsave(&grp->lock, flags);
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2382,14 @@ void task_numa_free(struct task_struct *p)
 		put_numa_group(grp);
 	}
 
-	p->numa_faults = NULL;
-	kfree(numa_faults);
+	if (final) {
+		p->numa_faults = NULL;
+		kfree(numa_faults);
+	} else {
+		p->total_numa_faults = 0;
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+			numa_faults[i] = 0;
+	}
 }
 
 /*
-- 
cgit v1.2.3


From cb361d8cdef69990f6b4504dc1fd9a594d983c97 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 16 Jul 2019 17:20:47 +0200
Subject: sched/fair: Use RCU accessors consistently for ->numa_group

The old code used RCU annotations and accessors inconsistently for
->numa_group, which can lead to use-after-frees and NULL dereferences.

Let all accesses to ->numa_group use proper RCU helpers to prevent such
issues.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults")
Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  10 ++++-
 kernel/sched/fair.c   | 120 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 90 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dc1811487f5..9f51932bd543 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1092,7 +1092,15 @@ struct task_struct {
 	u64				last_sum_exec_runtime;
 	struct callback_head		numa_work;
 
-	struct numa_group		*numa_group;
+	/*
+	 * This pointer is only modified for current in syscall and
+	 * pagefault context (and for tasks being destroyed), so it can be read
+	 * from any of the following contexts:
+	 *  - RCU read-side critical section
+	 *  - current->numa_group from everywhere
+	 *  - task's runqueue locked, task not running
+	 */
+	struct numa_group __rcu		*numa_group;
 
 	/*
 	 * numa_faults is an array split into four regions:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6adb0e0f5feb..bc9cfeaac8bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,6 +1086,21 @@ struct numa_group {
 	unsigned long faults[0];
 };
 
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+	return rcu_dereference_check(p->numa_group, p == current ||
+		(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+	return rcu_dereference_protected(p->numa_group, p == current);
+}
+
 static inline unsigned long group_faults_priv(struct numa_group *ng);
 static inline unsigned long group_faults_shared(struct numa_group *ng);
 
@@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
 {
 	unsigned long smin = task_scan_min(p);
 	unsigned long period = smin;
+	struct numa_group *ng;
 
 	/* Scale the maximum scan period with the amount of shared memory. */
-	if (p->numa_group) {
-		struct numa_group *ng = p->numa_group;
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
+	if (ng) {
 		unsigned long shared = group_faults_shared(ng);
 		unsigned long private = group_faults_priv(ng);
 
@@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
 		period *= shared + 1;
 		period /= private + shared + 1;
 	}
+	rcu_read_unlock();
 
 	return max(smin, period);
 }
@@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
 {
 	unsigned long smin = task_scan_min(p);
 	unsigned long smax;
+	struct numa_group *ng;
 
 	/* Watch for min being lower than max due to floor calculations */
 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 
 	/* Scale the maximum scan period with the amount of shared memory. */
-	if (p->numa_group) {
-		struct numa_group *ng = p->numa_group;
+	ng = deref_curr_numa_group(p);
+	if (ng) {
 		unsigned long shared = group_faults_shared(ng);
 		unsigned long private = group_faults_priv(ng);
 		unsigned long period = smax;
@@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
 	p->numa_work.next		= &p->numa_work;
 	p->numa_faults			= NULL;
-	p->numa_group			= NULL;
+	RCU_INIT_POINTER(p->numa_group, NULL);
 	p->last_task_numa_placement	= 0;
 	p->last_sum_exec_runtime	= 0;
 
@@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 
 pid_t task_numa_group_id(struct task_struct *p)
 {
-	return p->numa_group ? p->numa_group->gid : 0;
+	struct numa_group *ng;
+	pid_t gid = 0;
+
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
+	if (ng)
+		gid = ng->gid;
+	rcu_read_unlock();
+
+	return gid;
 }
 
 /*
@@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
 
 static inline unsigned long group_faults(struct task_struct *p, int nid)
 {
-	if (!p->numa_group)
+	struct numa_group *ng = deref_task_numa_group(p);
+
+	if (!ng)
 		return 0;
 
-	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
-		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
 static inline unsigned long group_weight(struct task_struct *p, int nid,
 					 int dist)
 {
+	struct numa_group *ng = deref_task_numa_group(p);
 	unsigned long faults, total_faults;
 
-	if (!p->numa_group)
+	if (!ng)
 		return 0;
 
-	total_faults = p->numa_group->total_faults;
+	total_faults = ng->total_faults;
 
 	if (!total_faults)
 		return 0;
@@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 				int src_nid, int dst_cpu)
 {
-	struct numa_group *ng = p->numa_group;
+	struct numa_group *ng = deref_curr_numa_group(p);
 	int dst_nid = cpu_to_node(dst_cpu);
 	int last_cpupid, this_cpupid;
 
@@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 static void task_numa_compare(struct task_numa_env *env,
 			      long taskimp, long groupimp, bool maymove)
 {
+	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
+	long imp = p_ng ? groupimp : taskimp;
 	struct task_struct *cur;
 	long src_load, dst_load;
-	long load;
-	long imp = env->p->numa_group ? groupimp : taskimp;
-	long moveimp = imp;
 	int dist = env->dist;
+	long moveimp = imp;
+	long load;
 
 	if (READ_ONCE(dst_rq->numa_migrate_on))
 		return;
@@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * If dst and source tasks are in the same NUMA group, or not
 	 * in any group then look only at task weights.
 	 */
-	if (cur->numa_group == env->p->numa_group) {
+	cur_ng = rcu_dereference(cur->numa_group);
+	if (cur_ng == p_ng) {
 		imp = taskimp + task_weight(cur, env->src_nid, dist) -
 		      task_weight(cur, env->dst_nid, dist);
 		/*
 		 * Add some hysteresis to prevent swapping the
 		 * tasks within a group over tiny differences.
 		 */
-		if (cur->numa_group)
+		if (cur_ng)
 			imp -= imp / 16;
 	} else {
 		/*
 		 * Compare the group weights. If a task is all by itself
 		 * (not part of a group), use the task weight instead.
 		 */
-		if (cur->numa_group && env->p->numa_group)
+		if (cur_ng && p_ng)
 			imp += group_weight(cur, env->src_nid, dist) -
 			       group_weight(cur, env->dst_nid, dist);
 		else
@@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p)
 		.best_imp = 0,
 		.best_cpu = -1,
 	};
+	unsigned long taskweight, groupweight;
 	struct sched_domain *sd;
+	long taskimp, groupimp;
+	struct numa_group *ng;
 	struct rq *best_rq;
-	unsigned long taskweight, groupweight;
 	int nid, ret, dist;
-	long taskimp, groupimp;
 
 	/*
 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p)
 	 *   multiple NUMA nodes; in order to better consolidate the group,
 	 *   we need to check other locations.
 	 */
-	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+	ng = deref_curr_numa_group(p);
+	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
@@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 * A task that migrated to a second choice node will be better off
 	 * trying for a better one later. Do not set the preferred node here.
 	 */
-	if (p->numa_group) {
+	if (ng) {
 		if (env.best_cpu == -1)
 			nid = env.src_nid;
 		else
@@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p)
 	unsigned long total_faults;
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
+	struct numa_group *ng;
 
 	/*
 	 * The p->mm->numa_scan_seq field gets updated without
@@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p)
 	runtime = numa_get_avg_runtime(p, &period);
 
 	/* If the task is part of a group prevent parallel updates to group stats */
-	if (p->numa_group) {
-		group_lock = &p->numa_group->lock;
+	ng = deref_curr_numa_group(p);
+	if (ng) {
+		group_lock = &ng->lock;
 		spin_lock_irq(group_lock);
 	}
 
@@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p)
 			p->numa_faults[cpu_idx] += f_diff;
 			faults += p->numa_faults[mem_idx];
 			p->total_numa_faults += diff;
-			if (p->numa_group) {
+			if (ng) {
 				/*
 				 * safe because we can only change our own group
 				 *
@@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p)
 				 * nid and priv in a specific region because it
 				 * is at the beginning of the numa_faults array.
 				 */
-				p->numa_group->faults[mem_idx] += diff;
-				p->numa_group->faults_cpu[mem_idx] += f_diff;
-				p->numa_group->total_faults += diff;
-				group_faults += p->numa_group->faults[mem_idx];
+				ng->faults[mem_idx] += diff;
+				ng->faults_cpu[mem_idx] += f_diff;
+				ng->total_faults += diff;
+				group_faults += ng->faults[mem_idx];
 			}
 		}
 
-		if (!p->numa_group) {
+		if (!ng) {
 			if (faults > max_faults) {
 				max_faults = faults;
 				max_nid = nid;
@@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p)
 		}
 	}
 
-	if (p->numa_group) {
-		numa_group_count_active_nodes(p->numa_group);
+	if (ng) {
+		numa_group_count_active_nodes(ng);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_nid);
 	}
@@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	int cpu = cpupid_to_cpu(cpupid);
 	int i;
 
-	if (unlikely(!p->numa_group)) {
+	if (unlikely(!deref_curr_numa_group(p))) {
 		unsigned int size = sizeof(struct numa_group) +
 				    4*nr_node_ids*sizeof(unsigned long);
 
@@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	if (!grp)
 		goto no_join;
 
-	my_grp = p->numa_group;
+	my_grp = deref_curr_numa_group(p);
 	if (grp == my_grp)
 		goto no_join;
 
@@ -2362,7 +2399,8 @@ no_join:
  */
 void task_numa_free(struct task_struct *p, bool final)
 {
-	struct numa_group *grp = p->numa_group;
+	/* safe: p either is current or is being freed by current */
+	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
 	unsigned long *numa_faults = p->numa_faults;
 	unsigned long flags;
 	int i;
@@ -2442,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	 * actively using should be counted as local. This allows the
 	 * scan rate to slow down when a workload has settled down.
 	 */
-	ng = p->numa_group;
+	ng = deref_curr_numa_group(p);
 	if (!priv && !local && ng && ng->active_nodes > 1 &&
 				numa_is_active_node(cpu_node, ng) &&
 				numa_is_active_node(mem_node, ng))
@@ -10460,18 +10498,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 {
 	int node;
 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+	struct numa_group *ng;
 
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
 	for_each_online_node(node) {
 		if (p->numa_faults) {
 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
-		if (p->numa_group) {
-			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
-			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+		if (ng) {
+			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
 	}
+	rcu_read_unlock();
 }
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
-- 
cgit v1.2.3


From 8732d85a69a0411f16a4b78df8fdc7b09c50a849 Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Fri, 19 Jul 2019 19:51:45 +0200
Subject: platform/x86: wmi: add missing struct parameter description

Add a description for the context parameter in the struct wmi_device_id.

Reported-by: kbuild test robot <lkp@intel.com>
Fixes: a48e23385fcf ("platform/x86: wmi: add context pointer field to struct wmi_device_id")
Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/mod_devicetable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b2c1648f7e5d..5714fd35a83c 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -814,6 +814,7 @@ struct tee_client_device_id {
 /**
  * struct wmi_device_id - WMI device identifier
  * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ * @context: pointer to driver specific data
  */
 struct wmi_device_id {
 	const char guid_string[UUID_STRING_LEN+1];
-- 
cgit v1.2.3


From f8be17b81d44aed1f9ea68c3fc70f501c9616e2d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 23 Jul 2019 10:22:48 +0300
Subject: lib/dim: Fix -Wunused-const-variable warnings

DIM causes to the following warnings during kernel compilation
which indicates that tx_profile and rx_profile are supposed to
be declared in *.c and not in *.h files.

In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from ./include/linux/mlx5/vport.h:36,
                 from drivers/infiniband/hw/mlx5/ib_virt.c:34:
./include/linux/dim.h:326:1: warning: _tx_profile_ defined but not used [-Wunused-const-variable=]
  326 | tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
      | ^~~~~~~~~~
./include/linux/dim.h:320:1: warning: _rx_profile_ defined but not used [-Wunused-const-variable=]
  320 | rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
      | ^~~~~~~~~~

Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dim.h | 56 -----------------------------------------------------
 lib/dim/net_dim.c   | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index d3a0fbfff2bb..9fa4b3f88c39 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -272,62 +272,6 @@ dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
 
 /* Net DIM */
 
-/*
- * Net DIM profiles:
- *        There are different set of profiles for each CQ period mode.
- *        There are different set of profiles for RX/TX CQs.
- *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
- */
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
-#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
-#define NET_DIM_RX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-}
-
-#define NET_DIM_RX_CQE_PROFILES { \
-	{2,  256},             \
-	{8,  128},             \
-	{16, 64},              \
-	{32, 64},              \
-	{64, 64}               \
-}
-
-#define NET_DIM_TX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
-}
-
-#define NET_DIM_TX_CQE_PROFILES { \
-	{5,  128},  \
-	{8,  64},  \
-	{16, 32},  \
-	{32, 32},  \
-	{64, 32}   \
-}
-
-static const struct dim_cq_moder
-rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_RX_EQE_PROFILES,
-	NET_DIM_RX_CQE_PROFILES,
-};
-
-static const struct dim_cq_moder
-tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_TX_EQE_PROFILES,
-	NET_DIM_TX_CQE_PROFILES,
-};
-
 /**
  *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
  *	@cq_period_mode: CQ period mode
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
index 5bcc902c5388..a4db51c21266 100644
--- a/lib/dim/net_dim.c
+++ b/lib/dim/net_dim.c
@@ -5,6 +5,62 @@
 
 #include <linux/dim.h>
 
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+}
+
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
 struct dim_cq_moder
 net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-- 
cgit v1.2.3


From 02712bc3250849c1cf99d626aea98f610e695f34 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 24 Jul 2019 08:52:53 +0200
Subject: mm/hmm: move hmm_vma_range_done and hmm_vma_fault to nouveau

These two functions are marked as a legacy APIs to get rid of, but seem to
suit the current nouveau flow.  Move it to the only user in preparation
for fixing a locking bug involving caller and callee.  All comments
referring to the old API have been removed as this now is a driver private
helper.

Link: https://lore.kernel.org/r/20190724065258.16603-3-hch@lst.de
Tested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/nouveau_svm.c | 46 +++++++++++++++++++++++++++--
 include/linux/hmm.h                   | 54 -----------------------------------
 2 files changed, 44 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 8c92374afcf2..6c1b04de0db8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -475,6 +475,48 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm,
 		fault->inst, fault->addr, fault->access);
 }
 
+static inline bool
+nouveau_range_done(struct hmm_range *range)
+{
+	bool ret = hmm_range_valid(range);
+
+	hmm_range_unregister(range);
+	return ret;
+}
+
+static int
+nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range,
+		    bool block)
+{
+	long ret;
+
+	range->default_flags = 0;
+	range->pfn_flags_mask = -1UL;
+
+	ret = hmm_range_register(range, mirror,
+				 range->start, range->end,
+				 PAGE_SHIFT);
+	if (ret)
+		return (int)ret;
+
+	if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+		up_read(&range->vma->vm_mm->mmap_sem);
+		return -EAGAIN;
+	}
+
+	ret = hmm_range_fault(range, block);
+	if (ret <= 0) {
+		if (ret == -EBUSY || !ret) {
+			up_read(&range->vma->vm_mm->mmap_sem);
+			ret = -EBUSY;
+		} else if (ret == -EAGAIN)
+			ret = -EBUSY;
+		hmm_range_unregister(range);
+		return ret;
+	}
+	return 0;
+}
+
 static int
 nouveau_svm_fault(struct nvif_notify *notify)
 {
@@ -649,10 +691,10 @@ nouveau_svm_fault(struct nvif_notify *notify)
 		range.values = nouveau_svm_pfn_values;
 		range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
 again:
-		ret = hmm_vma_fault(&svmm->mirror, &range, true);
+		ret = nouveau_range_fault(&svmm->mirror, &range, true);
 		if (ret == 0) {
 			mutex_lock(&svmm->mutex);
-			if (!hmm_vma_range_done(&range)) {
+			if (!nouveau_range_done(&range)) {
 				mutex_unlock(&svmm->mutex);
 				goto again;
 			}
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index b8a08b2a10ca..7ef56dc18050 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -484,60 +484,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
  */
 #define HMM_RANGE_DEFAULT_TIMEOUT 1000
 
-/* This is a temporary helper to avoid merge conflict between trees. */
-static inline bool hmm_vma_range_done(struct hmm_range *range)
-{
-	bool ret = hmm_range_valid(range);
-
-	hmm_range_unregister(range);
-	return ret;
-}
-
-/* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_mirror *mirror,
-				struct hmm_range *range, bool block)
-{
-	long ret;
-
-	/*
-	 * With the old API the driver must set each individual entries with
-	 * the requested flags (valid, write, ...). So here we set the mask to
-	 * keep intact the entries provided by the driver and zero out the
-	 * default_flags.
-	 */
-	range->default_flags = 0;
-	range->pfn_flags_mask = -1UL;
-
-	ret = hmm_range_register(range, mirror,
-				 range->start, range->end,
-				 PAGE_SHIFT);
-	if (ret)
-		return (int)ret;
-
-	if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
-		/*
-		 * The mmap_sem was taken by driver we release it here and
-		 * returns -EAGAIN which correspond to mmap_sem have been
-		 * drop in the old API.
-		 */
-		up_read(&range->vma->vm_mm->mmap_sem);
-		return -EAGAIN;
-	}
-
-	ret = hmm_range_fault(range, block);
-	if (ret <= 0) {
-		if (ret == -EBUSY || !ret) {
-			/* Same as above, drop mmap_sem to match old API. */
-			up_read(&range->vma->vm_mm->mmap_sem);
-			ret = -EBUSY;
-		} else if (ret == -EAGAIN)
-			ret = -EBUSY;
-		hmm_range_unregister(range);
-		return ret;
-	}
-	return 0;
-}
-
 /* Below are for HMM internal use only! Not to be used by device driver! */
 static inline void hmm_mm_init(struct mm_struct *mm)
 {
-- 
cgit v1.2.3


From 7a32f2962c56d9d8a836b4469855caeee8766bd4 Mon Sep 17 00:00:00 2001
From: Edward Srouji <edwards@mellanox.com>
Date: Tue, 23 Jul 2019 10:12:55 +0300
Subject: net/mlx5: Fix modify_cq_in alignment

Fix modify_cq_in alignment to match the device specification.
After this fix the 'cq_umem_valid' field will be in the right offset.

Cc: <stable@vger.kernel.org> # 4.19
Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
Signed-off-by: Edward Srouji <edwards@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b3d5752657d9..ec571fd7fcf8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5975,10 +5975,12 @@ struct mlx5_ifc_modify_cq_in_bits {
 
 	struct mlx5_ifc_cqc_bits cq_context;
 
-	u8         reserved_at_280[0x40];
+	u8         reserved_at_280[0x60];
 
 	u8         cq_umem_valid[0x1];
-	u8         reserved_at_2c1[0x5bf];
+	u8         reserved_at_2e1[0x1f];
+
+	u8         reserved_at_300[0x580];
 
 	u8         pas[0][0x40];
 };
-- 
cgit v1.2.3


From 90bb769291161cf25a818d69cf608c181654473e Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sat, 6 Jul 2019 18:06:15 +0300
Subject: net/mlx5e: Prevent encap flow counter update async to user query

This patch prevents a race between user invoked cached counters
query and a neighbor last usage updater.

The cached flow counter stats can be queried by calling
"mlx5_fc_query_cached" which provides the number of bytes and
packets that passed via this flow since the last time this counter
was queried.
It does so by reducting the last saved stats from the current, cached
stats and then updating the last saved stats with the cached stats.
It also provide the lastuse value for that flow.

Since "mlx5e_tc_update_neigh_used_value" needs to retrieve the
last usage time of encapsulation flows, it calls the flow counter
query method periodically and async to user queries of the flow counter
using cls_flower.
This call is causing the driver to update the last reported bytes and
packets from the cache and therefore, future user queries of the flow
stats will return lower than expected number for bytes and packets
since the last saved stats in the driver was updated async to the last
saved stats in cls_flower.

This causes wrong stats presentation of encapsulation flows to user.

Since the neighbor usage updater only needs the lastuse stats from the
cached counter, the fix is to use a dedicated lastuse query call that
returns the lastuse value without synching between the cached stats and
the last saved stats.

Fixes: f6dfb4c3f216 ("net/mlx5e: Update neighbour 'used' state using HW flow rules counters")
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c       | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 5 +++++
 include/linux/mlx5/fs.h                               | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cc096f6011d9..7ecfc53cf5f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1230,13 +1230,13 @@ static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 {
 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
-	u64 bytes, packets, lastuse = 0;
 	struct mlx5e_tc_flow *flow;
 	struct mlx5e_encap_entry *e;
 	struct mlx5_fc *counter;
 	struct neigh_table *tbl;
 	bool neigh_used = false;
 	struct neighbour *n;
+	u64 lastuse;
 
 	if (m_neigh->family == AF_INET)
 		tbl = &arp_tbl;
@@ -1256,7 +1256,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 					    encaps[efi->index]);
 			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
 				counter = mlx5e_tc_get_counter(flow);
-				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+				lastuse = mlx5_fc_query_lastuse(counter);
 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
 					neigh_used = true;
 					break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index b3762123a69c..1834d9f3aa1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -369,6 +369,11 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
 }
 EXPORT_SYMBOL(mlx5_fc_query);
 
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter)
+{
+	return counter->cache.lastuse;
+}
+
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse)
 {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 04a569568eac..f049af3f3cd8 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -220,6 +220,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
-- 
cgit v1.2.3


From a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Thu, 25 Jul 2019 12:07:12 -0600
Subject: net: qualcomm: rmnet: Fix incorrect UL checksum offload logic

The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets
so that the hardware can flip the checksum to 0xFFFF if the computed
checksum is 0 per RFC768.

However, this bit had to be set for IPv6 UDP non fragmented packets
as well per hardware requirements. Otherwise, IPv6 UDP packets
with computed checksum as 0 were transmitted by hardware and were
dropped in the network.

In addition to setting this bit for IPv6 UDP, the field is also
appropriately renamed to udp_ind as part of this change.

Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum offload")
Cc: Sean Tranchetti <stranche@codeaurora.org>
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 13 +++++++++----
 include/linux/if_rmnet.h                             |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 60189923737a..21d38167f961 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -206,9 +206,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr,
 	ul_header->csum_insert_offset = skb->csum_offset;
 	ul_header->csum_enabled = 1;
 	if (ip4h->protocol == IPPROTO_UDP)
-		ul_header->udp_ip4_ind = 1;
+		ul_header->udp_ind = 1;
 	else
-		ul_header->udp_ip4_ind = 0;
+		ul_header->udp_ind = 0;
 
 	/* Changing remaining fields to network order */
 	hdr++;
@@ -239,6 +239,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr,
 			      struct rmnet_map_ul_csum_header *ul_header,
 			      struct sk_buff *skb)
 {
+	struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr;
 	__be16 *hdr = (__be16 *)ul_header, offset;
 
 	offset = htons((__force u16)(skb_transport_header(skb) -
@@ -246,7 +247,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr,
 	ul_header->csum_start_offset = offset;
 	ul_header->csum_insert_offset = skb->csum_offset;
 	ul_header->csum_enabled = 1;
-	ul_header->udp_ip4_ind = 0;
+
+	if (ip6h->nexthdr == IPPROTO_UDP)
+		ul_header->udp_ind = 1;
+	else
+		ul_header->udp_ind = 0;
 
 	/* Changing remaining fields to network order */
 	hdr++;
@@ -419,7 +424,7 @@ sw_csum:
 	ul_header->csum_start_offset = 0;
 	ul_header->csum_insert_offset = 0;
 	ul_header->csum_enabled = 0;
-	ul_header->udp_ip4_ind = 0;
+	ul_header->udp_ind = 0;
 
 	priv->stats.csum_sw++;
 }
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index b4f5403383fc..9661416a9bb4 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -41,11 +41,11 @@ struct rmnet_map_ul_csum_header {
 	__be16 csum_start_offset;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	u16 csum_insert_offset:14;
-	u16 udp_ip4_ind:1;
+	u16 udp_ind:1;
 	u16 csum_enabled:1;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	u16 csum_enabled:1;
-	u16 udp_ip4_ind:1;
+	u16 udp_ind:1;
 	u16 csum_insert_offset:14;
 #else
 #error	"Please fix <asm/byteorder.h>"
-- 
cgit v1.2.3


From f1765a1819ff3489db9500c6d464e682e6844a14 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 26 Jul 2019 12:17:44 +0200
Subject: of: Fix typo in kerneldoc

"Findfrom" is not a word. Replace the function synopsis by something
that makes sense.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 0cf857012f11..844f89e1b039 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1164,7 +1164,7 @@ static inline int of_property_read_string_index(const struct device_node *np,
 }
 
 /**
- * of_property_read_bool - Findfrom a property
+ * of_property_read_bool - Find a property
  * @np:		device node from which the property value is to be read.
  * @propname:	name of the property to be searched.
  *
-- 
cgit v1.2.3


From ffe0bbabb0cffceceae07484fde1ec2a63b1537c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 8 Jul 2019 10:23:43 +0200
Subject: gpio: don't WARN() on NULL descs if gpiolib is disabled

If gpiolib is disabled, we use the inline stubs from gpio/consumer.h
instead of regular definitions of GPIO API. The stubs for 'optional'
variants of gpiod_get routines return NULL in this case as if the
relevant GPIO wasn't found. This is correct so far.

Calling other (non-gpio_get) stubs from this header triggers a warning
because the GPIO descriptor couldn't have been requested. The warning
however is unconditional (WARN_ON(1)) and is emitted even if the passed
descriptor pointer is NULL.

We don't want to force the users of 'optional' gpio_get to check the
returned pointer before calling e.g. gpiod_set_value() so let's only
WARN on non-NULL descriptors.

Cc: stable@vger.kernel.org
Reported-by: Claus H. Stovgaard <cst@phaseone.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/gpio/consumer.h | 64 +++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 9ddcf50a3c59..a7f08fb0f865 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -247,7 +247,7 @@ static inline void gpiod_put(struct gpio_desc *desc)
 	might_sleep();
 
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 
 static inline void devm_gpiod_unhinge(struct device *dev,
@@ -256,7 +256,7 @@ static inline void devm_gpiod_unhinge(struct device *dev,
 	might_sleep();
 
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 
 static inline void gpiod_put_array(struct gpio_descs *descs)
@@ -264,7 +264,7 @@ static inline void gpiod_put_array(struct gpio_descs *descs)
 	might_sleep();
 
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(descs);
 }
 
 static inline struct gpio_desc *__must_check
@@ -317,7 +317,7 @@ static inline void devm_gpiod_put(struct device *dev, struct gpio_desc *desc)
 	might_sleep();
 
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 
 static inline void devm_gpiod_put_array(struct device *dev,
@@ -326,32 +326,32 @@ static inline void devm_gpiod_put_array(struct device *dev,
 	might_sleep();
 
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(descs);
 }
 
 
 static inline int gpiod_get_direction(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 static inline int gpiod_direction_input(struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 static inline int gpiod_direction_output(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 
@@ -359,7 +359,7 @@ static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 static inline int gpiod_get_value(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 static inline int gpiod_get_array_value(unsigned int array_size,
@@ -368,13 +368,13 @@ static inline int gpiod_get_array_value(unsigned int array_size,
 					unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline void gpiod_set_value(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 static inline int gpiod_set_array_value(unsigned int array_size,
 					struct gpio_desc **desc_array,
@@ -382,13 +382,13 @@ static inline int gpiod_set_array_value(unsigned int array_size,
 					unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline int gpiod_get_raw_value(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 static inline int gpiod_get_raw_array_value(unsigned int array_size,
@@ -397,13 +397,13 @@ static inline int gpiod_get_raw_array_value(unsigned int array_size,
 					    unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 static inline int gpiod_set_raw_array_value(unsigned int array_size,
 					    struct gpio_desc **desc_array,
@@ -411,14 +411,14 @@ static inline int gpiod_set_raw_array_value(unsigned int array_size,
 					    unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 
 static inline int gpiod_get_value_cansleep(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 static inline int gpiod_get_array_value_cansleep(unsigned int array_size,
@@ -427,13 +427,13 @@ static inline int gpiod_get_array_value_cansleep(unsigned int array_size,
 				     unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 static inline int gpiod_set_array_value_cansleep(unsigned int array_size,
 					    struct gpio_desc **desc_array,
@@ -441,13 +441,13 @@ static inline int gpiod_set_array_value_cansleep(unsigned int array_size,
 					    unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
@@ -456,14 +456,14 @@ static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 					       unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc,
 						int value)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 }
 static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						struct gpio_desc **desc_array,
@@ -471,41 +471,41 @@ static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						unsigned long *value_bitmap)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc_array);
 	return 0;
 }
 
 static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 
 static inline int gpiod_set_transitory(struct gpio_desc *desc, bool transitory)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -ENOSYS;
 }
 
 static inline int gpiod_is_active_low(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 static inline int gpiod_cansleep(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return 0;
 }
 
 static inline int gpiod_to_irq(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -EINVAL;
 }
 
@@ -513,7 +513,7 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc,
 					  const char *name)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -EINVAL;
 }
 
@@ -525,7 +525,7 @@ static inline struct gpio_desc *gpio_to_desc(unsigned gpio)
 static inline int desc_to_gpio(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-	WARN_ON(1);
+	WARN_ON(desc);
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 89e524c04fa966330e2e80ab2bc50b9944c5847a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 30 Jul 2019 13:10:14 +0200
Subject: loop: Fix mount(2) failure due to race with LOOP_SET_FD

Commit 33ec3e53e7b1 ("loop: Don't change loop device under exclusive
opener") made LOOP_SET_FD ioctl acquire exclusive block device reference
while it updates loop device binding. However this can make perfectly
valid mount(2) fail with EBUSY due to racing LOOP_SET_FD holding
temporarily the exclusive bdev reference in cases like this:

for i in {a..z}{a..z}; do
        dd if=/dev/zero of=$i.image bs=1k count=0 seek=1024
        mkfs.ext2 $i.image
        mkdir mnt$i
done

echo "Run"
for i in {a..z}{a..z}; do
        mount -o loop -t ext2 $i.image mnt$i &
done

Fix the problem by not getting full exclusive bdev reference in
LOOP_SET_FD but instead just mark the bdev as being claimed while we
update the binding information. This just blocks new exclusive openers
instead of failing them with EBUSY thus fixing the problem.

Fixes: 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener")
Cc: stable@vger.kernel.org
Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 16 +++++-----
 fs/block_dev.c       | 83 ++++++++++++++++++++++++++++++++++++----------------
 include/linux/fs.h   |  6 ++++
 3 files changed, 73 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 44c9985f352a..3036883fc9f8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -924,6 +924,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	struct file	*file;
 	struct inode	*inode;
 	struct address_space *mapping;
+	struct block_device *claimed_bdev = NULL;
 	int		lo_flags = 0;
 	int		error;
 	loff_t		size;
@@ -942,10 +943,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	 * here to avoid changing device under exclusive owner.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		bdgrab(bdev);
-		error = blkdev_get(bdev, mode | FMODE_EXCL, loop_set_fd);
-		if (error)
+		claimed_bdev = bd_start_claiming(bdev, loop_set_fd);
+		if (IS_ERR(claimed_bdev)) {
+			error = PTR_ERR(claimed_bdev);
 			goto out_putf;
+		}
 	}
 
 	error = mutex_lock_killable(&loop_ctl_mutex);
@@ -1015,15 +1017,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
-	if (!(mode & FMODE_EXCL))
-		blkdev_put(bdev, mode | FMODE_EXCL);
+	if (claimed_bdev)
+		bd_abort_claiming(bdev, claimed_bdev, loop_set_fd);
 	return 0;
 
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 out_bdev:
-	if (!(mode & FMODE_EXCL))
-		blkdev_put(bdev, mode | FMODE_EXCL);
+	if (claimed_bdev)
+		bd_abort_claiming(bdev, claimed_bdev, loop_set_fd);
 out_putf:
 	fput(file);
 out:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c2a85b587922..22591bad9353 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1181,8 +1181,7 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
  * Pointer to the block device containing @bdev on success, ERR_PTR()
  * value on failure.
  */
-static struct block_device *bd_start_claiming(struct block_device *bdev,
-					      void *holder)
+struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk;
 	struct block_device *whole;
@@ -1229,6 +1228,62 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 		return ERR_PTR(err);
 	}
 }
+EXPORT_SYMBOL(bd_start_claiming);
+
+static void bd_clear_claiming(struct block_device *whole, void *holder)
+{
+	lockdep_assert_held(&bdev_lock);
+	/* tell others that we're done */
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
+			void *holder)
+{
+	spin_lock(&bdev_lock);
+	BUG_ON(!bd_may_claim(bdev, whole, holder));
+	/*
+	 * Note that for a whole device bd_holders will be incremented twice,
+	 * and bd_holder will be set to bd_may_claim before being set to holder
+	 */
+	whole->bd_holders++;
+	whole->bd_holder = bd_may_claim;
+	bdev->bd_holders++;
+	bdev->bd_holder = holder;
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_finish_claiming);
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
+		       void *holder)
+{
+	spin_lock(&bdev_lock);
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
 
 #ifdef CONFIG_SYSFS
 struct bd_holder_disk {
@@ -1698,29 +1753,7 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
-		spin_lock(&bdev_lock);
-
-		if (!res) {
-			BUG_ON(!bd_may_claim(bdev, whole, holder));
-			/*
-			 * Note that for a whole device bd_holders
-			 * will be incremented twice, and bd_holder
-			 * will be set to bd_may_claim before being
-			 * set to holder
-			 */
-			whole->bd_holders++;
-			whole->bd_holder = bd_may_claim;
-			bdev->bd_holders++;
-			bdev->bd_holder = holder;
-		}
-
-		/* tell others that we're done */
-		BUG_ON(whole->bd_claiming != holder);
-		whole->bd_claiming = NULL;
-		wake_up_bit(&whole->bd_claiming, 0);
-
-		spin_unlock(&bdev_lock);
-
+		bd_finish_claiming(bdev, whole, holder);
 		/*
 		 * Block event polling for write claims if requested.  Any
 		 * write holder makes the write_holder state stick until
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56b8e358af5c..997a530ff4e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2598,6 +2598,12 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					       void *holder);
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
+extern struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder);
+extern void bd_finish_claiming(struct block_device *bdev,
+			       struct block_device *whole, void *holder);
+extern void bd_abort_claiming(struct block_device *bdev,
+			      struct block_device *whole, void *holder);
 extern void blkdev_put(struct block_device *bdev, fmode_t mode);
 extern int __blkdev_reread_part(struct block_device *bdev);
 extern int blkdev_reread_part(struct block_device *bdev);
-- 
cgit v1.2.3


From 055d88242a6046a1ceac3167290f054c72571cd9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 30 Jul 2019 21:25:20 +0200
Subject: compat_ioctl: pppoe: fix PPPOEIOCSFWD handling

Support for handling the PPPOEIOCSFWD ioctl in compat mode was added in
linux-2.5.69 along with hundreds of other commands, but was always broken
sincen only the structure is compatible, but the command number is not,
due to the size being sizeof(size_t), or at first sizeof(sizeof((struct
sockaddr_pppox)), which is different on 64-bit architectures.

Guillaume Nault adds:

  And the implementation was broken until 2016 (see 29e73269aa4d ("pppoe:
  fix reference counting in PPPoE proxy")), and nobody ever noticed. I
  should probably have removed this ioctl entirely instead of fixing it.
  Clearly, it has never been used.

Fix it by adding a compat_ioctl handler for all pppoe variants that
translates the command number and then calls the regular ioctl function.

All other ioctl commands handled by pppoe are compatible between 32-bit
and 64-bit, and require compat_ptr() conversion.

This should apply to all stable kernels.

Acked-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pppoe.c  |  3 +++
 drivers/net/ppp/pppox.c  | 13 +++++++++++++
 drivers/net/ppp/pptp.c   |  3 +++
 fs/compat_ioctl.c        |  3 ---
 include/linux/if_pppox.h |  3 +++
 net/l2tp/l2tp_ppp.c      |  3 +++
 6 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 1d902ecb4aa8..a44dd3c8af63 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1115,6 +1115,9 @@ static const struct proto_ops pppoe_ops = {
 	.recvmsg	= pppoe_recvmsg,
 	.mmap		= sock_no_mmap,
 	.ioctl		= pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index 5ef422a43d70..08364f10a43f 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/net.h>
@@ -98,6 +99,18 @@ int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 EXPORT_SYMBOL(pppox_ioctl);
 
+#ifdef CONFIG_COMPAT
+int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == PPPOEIOCSFWD32)
+		cmd = PPPOEIOCSFWD;
+
+	return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+
+EXPORT_SYMBOL(pppox_compat_ioctl);
+#endif
+
 static int pppox_create(struct net *net, struct socket *sock, int protocol,
 			int kern)
 {
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index a8e52c8e4128..734de7de03f7 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -623,6 +623,9 @@ static const struct proto_ops pptp_ops = {
 	.recvmsg    = sock_no_recvmsg,
 	.mmap       = sock_no_mmap,
 	.ioctl      = pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppox_pptp_proto = {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6e30949d9f77..a7ec2d3dff92 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -638,9 +638,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN)
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
 COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* PPPOX */
-COMPATIBLE_IOCTL(PPPOEIOCSFWD)
-COMPATIBLE_IOCTL(PPPOEIOCDFWD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 8b728750a625..69e813bcb947 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -80,6 +80,9 @@ extern int register_pppox_proto(int proto_num, const struct pppox_proto *pp);
 extern void unregister_pppox_proto(int proto_num);
 extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
 extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+
+#define PPPOEIOCSFWD32    _IOW(0xB1 ,0, compat_size_t)
 
 /* PPPoX socket states */
 enum {
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1d0e5904dedf..c54cb59593ef 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1681,6 +1681,9 @@ static const struct proto_ops pppol2tp_ops = {
 	.recvmsg	= pppol2tp_recvmsg,
 	.mmap		= sock_no_mmap,
 	.ioctl		= pppox_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = pppox_compat_ioctl,
+#endif
 };
 
 static const struct pppox_proto pppol2tp_proto = {
-- 
cgit v1.2.3


From b877ac9815a8fe7e5f6d7fdde3dc34652408840a Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 14 Jun 2019 07:46:04 +0200
Subject: xen/swiotlb: remember having called xen_create_contiguous_region()

Instead of always calling xen_destroy_contiguous_region() in case the
memory is DMA-able for the used device, do so only in case it has been
made DMA-able via xen_create_contiguous_region() before.

This will avoid a lot of xen_destroy_contiguous_region() calls for
64-bit capable devices.

As the memory in question is owned by swiotlb-xen the PG_owner_priv_1
flag of the first allocated page can be used for remembering.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/swiotlb-xen.c  | 4 +++-
 include/linux/page-flags.h | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 37ddcfcfbb21..ceb681cf64bb 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -322,6 +322,7 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);
 			return NULL;
 		}
+		SetPageXenRemapped(virt_to_page(ret));
 	}
 	memset(ret, 0, size);
 	return ret;
@@ -346,7 +347,8 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	size = 1UL << (order + XEN_PAGE_SHIFT);
 
 	if (!WARN_ON((dev_addr + size - 1 > dma_mask) ||
-		     range_straddles_page_boundary(phys, size)))
+		     range_straddles_page_boundary(phys, size)) &&
+	    TestClearPageXenRemapped(virt_to_page(vaddr)))
 		xen_destroy_contiguous_region(phys, order);
 
 	xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b848517da64c..f91cb8898ff0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -152,6 +152,8 @@ enum pageflags {
 	PG_savepinned = PG_dirty,
 	/* Has a grant mapping of another (foreign) domain's page. */
 	PG_foreign = PG_owner_priv_1,
+	/* Remapped by swiotlb-xen. */
+	PG_xen_remapped = PG_owner_priv_1,
 
 	/* SLOB */
 	PG_slob_free = PG_private,
@@ -329,6 +331,8 @@ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
 	TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
 PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
 PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
+	TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
 
 PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
 	__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
-- 
cgit v1.2.3


From 9f00baf74e4b6f79a3a3dfab44fb7bb2e797b551 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Tue, 30 Jul 2019 16:05:24 +0000
Subject: crypto: ccp - Add support for valid authsize values less than 16

AES GCM encryption allows for authsize values of 4, 8, and 12-16 bytes.
Validate the requested authsize, and retain it to save in the request
context.

Fixes: 36cf515b9bbe2 ("crypto: ccp - Enable support for AES GCM on v5 CCPs")
Cc: <stable@vger.kernel.org>
Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/ccp-crypto-aes-galois.c | 14 ++++++++++++++
 drivers/crypto/ccp/ccp-ops.c               | 26 +++++++++++++++++++++-----
 include/linux/ccp.h                        |  2 ++
 3 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c
index d22631cb2bb3..02eba84028b3 100644
--- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c
+++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c
@@ -58,6 +58,19 @@ static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key,
 static int ccp_aes_gcm_setauthsize(struct crypto_aead *tfm,
 				   unsigned int authsize)
 {
+	switch (authsize) {
+	case 16:
+	case 15:
+	case 14:
+	case 13:
+	case 12:
+	case 8:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -104,6 +117,7 @@ static int ccp_aes_gcm_crypt(struct aead_request *req, bool encrypt)
 	memset(&rctx->cmd, 0, sizeof(rctx->cmd));
 	INIT_LIST_HEAD(&rctx->cmd.entry);
 	rctx->cmd.engine = CCP_ENGINE_AES;
+	rctx->cmd.u.aes.authsize = crypto_aead_authsize(tfm);
 	rctx->cmd.u.aes.type = ctx->u.aes.type;
 	rctx->cmd.u.aes.mode = ctx->u.aes.mode;
 	rctx->cmd.u.aes.action = encrypt;
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index 59f9849c3662..ef723e2722a8 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -622,6 +622,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q,
 
 	unsigned long long *final;
 	unsigned int dm_offset;
+	unsigned int authsize;
 	unsigned int jobid;
 	unsigned int ilen;
 	bool in_place = true; /* Default value */
@@ -643,6 +644,21 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q,
 	if (!aes->key) /* Gotta have a key SGL */
 		return -EINVAL;
 
+	/* Zero defaults to 16 bytes, the maximum size */
+	authsize = aes->authsize ? aes->authsize : AES_BLOCK_SIZE;
+	switch (authsize) {
+	case 16:
+	case 15:
+	case 14:
+	case 13:
+	case 12:
+	case 8:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	/* First, decompose the source buffer into AAD & PT,
 	 * and the destination buffer into AAD, CT & tag, or
 	 * the input into CT & tag.
@@ -657,7 +673,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q,
 		p_tag = scatterwalk_ffwd(sg_tag, p_outp, ilen);
 	} else {
 		/* Input length for decryption includes tag */
-		ilen = aes->src_len - AES_BLOCK_SIZE;
+		ilen = aes->src_len - authsize;
 		p_tag = scatterwalk_ffwd(sg_tag, p_inp, ilen);
 	}
 
@@ -839,19 +855,19 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q,
 
 	if (aes->action == CCP_AES_ACTION_ENCRYPT) {
 		/* Put the ciphered tag after the ciphertext. */
-		ccp_get_dm_area(&final_wa, 0, p_tag, 0, AES_BLOCK_SIZE);
+		ccp_get_dm_area(&final_wa, 0, p_tag, 0, authsize);
 	} else {
 		/* Does this ciphered tag match the input? */
-		ret = ccp_init_dm_workarea(&tag, cmd_q, AES_BLOCK_SIZE,
+		ret = ccp_init_dm_workarea(&tag, cmd_q, authsize,
 					   DMA_BIDIRECTIONAL);
 		if (ret)
 			goto e_tag;
-		ret = ccp_set_dm_area(&tag, 0, p_tag, 0, AES_BLOCK_SIZE);
+		ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize);
 		if (ret)
 			goto e_tag;
 
 		ret = crypto_memneq(tag.address, final_wa.address,
-				    AES_BLOCK_SIZE) ? -EBADMSG : 0;
+				    authsize) ? -EBADMSG : 0;
 		ccp_dm_free(&tag);
 	}
 
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index 7e9c991c95e0..43ed9e77cf81 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -173,6 +173,8 @@ struct ccp_aes_engine {
 	enum ccp_aes_mode mode;
 	enum ccp_aes_action action;
 
+	u32 authsize;
+
 	struct scatterlist *key;
 	u32 key_len;		/* In bytes */
 
-- 
cgit v1.2.3


From 23b6904442d08b7dbed7622ed33b236d41a3aa8b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 31 Jul 2019 14:43:40 +0200
Subject: driver core: add dev_groups to all drivers

Add the ability for the driver core to create and remove a list of
attribute groups automatically when the device is bound/unbound from a
specific driver.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Tested-by: Richard Gong <richard.gong@linux.intel.com>
Link: https://lore.kernel.org/r/20190731124349.4474-2-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c      | 14 ++++++++++++++
 include/linux/device.h |  3 +++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 994a90747420..d811e60610d3 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -554,9 +554,16 @@ re_probe:
 			goto probe_failed;
 	}
 
+	if (device_add_groups(dev, drv->dev_groups)) {
+		dev_err(dev, "device_add_groups() failed\n");
+		goto dev_groups_failed;
+	}
+
 	if (test_remove) {
 		test_remove = false;
 
+		device_remove_groups(dev, drv->dev_groups);
+
 		if (dev->bus->remove)
 			dev->bus->remove(dev);
 		else if (drv->remove)
@@ -584,6 +591,11 @@ re_probe:
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 	goto done;
 
+dev_groups_failed:
+	if (dev->bus->remove)
+		dev->bus->remove(dev);
+	else if (drv->remove)
+		drv->remove(dev);
 probe_failed:
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
@@ -1114,6 +1126,8 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 
 		pm_runtime_put_sync(dev);
 
+		device_remove_groups(dev, drv->dev_groups);
+
 		if (dev->bus && dev->bus->remove)
 			dev->bus->remove(dev);
 		else if (drv->remove)
diff --git a/include/linux/device.h b/include/linux/device.h
index c330b75c6c57..98c00b71b598 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -262,6 +262,8 @@ enum probe_type {
  * @resume:	Called to bring a device from sleep mode.
  * @groups:	Default attributes that get created by the driver core
  *		automatically.
+ * @dev_groups:	Additional attributes attached to device instance once the
+ *		it is bound to the driver.
  * @pm:		Power management operations of the device which matched
  *		this driver.
  * @coredump:	Called when sysfs entry is written to. The device driver
@@ -296,6 +298,7 @@ struct device_driver {
 	int (*suspend) (struct device *dev, pm_message_t state);
 	int (*resume) (struct device *dev);
 	const struct attribute_group **groups;
+	const struct attribute_group **dev_groups;
 
 	const struct dev_pm_ops *pm;
 	void (*coredump) (struct device *dev);
-- 
cgit v1.2.3


From ee38d94a0ad89890b770f6c876263cf9fcbfde84 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Aug 2019 21:49:02 -0700
Subject: page flags: prioritize kasan bits over last-cpuid

ARM64 randdconfig builds regularly run into a build error, especially
when NUMA_BALANCING and SPARSEMEM are enabled but not SPARSEMEM_VMEMMAP:

  #error "KASAN: not enough bits in page flags for tag"

The last-cpuid bits are already contitional on the available space, so
the result of the calculation is a bit random on whether they were
already left out or not.

Adding the kasan tag bits before last-cpuid makes it much more likely to
end up with a successful build here, and should be reliable for
randconfig at least, as long as that does not randomize NR_CPUS or
NODES_SHIFT but uses the defaults.

In order for the modified check to not trigger in the x86 vdso32 code
where all constants are wrong (building with -m32), enclose all the
definitions with an #ifdef.

[arnd@arndb.de: build fix]
  Link: http://lkml.kernel.org/r/CAK8P3a3Mno1SWTcuAOT0Wa9VS15pdU6EfnkxLbDpyS55yO04+g@mail.gmail.com
Link: http://lkml.kernel.org/r/20190722115520.3743282-1-arnd@arndb.de
Link: https://lore.kernel.org/lkml/20190618095347.3850490-1-arnd@arndb.de/
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/vdso/vdso.h             |  1 +
 include/linux/page-flags-layout.h | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/vdso/vdso.h b/arch/mips/vdso/vdso.h
index 14b1931be69c..b65b169778e3 100644
--- a/arch/mips/vdso/vdso.h
+++ b/arch/mips/vdso/vdso.h
@@ -9,6 +9,7 @@
 #if _MIPS_SIM != _MIPS_SIM_ABI64 && defined(CONFIG_64BIT)
 
 /* Building 32-bit VDSO for the 64-bit kernel. Fake a 32-bit Kconfig. */
+#define BUILD_VDSO32_64
 #undef CONFIG_64BIT
 #define CONFIG_32BIT 1
 #ifndef __ASSEMBLY__
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 1dda31825ec4..71283739ffd2 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -32,6 +32,7 @@
 
 #endif /* CONFIG_SPARSEMEM */
 
+#ifndef BUILD_VDSO32_64
 /*
  * page->flags layout:
  *
@@ -76,20 +77,22 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#ifdef CONFIG_KASAN_SW_TAGS
+#define KASAN_TAG_WIDTH 8
+#else
+#define KASAN_TAG_WIDTH 0
+#endif
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
+	<= BITS_PER_LONG - NR_PAGEFLAGS
 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
 #else
 #define LAST_CPUPID_WIDTH 0
 #endif
 
-#ifdef CONFIG_KASAN_SW_TAGS
-#define KASAN_TAG_WIDTH 8
 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
 	> BITS_PER_LONG - NR_PAGEFLAGS
-#error "KASAN: not enough bits in page flags for tag"
-#endif
-#else
-#define KASAN_TAG_WIDTH 0
+#error "Not enough bits in page flags"
 #endif
 
 /*
@@ -104,4 +107,5 @@
 #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #endif
 
+#endif
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
-- 
cgit v1.2.3


From 17e433b54393a6269acbcb792da97791fe1592d8 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 5 Aug 2019 10:03:19 +0800
Subject: KVM: Fix leak vCPU's VMCS value into other pCPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts), a
five years old bug is exposed. Running ebizzy benchmark in three 80 vCPUs VMs
on one 80 pCPUs Skylake server, a lot of rcu_sched stall warning splatting
in the VMs after stress testing:

 INFO: rcu_sched detected stalls on CPUs/tasks: { 4 41 57 62 77} (detected by 15, t=60004 jiffies, g=899, c=898, q=15073)
 Call Trace:
   flush_tlb_mm_range+0x68/0x140
   tlb_flush_mmu.part.75+0x37/0xe0
   tlb_finish_mmu+0x55/0x60
   zap_page_range+0x142/0x190
   SyS_madvise+0x3cd/0x9c0
   system_call_fastpath+0x1c/0x21

swait_active() sustains to be true before finish_swait() is called in
kvm_vcpu_block(), voluntarily preempted vCPUs are taken into account
by kvm_vcpu_on_spin() loop greatly increases the probability condition
kvm_arch_vcpu_runnable(vcpu) is checked and can be true, when APICv
is enabled the yield-candidate vCPU's VMCS RVI field leaks(by
vmx_sync_pir_to_irr()) into spinning-on-a-taken-lock vCPU's current
VMCS.

This patch fixes it by checking conservatively a subset of events.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Marc Zyngier <Marc.Zyngier@arm.com>
Cc: stable@vger.kernel.org
Fixes: 98f4a1467 (KVM: add kvm_arch_vcpu_runnable() test to kvm_vcpu_on_spin() loop)
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/powerpc.c      |  5 +++++
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              |  6 ++++++
 arch/x86/kvm/vmx/vmx.c          |  6 ++++++
 arch/x86/kvm/x86.c              | 16 ++++++++++++++++
 include/linux/kvm_host.h        |  1 +
 virt/kvm/kvm_main.c             | 25 ++++++++++++++++++++++++-
 7 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0dba7eb24f92..3e34d5fa6708 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -50,6 +50,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
 }
 
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_runnable(vcpu);
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
 	return false;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e74f0711eaaf..fc046ca89d32 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1175,6 +1175,7 @@ struct kvm_x86_ops {
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+	bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
 
 	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
 			    bool *expired);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7eafc6907861..d685491fce4d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5190,6 +5190,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		kvm_vcpu_wake_up(vcpu);
 }
 
+static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 {
 	unsigned long flags;
@@ -7314,6 +7319,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.pmu_ops = &amd_pmu_ops,
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
+	.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
 	.update_pi_irte = svm_update_pi_irte,
 	.setup_mce = svm_setup_mce,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 074385c86c09..42ed3faa6af8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	return max_irr;
 }
 
+static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+	return pi_test_on(vcpu_to_pi_desc(vcpu));
+}
+
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	if (!kvm_vcpu_apicv_active(vcpu))
@@ -7726,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
 	.sync_pir_to_irr = vmx_sync_pir_to_irr,
 	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+	.dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.set_identity_map_addr = vmx_set_identity_map_addr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c6d951cbd76c..93b0bd45ac73 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9698,6 +9698,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+		return true;
+
+	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+		kvm_test_request(KVM_REQ_SMI, vcpu) ||
+		 kvm_test_request(KVM_REQ_EVENT, vcpu))
+		return true;
+
+	if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+		return true;
+
+	return false;
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.preempted_in_kernel;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5c5b5867024c..9e4c2bb90297 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -872,6 +872,7 @@ int kvm_arch_check_processor_compat(void);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ed061d8a457c..1f05aeb9da27 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2477,6 +2477,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * Unlike kvm_arch_vcpu_runnable, this function is called outside
+ * a vcpu_load/vcpu_put pair.  However, for most architectures
+ * kvm_arch_vcpu_runnable does not require vcpu_load.
+ */
+bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_runnable(vcpu);
+}
+
+static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	if (kvm_arch_dy_runnable(vcpu))
+		return true;
+
+#ifdef CONFIG_KVM_ASYNC_PF
+	if (!list_empty_careful(&vcpu->async_pf.done))
+		return true;
+#endif
+
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	struct kvm *kvm = me->kvm;
@@ -2506,7 +2529,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+			if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
 				continue;
 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
 				!kvm_arch_vcpu_in_kernel(vcpu))
-- 
cgit v1.2.3


From 741cbbae0768b828be2d48331eb371a4f08bbea8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 3 Aug 2019 08:14:25 +0200
Subject: KVM: remove kvm_arch_has_vcpu_debugfs()

There is no need for this function as all arches have to implement
kvm_arch_create_vcpu_debugfs() no matter what.  A #define symbol
let us actually simplify the code.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c            | 10 ----------
 arch/powerpc/kvm/powerpc.c      | 10 ----------
 arch/s390/kvm/kvm-s390.c        | 10 ----------
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/debugfs.c          |  5 -----
 include/linux/kvm_host.h        |  3 ++-
 virt/kvm/arm/arm.c              |  5 -----
 virt/kvm/kvm_main.c             |  5 ++---
 8 files changed, 6 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2cfe839f0b3a..1109924560d8 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -150,16 +150,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	return 0;
 }
 
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
 void kvm_mips_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3e34d5fa6708..3e566c2e6066 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -457,16 +457,6 @@ err_out:
 	return -EINVAL;
 }
 
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	unsigned int i;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3f520cd837fb..f329dcb3f44c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2516,16 +2516,6 @@ out_err:
 	return rc;
 }
 
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc046ca89d32..e92725b2a46f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,6 +35,8 @@
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
 
+#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+
 #define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
 #define KVM_MAX_VCPU_ID 1023
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 329361b69d5e..9bd93e0d5f63 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -8,11 +8,6 @@
 #include <linux/debugfs.h>
 #include "lapic.h"
 
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return true;
-}
-
 static int vcpu_get_timer_advance_ns(void *data, u64 *val)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9e4c2bb90297..8d34db3c8bc6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -861,8 +861,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
-bool kvm_arch_has_vcpu_debugfs(void);
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+#endif
 
 int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index acc43242a310..13f5a1aa6d79 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -144,11 +144,6 @@ out_fail_alloc:
 	return ret;
 }
 
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
 int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1f05aeb9da27..4afb1a234018 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2617,12 +2617,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 
 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 	char dir_name[ITOA_MAX_LEN * 2];
 	int ret;
 
-	if (!kvm_arch_has_vcpu_debugfs())
-		return 0;
-
 	if (!debugfs_initialized())
 		return 0;
 
@@ -2637,6 +2635,7 @@ static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 		debugfs_remove_recursive(vcpu->debugfs_dentry);
 		return ret;
 	}
+#endif
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3e7093d045196b1016517631645e874fe903db7e Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@linuxfoundation.org>
Date: Wed, 31 Jul 2019 20:56:20 +0200
Subject: KVM: no need to check return value of debugfs_create functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

Also, when doing this, change kvm_arch_create_vcpu_debugfs() to return
void instead of an integer, as we should not care at all about if this
function actually does anything or not.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Cc: <kvm@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/debugfs.c   | 41 +++++++++++++----------------------------
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 21 +++++----------------
 3 files changed, 19 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 9bd93e0d5f63..018aebce33ff 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -43,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
 
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
-	struct dentry *ret;
+	debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu,
+			    &vcpu_tsc_offset_fops);
 
-	ret = debugfs_create_file("tsc-offset", 0444,
-							vcpu->debugfs_dentry,
-							vcpu, &vcpu_tsc_offset_fops);
-	if (!ret)
-		return -ENOMEM;
-
-	if (lapic_in_kernel(vcpu)) {
-		ret = debugfs_create_file("lapic_timer_advance_ns", 0444,
-								vcpu->debugfs_dentry,
-								vcpu, &vcpu_timer_advance_ns_fops);
-		if (!ret)
-			return -ENOMEM;
-	}
+	if (lapic_in_kernel(vcpu))
+		debugfs_create_file("lapic_timer_advance_ns", 0444,
+				    vcpu->debugfs_dentry, vcpu,
+				    &vcpu_timer_advance_ns_fops);
 
 	if (kvm_has_tsc_control) {
-		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
-							vcpu->debugfs_dentry,
-							vcpu, &vcpu_tsc_scaling_fops);
-		if (!ret)
-			return -ENOMEM;
-		ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
-							vcpu->debugfs_dentry,
-							vcpu, &vcpu_tsc_scaling_frac_fops);
-		if (!ret)
-			return -ENOMEM;
-
+		debugfs_create_file("tsc-scaling-ratio", 0444,
+				    vcpu->debugfs_dentry, vcpu,
+				    &vcpu_tsc_scaling_fops);
+		debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+				    vcpu->debugfs_dentry, vcpu,
+				    &vcpu_tsc_scaling_frac_fops);
 	}
-
-	return 0;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8d34db3c8bc6..fcb46b3374c6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -862,7 +862,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
 #endif
 
 int kvm_arch_hardware_enable(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4afb1a234018..4feceaa03fb1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2615,29 +2615,20 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 	char dir_name[ITOA_MAX_LEN * 2];
-	int ret;
 
 	if (!debugfs_initialized())
-		return 0;
+		return;
 
 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
 	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
-								vcpu->kvm->debugfs_dentry);
-	if (!vcpu->debugfs_dentry)
-		return -ENOMEM;
+						  vcpu->kvm->debugfs_dentry);
 
-	ret = kvm_arch_create_vcpu_debugfs(vcpu);
-	if (ret < 0) {
-		debugfs_remove_recursive(vcpu->debugfs_dentry);
-		return ret;
-	}
+	kvm_arch_create_vcpu_debugfs(vcpu);
 #endif
-
-	return 0;
 }
 
 /*
@@ -2672,9 +2663,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	if (r)
 		goto vcpu_destroy;
 
-	r = kvm_create_vcpu_debugfs(vcpu);
-	if (r)
-		goto vcpu_destroy;
+	kvm_create_vcpu_debugfs(vcpu);
 
 	mutex_lock(&kvm->lock);
 	if (kvm_get_vcpu_by_id(kvm, id)) {
-- 
cgit v1.2.3


From 060157e1dbc133075a2e20786d6ff6d4b41909f9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 Aug 2019 14:55:43 -0700
Subject: Input: remove w90x900 keyboard driver

The ARM w90x900 platform is getting removed, so this driver is obsolete.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/Kconfig               |  11 --
 drivers/input/keyboard/Makefile              |   1 -
 drivers/input/keyboard/w90p910_keypad.c      | 264 ---------------------------
 include/linux/platform_data/keypad-w90p910.h |  16 --
 4 files changed, 292 deletions(-)
 delete mode 100644 drivers/input/keyboard/w90p910_keypad.c
 delete mode 100644 include/linux/platform_data/keypad-w90p910.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 8e9c3ea9d5e7..c1da129a4eb5 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -731,17 +731,6 @@ config KEYBOARD_XTKBD
 	  To compile this driver as a module, choose M here: the
 	  module will be called xtkbd.
 
-config KEYBOARD_W90P910
-	tristate "W90P910 Matrix Keypad support"
-	depends on ARCH_W90X900
-	select INPUT_MATRIXKMAP
-	help
-	  Say Y here to enable the matrix keypad on evaluation board
-	  based on W90P910.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called w90p910_keypad.
-
 config KEYBOARD_CROS_EC
 	tristate "ChromeOS EC keyboard"
 	select INPUT_MATRIXKMAP
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 06a0af6efeae..9510325c0c5d 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -68,4 +68,3 @@ obj-$(CONFIG_KEYBOARD_TEGRA)		+= tegra-kbc.o
 obj-$(CONFIG_KEYBOARD_TM2_TOUCHKEY)	+= tm2-touchkey.o
 obj-$(CONFIG_KEYBOARD_TWL4030)		+= twl4030_keypad.o
 obj-$(CONFIG_KEYBOARD_XTKBD)		+= xtkbd.o
-obj-$(CONFIG_KEYBOARD_W90P910)		+= w90p910_keypad.o
diff --git a/drivers/input/keyboard/w90p910_keypad.c b/drivers/input/keyboard/w90p910_keypad.c
deleted file mode 100644
index c88d05d6108a..000000000000
--- a/drivers/input/keyboard/w90p910_keypad.c
+++ /dev/null
@@ -1,264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2008-2009 Nuvoton technology corporation.
- *
- * Wan ZongShun <mcuos.com@gmail.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/input.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-
-#include <linux/platform_data/keypad-w90p910.h>
-
-/* Keypad Interface Control Registers */
-#define KPI_CONF		0x00
-#define KPI_3KCONF		0x04
-#define KPI_LPCONF		0x08
-#define KPI_STATUS		0x0C
-
-#define IS1KEY			(0x01 << 16)
-#define INTTR			(0x01 << 21)
-#define KEY0R			(0x0f << 3)
-#define KEY0C			0x07
-#define DEBOUNCE_BIT		0x08
-#define KSIZE0			(0x01 << 16)
-#define KSIZE1			(0x01 << 17)
-#define KPSEL			(0x01 << 19)
-#define ENKP			(0x01 << 18)
-
-#define KGET_RAW(n)		(((n) & KEY0R) >> 3)
-#define KGET_COLUMN(n)		((n) & KEY0C)
-
-#define W90P910_NUM_ROWS	8
-#define W90P910_NUM_COLS	8
-#define W90P910_ROW_SHIFT	3
-
-struct w90p910_keypad {
-	const struct w90p910_keypad_platform_data *pdata;
-	struct clk *clk;
-	struct input_dev *input_dev;
-	void __iomem *mmio_base;
-	int irq;
-	unsigned short keymap[W90P910_NUM_ROWS * W90P910_NUM_COLS];
-};
-
-static void w90p910_keypad_scan_matrix(struct w90p910_keypad *keypad,
-							unsigned int status)
-{
-	struct input_dev *input_dev = keypad->input_dev;
-	unsigned int row = KGET_RAW(status);
-	unsigned int col = KGET_COLUMN(status);
-	unsigned int code = MATRIX_SCAN_CODE(row, col, W90P910_ROW_SHIFT);
-	unsigned int key = keypad->keymap[code];
-
-	input_event(input_dev, EV_MSC, MSC_SCAN, code);
-	input_report_key(input_dev, key, 1);
-	input_sync(input_dev);
-
-	input_event(input_dev, EV_MSC, MSC_SCAN, code);
-	input_report_key(input_dev, key, 0);
-	input_sync(input_dev);
-}
-
-static irqreturn_t w90p910_keypad_irq_handler(int irq, void *dev_id)
-{
-	struct w90p910_keypad *keypad = dev_id;
-	unsigned int  kstatus, val;
-
-	kstatus = __raw_readl(keypad->mmio_base + KPI_STATUS);
-
-	val = INTTR | IS1KEY;
-
-	if (kstatus & val)
-		w90p910_keypad_scan_matrix(keypad, kstatus);
-
-	return IRQ_HANDLED;
-}
-
-static int w90p910_keypad_open(struct input_dev *dev)
-{
-	struct w90p910_keypad *keypad = input_get_drvdata(dev);
-	const struct w90p910_keypad_platform_data *pdata = keypad->pdata;
-	unsigned int val, config;
-
-	/* Enable unit clock */
-	clk_enable(keypad->clk);
-
-	val = __raw_readl(keypad->mmio_base + KPI_CONF);
-	val |= (KPSEL | ENKP);
-	val &= ~(KSIZE0 | KSIZE1);
-
-	config = pdata->prescale | (pdata->debounce << DEBOUNCE_BIT);
-
-	val |= config;
-
-	__raw_writel(val, keypad->mmio_base + KPI_CONF);
-
-	return 0;
-}
-
-static void w90p910_keypad_close(struct input_dev *dev)
-{
-	struct w90p910_keypad *keypad = input_get_drvdata(dev);
-
-	/* Disable clock unit */
-	clk_disable(keypad->clk);
-}
-
-static int w90p910_keypad_probe(struct platform_device *pdev)
-{
-	const struct w90p910_keypad_platform_data *pdata =
-						dev_get_platdata(&pdev->dev);
-	const struct matrix_keymap_data *keymap_data;
-	struct w90p910_keypad *keypad;
-	struct input_dev *input_dev;
-	struct resource *res;
-	int irq;
-	int error;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data defined\n");
-		return -EINVAL;
-	}
-
-	keymap_data = pdata->keymap_data;
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(&pdev->dev, "failed to get keypad irq\n");
-		return -ENXIO;
-	}
-
-	keypad = kzalloc(sizeof(struct w90p910_keypad), GFP_KERNEL);
-	input_dev = input_allocate_device();
-	if (!keypad || !input_dev) {
-		dev_err(&pdev->dev, "failed to allocate driver data\n");
-		error = -ENOMEM;
-		goto failed_free;
-	}
-
-	keypad->pdata = pdata;
-	keypad->input_dev = input_dev;
-	keypad->irq = irq;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (res == NULL) {
-		dev_err(&pdev->dev, "failed to get I/O memory\n");
-		error = -ENXIO;
-		goto failed_free;
-	}
-
-	res = request_mem_region(res->start, resource_size(res), pdev->name);
-	if (res == NULL) {
-		dev_err(&pdev->dev, "failed to request I/O memory\n");
-		error = -EBUSY;
-		goto failed_free;
-	}
-
-	keypad->mmio_base = ioremap(res->start, resource_size(res));
-	if (keypad->mmio_base == NULL) {
-		dev_err(&pdev->dev, "failed to remap I/O memory\n");
-		error = -ENXIO;
-		goto failed_free_res;
-	}
-
-	keypad->clk = clk_get(&pdev->dev, NULL);
-	if (IS_ERR(keypad->clk)) {
-		dev_err(&pdev->dev, "failed to get keypad clock\n");
-		error = PTR_ERR(keypad->clk);
-		goto failed_free_io;
-	}
-
-	/* set multi-function pin for w90p910 kpi. */
-	mfp_set_groupi(&pdev->dev);
-
-	input_dev->name = pdev->name;
-	input_dev->id.bustype = BUS_HOST;
-	input_dev->open = w90p910_keypad_open;
-	input_dev->close = w90p910_keypad_close;
-	input_dev->dev.parent = &pdev->dev;
-
-	error = matrix_keypad_build_keymap(keymap_data, NULL,
-					   W90P910_NUM_ROWS, W90P910_NUM_COLS,
-					   keypad->keymap, input_dev);
-	if (error) {
-		dev_err(&pdev->dev, "failed to build keymap\n");
-		goto failed_put_clk;
-	}
-
-	error = request_irq(keypad->irq, w90p910_keypad_irq_handler,
-			    0, pdev->name, keypad);
-	if (error) {
-		dev_err(&pdev->dev, "failed to request IRQ\n");
-		goto failed_put_clk;
-	}
-
-	__set_bit(EV_REP, input_dev->evbit);
-	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
-	input_set_drvdata(input_dev, keypad);
-
-	/* Register the input device */
-	error = input_register_device(input_dev);
-	if (error) {
-		dev_err(&pdev->dev, "failed to register input device\n");
-		goto failed_free_irq;
-	}
-
-	platform_set_drvdata(pdev, keypad);
-	return 0;
-
-failed_free_irq:
-	free_irq(irq, keypad);
-failed_put_clk:
-	clk_put(keypad->clk);
-failed_free_io:
-	iounmap(keypad->mmio_base);
-failed_free_res:
-	release_mem_region(res->start, resource_size(res));
-failed_free:
-	input_free_device(input_dev);
-	kfree(keypad);
-	return error;
-}
-
-static int w90p910_keypad_remove(struct platform_device *pdev)
-{
-	struct w90p910_keypad *keypad = platform_get_drvdata(pdev);
-	struct resource *res;
-
-	free_irq(keypad->irq, keypad);
-
-	clk_put(keypad->clk);
-
-	input_unregister_device(keypad->input_dev);
-
-	iounmap(keypad->mmio_base);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(res->start, resource_size(res));
-
-	kfree(keypad);
-
-	return 0;
-}
-
-static struct platform_driver w90p910_keypad_driver = {
-	.probe		= w90p910_keypad_probe,
-	.remove		= w90p910_keypad_remove,
-	.driver		= {
-		.name	= "nuc900-kpi",
-	},
-};
-module_platform_driver(w90p910_keypad_driver);
-
-MODULE_AUTHOR("Wan ZongShun <mcuos.com@gmail.com>");
-MODULE_DESCRIPTION("w90p910 keypad driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:nuc900-keypad");
diff --git a/include/linux/platform_data/keypad-w90p910.h b/include/linux/platform_data/keypad-w90p910.h
deleted file mode 100644
index 206ca4ecd93f..000000000000
--- a/include/linux/platform_data/keypad-w90p910.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_ARCH_W90P910_KEYPAD_H
-#define __ASM_ARCH_W90P910_KEYPAD_H
-
-#include <linux/input/matrix_keypad.h>
-
-extern void mfp_set_groupi(struct device *dev);
-
-struct w90p910_keypad_platform_data {
-	const struct matrix_keymap_data *keymap_data;
-
-	unsigned int	prescale;
-	unsigned int	debounce;
-};
-
-#endif /* __ASM_ARCH_W90P910_KEYPAD_H */
-- 
cgit v1.2.3


From e95656ea15e54d4e6a192d560d84008b53fc1eb5 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 18 Apr 2017 17:28:30 -0700
Subject: Input: add support for polling to input devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separating "normal" and "polled" input devices was a mistake, as often we
want to allow the very same device work on both interrupt-driven and
polled mode, depending on the board on which the device is used.

This introduces new APIs:

- input_setup_polling
- input_set_poll_interval
- input_set_min_poll_interval
- input_set_max_poll_interval

These new APIs allow switching an input device into polled mode with sysfs
attributes matching drivers using input_polled_dev APIs that will be
eventually removed.

Tested-by: Michal Vokáč <michal.vokac@ysoft.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/Makefile       |   2 +-
 drivers/input/input-poller.c | 213 +++++++++++++++++++++++++++++++++++++++++++
 drivers/input/input-poller.h |  18 ++++
 drivers/input/input.c        |  36 ++++++--
 include/linux/input.h        |  12 +++
 5 files changed, 273 insertions(+), 8 deletions(-)
 create mode 100644 drivers/input/input-poller.c
 create mode 100644 drivers/input/input-poller.h

(limited to 'include/linux')

diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 40de6a7be641..e35650930371 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -6,7 +6,7 @@
 # Each configuration option enables a list of files.
 
 obj-$(CONFIG_INPUT)		+= input-core.o
-input-core-y := input.o input-compat.o input-mt.o ff-core.o
+input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o
 
 obj-$(CONFIG_INPUT_FF_MEMLESS)	+= ff-memless.o
 obj-$(CONFIG_INPUT_POLLDEV)	+= input-polldev.o
diff --git a/drivers/input/input-poller.c b/drivers/input/input-poller.c
new file mode 100644
index 000000000000..1b3d28964bb2
--- /dev/null
+++ b/drivers/input/input-poller.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support for polling mode for input devices.
+ */
+
+#include <linux/device.h>
+#include <linux/input.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include "input-poller.h"
+
+struct input_dev_poller {
+	void (*poll)(struct input_dev *dev);
+
+	unsigned int poll_interval; /* msec */
+	unsigned int poll_interval_max; /* msec */
+	unsigned int poll_interval_min; /* msec */
+
+	struct input_dev *input;
+	struct delayed_work work;
+};
+
+static void input_dev_poller_queue_work(struct input_dev_poller *poller)
+{
+	unsigned long delay;
+
+	delay = msecs_to_jiffies(poller->poll_interval);
+	if (delay >= HZ)
+		delay = round_jiffies_relative(delay);
+
+	queue_delayed_work(system_freezable_wq, &poller->work, delay);
+}
+
+static void input_dev_poller_work(struct work_struct *work)
+{
+	struct input_dev_poller *poller =
+		container_of(work, struct input_dev_poller, work.work);
+
+	poller->poll(poller->input);
+	input_dev_poller_queue_work(poller);
+}
+
+void input_dev_poller_finalize(struct input_dev_poller *poller)
+{
+	if (!poller->poll_interval)
+		poller->poll_interval = 500;
+	if (!poller->poll_interval_max)
+		poller->poll_interval_max = poller->poll_interval;
+}
+
+void input_dev_poller_start(struct input_dev_poller *poller)
+{
+	/* Only start polling if polling is enabled */
+	if (poller->poll_interval > 0) {
+		poller->poll(poller->input);
+		input_dev_poller_queue_work(poller);
+	}
+}
+
+void input_dev_poller_stop(struct input_dev_poller *poller)
+{
+	cancel_delayed_work_sync(&poller->work);
+}
+
+int input_setup_polling(struct input_dev *dev,
+			void (*poll_fn)(struct input_dev *dev))
+{
+	struct input_dev_poller *poller;
+
+	poller = kzalloc(sizeof(*poller), GFP_KERNEL);
+	if (!poller) {
+		/*
+		 * We want to show message even though kzalloc() may have
+		 * printed backtrace as knowing what instance of input
+		 * device we were dealing with is helpful.
+		 */
+		dev_err(dev->dev.parent ?: &dev->dev,
+			"%s: unable to allocate poller structure\n", __func__);
+		return -ENOMEM;
+	}
+
+	INIT_DELAYED_WORK(&poller->work, input_dev_poller_work);
+	poller->input = dev;
+	poller->poll = poll_fn;
+
+	dev->poller = poller;
+	return 0;
+}
+EXPORT_SYMBOL(input_setup_polling);
+
+static bool input_dev_ensure_poller(struct input_dev *dev)
+{
+	if (!dev->poller) {
+		dev_err(dev->dev.parent ?: &dev->dev,
+			"poller structure has not been set up\n");
+		return false;
+	}
+
+	return true;
+}
+
+void input_set_poll_interval(struct input_dev *dev, unsigned int interval)
+{
+	if (input_dev_ensure_poller(dev))
+		dev->poller->poll_interval = interval;
+}
+EXPORT_SYMBOL(input_set_poll_interval);
+
+void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval)
+{
+	if (input_dev_ensure_poller(dev))
+		dev->poller->poll_interval_min = interval;
+}
+EXPORT_SYMBOL(input_set_min_poll_interval);
+
+void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval)
+{
+	if (input_dev_ensure_poller(dev))
+		dev->poller->poll_interval_max = interval;
+}
+EXPORT_SYMBOL(input_set_max_poll_interval);
+
+/* SYSFS interface */
+
+static ssize_t input_dev_get_poll_interval(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct input_dev *input = to_input_dev(dev);
+
+	return sprintf(buf, "%d\n", input->poller->poll_interval);
+}
+
+static ssize_t input_dev_set_poll_interval(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct input_dev *input = to_input_dev(dev);
+	struct input_dev_poller *poller = input->poller;
+	unsigned int interval;
+	int err;
+
+	err = kstrtouint(buf, 0, &interval);
+	if (err)
+		return err;
+
+	if (interval < poller->poll_interval_min)
+		return -EINVAL;
+
+	if (interval > poller->poll_interval_max)
+		return -EINVAL;
+
+	mutex_lock(&input->mutex);
+
+	poller->poll_interval = interval;
+
+	if (input->users) {
+		cancel_delayed_work_sync(&poller->work);
+		if (poller->poll_interval > 0)
+			input_dev_poller_queue_work(poller);
+	}
+
+	mutex_unlock(&input->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(poll, 0644,
+		   input_dev_get_poll_interval, input_dev_set_poll_interval);
+
+static ssize_t input_dev_get_poll_max(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct input_dev *input = to_input_dev(dev);
+
+	return sprintf(buf, "%d\n", input->poller->poll_interval_max);
+}
+
+static DEVICE_ATTR(max, 0444, input_dev_get_poll_max, NULL);
+
+static ssize_t input_dev_get_poll_min(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct input_dev *input = to_input_dev(dev);
+
+	return sprintf(buf, "%d\n", input->poller->poll_interval_min);
+}
+
+static DEVICE_ATTR(min, 0444, input_dev_get_poll_min, NULL);
+
+static umode_t input_poller_attrs_visible(struct kobject *kobj,
+					  struct attribute *attr, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct input_dev *input = to_input_dev(dev);
+
+	return input->poller ? attr->mode : 0;
+}
+
+static struct attribute *input_poller_attrs[] = {
+	&dev_attr_poll.attr,
+	&dev_attr_max.attr,
+	&dev_attr_min.attr,
+	NULL
+};
+
+struct attribute_group input_poller_attribute_group = {
+	.is_visible	= input_poller_attrs_visible,
+	.attrs		= input_poller_attrs,
+};
diff --git a/drivers/input/input-poller.h b/drivers/input/input-poller.h
new file mode 100644
index 000000000000..e3fca0be1d32
--- /dev/null
+++ b/drivers/input/input-poller.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _INPUT_POLLER_H
+#define _INPUT_POLLER_H
+
+/*
+ * Support for polling mode for input devices.
+ */
+#include <linux/sysfs.h>
+
+struct input_dev_poller;
+
+void input_dev_poller_finalize(struct input_dev_poller *poller);
+void input_dev_poller_start(struct input_dev_poller *poller);
+void input_dev_poller_stop(struct input_dev_poller *poller);
+
+extern struct attribute_group input_poller_attribute_group;
+
+#endif /* _INPUT_POLLER_H */
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7494a0dede79..c08aa3596144 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -24,6 +24,7 @@
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
 #include "input-compat.h"
+#include "input-poller.h"
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>");
 MODULE_DESCRIPTION("Input core");
@@ -603,20 +604,31 @@ int input_open_device(struct input_handle *handle)
 
 	handle->open++;
 
-	if (!dev->users++ && dev->open)
-		retval = dev->open(dev);
+	if (dev->users++) {
+		/*
+		 * Device is already opened, so we can exit immediately and
+		 * report success.
+		 */
+		goto out;
+	}
 
-	if (retval) {
-		dev->users--;
-		if (!--handle->open) {
+	if (dev->open) {
+		retval = dev->open(dev);
+		if (retval) {
+			dev->users--;
+			handle->open--;
 			/*
 			 * Make sure we are not delivering any more events
 			 * through this handle
 			 */
 			synchronize_rcu();
+			goto out;
 		}
 	}
 
+	if (dev->poller)
+		input_dev_poller_start(dev->poller);
+
  out:
 	mutex_unlock(&dev->mutex);
 	return retval;
@@ -655,8 +667,13 @@ void input_close_device(struct input_handle *handle)
 
 	__input_release_device(handle);
 
-	if (!--dev->users && dev->close)
-		dev->close(dev);
+	if (!--dev->users) {
+		if (dev->poller)
+			input_dev_poller_stop(dev->poller);
+
+		if (dev->close)
+			dev->close(dev);
+	}
 
 	if (!--handle->open) {
 		/*
@@ -1502,6 +1519,7 @@ static const struct attribute_group *input_dev_attr_groups[] = {
 	&input_dev_attr_group,
 	&input_dev_id_attr_group,
 	&input_dev_caps_attr_group,
+	&input_poller_attribute_group,
 	NULL
 };
 
@@ -1511,6 +1529,7 @@ static void input_dev_release(struct device *device)
 
 	input_ff_destroy(dev);
 	input_mt_destroy_slots(dev);
+	kfree(dev->poller);
 	kfree(dev->absinfo);
 	kfree(dev->vals);
 	kfree(dev);
@@ -2175,6 +2194,9 @@ int input_register_device(struct input_dev *dev)
 	if (!dev->setkeycode)
 		dev->setkeycode = input_default_setkeycode;
 
+	if (dev->poller)
+		input_dev_poller_finalize(dev->poller);
+
 	error = device_add(&dev->dev);
 	if (error)
 		goto err_free_vals;
diff --git a/include/linux/input.h b/include/linux/input.h
index e95a439d8bd5..94f277cd806a 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -21,6 +21,8 @@
 #include <linux/timer.h>
 #include <linux/mod_devicetable.h>
 
+struct input_dev_poller;
+
 /**
  * struct input_value - input value representation
  * @type: type of value (EV_KEY, EV_ABS, etc)
@@ -71,6 +73,8 @@ enum input_clock_type {
  *	not sleep
  * @ff: force feedback structure associated with the device if device
  *	supports force feedback effects
+ * @poller: poller structure associated with the device if device is
+ *	set up to use polling mode
  * @repeat_key: stores key code of the last key pressed; used to implement
  *	software autorepeat
  * @timer: timer for software autorepeat
@@ -156,6 +160,8 @@ struct input_dev {
 
 	struct ff_device *ff;
 
+	struct input_dev_poller *poller;
+
 	unsigned int repeat_key;
 	struct timer_list timer;
 
@@ -372,6 +378,12 @@ void input_unregister_device(struct input_dev *);
 
 void input_reset_device(struct input_dev *);
 
+int input_setup_polling(struct input_dev *dev,
+			void (*poll_fn)(struct input_dev *dev));
+void input_set_poll_interval(struct input_dev *dev, unsigned int interval);
+void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval);
+void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval);
+
 int __must_check input_register_handler(struct input_handler *);
 void input_unregister_handler(struct input_handler *);
 
-- 
cgit v1.2.3


From a1b70a44b80af641a441937803cb8251e8e6d8e3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 8 Aug 2019 11:03:59 -0700
Subject: Input: bu21013_ts - convert to use GPIO descriptors

This driver can use GPIO descriptors rather than GPIO numbers
without any problems, convert it. Name the field variables after
the actual pins on the chip rather than the "reset" and "touch"
names from the devicetree bindings that are vaguely inaccurate.

No in-tree users pass GPIO numbers in platform data so drop
this. Descriptor tables can be used to get these GPIOs from a board
file if need be.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../bindings/input/touchscreen/bu21013.txt         |  5 +-
 drivers/input/touchscreen/bu21013_ts.c             | 86 ++++++++++------------
 include/linux/input/bu21013.h                      |  4 -
 3 files changed, 41 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/touchscreen/bu21013.txt b/Documentation/devicetree/bindings/input/touchscreen/bu21013.txt
index 56d835242af2..43899fc36ecf 100644
--- a/Documentation/devicetree/bindings/input/touchscreen/bu21013.txt
+++ b/Documentation/devicetree/bindings/input/touchscreen/bu21013.txt
@@ -2,10 +2,11 @@
 
 Required properties:
  - compatible              : "rohm,bu21013_tp"
- - reg                     :  I2C device address
+ - reg                     : I2C device address
+ - reset-gpios             : GPIO pin enabling (selecting) chip (CS)
 
 Optional properties:
- - touch-gpio              : GPIO pin registering a touch event
+ - touch-gpios             : GPIO pin registering a touch event
  - <supply_name>-supply    : Phandle to a regulator supply
  - rohm,touch-max-x        : Maximum outward permitted limit in the X axis
  - rohm,touch-max-y        : Maximum outward permitted limit in the Y axis
diff --git a/drivers/input/touchscreen/bu21013_ts.c b/drivers/input/touchscreen/bu21013_ts.c
index 1d703e230ac3..c20f86f98ffc 100644
--- a/drivers/input/touchscreen/bu21013_ts.c
+++ b/drivers/input/touchscreen/bu21013_ts.c
@@ -14,11 +14,9 @@
 #include <linux/slab.h>
 #include <linux/regulator/consumer.h>
 #include <linux/module.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 
-#define PEN_DOWN_INTR	0
 #define MAX_FINGERS	2
 #define RESET_DELAY	30
 #define PENUP_TIMEOUT	(10)
@@ -143,8 +141,9 @@
  * @touch_stopped: touch stop flag
  * @chip: pointer to the touch panel controller
  * @in_dev: pointer to the input device structure
- * @intr_pin: interrupt pin value
  * @regulator: pointer to the Regulator used for touch screen
+ * @cs_gpiod: chip select GPIO line
+ * @int_gpiod: touch interrupt GPIO line
  *
  * Touch panel device data structure
  */
@@ -154,8 +153,9 @@ struct bu21013_ts_data {
 	const struct bu21013_platform_device *chip;
 	struct input_dev *in_dev;
 	struct regulator *regulator;
+	struct gpio_desc *cs_gpiod;
+	struct gpio_desc *int_gpiod;
 	unsigned int irq;
-	unsigned int intr_pin;
 	bool touch_stopped;
 };
 
@@ -257,20 +257,21 @@ static irqreturn_t bu21013_gpio_irq(int irq, void *device_data)
 {
 	struct bu21013_ts_data *data = device_data;
 	struct i2c_client *i2c = data->client;
+	int keep_polling;
 	int retval;
 
 	do {
 		retval = bu21013_do_touch_report(data);
 		if (retval < 0) {
 			dev_err(&i2c->dev, "bu21013_do_touch_report failed\n");
-			return IRQ_NONE;
+			break;
 		}
 
-		data->intr_pin = gpio_get_value(data->chip->touch_pin);
-		if (data->intr_pin == PEN_DOWN_INTR)
+		keep_polling = gpiod_get_value(data->int_gpiod);
+		if (keep_polling)
 			wait_event_timeout(data->wait, data->touch_stopped,
 					   msecs_to_jiffies(2));
-	} while (!data->intr_pin && !data->touch_stopped);
+	} while (keep_polling && !data->touch_stopped);
 
 	return IRQ_HANDLED;
 }
@@ -425,28 +426,6 @@ static void bu21013_free_irq(struct bu21013_ts_data *bu21013_data)
 	free_irq(bu21013_data->irq, bu21013_data);
 }
 
-/**
- * bu21013_cs_disable() - deconfigures the touch panel controller
- * @bu21013_data: device structure pointer
- *
- * This function is used to deconfigure the chip selection
- * for touch panel controller.
- */
-static void bu21013_cs_disable(struct bu21013_ts_data *bu21013_data)
-{
-	int error;
-
-	error = gpio_direction_output(bu21013_data->chip->cs_pin, 0);
-	if (error < 0)
-		dev_warn(&bu21013_data->client->dev,
-			 "%s: gpio direction failed, error: %d\n",
-			 __func__, error);
-	else
-		gpio_set_value(bu21013_data->chip->cs_pin, 0);
-
-	gpio_free(bu21013_data->chip->cs_pin);
-}
-
 #ifdef CONFIG_OF
 static const struct bu21013_platform_device *
 bu21013_parse_dt(struct device *dev)
@@ -471,9 +450,6 @@ bu21013_parse_dt(struct device *dev)
 	of_property_read_u32(np, "rohm,touch-max-x", &pdata->touch_x_max);
 	of_property_read_u32(np, "rohm,touch-max-y", &pdata->touch_y_max);
 
-	pdata->touch_pin = of_get_named_gpio(np, "touch-gpio", 0);
-	pdata->cs_pin = of_get_named_gpio(np, "reset-gpio", 0);
-
 	pdata->ext_clk = false;
 
 	return pdata;
@@ -516,11 +492,6 @@ static int bu21013_probe(struct i2c_client *client,
 			return PTR_ERR(pdata);
 	}
 
-	if (!gpio_is_valid(pdata->touch_pin)) {
-		dev_err(&client->dev, "invalid touch_pin supplied\n");
-		return -EINVAL;
-	}
-
 	bu21013_data = kzalloc(sizeof(struct bu21013_ts_data), GFP_KERNEL);
 	in_dev = input_allocate_device();
 	if (!bu21013_data || !in_dev) {
@@ -529,16 +500,26 @@ static int bu21013_probe(struct i2c_client *client,
 		goto err_free_mem;
 	}
 
+	/* Named "INT" on the chip, DT binding is "touch" */
+	bu21013_data->int_gpiod = gpiod_get(&client->dev, "touch", GPIOD_IN);
+	error = PTR_ERR_OR_ZERO(bu21013_data->int_gpiod);
+	if (error) {
+		if (error != -EPROBE_DEFER)
+			dev_err(&client->dev, "failed to get INT GPIO\n");
+		goto err_free_mem;
+	}
+	gpiod_set_consumer_name(bu21013_data->int_gpiod, "BU21013 INT");
+
 	bu21013_data->in_dev = in_dev;
 	bu21013_data->chip = pdata;
 	bu21013_data->client = client;
-	bu21013_data->irq = gpio_to_irq(pdata->touch_pin);
+	bu21013_data->irq = gpiod_to_irq(bu21013_data->int_gpiod);
 
 	bu21013_data->regulator = regulator_get(&client->dev, "avdd");
 	if (IS_ERR(bu21013_data->regulator)) {
 		dev_err(&client->dev, "regulator_get failed\n");
 		error = PTR_ERR(bu21013_data->regulator);
-		goto err_free_mem;
+		goto err_put_int_gpio;
 	}
 
 	error = regulator_enable(bu21013_data->regulator);
@@ -550,13 +531,16 @@ static int bu21013_probe(struct i2c_client *client,
 	bu21013_data->touch_stopped = false;
 	init_waitqueue_head(&bu21013_data->wait);
 
-	/* configure the gpio pins */
-	error = gpio_request_one(pdata->cs_pin, GPIOF_OUT_INIT_HIGH,
-				 "touchp_reset");
-	if (error < 0) {
-		dev_err(&client->dev, "Unable to request gpio reset_pin\n");
+	/* Named "CS" on the chip, DT binding is "reset" */
+	bu21013_data->cs_gpiod = gpiod_get(&client->dev, "reset",
+					   GPIOD_OUT_HIGH);
+	error = PTR_ERR_OR_ZERO(bu21013_data->cs_gpiod);
+	if (error) {
+		if (error != -EPROBE_DEFER)
+			dev_err(&client->dev, "failed to get CS GPIO\n");
 		goto err_disable_regulator;
 	}
+	gpiod_set_consumer_name(bu21013_data->cs_gpiod, "BU21013 CS");
 
 	/* configure the touch panel controller */
 	error = bu21013_init_chip(bu21013_data);
@@ -604,11 +588,14 @@ static int bu21013_probe(struct i2c_client *client,
 err_free_irq:
 	bu21013_free_irq(bu21013_data);
 err_cs_disable:
-	bu21013_cs_disable(bu21013_data);
+	gpiod_set_value(bu21013_data->cs_gpiod, 0);
+	gpiod_put(bu21013_data->cs_gpiod);
 err_disable_regulator:
 	regulator_disable(bu21013_data->regulator);
 err_put_regulator:
 	regulator_put(bu21013_data->regulator);
+err_put_int_gpio:
+	gpiod_put(bu21013_data->int_gpiod);
 err_free_mem:
 	input_free_device(in_dev);
 	kfree(bu21013_data);
@@ -628,13 +615,16 @@ static int bu21013_remove(struct i2c_client *client)
 
 	bu21013_free_irq(bu21013_data);
 
-	bu21013_cs_disable(bu21013_data);
+	gpiod_set_value(bu21013_data->cs_gpiod, 0);
+	gpiod_put(bu21013_data->cs_gpiod);
 
 	input_unregister_device(bu21013_data->in_dev);
 
 	regulator_disable(bu21013_data->regulator);
 	regulator_put(bu21013_data->regulator);
 
+	gpiod_put(bu21013_data->int_gpiod);
+
 	kfree(bu21013_data);
 
 	return 0;
diff --git a/include/linux/input/bu21013.h b/include/linux/input/bu21013.h
index 7e5b7e978e8a..58b1a9d44443 100644
--- a/include/linux/input/bu21013.h
+++ b/include/linux/input/bu21013.h
@@ -11,8 +11,6 @@
  * struct bu21013_platform_device - Handle the platform data
  * @touch_x_max: touch x max
  * @touch_y_max: touch y max
- * @cs_pin: chip select pin
- * @touch_pin: touch gpio pin
  * @ext_clk: external clock flag
  * @x_flip: x flip flag
  * @y_flip: y flip flag
@@ -23,8 +21,6 @@
 struct bu21013_platform_device {
 	int touch_x_max;
 	int touch_y_max;
-	unsigned int cs_pin;
-	unsigned int touch_pin;
 	bool ext_clk;
 	bool x_flip;
 	bool y_flip;
-- 
cgit v1.2.3


From 1eb7b4cacc01771ae42fcbc5ae9a4bc1d13c1dbc Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 8 Aug 2019 12:09:43 -0700
Subject: Input: bu21013_ts - remove support for platform data

There are no current users of the platform data in the tree, and
any new users should either use device tree, or static device
properties to describe the device.

This change drop the platform data definition and handling and moves the
driver over to generic device properties API. We also drop support for the
external clock. If it is needed we will have to extend the bindings to
supply the clock reference and handle it properly in the driver.

Also, wakeup setting should be coming from I2C client.

Tested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/bu21013_ts.c | 109 +++++++++++----------------------
 include/linux/input/bu21013.h          |  30 ---------
 2 files changed, 37 insertions(+), 102 deletions(-)
 delete mode 100644 include/linux/input/bu21013.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/bu21013_ts.c b/drivers/input/touchscreen/bu21013_ts.c
index 2b8538a63945..f3b3e4c72c84 100644
--- a/drivers/input/touchscreen/bu21013_ts.c
+++ b/drivers/input/touchscreen/bu21013_ts.c
@@ -4,18 +4,18 @@
  * Author: Naveen Kumar G <naveen.gaddipati@stericsson.com> for ST-Ericsson
  */
 
-#include <linux/kernel.h>
+#include <linux/bitops.h>
 #include <linux/delay.h>
-#include <linux/interrupt.h>
+#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
-#include <linux/workqueue.h>
 #include <linux/input.h>
-#include <linux/input/bu21013.h>
-#include <linux/slab.h>
-#include <linux/regulator/consumer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/gpio/consumer.h>
-#include <linux/of.h>
+#include <linux/property.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/types.h>
 
 #define MAX_FINGERS	2
 #define RESET_DELAY	30
@@ -137,23 +137,32 @@
 /**
  * struct bu21013_ts - touch panel data structure
  * @client: pointer to the i2c client
- * @touch_stopped: touch stop flag
- * @chip: pointer to the touch panel controller
  * @in_dev: pointer to the input device structure
  * @regulator: pointer to the Regulator used for touch screen
  * @cs_gpiod: chip select GPIO line
  * @int_gpiod: touch interrupt GPIO line
+ * @irq: interrupt number the device is using
+ * @touch_x_max: maximum X coordinate reported by the device
+ * @touch_y_max: maximum Y coordinate reported by the device
+ * @x_flip: indicates that the driver should invert X coordinate before
+ *	reporting
+ * @y_flip: indicates that the driver should invert Y coordinate before
+ *	reporting
+ * @touch_stopped: touch stop flag
  *
  * Touch panel device data structure
  */
 struct bu21013_ts {
 	struct i2c_client *client;
-	const struct bu21013_platform_device *chip;
 	struct input_dev *in_dev;
 	struct regulator *regulator;
 	struct gpio_desc *cs_gpiod;
 	struct gpio_desc *int_gpiod;
 	unsigned int irq;
+	u32 touch_x_max;
+	u32 touch_y_max;
+	bool x_flip;
+	bool y_flip;
 	bool touch_stopped;
 };
 
@@ -208,10 +217,10 @@ static int bu21013_do_touch_report(struct bu21013_ts *ts)
 		}
 
 		for (i = 0; i < finger_down_count; i++) {
-			if (ts->chip->x_flip)
-				pos_x[i] = ts->chip->touch_x_max - pos_x[i];
-			if (ts->chip->y_flip)
-				pos_y[i] = ts->chip->touch_y_max - pos_y[i];
+			if (ts->x_flip)
+				pos_x[i] = ts->touch_x_max - pos_x[i];
+			if (ts->y_flip)
+				pos_y[i] = ts->touch_y_max - pos_y[i];
 
 			input_report_abs(ts->in_dev,
 					 ABS_MT_POSITION_X, pos_x[i]);
@@ -304,14 +313,9 @@ static int bu21013_init_chip(struct bu21013_ts *ts)
 		return error;
 	}
 
-	if (ts->chip->ext_clk)
-		error = i2c_smbus_write_byte_data(client, BU21013_CLK_MODE_REG,
-						  BU21013_CLK_MODE_EXT |
-							BU21013_CLK_MODE_CALIB);
-	else
-		error = i2c_smbus_write_byte_data(client, BU21013_CLK_MODE_REG,
-						  BU21013_CLK_MODE_DIV |
-							BU21013_CLK_MODE_CALIB);
+	error = i2c_smbus_write_byte_data(client, BU21013_CLK_MODE_REG,
+					  BU21013_CLK_MODE_DIV |
+						BU21013_CLK_MODE_CALIB);
 	if (error) {
 		dev_err(&client->dev, "BU21013_CLK_MODE reg write failed\n");
 		return error;
@@ -388,43 +392,6 @@ static int bu21013_init_chip(struct bu21013_ts *ts)
 	return 0;
 }
 
-#ifdef CONFIG_OF
-static const struct bu21013_platform_device *
-bu21013_parse_dt(struct device *dev)
-{
-	struct device_node *np = dev->of_node;
-	struct bu21013_platform_device *pdata;
-
-	if (!np) {
-		dev_err(dev, "no device tree or platform data\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return ERR_PTR(-ENOMEM);
-
-	pdata->y_flip = pdata->x_flip = false;
-
-	pdata->x_flip = of_property_read_bool(np, "rohm,flip-x");
-	pdata->y_flip = of_property_read_bool(np, "rohm,flip-y");
-
-	of_property_read_u32(np, "rohm,touch-max-x", &pdata->touch_x_max);
-	of_property_read_u32(np, "rohm,touch-max-y", &pdata->touch_y_max);
-
-	pdata->ext_clk = false;
-
-	return pdata;
-}
-#else
-static inline const struct bu21013_platform_device *
-bu21013_parse_dt(struct device *dev)
-{
-	dev_err(dev, "no platform data available\n");
-	return ERR_PTR(-EINVAL);
-}
-#endif
-
 static void bu21013_power_off(void *_ts)
 {
 	struct bu21013_ts *ts = _ts;
@@ -442,8 +409,6 @@ static void bu21013_disable_chip(void *_ts)
 static int bu21013_probe(struct i2c_client *client,
 			 const struct i2c_device_id *id)
 {
-	const struct bu21013_platform_device *pdata =
-					dev_get_platdata(&client->dev);
 	struct bu21013_ts *ts;
 	struct input_dev *in_dev;
 	int error;
@@ -454,19 +419,20 @@ static int bu21013_probe(struct i2c_client *client,
 		return -EIO;
 	}
 
-	if (!pdata) {
-		pdata = bu21013_parse_dt(&client->dev);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-	}
-
 	ts = devm_kzalloc(&client->dev, sizeof(*ts), GFP_KERNEL);
 	if (!ts)
 		return -ENOMEM;
 
-	ts->chip = pdata;
 	ts->client = client;
 
+	ts->x_flip = device_property_read_bool(&client->dev, "rohm,flip-x");
+	ts->y_flip = device_property_read_bool(&client->dev, "rohm,flip-y");
+
+	device_property_read_u32(&client->dev, "rohm,touch-max-x",
+				 &ts->touch_x_max);
+	device_property_read_u32(&client->dev, "rohm,touch-max-y",
+				 &ts->touch_y_max);
+
 	in_dev = devm_input_allocate_device(&client->dev);
 	if (!in_dev) {
 		dev_err(&client->dev, "device memory alloc failed\n");
@@ -483,9 +449,9 @@ static int bu21013_probe(struct i2c_client *client,
 	__set_bit(EV_ABS, in_dev->evbit);
 
 	input_set_abs_params(in_dev, ABS_MT_POSITION_X,
-			     0, pdata->touch_x_max, 0, 0);
+			     0, ts->touch_x_max, 0, 0);
 	input_set_abs_params(in_dev, ABS_MT_POSITION_Y,
-			     0, pdata->touch_y_max, 0, 0);
+			     0, ts->touch_y_max, 0, 0);
 	input_set_drvdata(in_dev, ts);
 
 	ts->regulator = devm_regulator_get(&client->dev, "avdd");
@@ -560,7 +526,6 @@ static int bu21013_probe(struct i2c_client *client,
 		return error;
 	}
 
-	device_init_wakeup(&client->dev, pdata->wakeup);
 	i2c_set_clientdata(client, ts);
 
 	return 0;
diff --git a/include/linux/input/bu21013.h b/include/linux/input/bu21013.h
deleted file mode 100644
index 58b1a9d44443..000000000000
--- a/include/linux/input/bu21013.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson SA 2010
- * Author: Naveen Kumar G <naveen.gaddipati@stericsson.com> for ST-Ericsson
- */
-
-#ifndef _BU21013_H
-#define _BU21013_H
-
-/**
- * struct bu21013_platform_device - Handle the platform data
- * @touch_x_max: touch x max
- * @touch_y_max: touch y max
- * @ext_clk: external clock flag
- * @x_flip: x flip flag
- * @y_flip: y flip flag
- * @wakeup: wakeup flag
- *
- * This is used to handle the platform data
- */
-struct bu21013_platform_device {
-	int touch_x_max;
-	int touch_y_max;
-	bool ext_clk;
-	bool x_flip;
-	bool y_flip;
-	bool wakeup;
-};
-
-#endif
-- 
cgit v1.2.3